aboutsummaryrefslogtreecommitdiff
path: root/download/dump2msgp.py
diff options
context:
space:
mode:
authorache <ache@ache.one>2021-08-31 08:21:42 +0200
committerache <ache@ache.one>2021-08-31 08:21:42 +0200
commiteed13c1b587c292f86d4b302918418ce78637126 (patch)
treea2c008669214d2e9122c1d5b33c1bf079d9b4ecd /download/dump2msgp.py
parentFix argument name (diff)
Create a new database
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r--download/dump2msgp.py12
1 files changed, 11 insertions, 1 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 70b483c..c0186af 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -255,6 +255,7 @@ def extractAll(f, errorF, ignore):
hasForbidden = False
hasText = False
tf = None
+ isEnd = False
dict_ = dict()
@@ -271,12 +272,16 @@ def extractAll(f, errorF, ignore):
hasText = False
isFr = False
title = ""
+ isEnd = False
elif "</page>" in line:
tf = None
hasForbidden = False
hasText = False
isFr = False
title = ""
+ isEnd = False
+ if isEnd:
+ continue
if "<title>" in line:
title = line[line.find('>') + 1:]
@@ -308,7 +313,12 @@ def extractAll(f, errorF, ignore):
elif not hasForbidden and "== {{langue|" in line:
isFr = False
if not hasForbidden and isFr and tf:
- tf.write(line)
+ try:
+ ind = line.index('</text>')
+ tf.write(line[:ind])
+ isEnd = True
+ except:
+ tf.write(line)
return dict_