aboutsummaryrefslogtreecommitdiff
path: root/download/dump2msgp.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r--download/dump2msgp.py12
1 files changed, 11 insertions, 1 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 70b483c..c0186af 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -255,6 +255,7 @@ def extractAll(f, errorF, ignore):
hasForbidden = False
hasText = False
tf = None
+ isEnd = False
dict_ = dict()
@@ -271,12 +272,16 @@ def extractAll(f, errorF, ignore):
hasText = False
isFr = False
title = ""
+ isEnd = False
elif "</page>" in line:
tf = None
hasForbidden = False
hasText = False
isFr = False
title = ""
+ isEnd = False
+ if isEnd:
+ continue
if "<title>" in line:
title = line[line.find('>') + 1:]
@@ -308,7 +313,12 @@ def extractAll(f, errorF, ignore):
elif not hasForbidden and "== {{langue|" in line:
isFr = False
if not hasForbidden and isFr and tf:
- tf.write(line)
+ try:
+ ind = line.index('</text>')
+ tf.write(line[:ind])
+ isEnd = True
+ except:
+ tf.write(line)
return dict_