diff options
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r-- | download/dump2msgp.py | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py index 70b483c..c0186af 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -255,6 +255,7 @@ def extractAll(f, errorF, ignore): hasForbidden = False hasText = False tf = None + isEnd = False dict_ = dict() @@ -271,12 +272,16 @@ def extractAll(f, errorF, ignore): hasText = False isFr = False title = "" + isEnd = False elif "</page>" in line: tf = None hasForbidden = False hasText = False isFr = False title = "" + isEnd = False + if isEnd: + continue if "<title>" in line: title = line[line.find('>') + 1:] @@ -308,7 +313,12 @@ def extractAll(f, errorF, ignore): elif not hasForbidden and "== {{langue|" in line: isFr = False if not hasForbidden and isFr and tf: - tf.write(line) + try: + ind = line.index('</text>') + tf.write(line[:ind]) + isEnd = True + except: + tf.write(line) return dict_ |