diff options
-rw-r--r-- | download/dump2msgp.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py index c0186af..50372cf 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -14,7 +14,7 @@ Extract words from the Wiktionnary archive """ -DEFAULT_OUTPUT = 'dicofr.msgpk' +DEFAULT_OUTPUT = 'dfr.msgpk' template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', @@ -259,6 +259,8 @@ def extractAll(f, errorF, ignore): dict_ = dict() + cleanr = re.compile('<.*?>') + for line in f: if "</page>" in line and tf: tf.seek(0) @@ -315,10 +317,10 @@ def extractAll(f, errorF, ignore): if not hasForbidden and isFr and tf: try: ind = line.index('</text>') - tf.write(line[:ind]) + tf.write(cleanr.sub('', line[:ind].replace('<br>', '\n'))) isEnd = True except: - tf.write(line) + tf.write(cleanr.sub('', line.replace('<br>', '\n'))) return dict_ |