From 458c38e0764d785a38050c41dcc9262a0604859b Mon Sep 17 00:00:00 2001 From: ache Date: Sun, 5 Sep 2021 01:21:03 +0200 Subject: Clean HTML --- download/dump2msgp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/download/dump2msgp.py b/download/dump2msgp.py index c0186af..50372cf 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -14,7 +14,7 @@ Extract words from the Wiktionnary archive """ -DEFAULT_OUTPUT = 'dicofr.msgpk' +DEFAULT_OUTPUT = 'dfr.msgpk' template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', @@ -259,6 +259,8 @@ def extractAll(f, errorF, ignore): dict_ = dict() + cleanr = re.compile('<.*?>') + for line in f: if "" in line and tf: tf.seek(0) @@ -315,10 +317,10 @@ def extractAll(f, errorF, ignore): if not hasForbidden and isFr and tf: try: ind = line.index('') - tf.write(line[:ind]) + tf.write(cleanr.sub('', line[:ind].replace('<br>', '\n'))) isEnd = True except: - tf.write(line) + tf.write(cleanr.sub('', line.replace('<br>', '\n'))) return dict_ -- cgit v1.2.3