aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorache <ache@ache.one>2021-09-05 01:21:03 +0200
committerache <ache@ache.one>2021-09-05 01:21:43 +0200
commit458c38e0764d785a38050c41dcc9262a0604859b (patch)
tree29823fdb103538f9059d4d7826a70a6fae30c970
parentLast rename dicofr => dfr (diff)
Clean HTML
-rw-r--r--download/dump2msgp.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index c0186af..50372cf 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -14,7 +14,7 @@ Extract words from the Wiktionnary archive
"""
-DEFAULT_OUTPUT = 'dicofr.msgpk'
+DEFAULT_OUTPUT = 'dfr.msgpk'
template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws',
@@ -259,6 +259,8 @@ def extractAll(f, errorF, ignore):
dict_ = dict()
+ cleanr = re.compile('&lt;.*?&gt;')
+
for line in f:
if "</page>" in line and tf:
tf.seek(0)
@@ -315,10 +317,10 @@ def extractAll(f, errorF, ignore):
if not hasForbidden and isFr and tf:
try:
ind = line.index('</text>')
- tf.write(line[:ind])
+ tf.write(cleanr.sub('', line[:ind].replace('&lt;br&gt;', '\n')))
isEnd = True
except:
- tf.write(line)
+ tf.write(cleanr.sub('', line.replace('&lt;br&gt;', '\n')))
return dict_