aboutsummaryrefslogtreecommitdiff
path: root/download/dump2msgp.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r--download/dump2msgp.py65
1 files changed, 52 insertions, 13 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 50372cf..e76115c 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -1,3 +1,39 @@
+#!/bin/env python
+
+"""dfr - Dump to msgpack
+
+Extract words from the Wiktionnary archive. All the parsing is done here.
+The product of that script is a MessagePack file that store every information in
+easily editable and dev friendly format.
+
+More information on MessagePack (msgpack) :
+<https://msgpack.org/>
+
+So there is some command line options that script can deal with.
+
+
+ + --output
+ The filename of the msgpack file to write.
+
+ + --input
+ The filename of the decompressed wiktionary dump.
+
+ + --error
+ The filename that will log errors related to parsing.
+ Wiktionnary is a community edited platform so there is a lot of
+ formatting mistakes. This script will report everything that it
+ doesn't understand in that file.
+
+ + --ignore
+ By default, this script stops on the first error. But as I say earlier,
+ there is a lot of mistakes in the wiktionary archive dump so this option
+ is intended to ignore errors and just continue.
+ Errors are still logged though.
+
+
+"""
+
+
import tempfile as tmp
import re
import sys
@@ -8,12 +44,6 @@ from sectionList import listInfoSection
from template import template
-"""
-
-Extract words from the Wiktionnary archive
-
-"""
-
DEFAULT_OUTPUT = 'dfr.msgpk'
@@ -259,14 +289,17 @@ def extractAll(f, errorF, ignore):
dict_ = dict()
- cleanr = re.compile('&lt;.*?&gt;')
+ i = 0
+
+ clearHTML = re.compile('&lt;.*&gt;', re.IGNORECASE)
for line in f:
+ i += 1
if "</page>" in line and tf:
tf.seek(0)
- i = extract(tf, title, errorF)
+ tmpInfo = extract(tf, title, errorF)
- dict_[title] = i
+ dict_[title] = tmpInfo
tf.close()
tf = None
@@ -307,7 +340,7 @@ def extractAll(f, errorF, ignore):
tf.seek(0)
while line2 := tf.readline():
print(line2, end='')
- print(line)
+ print(f"{i}: {line}")
exit(-1)
else:
@@ -315,12 +348,18 @@ def extractAll(f, errorF, ignore):
elif not hasForbidden and "== {{langue|" in line:
isFr = False
if not hasForbidden and isFr and tf:
+ start = ""
+ try:
+ start = line.split()[0]
+ except:
+ pass
+ tLine = clearHTML.sub('', line.replace('&lt;br&gt;', '\n' + start + ' '))
try:
- ind = line.index('</text>')
- tf.write(cleanr.sub('', line[:ind].replace('&lt;br&gt;', '\n')))
+ ind = tLine.index('</text>')
+ tf.write(tLine[:ind])
isEnd = True
except:
- tf.write(cleanr.sub('', line.replace('&lt;br&gt;', '\n')))
+ tf.write(tLine)
return dict_