1 files changed, 52 insertions, 13 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 50372cf..e76115c 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -1,3 +1,39 @@
+#!/bin/env python
+
+"""dfr - Dump to msgpack
+
+Extract words from the Wiktionnary archive. All the parsing is done here.
+The product of that script is a MessagePack file that store every information in
+easily editable and dev friendly format.
+
+More information on MessagePack (msgpack) :
+<https://msgpack.org/>
+
+So there is some command line options that script can deal with.
+
+
+ + --output
+    The filename of the msgpack file to write.
+
+ + --input
+    The filename of the decompressed wiktionary dump.
+
+ + --error
+    The filename that will log errors related to parsing.
+    Wiktionnary is a community edited platform so there is a lot of
+    formatting mistakes. This script will report everything that it
+    doesn't understand in that file.
+
+ + --ignore
+    By default, this script stops on the first error. But as I say earlier,
+    there is a lot of mistakes in the wiktionary archive dump so this option
+    is intended to ignore errors and just continue.
+    Errors are still logged though.
+
+
+"""
+
+
 import tempfile as tmp
 import re
 import sys
@@ -8,12 +44,6 @@ from sectionList import listInfoSection
 from template import template
 
 
-"""
-
-Extract words from the Wiktionnary archive
-
-"""
-
 DEFAULT_OUTPUT = 'dfr.msgpk'
 
 
@@ -259,14 +289,17 @@ def extractAll(f, errorF, ignore):
 
     dict_ = dict()
 
-    cleanr = re.compile('&lt;.*?&gt;')
+    i = 0
+
+    clearHTML = re.compile('&lt;.*&gt;', re.IGNORECASE)
 
     for line in f:
+        i += 1
         if "</page>" in line and tf:
             tf.seek(0)
-            i = extract(tf, title, errorF)
+            tmpInfo = extract(tf, title, errorF)
 
-            dict_[title] = i
+            dict_[title] = tmpInfo
             tf.close()
 
             tf = None
@@ -307,7 +340,7 @@ def extractAll(f, errorF, ignore):
                         tf.seek(0)
                         while line2 := tf.readline():
                             print(line2, end='')
-                        print(line)
+                        print(f"{i}: {line}")
 
                         exit(-1)
             else:
@@ -315,12 +348,18 @@ def extractAll(f, errorF, ignore):
         elif not hasForbidden and "== {{langue|" in line:
             isFr = False
         if not hasForbidden and isFr and tf:
+            start = ""
+            try:
+                start = line.split()[0]
+            except:
+                pass
+            tLine = clearHTML.sub('', line.replace('&lt;br&gt;', '\n' + start + ' '))
             try:
-                ind = line.index('</text>')
-                tf.write(cleanr.sub('', line[:ind].replace('&lt;br&gt;', '\n')))
+                ind = tLine.index('</text>')
+                tf.write(tLine[:ind])
                 isEnd = True
             except:
-                tf.write(cleanr.sub('', line.replace('&lt;br&gt;', '\n')))
+                tf.write(tLine)
 
     return dict_