From 7e1d9e251b517153db8b639133c9e3bee266ce1b Mon Sep 17 00:00:00 2001
From: ache <ache@ache.one>
Date: Sun, 3 Oct 2021 02:31:32 +0200
Subject: Rename files

---
 dfr/dump2msgp.py | 397 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 397 insertions(+)
 create mode 100644 dfr/dump2msgp.py

(limited to 'dfr/dump2msgp.py')

diff --git a/dfr/dump2msgp.py b/dfr/dump2msgp.py
new file mode 100644
index 0000000..1d3e1a2
--- /dev/null
+++ b/dfr/dump2msgp.py
@@ -0,0 +1,397 @@
+#!/bin/env python
+
+"""dfr - Dump to msgpack
+
+Extract words from the Wiktionnary archive. All the parsing is done here.
+The product of that script is a MessagePack file that store every information in
+easily editable and dev friendly format.
+
+More information on MessagePack (msgpack) :
+<https://msgpack.org/>
+
+So there is some command line options that script can deal with.
+
+
+ + --output
+    The filename of the msgpack file to write.
+
+ + --input
+    The filename of the decompressed wiktionary dump.
+
+ + --error
+    The filename that will log errors related to parsing.
+    Wiktionnary is a community edited platform so there is a lot of
+    formatting mistakes. This script will report everything that it
+    doesn't understand in that file.
+
+ + --ignore
+    By default, this script stops on the first error. But as I say earlier,
+    there is a lot of mistakes in the wiktionary archive dump so this option
+    is intended to ignore errors and just continue.
+    Errors are still logged though.
+
+
+"""
+
+
+import tempfile as tmp
+import re
+import sys
+import msgpack
+import argparse
+
+if __name__ == '__main__':
+    from sectionList import listInfoSection
+    from template import template
+else:
+    from dfr.sectionList import listInfoSection
+    from dfr.template import template
+
+
+dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
+
+
+DEFAULT_OUTPUT = 'dfr.msgpk'
+
+
+template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws',
+                   'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r',
+                   'clé de tri', 'contexte', 'emploi', 'l', 'polytonique',
+                   'pron-API', 'registre', 'scmp', 'siècle', 'x',
+                   ]
+
+
+template_second_lambda_trd = {
+    'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'),
+    'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'),
+}
+
+template_second_lambda_snd = {
+    'term': (lambda x: '(' + x.title() + ')'),
+    'terme': (lambda x: '(' + x.title() + ')'),
+    'ex': (lambda x: '^{' + x if x else 'e' + '}'),
+    'exp': (lambda x: '^{' + x if x else 'e' + '}'),
+    'e': (lambda x: '^{' + x if x else 'e' + '}'),
+    'er': (lambda x: '^{' + x if x else 'er' + '}'),
+    'ère': (lambda x: '^{' + x if x else 'ère' + '}'),
+    'ème': (lambda x: '^{' + x if x else 'ème' + '}'),
+    'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'),
+    'III': (lambda x: '^{' + x if x else 'III' + '}'),
+    'small': (lambda x: '_{' + x if x else '' + '}'),
+    'indice': (lambda x: '_{' + x if x else '' + '}'),
+    'graphie': (lambda x: '«' + x if x else '»'),
+    'petites capitales': (lambda x: x.upper()),
+    'isbn': (lambda x: 'cf. ISBN ' + x),
+    'OCLC': (lambda x: 'cf. OCLC ' + x),
+    'variante de': (lambda x: 'Variante de ' + x),
+    'variante  de': (lambda x: 'Variante de ' + x),
+    'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
+    'variante  ortho de': (lambda x: 'Variante orthographique de ' + x),
+    'variante ortho  de': (lambda x: 'Variante orthographique de ' + x),
+    'variante orthographique de': (lambda x: 'Variante orthographique de ' + x),
+    'sic !': (lambda x: '^{sic ' + x + '}'),
+    'sic': (lambda x: '^{sic ' + x + '}'),
+    'incise': (lambda x: '_' + x + '_'),
+    'n°': (lambda x: 'n°' + x),
+    'superlatif de': (lambda x: 'Superlatif de' + x),
+    'vérifier': (lambda x: '(À vérifier : ' + x + ')'),
+}
+
+
+interdit = " :"
+
+
+def transclusion(trans, info, errorF):
+    trans = trans[2:-2]
+
+    while '{{' in trans:
+        l0 = trans.rfind('{{')
+        l1 = trans.find('}}', l0)
+        if l1 == -1:
+            break
+        else:
+            l1 += 2
+        t = trans[l0:l1]
+        t = transclusion(t, info, errorF)
+        trans = trans[:l0] + t + trans[l1:]
+
+    s = list(map(lambda x: x.strip(), trans.split('|')))
+    if s[0] in template:
+        return template[s[0]]
+
+    if s[0].lower() in template_second:
+        return s[1] if len(s) > 1 else info['mot']
+
+    if s[0].lower().startswith('citation'):
+        cit = s[0].split('/')
+        if len(cit) == 4:
+            return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3]
+        if len(cit) == 3:
+            return 'Par ' + cit[1] + ', ' + cit[2]
+        if len(cit) == 5:
+            return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4]
+        if len(cit) <= 2:
+            return ''
+        else:
+            return '/'.join(cit[1:])
+
+    if s[0].lower() in template_second_lambda_snd:
+        return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '')
+
+    if s[0].lower() in template_second_lambda_trd:
+        return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '')
+
+    if errorF:
+        with open(errorF, 'a') as err:
+           print(s[0], file=err)
+           print("Incompréhension de la transclusion {} du mot {}".format(trans,
+                 info['mot']), file=err)
+    return ''
+
+
+def extract(f, w, errorF):
+    infoFin = []
+
+    toRead = True
+    goBack = 0
+
+    while toRead:
+        toRead = False
+
+        info = {'mot': w,
+                'cat-gram': None,
+                'def': [],
+                'API': None,
+                'infos': [],
+                'genre': '',
+                'accord': None}
+
+        # State 0 // Initialisation !
+        while line := f.readline():
+
+            if line.startswith('=== ') or line.startswith('==={'):
+                if re.match('^=== *{{ *S\\|([^|]+)|.*$', line):
+                    try:
+                        r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line)
+                        r = r.groups()
+                        nat = r[0].strip()
+                        if nat in dictMatch.keys():
+                            info['cat-gram'] = nat
+                            toRead = True
+                            break
+                    except Exception as e:
+                        if errorF:
+                            with open(errorF, 'a') as err:
+                                print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err)
+                                print(f'line: [{line}]: {e}', file=err)
+                                e = sys.exc_info()[0]
+                                print("Erreur :", e, file=err)
+
+        if not toRead:
+            break
+
+        # State 1
+        while line := f.readline():
+            if line.startswith('{{fr-'):
+                e = line.find('}}')
+                if e == -1:
+                    continue
+                ex = line[:e]
+                try:
+                    infos = list(map(lambda x: x.strip(), ex.split('|')))
+                    info['infos'] = infos
+                    info['accord'] = infos[0]
+                    if len(infos) > 1:
+                        info['API'] = infos[1]
+                except e:
+                    err = sys.exc_info()[0]
+                    print(ex)
+                    print("Erreur :", e)
+                    print("Erreur :", err)
+            if line.rstrip().startswith("'''"):
+                if '{{pron' in line:
+                    p0 = line.find('{{pron')
+                    p1 = line.find('}}', p0)
+                    if p1 > 0:
+                        p1 += 2
+                        p = line[p0:p1]
+                        p = p.split('|')
+                        info['API'] = p[1]
+                if '{{m}}' in line:
+                    info['genre'] = 'mas'
+                elif '{{f}}' in line:
+                    info['genre'] = 'fem'
+
+            if line.startswith('# '):
+                info['def'].append({'def': wikiToMd(line[2:], info, errorF)})
+            elif line.startswith('#* '):
+                if not info['def']:
+                    with open('wiki_err.log', 'a') as err:
+                        print("Exemple sans définition pour le mot {}".format(
+                              info['mot']), file=err)
+                elif 'ex' in info['def'][-1]:
+                    info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF))
+                else:
+                    info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)]
+            elif line.startswith('#') and not line.startswith('##'):
+                info['def'].append({'def': wikiToMd(line[1:], info, errorF)})
+            if line.startswith('==='):
+                goBack = len(line)
+                break
+        if goBack:
+            f.seek(f.tell() - goBack)
+            goBack = 0
+            toRead = True
+        infoFin.append(info)
+
+    return infoFin
+
+
+def wikiToMd(line, info, errorF):
+    line = line.strip()
+    # 3 Étapes:
+    #   - Links [...]
+    #   - Style ''ita'' / '''bold'''
+    #   - Template / Transclusion {{info}} = (Informatique)
+
+    # Template
+    while '{{' in line:
+        l0 = line.rfind('{{')
+        l1 = line.find('}}', l0)
+        if l1 == -1:
+            break
+        else:
+            l1 += 2
+        trans = line[l0:l1]
+        trans = transclusion(trans, info, errorF)
+        line = line[:l0] + trans + line[l1:]
+
+    # Links !
+    while '[[' in line:
+        link0 = line.rfind('[[')
+        link1 = line.find(']]', link0)
+        if link1 == -1:
+            break
+        else:
+            link1 += 2
+
+        link = line[link0:link1]
+        link = link[2:-2].split('|')
+        line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:]
+
+    # Style
+    line = line.replace("'''", '*').replace("''", '')
+
+    return line
+
+
+def extractAll(f, errorF, ignore):
+    title = ""
+    isFr = False
+    hasForbidden = False
+    hasText = False
+    tf = None
+    isEnd = False
+
+    dict_ = dict()
+
+    i = 0
+
+    clearHTML = re.compile('&lt;.*&gt;', re.IGNORECASE)
+
+    for line in f:
+        i += 1
+        if "</page>" in line and tf:
+            tf.seek(0)
+            tmpInfo = extract(tf, title, errorF)
+
+            dict_[title] = tmpInfo
+            tf.close()
+
+            tf = None
+            hasForbidden = False
+            hasText = False
+            isFr = False
+            title = ""
+            isEnd = False
+        elif "</page>" in line:
+            tf = None
+            hasForbidden = False
+            hasText = False
+            isFr = False
+            title = ""
+            isEnd = False
+        if isEnd:
+            continue
+
+        if "<title>" in line:
+            title = line[line.find('>') + 1:]
+            title = title[:title.find('<')]
+
+            for c in interdit:
+                if c in title:
+                    hasForbidden = True
+        if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line:
+            hasText = True
+        if not hasForbidden and "== {{langue|fr}}" in line and hasText:
+            isFr = True
+            if tf:
+                if not ignore:
+                    if errorF:
+                        with open(errorF, 'a') as err:
+                            print(f"{title}: Erreur tf encore ouvert !",
+                                  file=err)
+                    else:
+                        print(f"{title}: Erreur tf encore ouvert !")
+                        tf.seek(0)
+                        while line2 := tf.readline():
+                            print(line2, end='')
+                        print(f"{i}: {line}")
+
+                        exit(-1)
+            else:
+                tf = tmp.NamedTemporaryFile(mode="w+t")
+        elif not hasForbidden and "== {{langue|" in line:
+            isFr = False
+        if not hasForbidden and isFr and tf:
+            start = ""
+            try:
+                start = line.split()[0]
+            except:
+                pass
+            tLine = clearHTML.sub('', line.replace('&lt;br&gt;', '\n' + start + ' '))
+            try:
+                ind = tLine.index('</text>')
+                tf.write(tLine[:ind])
+                isEnd = True
+            except:
+                tf.write(tLine)
+
+    return dict_
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
+    parser.add_argument('-o', '--out', dest='outputF', action='store_const',
+                        const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
+                        help='the output filename')
+    parser.add_argument('-i', '--input', dest='inputF', action='store',
+                        help='the input filename, a dump of witionary')
+    parser.add_argument('-e', '--error', dest='errorF', action='store',
+                        help='the filename to log errors')
+    parser.add_argument('--ignore', dest='ignoreError', action='store_true',
+                        help='the filename to log errors')
+
+    arg = parser.parse_args()
+
+    if arg.inputF is None:
+        print('A wiktionary dump is needed', file=sys.stderr)
+        exit(-1)
+
+    with open(arg.inputF, 'r') as f:
+        res = extractAll(f, arg.errorF, arg.ignoreError)
+
+        with open(arg.outputF, 'wb') as f:
+            to_w = msgpack.packb(res)
+            f.write(to_w)
-- 
cgit v1.2.3