Rename files

author: ache <ache@ache.one> 2021-10-03 02:31:32 +0200
committer: ache <ache@ache.one> 2021-10-03 02:31:54 +0200
commit: 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree: c1e02297807107096cf5a8962ed51342a69f0626 /download/dump2msgp.py
parent: Command every scripts (diff)
1 files changed, 0 insertions, 390 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
deleted file mode 100644
index e76115c..0000000
--- a/download/dump2msgp.py
+++ /dev/null
@@ -1,390 +0,0 @@
-#!/bin/env python
-
-"""dfr - Dump to msgpack
-
-Extract words from the Wiktionnary archive. All the parsing is done here.
-The product of that script is a MessagePack file that store every information in
-easily editable and dev friendly format.
-
-More information on MessagePack (msgpack) :
-<https://msgpack.org/>
-
-So there is some command line options that script can deal with.
-
-
- + --output
-    The filename of the msgpack file to write.
-
- + --input
-    The filename of the decompressed wiktionary dump.
-
- + --error
-    The filename that will log errors related to parsing.
-    Wiktionnary is a community edited platform so there is a lot of
-    formatting mistakes. This script will report everything that it
-    doesn't understand in that file.
-
- + --ignore
-    By default, this script stops on the first error. But as I say earlier,
-    there is a lot of mistakes in the wiktionary archive dump so this option
-    is intended to ignore errors and just continue.
-    Errors are still logged though.
-
-
-"""
-
-
-import tempfile as tmp
-import re
-import sys
-import msgpack
-import argparse
-
-from sectionList import listInfoSection
-from template import template
-
-
-DEFAULT_OUTPUT = 'dfr.msgpk'
-
-
-template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws',
-                   'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r',
-                   'clé de tri', 'contexte', 'emploi', 'l', 'polytonique',
-                   'pron-API', 'registre', 'scmp', 'siècle', 'x',
-                   ]
-
-
-template_second_lambda_trd = {
-    'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'),
-    'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'),
-}
-
-template_second_lambda_snd = {
-    'term': (lambda x: '(' + x.title() + ')'),
-    'terme': (lambda x: '(' + x.title() + ')'),
-    'ex': (lambda x: '^{' + x if x else 'e' + '}'),
-    'exp': (lambda x: '^{' + x if x else 'e' + '}'),
-    'e': (lambda x: '^{' + x if x else 'e' + '}'),
-    'er': (lambda x: '^{' + x if x else 'er' + '}'),
-    'ère': (lambda x: '^{' + x if x else 'ère' + '}'),
-    'ème': (lambda x: '^{' + x if x else 'ème' + '}'),
-    'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'),
-    'III': (lambda x: '^{' + x if x else 'III' + '}'),
-    'small': (lambda x: '_{' + x if x else '' + '}'),
-    'indice': (lambda x: '_{' + x if x else '' + '}'),
-    'graphie': (lambda x: '«' + x if x else '»'),
-    'petites capitales': (lambda x: x.upper()),
-    'isbn': (lambda x: 'cf. ISBN ' + x),
-    'OCLC': (lambda x: 'cf. OCLC ' + x),
-    'variante de': (lambda x: 'Variante de ' + x),
-    'variante  de': (lambda x: 'Variante de ' + x),
-    'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
-    'variante  ortho de': (lambda x: 'Variante orthographique de ' + x),
-    'variante ortho  de': (lambda x: 'Variante orthographique de ' + x),
-    'variante orthographique de': (lambda x: 'Variante orthographique de ' + x),
-    'sic !': (lambda x: '^{sic ' + x + '}'),
-    'sic': (lambda x: '^{sic ' + x + '}'),
-    'incise': (lambda x: '_' + x + '_'),
-    'n°': (lambda x: 'n°' + x),
-    'superlatif de': (lambda x: 'Superlatif de' + x),
-    'vérifier': (lambda x: '(À vérifier : ' + x + ')'),
-}
-
-dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
-
-interdit = " :"
-
-
-def transclusion(trans, info, errorF):
-    trans = trans[2:-2]
-
-    while '{{' in trans:
-        l0 = trans.rfind('{{')
-        l1 = trans.find('}}', l0)
-        if l1 == -1:
-            break
-        else:
-            l1 += 2
-        t = trans[l0:l1]
-        t = transclusion(t, info, errorF)
-        trans = trans[:l0] + t + trans[l1:]
-
-    s = list(map(lambda x: x.strip(), trans.split('|')))
-    if s[0] in template:
-        return template[s[0]]
-
-    if s[0].lower() in template_second:
-        return s[1] if len(s) > 1 else info['mot']
-
-    if s[0].lower().startswith('citation'):
-        cit = s[0].split('/')
-        if len(cit) == 4:
-            return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3]
-        if len(cit) == 3:
-            return 'Par ' + cit[1] + ', ' + cit[2]
-        if len(cit) == 5:
-            return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4]
-        if len(cit) <= 2:
-            return ''
-        else:
-            return '/'.join(cit[1:])
-
-    if s[0].lower() in template_second_lambda_snd:
-        return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '')
-
-    if s[0].lower() in template_second_lambda_trd:
-        return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '')
-
-    if errorF:
-        with open(errorF, 'a') as err:
-           print(s[0], file=err)
-           print("Incompréhension de la transclusion {} du mot {}".format(trans,
-                 info['mot']), file=err)
-    return ''
-
-
-def extract(f, w, errorF):
-    infoFin = []
-
-    toRead = True
-    goBack = 0
-
-    while toRead:
-        toRead = False
-
-        info = {'mot': w,
-                'cat-gram': None,
-                'def': [],
-                'API': None,
-                'infos': [],
-                'genre': '',
-                'accord': None}
-
-        # State 0 // Initialisation !
-        while line := f.readline():
-
-            if line.startswith('=== ') or line.startswith('==={'):
-                if re.match('^=== *{{ *S\\|([^|]+)|.*$', line):
-                    try:
-                        r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line)
-                        r = r.groups()
-                        nat = r[0].strip()
-                        if nat in dictMatch.keys():
-                            info['cat-gram'] = nat
-                            toRead = True
-                            break
-                    except Exception as e:
-                        if errorF:
-                            with open(errorF, 'a') as err:
-                                print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err)
-                                print(f'line: [{line}]: {e}', file=err)
-                                e = sys.exc_info()[0]
-                                print("Erreur :", e, file=err)
-
-        if not toRead:
-            break
-
-        # State 1
-        while line := f.readline():
-            if line.startswith('{{fr-'):
-                e = line.find('}}')
-                if e == -1:
-                    continue
-                ex = line[:e]
-                try:
-                    infos = list(map(lambda x: x.strip(), ex.split('|')))
-                    info['infos'] = infos
-                    info['accord'] = infos[0]
-                    if len(infos) > 1:
-                        info['API'] = infos[1]
-                except e:
-                    err = sys.exc_info()[0]
-                    print(ex)
-                    print("Erreur :", e)
-                    print("Erreur :", err)
-            if line.rstrip().startswith("'''"):
-                if '{{pron' in line:
-                    p0 = line.find('{{pron')
-                    p1 = line.find('}}', p0)
-                    if p1 > 0:
-                        p1 += 2
-                        p = line[p0:p1]
-                        p = p.split('|')
-                        info['API'] = p[1]
-                if '{{m}}' in line:
-                    info['genre'] = 'mas'
-                elif '{{f}}' in line:
-                    info['genre'] = 'fem'
-
-            if line.startswith('# '):
-                info['def'].append({'def': wikiToMd(line[2:], info, errorF)})
-            elif line.startswith('#* '):
-                if not info['def']:
-                    with open('wiki_err.log', 'a') as err:
-                        print("Exemple sans définition pour le mot {}".format(
-                              info['mot']), file=err)
-                elif 'ex' in info['def'][-1]:
-                    info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF))
-                else:
-                    info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)]
-            elif line.startswith('#') and not line.startswith('##'):
-                info['def'].append({'def': wikiToMd(line[1:], info, errorF)})
-            if line.startswith('==='):
-                goBack = len(line)
-                break
-        if goBack:
-            f.seek(f.tell() - goBack)
-            goBack = 0
-            toRead = True
-        infoFin.append(info)
-
-    return infoFin
-
-
-def wikiToMd(line, info, errorF):
-    line = line.strip()
-    # 3 Étapes:
-    #   - Links [...]
-    #   - Style ''ita'' / '''bold'''
-    #   - Template / Transclusion {{info}} = (Informatique)
-
-    # Template
-    while '{{' in line:
-        l0 = line.rfind('{{')
-        l1 = line.find('}}', l0)
-        if l1 == -1:
-            break
-        else:
-            l1 += 2
-        trans = line[l0:l1]
-        trans = transclusion(trans, info, errorF)
-        line = line[:l0] + trans + line[l1:]
-
-    # Links !
-    while '[[' in line:
-        link0 = line.rfind('[[')
-        link1 = line.find(']]', link0)
-        if link1 == -1:
-            break
-        else:
-            link1 += 2
-
-        link = line[link0:link1]
-        link = link[2:-2].split('|')
-        line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:]
-
-    # Style
-    line = line.replace("'''", '*').replace("''", '')
-
-    return line
-
-
-def extractAll(f, errorF, ignore):
-    title = ""
-    isFr = False
-    hasForbidden = False
-    hasText = False
-    tf = None
-    isEnd = False
-
-    dict_ = dict()
-
-    i = 0
-
-    clearHTML = re.compile('&lt;.*&gt;', re.IGNORECASE)
-
-    for line in f:
-        i += 1
-        if "</page>" in line and tf:
-            tf.seek(0)
-            tmpInfo = extract(tf, title, errorF)
-
-            dict_[title] = tmpInfo
-            tf.close()
-
-            tf = None
-            hasForbidden = False
-            hasText = False
-            isFr = False
-            title = ""
-            isEnd = False
-        elif "</page>" in line:
-            tf = None
-            hasForbidden = False
-            hasText = False
-            isFr = False
-            title = ""
-            isEnd = False
-        if isEnd:
-            continue
-
-        if "<title>" in line:
-            title = line[line.find('>') + 1:]
-            title = title[:title.find('<')]
-
-            for c in interdit:
-                if c in title:
-                    hasForbidden = True
-        if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line:
-            hasText = True
-        if not hasForbidden and "== {{langue|fr}}" in line and hasText:
-            isFr = True
-            if tf:
-                if not ignore:
-                    if errorF:
-                        with open(errorF, 'a') as err:
-                            print(f"{title}: Erreur tf encore ouvert !",
-                                  file=err)
-                    else:
-                        print(f"{title}: Erreur tf encore ouvert !")
-                        tf.seek(0)
-                        while line2 := tf.readline():
-                            print(line2, end='')
-                        print(f"{i}: {line}")
-
-                        exit(-1)
-            else:
-                tf = tmp.NamedTemporaryFile(mode="w+t")
-        elif not hasForbidden and "== {{langue|" in line:
-            isFr = False
-        if not hasForbidden and isFr and tf:
-            start = ""
-            try:
-                start = line.split()[0]
-            except:
-                pass
-            tLine = clearHTML.sub('', line.replace('&lt;br&gt;', '\n' + start + ' '))
-            try:
-                ind = tLine.index('</text>')
-                tf.write(tLine[:ind])
-                isEnd = True
-            except:
-                tf.write(tLine)
-
-    return dict_
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
-    parser.add_argument('-o', '--out', dest='outputF', action='store_const',
-                        const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
-                        help='the output filename')
-    parser.add_argument('-i', '--input', dest='inputF', action='store',
-                        help='the input filename, a dump of witionary')
-    parser.add_argument('-e', '--error', dest='errorF', action='store',
-                        help='the filename to log errors')
-    parser.add_argument('--ignore', dest='ignoreError', action='store_true',
-                        help='the filename to log errors')
-
-    arg = parser.parse_args()
-
-    if arg.inputF is None:
-        print('A wiktionary dump is needed', file=sys.stderr)
-        exit(-1)
-
-    with open(arg.inputF, 'r') as f:
-        res = extractAll(f, arg.errorF, arg.ignoreError)
-
-        with open(arg.outputF, 'wb') as f:
-            to_w = msgpack.packb(res)
-            f.write(to_w)
author	ache <ache@ache.one>	2021-10-03 02:31:32 +0200
committer	ache <ache@ache.one>	2021-10-03 02:31:54 +0200
commit	7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree	c1e02297807107096cf5a8962ed51342a69f0626 /download/dump2msgp.py
parent	Command every scripts (diff)