From 7e1d9e251b517153db8b639133c9e3bee266ce1b Mon Sep 17 00:00:00 2001 From: ache Date: Sun, 3 Oct 2021 02:31:32 +0200 Subject: Rename files --- dfr/dump2msgp.py | 397 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 dfr/dump2msgp.py (limited to 'dfr/dump2msgp.py') diff --git a/dfr/dump2msgp.py b/dfr/dump2msgp.py new file mode 100644 index 0000000..1d3e1a2 --- /dev/null +++ b/dfr/dump2msgp.py @@ -0,0 +1,397 @@ +#!/bin/env python + +"""dfr - Dump to msgpack + +Extract words from the Wiktionnary archive. All the parsing is done here. +The product of that script is a MessagePack file that store every information in +easily editable and dev friendly format. + +More information on MessagePack (msgpack) : + + +So there is some command line options that script can deal with. + + + + --output + The filename of the msgpack file to write. + + + --input + The filename of the decompressed wiktionary dump. + + + --error + The filename that will log errors related to parsing. + Wiktionnary is a community edited platform so there is a lot of + formatting mistakes. This script will report everything that it + doesn't understand in that file. + + + --ignore + By default, this script stops on the first error. But as I say earlier, + there is a lot of mistakes in the wiktionary archive dump so this option + is intended to ignore errors and just continue. + Errors are still logged though. + + +""" + + +import tempfile as tmp +import re +import sys +import msgpack +import argparse + +if __name__ == '__main__': + from sectionList import listInfoSection + from template import template +else: + from dfr.sectionList import listInfoSection + from dfr.template import template + + +dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} + + +DEFAULT_OUTPUT = 'dfr.msgpk' + + +template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', + 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', + 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', + 'pron-API', 'registre', 'scmp', 'siècle', 'x', + ] + + +template_second_lambda_trd = { + 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), + 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), +} + +template_second_lambda_snd = { + 'term': (lambda x: '(' + x.title() + ')'), + 'terme': (lambda x: '(' + x.title() + ')'), + 'ex': (lambda x: '^{' + x if x else 'e' + '}'), + 'exp': (lambda x: '^{' + x if x else 'e' + '}'), + 'e': (lambda x: '^{' + x if x else 'e' + '}'), + 'er': (lambda x: '^{' + x if x else 'er' + '}'), + 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), + 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), + 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), + 'III': (lambda x: '^{' + x if x else 'III' + '}'), + 'small': (lambda x: '_{' + x if x else '' + '}'), + 'indice': (lambda x: '_{' + x if x else '' + '}'), + 'graphie': (lambda x: '«' + x if x else '»'), + 'petites capitales': (lambda x: x.upper()), + 'isbn': (lambda x: 'cf. ISBN ' + x), + 'OCLC': (lambda x: 'cf. OCLC ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), + 'sic !': (lambda x: '^{sic ' + x + '}'), + 'sic': (lambda x: '^{sic ' + x + '}'), + 'incise': (lambda x: '_' + x + '_'), + 'n°': (lambda x: 'n°' + x), + 'superlatif de': (lambda x: 'Superlatif de' + x), + 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), +} + + +interdit = " :" + + +def transclusion(trans, info, errorF): + trans = trans[2:-2] + + while '{{' in trans: + l0 = trans.rfind('{{') + l1 = trans.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + t = trans[l0:l1] + t = transclusion(t, info, errorF) + trans = trans[:l0] + t + trans[l1:] + + s = list(map(lambda x: x.strip(), trans.split('|'))) + if s[0] in template: + return template[s[0]] + + if s[0].lower() in template_second: + return s[1] if len(s) > 1 else info['mot'] + + if s[0].lower().startswith('citation'): + cit = s[0].split('/') + if len(cit) == 4: + return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] + if len(cit) == 3: + return 'Par ' + cit[1] + ', ' + cit[2] + if len(cit) == 5: + return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] + if len(cit) <= 2: + return '' + else: + return '/'.join(cit[1:]) + + if s[0].lower() in template_second_lambda_snd: + return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') + + if s[0].lower() in template_second_lambda_trd: + return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') + + if errorF: + with open(errorF, 'a') as err: + print(s[0], file=err) + print("Incompréhension de la transclusion {} du mot {}".format(trans, + info['mot']), file=err) + return '' + + +def extract(f, w, errorF): + infoFin = [] + + toRead = True + goBack = 0 + + while toRead: + toRead = False + + info = {'mot': w, + 'cat-gram': None, + 'def': [], + 'API': None, + 'infos': [], + 'genre': '', + 'accord': None} + + # State 0 // Initialisation ! + while line := f.readline(): + + if line.startswith('=== ') or line.startswith('==={'): + if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): + try: + r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) + r = r.groups() + nat = r[0].strip() + if nat in dictMatch.keys(): + info['cat-gram'] = nat + toRead = True + break + except Exception as e: + if errorF: + with open(errorF, 'a') as err: + print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err) + print(f'line: [{line}]: {e}', file=err) + e = sys.exc_info()[0] + print("Erreur :", e, file=err) + + if not toRead: + break + + # State 1 + while line := f.readline(): + if line.startswith('{{fr-'): + e = line.find('}}') + if e == -1: + continue + ex = line[:e] + try: + infos = list(map(lambda x: x.strip(), ex.split('|'))) + info['infos'] = infos + info['accord'] = infos[0] + if len(infos) > 1: + info['API'] = infos[1] + except e: + err = sys.exc_info()[0] + print(ex) + print("Erreur :", e) + print("Erreur :", err) + if line.rstrip().startswith("'''"): + if '{{pron' in line: + p0 = line.find('{{pron') + p1 = line.find('}}', p0) + if p1 > 0: + p1 += 2 + p = line[p0:p1] + p = p.split('|') + info['API'] = p[1] + if '{{m}}' in line: + info['genre'] = 'mas' + elif '{{f}}' in line: + info['genre'] = 'fem' + + if line.startswith('# '): + info['def'].append({'def': wikiToMd(line[2:], info, errorF)}) + elif line.startswith('#* '): + if not info['def']: + with open('wiki_err.log', 'a') as err: + print("Exemple sans définition pour le mot {}".format( + info['mot']), file=err) + elif 'ex' in info['def'][-1]: + info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF)) + else: + info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)] + elif line.startswith('#') and not line.startswith('##'): + info['def'].append({'def': wikiToMd(line[1:], info, errorF)}) + if line.startswith('==='): + goBack = len(line) + break + if goBack: + f.seek(f.tell() - goBack) + goBack = 0 + toRead = True + infoFin.append(info) + + return infoFin + + +def wikiToMd(line, info, errorF): + line = line.strip() + # 3 Étapes: + # - Links [...] + # - Style ''ita'' / '''bold''' + # - Template / Transclusion {{info}} = (Informatique) + + # Template + while '{{' in line: + l0 = line.rfind('{{') + l1 = line.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + trans = line[l0:l1] + trans = transclusion(trans, info, errorF) + line = line[:l0] + trans + line[l1:] + + # Links ! + while '[[' in line: + link0 = line.rfind('[[') + link1 = line.find(']]', link0) + if link1 == -1: + break + else: + link1 += 2 + + link = line[link0:link1] + link = link[2:-2].split('|') + line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] + + # Style + line = line.replace("'''", '*').replace("''", '') + + return line + + +def extractAll(f, errorF, ignore): + title = "" + isFr = False + hasForbidden = False + hasText = False + tf = None + isEnd = False + + dict_ = dict() + + i = 0 + + clearHTML = re.compile('<.*>', re.IGNORECASE) + + for line in f: + i += 1 + if "" in line and tf: + tf.seek(0) + tmpInfo = extract(tf, title, errorF) + + dict_[title] = tmpInfo + tf.close() + + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + isEnd = False + elif "" in line: + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + isEnd = False + if isEnd: + continue + + if "" in line: + title = line[line.find('>') + 1:] + title = title[:title.find('<')] + + for c in interdit: + if c in title: + hasForbidden = True + if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line: + hasText = True + if not hasForbidden and "== {{langue|fr}}" in line and hasText: + isFr = True + if tf: + if not ignore: + if errorF: + with open(errorF, 'a') as err: + print(f"{title}: Erreur tf encore ouvert !", + file=err) + else: + print(f"{title}: Erreur tf encore ouvert !") + tf.seek(0) + while line2 := tf.readline(): + print(line2, end='') + print(f"{i}: {line}") + + exit(-1) + else: + tf = tmp.NamedTemporaryFile(mode="w+t") + elif not hasForbidden and "== {{langue|" in line: + isFr = False + if not hasForbidden and isFr and tf: + start = "" + try: + start = line.split()[0] + except: + pass + tLine = clearHTML.sub('', line.replace('<br>', '\n' + start + ' ')) + try: + ind = tLine.index('</text>') + tf.write(tLine[:ind]) + isEnd = True + except: + tf.write(tLine) + + return dict_ + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='wiktionary dump to msgpack') + parser.add_argument('-o', '--out', dest='outputF', action='store_const', + const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT, + help='the output filename') + parser.add_argument('-i', '--input', dest='inputF', action='store', + help='the input filename, a dump of witionary') + parser.add_argument('-e', '--error', dest='errorF', action='store', + help='the filename to log errors') + parser.add_argument('--ignore', dest='ignoreError', action='store_true', + help='the filename to log errors') + + arg = parser.parse_args() + + if arg.inputF is None: + print('A wiktionary dump is needed', file=sys.stderr) + exit(-1) + + with open(arg.inputF, 'r') as f: + res = extractAll(f, arg.errorF, arg.ignoreError) + + with open(arg.outputF, 'wb') as f: + to_w = msgpack.packb(res) + f.write(to_w) -- cgit v1.2.3