diff options
author | ache <ache@ache.one> | 2021-10-03 02:31:32 +0200 |
---|---|---|
committer | ache <ache@ache.one> | 2021-10-03 02:31:54 +0200 |
commit | 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch) | |
tree | c1e02297807107096cf5a8962ed51342a69f0626 /download/dump2msgp.py | |
parent | Command every scripts (diff) |
Rename files
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r-- | download/dump2msgp.py | 390 |
1 files changed, 0 insertions, 390 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py deleted file mode 100644 index e76115c..0000000 --- a/download/dump2msgp.py +++ /dev/null @@ -1,390 +0,0 @@ -#!/bin/env python - -"""dfr - Dump to msgpack - -Extract words from the Wiktionnary archive. All the parsing is done here. -The product of that script is a MessagePack file that store every information in -easily editable and dev friendly format. - -More information on MessagePack (msgpack) : -<https://msgpack.org/> - -So there is some command line options that script can deal with. - - - + --output - The filename of the msgpack file to write. - - + --input - The filename of the decompressed wiktionary dump. - - + --error - The filename that will log errors related to parsing. - Wiktionnary is a community edited platform so there is a lot of - formatting mistakes. This script will report everything that it - doesn't understand in that file. - - + --ignore - By default, this script stops on the first error. But as I say earlier, - there is a lot of mistakes in the wiktionary archive dump so this option - is intended to ignore errors and just continue. - Errors are still logged though. - - -""" - - -import tempfile as tmp -import re -import sys -import msgpack -import argparse - -from sectionList import listInfoSection -from template import template - - -DEFAULT_OUTPUT = 'dfr.msgpk' - - -template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', - 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', - 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', - 'pron-API', 'registre', 'scmp', 'siècle', 'x', - ] - - -template_second_lambda_trd = { - 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), - 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), -} - -template_second_lambda_snd = { - 'term': (lambda x: '(' + x.title() + ')'), - 'terme': (lambda x: '(' + x.title() + ')'), - 'ex': (lambda x: '^{' + x if x else 'e' + '}'), - 'exp': (lambda x: '^{' + x if x else 'e' + '}'), - 'e': (lambda x: '^{' + x if x else 'e' + '}'), - 'er': (lambda x: '^{' + x if x else 'er' + '}'), - 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), - 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), - 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), - 'III': (lambda x: '^{' + x if x else 'III' + '}'), - 'small': (lambda x: '_{' + x if x else '' + '}'), - 'indice': (lambda x: '_{' + x if x else '' + '}'), - 'graphie': (lambda x: '«' + x if x else '»'), - 'petites capitales': (lambda x: x.upper()), - 'isbn': (lambda x: 'cf. ISBN ' + x), - 'OCLC': (lambda x: 'cf. OCLC ' + x), - 'variante de': (lambda x: 'Variante de ' + x), - 'variante de': (lambda x: 'Variante de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), - 'sic !': (lambda x: '^{sic ' + x + '}'), - 'sic': (lambda x: '^{sic ' + x + '}'), - 'incise': (lambda x: '_' + x + '_'), - 'n°': (lambda x: 'n°' + x), - 'superlatif de': (lambda x: 'Superlatif de' + x), - 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), -} - -dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} - -interdit = " :" - - -def transclusion(trans, info, errorF): - trans = trans[2:-2] - - while '{{' in trans: - l0 = trans.rfind('{{') - l1 = trans.find('}}', l0) - if l1 == -1: - break - else: - l1 += 2 - t = trans[l0:l1] - t = transclusion(t, info, errorF) - trans = trans[:l0] + t + trans[l1:] - - s = list(map(lambda x: x.strip(), trans.split('|'))) - if s[0] in template: - return template[s[0]] - - if s[0].lower() in template_second: - return s[1] if len(s) > 1 else info['mot'] - - if s[0].lower().startswith('citation'): - cit = s[0].split('/') - if len(cit) == 4: - return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] - if len(cit) == 3: - return 'Par ' + cit[1] + ', ' + cit[2] - if len(cit) == 5: - return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] - if len(cit) <= 2: - return '' - else: - return '/'.join(cit[1:]) - - if s[0].lower() in template_second_lambda_snd: - return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') - - if s[0].lower() in template_second_lambda_trd: - return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') - - if errorF: - with open(errorF, 'a') as err: - print(s[0], file=err) - print("Incompréhension de la transclusion {} du mot {}".format(trans, - info['mot']), file=err) - return '' - - -def extract(f, w, errorF): - infoFin = [] - - toRead = True - goBack = 0 - - while toRead: - toRead = False - - info = {'mot': w, - 'cat-gram': None, - 'def': [], - 'API': None, - 'infos': [], - 'genre': '', - 'accord': None} - - # State 0 // Initialisation ! - while line := f.readline(): - - if line.startswith('=== ') or line.startswith('==={'): - if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): - try: - r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) - r = r.groups() - nat = r[0].strip() - if nat in dictMatch.keys(): - info['cat-gram'] = nat - toRead = True - break - except Exception as e: - if errorF: - with open(errorF, 'a') as err: - print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err) - print(f'line: [{line}]: {e}', file=err) - e = sys.exc_info()[0] - print("Erreur :", e, file=err) - - if not toRead: - break - - # State 1 - while line := f.readline(): - if line.startswith('{{fr-'): - e = line.find('}}') - if e == -1: - continue - ex = line[:e] - try: - infos = list(map(lambda x: x.strip(), ex.split('|'))) - info['infos'] = infos - info['accord'] = infos[0] - if len(infos) > 1: - info['API'] = infos[1] - except e: - err = sys.exc_info()[0] - print(ex) - print("Erreur :", e) - print("Erreur :", err) - if line.rstrip().startswith("'''"): - if '{{pron' in line: - p0 = line.find('{{pron') - p1 = line.find('}}', p0) - if p1 > 0: - p1 += 2 - p = line[p0:p1] - p = p.split('|') - info['API'] = p[1] - if '{{m}}' in line: - info['genre'] = 'mas' - elif '{{f}}' in line: - info['genre'] = 'fem' - - if line.startswith('# '): - info['def'].append({'def': wikiToMd(line[2:], info, errorF)}) - elif line.startswith('#* '): - if not info['def']: - with open('wiki_err.log', 'a') as err: - print("Exemple sans définition pour le mot {}".format( - info['mot']), file=err) - elif 'ex' in info['def'][-1]: - info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF)) - else: - info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)] - elif line.startswith('#') and not line.startswith('##'): - info['def'].append({'def': wikiToMd(line[1:], info, errorF)}) - if line.startswith('==='): - goBack = len(line) - break - if goBack: - f.seek(f.tell() - goBack) - goBack = 0 - toRead = True - infoFin.append(info) - - return infoFin - - -def wikiToMd(line, info, errorF): - line = line.strip() - # 3 Étapes: - # - Links [...] - # - Style ''ita'' / '''bold''' - # - Template / Transclusion {{info}} = (Informatique) - - # Template - while '{{' in line: - l0 = line.rfind('{{') - l1 = line.find('}}', l0) - if l1 == -1: - break - else: - l1 += 2 - trans = line[l0:l1] - trans = transclusion(trans, info, errorF) - line = line[:l0] + trans + line[l1:] - - # Links ! - while '[[' in line: - link0 = line.rfind('[[') - link1 = line.find(']]', link0) - if link1 == -1: - break - else: - link1 += 2 - - link = line[link0:link1] - link = link[2:-2].split('|') - line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] - - # Style - line = line.replace("'''", '*').replace("''", '') - - return line - - -def extractAll(f, errorF, ignore): - title = "" - isFr = False - hasForbidden = False - hasText = False - tf = None - isEnd = False - - dict_ = dict() - - i = 0 - - clearHTML = re.compile('<.*>', re.IGNORECASE) - - for line in f: - i += 1 - if "</page>" in line and tf: - tf.seek(0) - tmpInfo = extract(tf, title, errorF) - - dict_[title] = tmpInfo - tf.close() - - tf = None - hasForbidden = False - hasText = False - isFr = False - title = "" - isEnd = False - elif "</page>" in line: - tf = None - hasForbidden = False - hasText = False - isFr = False - title = "" - isEnd = False - if isEnd: - continue - - if "<title>" in line: - title = line[line.find('>') + 1:] - title = title[:title.find('<')] - - for c in interdit: - if c in title: - hasForbidden = True - if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line: - hasText = True - if not hasForbidden and "== {{langue|fr}}" in line and hasText: - isFr = True - if tf: - if not ignore: - if errorF: - with open(errorF, 'a') as err: - print(f"{title}: Erreur tf encore ouvert !", - file=err) - else: - print(f"{title}: Erreur tf encore ouvert !") - tf.seek(0) - while line2 := tf.readline(): - print(line2, end='') - print(f"{i}: {line}") - - exit(-1) - else: - tf = tmp.NamedTemporaryFile(mode="w+t") - elif not hasForbidden and "== {{langue|" in line: - isFr = False - if not hasForbidden and isFr and tf: - start = "" - try: - start = line.split()[0] - except: - pass - tLine = clearHTML.sub('', line.replace('<br>', '\n' + start + ' ')) - try: - ind = tLine.index('</text>') - tf.write(tLine[:ind]) - isEnd = True - except: - tf.write(tLine) - - return dict_ - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='wiktionary dump to msgpack') - parser.add_argument('-o', '--out', dest='outputF', action='store_const', - const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT, - help='the output filename') - parser.add_argument('-i', '--input', dest='inputF', action='store', - help='the input filename, a dump of witionary') - parser.add_argument('-e', '--error', dest='errorF', action='store', - help='the filename to log errors') - parser.add_argument('--ignore', dest='ignoreError', action='store_true', - help='the filename to log errors') - - arg = parser.parse_args() - - if arg.inputF is None: - print('A wiktionary dump is needed', file=sys.stderr) - exit(-1) - - with open(arg.inputF, 'r') as f: - res = extractAll(f, arg.errorF, arg.ignoreError) - - with open(arg.outputF, 'wb') as f: - to_w = msgpack.packb(res) - f.write(to_w) |