From ac7c2e5d071151f69872f8e97dac414e41976168 Mon Sep 17 00:00:00 2001 From: ache Date: Tue, 16 Jun 2020 17:37:32 +0200 Subject: Documentation README --- dump2msgp.py | 309 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 dump2msgp.py (limited to 'dump2msgp.py') diff --git a/dump2msgp.py b/dump2msgp.py new file mode 100644 index 0000000..d4fb050 --- /dev/null +++ b/dump2msgp.py @@ -0,0 +1,309 @@ +import tempfile as tmp +import re +import sys +import msgpack + +from listSection import listInfoSection +from template import template + + +""" + +Extract words from the Wiktionnary archive + +""" + +template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', + 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', + 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', + 'pron-API', 'registre', 'scmp', 'siècle', 'x', + ] + + +template_second_lambda_trd = { + 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), + 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), +} + +template_second_lambda_snd = { + 'term': (lambda x: '(' + x.title() + ')'), + 'terme': (lambda x: '(' + x.title() + ')'), + 'ex': (lambda x: '^{' + x if x else 'e' + '}'), + 'exp': (lambda x: '^{' + x if x else 'e' + '}'), + 'e': (lambda x: '^{' + x if x else 'e' + '}'), + 'er': (lambda x: '^{' + x if x else 'er' + '}'), + 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), + 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), + 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), + 'III': (lambda x: '^{' + x if x else 'III' + '}'), + 'III': (lambda x: '^{' + x if x else 'III' + '}'), + 'small': (lambda x: '_{' + x if x else '' + '}'), + 'indice': (lambda x: '_{' + x if x else '' + '}'), + 'graphie': (lambda x: '«' + x if x else '»'), + 'petites capitales': (lambda x: x.upper()), + 'isbn': (lambda x: 'cf. ISBN ' + x), + 'OCLC': (lambda x: 'cf. OCLC ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), + 'sic !': (lambda x: '^{sic ' + x + '}'), + 'sic': (lambda x: '^{sic ' + x + '}'), + 'incise': (lambda x: '_' + x + '_'), + 'n°': (lambda x: 'n°' + x), + 'superlatif de': (lambda x: 'Superlatif de' + x), + 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), +} + +dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} + +interdit = " :" + + +def transclusion(trans, info): + trans = trans[2:-2] + + while '{{' in trans: + l0 = trans.rfind('{{') + l1 = trans.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + t = trans[l0:l1] + t = transclusion(t, info) + trans = trans[:l0] + t + trans[l1:] + + s = list(map(lambda x: x.strip(), trans.split('|'))) + if s[0] in template: + return template[s[0]] + + if s[0].lower() in template_second: + + return s[1] if len(s) > 1 else title + + if s[0].lower().startswith('citation'): + cit = s[0].split('/') + if len(cit) == 4: + return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] + if len(cit) == 3: + return 'Par ' + cit[1] + ', ' + cit[2] + if len(cit) == 5: + return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] + if len(cit) <= 2: + return '' + else: + return '/'.join(c[1:]) + + if s[0].lower() in template_second_lambda_snd: + return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') + + if s[0].lower() in template_second_lambda_trd: + return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') + + with open('wiki_err.log', 'a') as err: + print(s[0], file=err) +# print("Incompréhension de la transclusion {} du mot {}".format(trans, +# info['mot']), file=err) + return '' + + +def extract(f, w): + infoFin = [] + + toRead = True + goBack = 0 + + while toRead: + toRead = False + + info = {'mot': w, + 'cat-gram': None, + 'def': [], + 'API': None, + 'infos': [], + 'genre': '', + 'accord': None} + + # State 0 // Initialisation ! + while line := f.readline(): + + if line.startswith('=== ') or line.startswith('==={'): + if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): + try: + r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) + r = r.groups() + nat = r[0].strip() + if nat in dictMatch.keys(): + info['cat-gram'] = nat + toRead = True + break + except e: + with open('wiki_err.log', 'a') as err: + print("^[1] Problème à l'initialisation du mot {}:" + " {}".format(info['mot'], e), file=err) + print('line: [{}]'.format(line, e), file=err) + e = sys.exc_info()[0] + print("Erreur :", e, file=err) + + if not toRead: + break + + # State 1 + while line := f.readline(): + if line.startswith('{{fr-'): + e = line.find('}}') + if e == -1: + continue + ex = line[:e] + try: + infos = list(map(lambda x: x.strip(), ex.split('|'))) + info['infos'] = infos + info['accord'] = infos[0] + if len(infos) > 1: + info['API'] = infos[1] + except e: + err = sys.exc_info()[0] + print(ex) + print("Erreur :", e) + print("Erreur :", err) + if line.rstrip().startswith("'''"): + if '{{pron' in line: + p0 = line.find('{{pron') + p1 = line.find('}}', p0) + if p1 > 0: + p1 += 2 + p = line[p0:p1] + p = p.split('|') + info['API'] = p[1] + if '{{m}}' in line: + info['genre'] = 'mas' + elif '{{f}}' in line: + info['genre'] = 'fem' + + if line.startswith('# '): + info['def'].append({'def': wikiToMd(line[2:], info)}) + elif line.startswith('#* '): + if not info['def']: + with open('wiki_err.log', 'a') as err: + print("Exemple sans définition pour le mot {}".format( + info['mot']), file=err) + elif 'ex' in info['def'][-1]: + info['def'][-1]['ex'].append(wikiToMd(line[3:], info)) + else: + info['def'][-1]['ex'] = [wikiToMd(line[3:], info)] + elif line.startswith('#') and not line.startswith('##'): + info['def'].append({'def': wikiToMd(line[1:], info)}) + if line.startswith('==='): + goBack = len(line) + break + if goBack: + tf.seek(tf.tell() - goBack) + goBack = 0 + toRead = True + infoFin.append(info) + + return infoFin + + +def wikiToMd(line, info): + line = line.strip() + # 3 Étapes: + # - Links [...] + # - Style ''ita'' / '''bold''' + # - Template / Transclusion {{info}} = (Informatique) + + # Template + while '{{' in line: + l0 = line.rfind('{{') + l1 = line.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + trans = line[l0:l1] + trans = transclusion(trans, info) + line = line[:l0] + trans + line[l1:] + + # Links ! + while '[[' in line: + link0 = line.rfind('[[') + link1 = line.find(']]', link0) + if link1 == -1: + break + else: + link1 += 2 + + link = line[link0:link1] + link = link[2:-2].split('|') + line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] + + # Style + line = line.replace("'''", '*').replace("''", '') + + return line + + +with open("./fr_wiktionary_all.xml", 'r') as f: + title = "" + isFr = False + hasForbidden = False + hasText = False + tf = None + + dict_ = dict() + + for line in f: + if "" in line and tf: + tf.seek(0) + i = extract(tf, title) + + for w in i: + if w['mot'] == 'président': + print("What we exstract from it:") + print(i) + + dict_[title] = i + tf.close() + + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + elif "" in line: + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + + if "" in line: + title = line[line.find('>') + 1:] + title = title[:title.find('<')] + + for c in interdit: + if c in title: + hasForbidden = True + if not hasForbidden and "<text xml:space=\"preserve\">" in line: + hasText = True + if not hasForbidden and "== {{langue|fr}}" in line and hasText: + isFr = True + if tf: + print("Erreur tf encore ouvert !") + exit(-1) + tf = tmp.NamedTemporaryFile(mode="w+t") + # print(title) + elif not hasForbidden and "== {{langue|" in line: + isFr = False + if not hasForbidden and isFr and tf: + tf.write(line) + + print("Will save the result") + + with open('result_all.pack', 'wb') as f: + to_w = msgpack.packb(dict_) + f.write(to_w) + -- cgit v1.2.3-54-g00ecf