import tempfile as tmp import re import sys import msgpack import argparse from sectionList import listInfoSection from template import template """ Extract words from the Wiktionnary archive """ DEFAULT_OUTPUT = 'dfr.msgpk' template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', 'pron-API', 'registre', 'scmp', 'siècle', 'x', ] template_second_lambda_trd = { 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), } template_second_lambda_snd = { 'term': (lambda x: '(' + x.title() + ')'), 'terme': (lambda x: '(' + x.title() + ')'), 'ex': (lambda x: '^{' + x if x else 'e' + '}'), 'exp': (lambda x: '^{' + x if x else 'e' + '}'), 'e': (lambda x: '^{' + x if x else 'e' + '}'), 'er': (lambda x: '^{' + x if x else 'er' + '}'), 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), 'III': (lambda x: '^{' + x if x else 'III' + '}'), 'small': (lambda x: '_{' + x if x else '' + '}'), 'indice': (lambda x: '_{' + x if x else '' + '}'), 'graphie': (lambda x: '«' + x if x else '»'), 'petites capitales': (lambda x: x.upper()), 'isbn': (lambda x: 'cf. ISBN ' + x), 'OCLC': (lambda x: 'cf. OCLC ' + x), 'variante de': (lambda x: 'Variante de ' + x), 'variante de': (lambda x: 'Variante de ' + x), 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), 'sic !': (lambda x: '^{sic ' + x + '}'), 'sic': (lambda x: '^{sic ' + x + '}'), 'incise': (lambda x: '_' + x + '_'), 'n°': (lambda x: 'n°' + x), 'superlatif de': (lambda x: 'Superlatif de' + x), 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), } dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} interdit = " :" def transclusion(trans, info, errorF): trans = trans[2:-2] while '{{' in trans: l0 = trans.rfind('{{') l1 = trans.find('}}', l0) if l1 == -1: break else: l1 += 2 t = trans[l0:l1] t = transclusion(t, info, errorF) trans = trans[:l0] + t + trans[l1:] s = list(map(lambda x: x.strip(), trans.split('|'))) if s[0] in template: return template[s[0]] if s[0].lower() in template_second: return s[1] if len(s) > 1 else info['mot'] if s[0].lower().startswith('citation'): cit = s[0].split('/') if len(cit) == 4: return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] if len(cit) == 3: return 'Par ' + cit[1] + ', ' + cit[2] if len(cit) == 5: return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] if len(cit) <= 2: return '' else: return '/'.join(cit[1:]) if s[0].lower() in template_second_lambda_snd: return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') if s[0].lower() in template_second_lambda_trd: return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') if errorF: with open(errorF, 'a') as err: print(s[0], file=err) print("Incompréhension de la transclusion {} du mot {}".format(trans, info['mot']), file=err) return '' def extract(f, w, errorF): infoFin = [] toRead = True goBack = 0 while toRead: toRead = False info = {'mot': w, 'cat-gram': None, 'def': [], 'API': None, 'infos': [], 'genre': '', 'accord': None} # State 0 // Initialisation ! while line := f.readline(): if line.startswith('=== ') or line.startswith('==={'): if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): try: r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) r = r.groups() nat = r[0].strip() if nat in dictMatch.keys(): info['cat-gram'] = nat toRead = True break except Exception as e: if errorF: with open(errorF, 'a') as err: print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err) print(f'line: [{line}]: {e}', file=err) e = sys.exc_info()[0] print("Erreur :", e, file=err) if not toRead: break # State 1 while line := f.readline(): if line.startswith('{{fr-'): e = line.find('}}') if e == -1: continue ex = line[:e] try: infos = list(map(lambda x: x.strip(), ex.split('|'))) info['infos'] = infos info['accord'] = infos[0] if len(infos) > 1: info['API'] = infos[1] except e: err = sys.exc_info()[0] print(ex) print("Erreur :", e) print("Erreur :", err) if line.rstrip().startswith("'''"): if '{{pron' in line: p0 = line.find('{{pron') p1 = line.find('}}', p0) if p1 > 0: p1 += 2 p = line[p0:p1] p = p.split('|') info['API'] = p[1] if '{{m}}' in line: info['genre'] = 'mas' elif '{{f}}' in line: info['genre'] = 'fem' if line.startswith('# '): info['def'].append({'def': wikiToMd(line[2:], info, errorF)}) elif line.startswith('#* '): if not info['def']: with open('wiki_err.log', 'a') as err: print("Exemple sans définition pour le mot {}".format( info['mot']), file=err) elif 'ex' in info['def'][-1]: info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF)) else: info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)] elif line.startswith('#') and not line.startswith('##'): info['def'].append({'def': wikiToMd(line[1:], info, errorF)}) if line.startswith('==='): goBack = len(line) break if goBack: f.seek(f.tell() - goBack) goBack = 0 toRead = True infoFin.append(info) return infoFin def wikiToMd(line, info, errorF): line = line.strip() # 3 Étapes: # - Links [...] # - Style ''ita'' / '''bold''' # - Template / Transclusion {{info}} = (Informatique) # Template while '{{' in line: l0 = line.rfind('{{') l1 = line.find('}}', l0) if l1 == -1: break else: l1 += 2 trans = line[l0:l1] trans = transclusion(trans, info, errorF) line = line[:l0] + trans + line[l1:] # Links ! while '[[' in line: link0 = line.rfind('[[') link1 = line.find(']]', link0) if link1 == -1: break else: link1 += 2 link = line[link0:link1] link = link[2:-2].split('|') line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] # Style line = line.replace("'''", '*').replace("''", '') return line def extractAll(f, errorF, ignore): title = "" isFr = False hasForbidden = False hasText = False tf = None isEnd = False dict_ = dict() cleanr = re.compile('<.*?>') for line in f: if "" in line and tf: tf.seek(0) i = extract(tf, title, errorF) dict_[title] = i tf.close() tf = None hasForbidden = False hasText = False isFr = False title = "" isEnd = False elif "" in line: tf = None hasForbidden = False hasText = False isFr = False title = "" isEnd = False if isEnd: continue if "" in line: title = line[line.find('>') + 1:] title = title[:title.find('<')] for c in interdit: if c in title: hasForbidden = True if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line: hasText = True if not hasForbidden and "== {{langue|fr}}" in line and hasText: isFr = True if tf: if not ignore: if errorF: with open(errorF, 'a') as err: print(f"{title}: Erreur tf encore ouvert !", file=err) else: print(f"{title}: Erreur tf encore ouvert !") tf.seek(0) while line2 := tf.readline(): print(line2, end='') print(line) exit(-1) else: tf = tmp.NamedTemporaryFile(mode="w+t") elif not hasForbidden and "== {{langue|" in line: isFr = False if not hasForbidden and isFr and tf: try: ind = line.index('</text>') tf.write(cleanr.sub('', line[:ind].replace('<br>', '\n'))) isEnd = True except: tf.write(cleanr.sub('', line.replace('<br>', '\n'))) return dict_ if __name__ == '__main__': parser = argparse.ArgumentParser(description='wiktionary dump to msgpack') parser.add_argument('-o', '--out', dest='outputF', action='store_const', const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT, help='the output filename') parser.add_argument('-i', '--input', dest='inputF', action='store', help='the input filename, a dump of witionary') parser.add_argument('-e', '--error', dest='errorF', action='store', help='the filename to log errors') parser.add_argument('--ignore', dest='ignoreError', action='store_true', help='the filename to log errors') arg = parser.parse_args() if arg.inputF is None: print('A wiktionary dump is needed', file=sys.stderr) exit(-1) with open(arg.inputF, 'r') as f: res = extractAll(f, arg.errorF, arg.ignoreError) with open(arg.outputF, 'wb') as f: to_w = msgpack.packb(res) f.write(to_w)