aboutsummaryrefslogtreecommitdiff
path: root/download/dump2msgp.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/dump2msgp.py')
-rw-r--r--download/dump2msgp.py339
1 files changed, 339 insertions, 0 deletions
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
new file mode 100644
index 0000000..70b483c
--- /dev/null
+++ b/download/dump2msgp.py
@@ -0,0 +1,339 @@
+import tempfile as tmp
+import re
+import sys
+import msgpack
+import argparse
+
+from sectionList import listInfoSection
+from template import template
+
+
+"""
+
+Extract words from the Wiktionnary archive
+
+"""
+
+DEFAULT_OUTPUT = 'dicofr.msgpk'
+
+
+template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws',
+ 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r',
+ 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique',
+ 'pron-API', 'registre', 'scmp', 'siècle', 'x',
+ ]
+
+
+template_second_lambda_trd = {
+ 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'),
+ 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'),
+}
+
+template_second_lambda_snd = {
+ 'term': (lambda x: '(' + x.title() + ')'),
+ 'terme': (lambda x: '(' + x.title() + ')'),
+ 'ex': (lambda x: '^{' + x if x else 'e' + '}'),
+ 'exp': (lambda x: '^{' + x if x else 'e' + '}'),
+ 'e': (lambda x: '^{' + x if x else 'e' + '}'),
+ 'er': (lambda x: '^{' + x if x else 'er' + '}'),
+ 'ère': (lambda x: '^{' + x if x else 'ère' + '}'),
+ 'ème': (lambda x: '^{' + x if x else 'ème' + '}'),
+ 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'),
+ 'III': (lambda x: '^{' + x if x else 'III' + '}'),
+ 'small': (lambda x: '_{' + x if x else '' + '}'),
+ 'indice': (lambda x: '_{' + x if x else '' + '}'),
+ 'graphie': (lambda x: '«' + x if x else '»'),
+ 'petites capitales': (lambda x: x.upper()),
+ 'isbn': (lambda x: 'cf. ISBN ' + x),
+ 'OCLC': (lambda x: 'cf. OCLC ' + x),
+ 'variante de': (lambda x: 'Variante de ' + x),
+ 'variante de': (lambda x: 'Variante de ' + x),
+ 'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
+ 'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
+ 'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
+ 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x),
+ 'sic !': (lambda x: '^{sic ' + x + '}'),
+ 'sic': (lambda x: '^{sic ' + x + '}'),
+ 'incise': (lambda x: '_' + x + '_'),
+ 'n°': (lambda x: 'n°' + x),
+ 'superlatif de': (lambda x: 'Superlatif de' + x),
+ 'vérifier': (lambda x: '(À vérifier : ' + x + ')'),
+}
+
+dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
+
+interdit = " :"
+
+
+def transclusion(trans, info, errorF):
+ trans = trans[2:-2]
+
+ while '{{' in trans:
+ l0 = trans.rfind('{{')
+ l1 = trans.find('}}', l0)
+ if l1 == -1:
+ break
+ else:
+ l1 += 2
+ t = trans[l0:l1]
+ t = transclusion(t, info, errorF)
+ trans = trans[:l0] + t + trans[l1:]
+
+ s = list(map(lambda x: x.strip(), trans.split('|')))
+ if s[0] in template:
+ return template[s[0]]
+
+ if s[0].lower() in template_second:
+ return s[1] if len(s) > 1 else info['mot']
+
+ if s[0].lower().startswith('citation'):
+ cit = s[0].split('/')
+ if len(cit) == 4:
+ return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3]
+ if len(cit) == 3:
+ return 'Par ' + cit[1] + ', ' + cit[2]
+ if len(cit) == 5:
+ return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4]
+ if len(cit) <= 2:
+ return ''
+ else:
+ return '/'.join(cit[1:])
+
+ if s[0].lower() in template_second_lambda_snd:
+ return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '')
+
+ if s[0].lower() in template_second_lambda_trd:
+ return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '')
+
+ if errorF:
+ with open(errorF, 'a') as err:
+ print(s[0], file=err)
+ print("Incompréhension de la transclusion {} du mot {}".format(trans,
+ info['mot']), file=err)
+ return ''
+
+
+def extract(f, w, errorF):
+ infoFin = []
+
+ toRead = True
+ goBack = 0
+
+ while toRead:
+ toRead = False
+
+ info = {'mot': w,
+ 'cat-gram': None,
+ 'def': [],
+ 'API': None,
+ 'infos': [],
+ 'genre': '',
+ 'accord': None}
+
+ # State 0 // Initialisation !
+ while line := f.readline():
+
+ if line.startswith('=== ') or line.startswith('==={'):
+ if re.match('^=== *{{ *S\\|([^|]+)|.*$', line):
+ try:
+ r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line)
+ r = r.groups()
+ nat = r[0].strip()
+ if nat in dictMatch.keys():
+ info['cat-gram'] = nat
+ toRead = True
+ break
+ except Exception as e:
+ if errorF:
+ with open(errorF, 'a') as err:
+ print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err)
+ print(f'line: [{line}]: {e}', file=err)
+ e = sys.exc_info()[0]
+ print("Erreur :", e, file=err)
+
+ if not toRead:
+ break
+
+ # State 1
+ while line := f.readline():
+ if line.startswith('{{fr-'):
+ e = line.find('}}')
+ if e == -1:
+ continue
+ ex = line[:e]
+ try:
+ infos = list(map(lambda x: x.strip(), ex.split('|')))
+ info['infos'] = infos
+ info['accord'] = infos[0]
+ if len(infos) > 1:
+ info['API'] = infos[1]
+ except e:
+ err = sys.exc_info()[0]
+ print(ex)
+ print("Erreur :", e)
+ print("Erreur :", err)
+ if line.rstrip().startswith("'''"):
+ if '{{pron' in line:
+ p0 = line.find('{{pron')
+ p1 = line.find('}}', p0)
+ if p1 > 0:
+ p1 += 2
+ p = line[p0:p1]
+ p = p.split('|')
+ info['API'] = p[1]
+ if '{{m}}' in line:
+ info['genre'] = 'mas'
+ elif '{{f}}' in line:
+ info['genre'] = 'fem'
+
+ if line.startswith('# '):
+ info['def'].append({'def': wikiToMd(line[2:], info, errorF)})
+ elif line.startswith('#* '):
+ if not info['def']:
+ with open('wiki_err.log', 'a') as err:
+ print("Exemple sans définition pour le mot {}".format(
+ info['mot']), file=err)
+ elif 'ex' in info['def'][-1]:
+ info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF))
+ else:
+ info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)]
+ elif line.startswith('#') and not line.startswith('##'):
+ info['def'].append({'def': wikiToMd(line[1:], info, errorF)})
+ if line.startswith('==='):
+ goBack = len(line)
+ break
+ if goBack:
+ f.seek(f.tell() - goBack)
+ goBack = 0
+ toRead = True
+ infoFin.append(info)
+
+ return infoFin
+
+
+def wikiToMd(line, info, errorF):
+ line = line.strip()
+ # 3 Étapes:
+ # - Links [...]
+ # - Style ''ita'' / '''bold'''
+ # - Template / Transclusion {{info}} = (Informatique)
+
+ # Template
+ while '{{' in line:
+ l0 = line.rfind('{{')
+ l1 = line.find('}}', l0)
+ if l1 == -1:
+ break
+ else:
+ l1 += 2
+ trans = line[l0:l1]
+ trans = transclusion(trans, info, errorF)
+ line = line[:l0] + trans + line[l1:]
+
+ # Links !
+ while '[[' in line:
+ link0 = line.rfind('[[')
+ link1 = line.find(']]', link0)
+ if link1 == -1:
+ break
+ else:
+ link1 += 2
+
+ link = line[link0:link1]
+ link = link[2:-2].split('|')
+ line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:]
+
+ # Style
+ line = line.replace("'''", '*').replace("''", '')
+
+ return line
+
+
+def extractAll(f, errorF, ignore):
+ title = ""
+ isFr = False
+ hasForbidden = False
+ hasText = False
+ tf = None
+
+ dict_ = dict()
+
+ for line in f:
+ if "</page>" in line and tf:
+ tf.seek(0)
+ i = extract(tf, title, errorF)
+
+ dict_[title] = i
+ tf.close()
+
+ tf = None
+ hasForbidden = False
+ hasText = False
+ isFr = False
+ title = ""
+ elif "</page>" in line:
+ tf = None
+ hasForbidden = False
+ hasText = False
+ isFr = False
+ title = ""
+
+ if "<title>" in line:
+ title = line[line.find('>') + 1:]
+ title = title[:title.find('<')]
+
+ for c in interdit:
+ if c in title:
+ hasForbidden = True
+ if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line:
+ hasText = True
+ if not hasForbidden and "== {{langue|fr}}" in line and hasText:
+ isFr = True
+ if tf:
+ if not ignore:
+ if errorF:
+ with open(errorF, 'a') as err:
+ print(f"{title}: Erreur tf encore ouvert !",
+ file=err)
+ else:
+ print(f"{title}: Erreur tf encore ouvert !")
+ tf.seek(0)
+ while line2 := tf.readline():
+ print(line2, end='')
+ print(line)
+
+ exit(-1)
+ else:
+ tf = tmp.NamedTemporaryFile(mode="w+t")
+ elif not hasForbidden and "== {{langue|" in line:
+ isFr = False
+ if not hasForbidden and isFr and tf:
+ tf.write(line)
+
+ return dict_
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
+ parser.add_argument('-o', '--out', dest='outputF', action='store_const',
+ const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
+ help='the output filename')
+ parser.add_argument('-i', '--input', dest='inputF', action='store',
+ help='the input filename, a dump of witionary')
+ parser.add_argument('-e', '--error', dest='errorF', action='store',
+ help='the filename to log errors')
+ parser.add_argument('--ignore', dest='ignoreError', action='store_true',
+ help='the filename to log errors')
+
+ arg = parser.parse_args()
+
+ if arg.inputF is None:
+ print('A wiktionary dump is needed', file=sys.stderr)
+ exit(-1)
+
+ with open(arg.inputF, 'r') as f:
+ res = extractAll(f, arg.errorF, arg.ignoreError)
+
+ with open(arg.outputF, 'wb') as f:
+ to_w = msgpack.packb(res)
+ f.write(to_w)