From ac7c2e5d071151f69872f8e97dac414e41976168 Mon Sep 17 00:00:00 2001 From: ache Date: Tue, 16 Jun 2020 17:37:32 +0200 Subject: Documentation README --- README.md | 102 +++++++++++++++ dicofr.py | 17 ++- dump2msgp.py | 309 ++++++++++++++++++++++++++++++++++++++++++++++ main.py | 309 ---------------------------------------------- msgPack2sqlite_msgPack.py | 73 ++++++----- 5 files changed, 467 insertions(+), 343 deletions(-) create mode 100644 README.md create mode 100644 dump2msgp.py delete mode 100644 main.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d9a041 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +Dicofr +====== + +An utility to create and query a French dictionary based on [Wiktionary archive dump](https://dumps.wikimedia.org/frwiktionary/20200601/). + + +Technically +---------- + +A bunch of **Python** scripts to transform wiktionary archive dump to MySQL database file. + + - With a simple CLI. + - With a simple WUI, flask based. + - Regex support. + - + +How to create the database +-------------------------- + +First you have to download a wiktionary archive file. +For example the file `frwiktionary-20200601-pages-articles.xml.bz2` witch is a full dump of the current version of every pages. + +For now you have to decompress it completely before you can treat it. +The use of [bz2](https://docs.python.org/3/library/bz2.html) may be considered in the future to make this step optional and thus reduce disk usage. + +~~~shell +$ bunzip2 frwiktionary-20200601-pages-articles.xml.bz2 +~~~ + +Then, you will create an intermediary file, a msgpack file, of every data of wiktionary. +This file is interesting for developers not for end users. +It's a serialization of the internal used dictionary (python dictionary). + +~~~shell +$ python dump2msgp.py -i frwiktionary-20200601-pages-articles.xml.bz2 +~~~ + +Then, you can create the SQLite database file. +~~~shell +$ python msgPack2sqlite_msgPack.py -i dicofr.msgpk -o dicofr.db +~~~ + +You can then use `dicofr.py` to search a word from the CLI or use the WUI with the command: +~~~shell +$ python web.py +~~~ + +How to use it +------------- + +You can use the CLI. + +~~~shell +$ dicofr -h +usage: dicofr [-h] [--sql] [--matching] PATTERN + +Get a french word's definition. + +positional arguments: + PATTERN the word or the pattern to match + +optional arguments: + -h, --help show this help message and exit + --sql search a definition using SQL regex, _ to match a letter, % to match a group of letters + --matching search the french words that match the regex +~~~ + +For example + +~~~shell +$ dicofr julien + julien + /ʒy.ljɛ̃/, adjectif + (Chronologie) Qui est lié à Jules César et à sa décision d’instaurer l’alternance entre trois années de trois cent soixante-cinq jours et une année bissextile de trois cent soixante-six jours. + * Calendrier *julien*. + * Année *julienne*. + * Correction *julienne*. +~~~ + +~~~shell +$ dicofr -m /julien/ +julienois +juliennette +juliennoises +juliennes +julien +julienne +julienoises +juliennettes +juliennoise +julienoise +juliennois +juliens +~~~ + +How to contribute ? +------------------- + +This project is free, you are free to send me a PR to improove this software. +Respect each other is the only rule. + +License: MIT like. diff --git a/dicofr.py b/dicofr.py index ea5741c..9bef7ac 100755 --- a/dicofr.py +++ b/dicofr.py @@ -12,7 +12,7 @@ sys.path.insert(-1, DIR_PATH) import ui -dico = 'wiktfr.sql' +dico = 'dicofr.db' def get_def_sql(word): @@ -56,7 +56,11 @@ def matching(word): matchingWord = [] - with open('list_word.msgpk', 'rb') as f: + if not exists(arg.wordList): + print(f'Error: Word list {arg.wordList} not found', file=sys.stderr) + return + + with open(arg.wordList, 'rb') as f: msgpackList = f.read() listWord = msgpack.unpackb(msgpackList, raw=False) @@ -81,8 +85,9 @@ def matching(word): if __name__ == '__main__': if len(sys.argv) < 2: - print("Erreur: Rechercher un mot", file=sys.stderr) - exit() + print("Erreur: Rechercher un mot\nUtilisez l'option -h pour avoir de l'aide", + file=sys.stderr) + exit(-1) # Si on n'arrive pas à trouver le dictionnaire if not exists(dico): @@ -97,6 +102,8 @@ if __name__ == '__main__': const=get_def_sql_reg, default=get_def_sql, help='search a definition using SQL regex, ' '_ to match a letter, %% to match a group of letters') + parser.add_argument('-w', '--wordlist', dest='wordList', + action='store_const', default='list_word.msgpack') parser.add_argument('-m', '--matching', dest='matching', action='store_true', help='search the french words that match the regex') parser.add_argument('word', metavar='PATTERN', type=str, @@ -113,6 +120,4 @@ if __name__ == '__main__': else: for w in arg.action(arg.word): ui.show_terminal(w) - if not ret: - exit(1) diff --git a/dump2msgp.py b/dump2msgp.py new file mode 100644 index 0000000..d4fb050 --- /dev/null +++ b/dump2msgp.py @@ -0,0 +1,309 @@ +import tempfile as tmp +import re +import sys +import msgpack + +from listSection import listInfoSection +from template import template + + +""" + +Extract words from the Wiktionnary archive + +""" + +template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', + 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', + 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', + 'pron-API', 'registre', 'scmp', 'siècle', 'x', + ] + + +template_second_lambda_trd = { + 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), + 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), +} + +template_second_lambda_snd = { + 'term': (lambda x: '(' + x.title() + ')'), + 'terme': (lambda x: '(' + x.title() + ')'), + 'ex': (lambda x: '^{' + x if x else 'e' + '}'), + 'exp': (lambda x: '^{' + x if x else 'e' + '}'), + 'e': (lambda x: '^{' + x if x else 'e' + '}'), + 'er': (lambda x: '^{' + x if x else 'er' + '}'), + 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), + 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), + 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), + 'III': (lambda x: '^{' + x if x else 'III' + '}'), + 'III': (lambda x: '^{' + x if x else 'III' + '}'), + 'small': (lambda x: '_{' + x if x else '' + '}'), + 'indice': (lambda x: '_{' + x if x else '' + '}'), + 'graphie': (lambda x: '«' + x if x else '»'), + 'petites capitales': (lambda x: x.upper()), + 'isbn': (lambda x: 'cf. ISBN ' + x), + 'OCLC': (lambda x: 'cf. OCLC ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante de': (lambda x: 'Variante de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), + 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), + 'sic !': (lambda x: '^{sic ' + x + '}'), + 'sic': (lambda x: '^{sic ' + x + '}'), + 'incise': (lambda x: '_' + x + '_'), + 'n°': (lambda x: 'n°' + x), + 'superlatif de': (lambda x: 'Superlatif de' + x), + 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), +} + +dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} + +interdit = " :" + + +def transclusion(trans, info): + trans = trans[2:-2] + + while '{{' in trans: + l0 = trans.rfind('{{') + l1 = trans.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + t = trans[l0:l1] + t = transclusion(t, info) + trans = trans[:l0] + t + trans[l1:] + + s = list(map(lambda x: x.strip(), trans.split('|'))) + if s[0] in template: + return template[s[0]] + + if s[0].lower() in template_second: + + return s[1] if len(s) > 1 else title + + if s[0].lower().startswith('citation'): + cit = s[0].split('/') + if len(cit) == 4: + return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] + if len(cit) == 3: + return 'Par ' + cit[1] + ', ' + cit[2] + if len(cit) == 5: + return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] + if len(cit) <= 2: + return '' + else: + return '/'.join(c[1:]) + + if s[0].lower() in template_second_lambda_snd: + return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') + + if s[0].lower() in template_second_lambda_trd: + return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') + + with open('wiki_err.log', 'a') as err: + print(s[0], file=err) +# print("Incompréhension de la transclusion {} du mot {}".format(trans, +# info['mot']), file=err) + return '' + + +def extract(f, w): + infoFin = [] + + toRead = True + goBack = 0 + + while toRead: + toRead = False + + info = {'mot': w, + 'cat-gram': None, + 'def': [], + 'API': None, + 'infos': [], + 'genre': '', + 'accord': None} + + # State 0 // Initialisation ! + while line := f.readline(): + + if line.startswith('=== ') or line.startswith('==={'): + if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): + try: + r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) + r = r.groups() + nat = r[0].strip() + if nat in dictMatch.keys(): + info['cat-gram'] = nat + toRead = True + break + except e: + with open('wiki_err.log', 'a') as err: + print("^[1] Problème à l'initialisation du mot {}:" + " {}".format(info['mot'], e), file=err) + print('line: [{}]'.format(line, e), file=err) + e = sys.exc_info()[0] + print("Erreur :", e, file=err) + + if not toRead: + break + + # State 1 + while line := f.readline(): + if line.startswith('{{fr-'): + e = line.find('}}') + if e == -1: + continue + ex = line[:e] + try: + infos = list(map(lambda x: x.strip(), ex.split('|'))) + info['infos'] = infos + info['accord'] = infos[0] + if len(infos) > 1: + info['API'] = infos[1] + except e: + err = sys.exc_info()[0] + print(ex) + print("Erreur :", e) + print("Erreur :", err) + if line.rstrip().startswith("'''"): + if '{{pron' in line: + p0 = line.find('{{pron') + p1 = line.find('}}', p0) + if p1 > 0: + p1 += 2 + p = line[p0:p1] + p = p.split('|') + info['API'] = p[1] + if '{{m}}' in line: + info['genre'] = 'mas' + elif '{{f}}' in line: + info['genre'] = 'fem' + + if line.startswith('# '): + info['def'].append({'def': wikiToMd(line[2:], info)}) + elif line.startswith('#* '): + if not info['def']: + with open('wiki_err.log', 'a') as err: + print("Exemple sans définition pour le mot {}".format( + info['mot']), file=err) + elif 'ex' in info['def'][-1]: + info['def'][-1]['ex'].append(wikiToMd(line[3:], info)) + else: + info['def'][-1]['ex'] = [wikiToMd(line[3:], info)] + elif line.startswith('#') and not line.startswith('##'): + info['def'].append({'def': wikiToMd(line[1:], info)}) + if line.startswith('==='): + goBack = len(line) + break + if goBack: + tf.seek(tf.tell() - goBack) + goBack = 0 + toRead = True + infoFin.append(info) + + return infoFin + + +def wikiToMd(line, info): + line = line.strip() + # 3 Étapes: + # - Links [...] + # - Style ''ita'' / '''bold''' + # - Template / Transclusion {{info}} = (Informatique) + + # Template + while '{{' in line: + l0 = line.rfind('{{') + l1 = line.find('}}', l0) + if l1 == -1: + break + else: + l1 += 2 + trans = line[l0:l1] + trans = transclusion(trans, info) + line = line[:l0] + trans + line[l1:] + + # Links ! + while '[[' in line: + link0 = line.rfind('[[') + link1 = line.find(']]', link0) + if link1 == -1: + break + else: + link1 += 2 + + link = line[link0:link1] + link = link[2:-2].split('|') + line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] + + # Style + line = line.replace("'''", '*').replace("''", '') + + return line + + +with open("./fr_wiktionary_all.xml", 'r') as f: + title = "" + isFr = False + hasForbidden = False + hasText = False + tf = None + + dict_ = dict() + + for line in f: + if "" in line and tf: + tf.seek(0) + i = extract(tf, title) + + for w in i: + if w['mot'] == 'président': + print("What we exstract from it:") + print(i) + + dict_[title] = i + tf.close() + + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + elif "" in line: + tf = None + hasForbidden = False + hasText = False + isFr = False + title = "" + + if "" in line: + title = line[line.find('>') + 1:] + title = title[:title.find('<')] + + for c in interdit: + if c in title: + hasForbidden = True + if not hasForbidden and "<text xml:space=\"preserve\">" in line: + hasText = True + if not hasForbidden and "== {{langue|fr}}" in line and hasText: + isFr = True + if tf: + print("Erreur tf encore ouvert !") + exit(-1) + tf = tmp.NamedTemporaryFile(mode="w+t") + # print(title) + elif not hasForbidden and "== {{langue|" in line: + isFr = False + if not hasForbidden and isFr and tf: + tf.write(line) + + print("Will save the result") + + with open('result_all.pack', 'wb') as f: + to_w = msgpack.packb(dict_) + f.write(to_w) + diff --git a/main.py b/main.py deleted file mode 100644 index d4fb050..0000000 --- a/main.py +++ /dev/null @@ -1,309 +0,0 @@ -import tempfile as tmp -import re -import sys -import msgpack - -from listSection import listInfoSection -from template import template - - -""" - -Extract words from the Wiktionnary archive - -""" - -template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws', - 'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r', - 'clé de tri', 'contexte', 'emploi', 'l', 'polytonique', - 'pron-API', 'registre', 'scmp', 'siècle', 'x', - ] - - -template_second_lambda_trd = { - 'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'), - 'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'), -} - -template_second_lambda_snd = { - 'term': (lambda x: '(' + x.title() + ')'), - 'terme': (lambda x: '(' + x.title() + ')'), - 'ex': (lambda x: '^{' + x if x else 'e' + '}'), - 'exp': (lambda x: '^{' + x if x else 'e' + '}'), - 'e': (lambda x: '^{' + x if x else 'e' + '}'), - 'er': (lambda x: '^{' + x if x else 'er' + '}'), - 'ère': (lambda x: '^{' + x if x else 'ère' + '}'), - 'ème': (lambda x: '^{' + x if x else 'ème' + '}'), - 'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'), - 'III': (lambda x: '^{' + x if x else 'III' + '}'), - 'III': (lambda x: '^{' + x if x else 'III' + '}'), - 'small': (lambda x: '_{' + x if x else '' + '}'), - 'indice': (lambda x: '_{' + x if x else '' + '}'), - 'graphie': (lambda x: '«' + x if x else '»'), - 'petites capitales': (lambda x: x.upper()), - 'isbn': (lambda x: 'cf. ISBN ' + x), - 'OCLC': (lambda x: 'cf. OCLC ' + x), - 'variante de': (lambda x: 'Variante de ' + x), - 'variante de': (lambda x: 'Variante de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante ortho de': (lambda x: 'Variante orthographique de ' + x), - 'variante orthographique de': (lambda x: 'Variante orthographique de ' + x), - 'sic !': (lambda x: '^{sic ' + x + '}'), - 'sic': (lambda x: '^{sic ' + x + '}'), - 'incise': (lambda x: '_' + x + '_'), - 'n°': (lambda x: 'n°' + x), - 'superlatif de': (lambda x: 'Superlatif de' + x), - 'vérifier': (lambda x: '(À vérifier : ' + x + ')'), -} - -dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} - -interdit = " :" - - -def transclusion(trans, info): - trans = trans[2:-2] - - while '{{' in trans: - l0 = trans.rfind('{{') - l1 = trans.find('}}', l0) - if l1 == -1: - break - else: - l1 += 2 - t = trans[l0:l1] - t = transclusion(t, info) - trans = trans[:l0] + t + trans[l1:] - - s = list(map(lambda x: x.strip(), trans.split('|'))) - if s[0] in template: - return template[s[0]] - - if s[0].lower() in template_second: - - return s[1] if len(s) > 1 else title - - if s[0].lower().startswith('citation'): - cit = s[0].split('/') - if len(cit) == 4: - return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3] - if len(cit) == 3: - return 'Par ' + cit[1] + ', ' + cit[2] - if len(cit) == 5: - return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4] - if len(cit) <= 2: - return '' - else: - return '/'.join(c[1:]) - - if s[0].lower() in template_second_lambda_snd: - return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '') - - if s[0].lower() in template_second_lambda_trd: - return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '') - - with open('wiki_err.log', 'a') as err: - print(s[0], file=err) -# print("Incompréhension de la transclusion {} du mot {}".format(trans, -# info['mot']), file=err) - return '' - - -def extract(f, w): - infoFin = [] - - toRead = True - goBack = 0 - - while toRead: - toRead = False - - info = {'mot': w, - 'cat-gram': None, - 'def': [], - 'API': None, - 'infos': [], - 'genre': '', - 'accord': None} - - # State 0 // Initialisation ! - while line := f.readline(): - - if line.startswith('=== ') or line.startswith('==={'): - if re.match('^=== *{{ *S\\|([^|]+)|.*$', line): - try: - r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line) - r = r.groups() - nat = r[0].strip() - if nat in dictMatch.keys(): - info['cat-gram'] = nat - toRead = True - break - except e: - with open('wiki_err.log', 'a') as err: - print("^[1] Problème à l'initialisation du mot {}:" - " {}".format(info['mot'], e), file=err) - print('line: [{}]'.format(line, e), file=err) - e = sys.exc_info()[0] - print("Erreur :", e, file=err) - - if not toRead: - break - - # State 1 - while line := f.readline(): - if line.startswith('{{fr-'): - e = line.find('}}') - if e == -1: - continue - ex = line[:e] - try: - infos = list(map(lambda x: x.strip(), ex.split('|'))) - info['infos'] = infos - info['accord'] = infos[0] - if len(infos) > 1: - info['API'] = infos[1] - except e: - err = sys.exc_info()[0] - print(ex) - print("Erreur :", e) - print("Erreur :", err) - if line.rstrip().startswith("'''"): - if '{{pron' in line: - p0 = line.find('{{pron') - p1 = line.find('}}', p0) - if p1 > 0: - p1 += 2 - p = line[p0:p1] - p = p.split('|') - info['API'] = p[1] - if '{{m}}' in line: - info['genre'] = 'mas' - elif '{{f}}' in line: - info['genre'] = 'fem' - - if line.startswith('# '): - info['def'].append({'def': wikiToMd(line[2:], info)}) - elif line.startswith('#* '): - if not info['def']: - with open('wiki_err.log', 'a') as err: - print("Exemple sans définition pour le mot {}".format( - info['mot']), file=err) - elif 'ex' in info['def'][-1]: - info['def'][-1]['ex'].append(wikiToMd(line[3:], info)) - else: - info['def'][-1]['ex'] = [wikiToMd(line[3:], info)] - elif line.startswith('#') and not line.startswith('##'): - info['def'].append({'def': wikiToMd(line[1:], info)}) - if line.startswith('==='): - goBack = len(line) - break - if goBack: - tf.seek(tf.tell() - goBack) - goBack = 0 - toRead = True - infoFin.append(info) - - return infoFin - - -def wikiToMd(line, info): - line = line.strip() - # 3 Étapes: - # - Links [...] - # - Style ''ita'' / '''bold''' - # - Template / Transclusion {{info}} = (Informatique) - - # Template - while '{{' in line: - l0 = line.rfind('{{') - l1 = line.find('}}', l0) - if l1 == -1: - break - else: - l1 += 2 - trans = line[l0:l1] - trans = transclusion(trans, info) - line = line[:l0] + trans + line[l1:] - - # Links ! - while '[[' in line: - link0 = line.rfind('[[') - link1 = line.find(']]', link0) - if link1 == -1: - break - else: - link1 += 2 - - link = line[link0:link1] - link = link[2:-2].split('|') - line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:] - - # Style - line = line.replace("'''", '*').replace("''", '') - - return line - - -with open("./fr_wiktionary_all.xml", 'r') as f: - title = "" - isFr = False - hasForbidden = False - hasText = False - tf = None - - dict_ = dict() - - for line in f: - if "</page>" in line and tf: - tf.seek(0) - i = extract(tf, title) - - for w in i: - if w['mot'] == 'président': - print("What we exstract from it:") - print(i) - - dict_[title] = i - tf.close() - - tf = None - hasForbidden = False - hasText = False - isFr = False - title = "" - elif "</page>" in line: - tf = None - hasForbidden = False - hasText = False - isFr = False - title = "" - - if "<title>" in line: - title = line[line.find('>') + 1:] - title = title[:title.find('<')] - - for c in interdit: - if c in title: - hasForbidden = True - if not hasForbidden and "<text xml:space=\"preserve\">" in line: - hasText = True - if not hasForbidden and "== {{langue|fr}}" in line and hasText: - isFr = True - if tf: - print("Erreur tf encore ouvert !") - exit(-1) - tf = tmp.NamedTemporaryFile(mode="w+t") - # print(title) - elif not hasForbidden and "== {{langue|" in line: - isFr = False - if not hasForbidden and isFr and tf: - tf.write(line) - - print("Will save the result") - - with open('result_all.pack', 'wb') as f: - to_w = msgpack.packb(dict_) - f.write(to_w) - diff --git a/msgPack2sqlite_msgPack.py b/msgPack2sqlite_msgPack.py index b77dd2e..c251d59 100644 --- a/msgPack2sqlite_msgPack.py +++ b/msgPack2sqlite_msgPack.py @@ -2,36 +2,53 @@ import msgpack import ui +import sys import sqlite3 - -with open('result_all.pack', 'rb') as f: - r = f.read() - -d = p = msgpack.unpackb(r, raw=False) -del r - -with sqlite3.connect("result_all.sql") as con: - cur = con.cursor() - cur.execute('''CREATE TABLE IF NOT EXISTS entry ( - word TEXT, - cat_gram TEXT, - API TEXT, - infos TEXT, - genre TEXT, - accord TEXT, - defs BLOG, - ID INTEGER PRIMARY KEY)''') - con.commit() - - for w, listW in d.items(): - for word in listW: - data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']), - word['genre'], word['accord'], - msgpack.packb(word['def'])) - cur.execute('''INSERT INTO entry (word, cat_gram, API, infos, - genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data) - con.commit() +import argparse + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='wiktionary dump msgpack ' + 'to SQLite database file') + parser.add_argument('-o', '--out', dest='outputF', action='store', + help='the output filename') + parser.add_argument('-i', '--input', dest='inputF', action='store', + help='the input filename, a dump of witionary') + + arg = parser.parse_args() + + if arg.inputF is None: + print('Error input file needed', file=sys.stderr) + if arg.outputF is None: + print('Error output file needed', file=sys.stderr) + + with open(arg.inputF, 'rb') as f: + r = f.read() + + d = p = msgpack.unpackb(r, raw=False) + del r + + with sqlite3.connect(arg.outputF) as con: + cur = con.cursor() + cur.execute('''CREATE TABLE IF NOT EXISTS entry ( + word TEXT, + cat_gram TEXT, + API TEXT, + infos TEXT, + genre TEXT, + accord TEXT, + defs BLOG, + ID INTEGER PRIMARY KEY)''') + con.commit() + + for w, listW in d.items(): + for word in listW: + data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']), + word['genre'], word['accord'], + msgpack.packb(word['def'])) + cur.execute('''INSERT INTO entry (word, cat_gram, API, infos, + genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data) + con.commit() def give_def(w): -- cgit v1.2.3