diff options
author | ache <ache@ache.one> | 2020-06-16 17:37:32 +0200 |
---|---|---|
committer | ache <ache@ache.one> | 2020-06-16 17:37:32 +0200 |
commit | ac7c2e5d071151f69872f8e97dac414e41976168 (patch) | |
tree | e2f9c51b5f89e61ce4e08125160b46cd5994f9f2 | |
parent | Regex matching (diff) |
Documentation README
-rw-r--r-- | README.md | 102 | ||||
-rwxr-xr-x | dicofr.py | 17 | ||||
-rw-r--r-- | dump2msgp.py (renamed from main.py) | 0 | ||||
-rw-r--r-- | msgPack2sqlite_msgPack.py | 73 |
4 files changed, 158 insertions, 34 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d9a041 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +Dicofr +====== + +An utility to create and query a French dictionary based on [Wiktionary archive dump](https://dumps.wikimedia.org/frwiktionary/20200601/). + + +Technically +---------- + +A bunch of **Python** scripts to transform wiktionary archive dump to MySQL database file. + + - With a simple CLI. + - With a simple WUI, flask based. + - Regex support. + - + +How to create the database +-------------------------- + +First you have to download a wiktionary archive file. +For example the file `frwiktionary-20200601-pages-articles.xml.bz2` witch is a full dump of the current version of every pages. + +For now you have to decompress it completely before you can treat it. +The use of [bz2](https://docs.python.org/3/library/bz2.html) may be considered in the future to make this step optional and thus reduce disk usage. + +~~~shell +$ bunzip2 frwiktionary-20200601-pages-articles.xml.bz2 +~~~ + +Then, you will create an intermediary file, a msgpack file, of every data of wiktionary. +This file is interesting for developers not for end users. +It's a serialization of the internal used dictionary (python dictionary). + +~~~shell +$ python dump2msgp.py -i frwiktionary-20200601-pages-articles.xml.bz2 +~~~ + +Then, you can create the SQLite database file. +~~~shell +$ python msgPack2sqlite_msgPack.py -i dicofr.msgpk -o dicofr.db +~~~ + +You can then use `dicofr.py` to search a word from the CLI or use the WUI with the command: +~~~shell +$ python web.py +~~~ + +How to use it +------------- + +You can use the CLI. + +~~~shell +$ dicofr -h +usage: dicofr [-h] [--sql] [--matching] PATTERN + +Get a french word's definition. + +positional arguments: + PATTERN the word or the pattern to match + +optional arguments: + -h, --help show this help message and exit + --sql search a definition using SQL regex, _ to match a letter, % to match a group of letters + --matching search the french words that match the regex +~~~ + +For example + +~~~shell +$ dicofr julien + julien + /ʒy.ljɛ̃/, adjectif + (Chronologie) Qui est lié à Jules César et à sa décision d’instaurer l’alternance entre trois années de trois cent soixante-cinq jours et une année bissextile de trois cent soixante-six jours. + * Calendrier *julien*. + * Année *julienne*. + * Correction *julienne*. +~~~ + +~~~shell +$ dicofr -m /julien/ +julienois +juliennette +juliennoises +juliennes +julien +julienne +julienoises +juliennettes +juliennoise +julienoise +juliennois +juliens +~~~ + +How to contribute ? +------------------- + +This project is free, you are free to send me a PR to improove this software. +Respect each other is the only rule. + +License: MIT like. @@ -12,7 +12,7 @@ sys.path.insert(-1, DIR_PATH) import ui -dico = 'wiktfr.sql' +dico = 'dicofr.db' def get_def_sql(word): @@ -56,7 +56,11 @@ def matching(word): matchingWord = [] - with open('list_word.msgpk', 'rb') as f: + if not exists(arg.wordList): + print(f'Error: Word list {arg.wordList} not found', file=sys.stderr) + return + + with open(arg.wordList, 'rb') as f: msgpackList = f.read() listWord = msgpack.unpackb(msgpackList, raw=False) @@ -81,8 +85,9 @@ def matching(word): if __name__ == '__main__': if len(sys.argv) < 2: - print("Erreur: Rechercher un mot", file=sys.stderr) - exit() + print("Erreur: Rechercher un mot\nUtilisez l'option -h pour avoir de l'aide", + file=sys.stderr) + exit(-1) # Si on n'arrive pas à trouver le dictionnaire if not exists(dico): @@ -97,6 +102,8 @@ if __name__ == '__main__': const=get_def_sql_reg, default=get_def_sql, help='search a definition using SQL regex, ' '_ to match a letter, %% to match a group of letters') + parser.add_argument('-w', '--wordlist', dest='wordList', + action='store_const', default='list_word.msgpack') parser.add_argument('-m', '--matching', dest='matching', action='store_true', help='search the french words that match the regex') parser.add_argument('word', metavar='PATTERN', type=str, @@ -113,6 +120,4 @@ if __name__ == '__main__': else: for w in arg.action(arg.word): ui.show_terminal(w) - if not ret: - exit(1) diff --git a/msgPack2sqlite_msgPack.py b/msgPack2sqlite_msgPack.py index b77dd2e..c251d59 100644 --- a/msgPack2sqlite_msgPack.py +++ b/msgPack2sqlite_msgPack.py @@ -2,36 +2,53 @@ import msgpack import ui +import sys import sqlite3 - -with open('result_all.pack', 'rb') as f: - r = f.read() - -d = p = msgpack.unpackb(r, raw=False) -del r - -with sqlite3.connect("result_all.sql") as con: - cur = con.cursor() - cur.execute('''CREATE TABLE IF NOT EXISTS entry ( - word TEXT, - cat_gram TEXT, - API TEXT, - infos TEXT, - genre TEXT, - accord TEXT, - defs BLOG, - ID INTEGER PRIMARY KEY)''') - con.commit() - - for w, listW in d.items(): - for word in listW: - data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']), - word['genre'], word['accord'], - msgpack.packb(word['def'])) - cur.execute('''INSERT INTO entry (word, cat_gram, API, infos, - genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data) - con.commit() +import argparse + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='wiktionary dump msgpack ' + 'to SQLite database file') + parser.add_argument('-o', '--out', dest='outputF', action='store', + help='the output filename') + parser.add_argument('-i', '--input', dest='inputF', action='store', + help='the input filename, a dump of witionary') + + arg = parser.parse_args() + + if arg.inputF is None: + print('Error input file needed', file=sys.stderr) + if arg.outputF is None: + print('Error output file needed', file=sys.stderr) + + with open(arg.inputF, 'rb') as f: + r = f.read() + + d = p = msgpack.unpackb(r, raw=False) + del r + + with sqlite3.connect(arg.outputF) as con: + cur = con.cursor() + cur.execute('''CREATE TABLE IF NOT EXISTS entry ( + word TEXT, + cat_gram TEXT, + API TEXT, + infos TEXT, + genre TEXT, + accord TEXT, + defs BLOG, + ID INTEGER PRIMARY KEY)''') + con.commit() + + for w, listW in d.items(): + for word in listW: + data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']), + word['genre'], word['accord'], + msgpack.packb(word['def'])) + cur.execute('''INSERT INTO entry (word, cat_gram, API, infos, + genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data) + con.commit() def give_def(w): |