aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorache <ache@ache.one>2020-06-16 17:37:32 +0200
committerache <ache@ache.one>2020-06-16 17:37:32 +0200
commitac7c2e5d071151f69872f8e97dac414e41976168 (patch)
treee2f9c51b5f89e61ce4e08125160b46cd5994f9f2
parentRegex matching (diff)
Documentation README
-rw-r--r--README.md102
-rwxr-xr-xdicofr.py17
-rw-r--r--dump2msgp.py (renamed from main.py)0
-rw-r--r--msgPack2sqlite_msgPack.py73
4 files changed, 158 insertions, 34 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6d9a041
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+Dicofr
+======
+
+An utility to create and query a French dictionary based on [Wiktionary archive dump](https://dumps.wikimedia.org/frwiktionary/20200601/).
+
+
+Technically
+----------
+
+A bunch of **Python** scripts to transform wiktionary archive dump to MySQL database file.
+
+ - With a simple CLI.
+ - With a simple WUI, flask based.
+ - Regex support.
+ -
+
+How to create the database
+--------------------------
+
+First you have to download a wiktionary archive file.
+For example the file `frwiktionary-20200601-pages-articles.xml.bz2` witch is a full dump of the current version of every pages.
+
+For now you have to decompress it completely before you can treat it.
+The use of [bz2](https://docs.python.org/3/library/bz2.html) may be considered in the future to make this step optional and thus reduce disk usage.
+
+~~~shell
+$ bunzip2 frwiktionary-20200601-pages-articles.xml.bz2
+~~~
+
+Then, you will create an intermediary file, a msgpack file, of every data of wiktionary.
+This file is interesting for developers not for end users.
+It's a serialization of the internal used dictionary (python dictionary).
+
+~~~shell
+$ python dump2msgp.py -i frwiktionary-20200601-pages-articles.xml.bz2
+~~~
+
+Then, you can create the SQLite database file.
+~~~shell
+$ python msgPack2sqlite_msgPack.py -i dicofr.msgpk -o dicofr.db
+~~~
+
+You can then use `dicofr.py` to search a word from the CLI or use the WUI with the command:
+~~~shell
+$ python web.py
+~~~
+
+How to use it
+-------------
+
+You can use the CLI.
+
+~~~shell
+$ dicofr -h
+usage: dicofr [-h] [--sql] [--matching] PATTERN
+
+Get a french word's definition.
+
+positional arguments:
+ PATTERN the word or the pattern to match
+
+optional arguments:
+ -h, --help show this help message and exit
+ --sql search a definition using SQL regex, _ to match a letter, % to match a group of letters
+ --matching search the french words that match the regex
+~~~
+
+For example
+
+~~~shell
+$ dicofr julien
+ julien
+ /ʒy.ljɛ̃/, adjectif
+ (Chronologie) Qui est lié à Jules César et à sa décision d’instaurer l’alternance entre trois années de trois cent soixante-cinq jours et une année bissextile de trois cent soixante-six jours.
+ * Calendrier *julien*.
+ * Année *julienne*.
+ * Correction *julienne*.
+~~~
+
+~~~shell
+$ dicofr -m /julien/
+julienois
+juliennette
+juliennoises
+juliennes
+julien
+julienne
+julienoises
+juliennettes
+juliennoise
+julienoise
+juliennois
+juliens
+~~~
+
+How to contribute ?
+-------------------
+
+This project is free, you are free to send me a PR to improove this software.
+Respect each other is the only rule.
+
+License: MIT like.
diff --git a/dicofr.py b/dicofr.py
index ea5741c..9bef7ac 100755
--- a/dicofr.py
+++ b/dicofr.py
@@ -12,7 +12,7 @@ sys.path.insert(-1, DIR_PATH)
import ui
-dico = 'wiktfr.sql'
+dico = 'dicofr.db'
def get_def_sql(word):
@@ -56,7 +56,11 @@ def matching(word):
matchingWord = []
- with open('list_word.msgpk', 'rb') as f:
+ if not exists(arg.wordList):
+ print(f'Error: Word list {arg.wordList} not found', file=sys.stderr)
+ return
+
+ with open(arg.wordList, 'rb') as f:
msgpackList = f.read()
listWord = msgpack.unpackb(msgpackList, raw=False)
@@ -81,8 +85,9 @@ def matching(word):
if __name__ == '__main__':
if len(sys.argv) < 2:
- print("Erreur: Rechercher un mot", file=sys.stderr)
- exit()
+ print("Erreur: Rechercher un mot\nUtilisez l'option -h pour avoir de l'aide",
+ file=sys.stderr)
+ exit(-1)
# Si on n'arrive pas à trouver le dictionnaire
if not exists(dico):
@@ -97,6 +102,8 @@ if __name__ == '__main__':
const=get_def_sql_reg, default=get_def_sql,
help='search a definition using SQL regex, '
'_ to match a letter, %% to match a group of letters')
+ parser.add_argument('-w', '--wordlist', dest='wordList',
+ action='store_const', default='list_word.msgpack')
parser.add_argument('-m', '--matching', dest='matching', action='store_true',
help='search the french words that match the regex')
parser.add_argument('word', metavar='PATTERN', type=str,
@@ -113,6 +120,4 @@ if __name__ == '__main__':
else:
for w in arg.action(arg.word):
ui.show_terminal(w)
- if not ret:
- exit(1)
diff --git a/main.py b/dump2msgp.py
index d4fb050..d4fb050 100644
--- a/main.py
+++ b/dump2msgp.py
diff --git a/msgPack2sqlite_msgPack.py b/msgPack2sqlite_msgPack.py
index b77dd2e..c251d59 100644
--- a/msgPack2sqlite_msgPack.py
+++ b/msgPack2sqlite_msgPack.py
@@ -2,36 +2,53 @@
import msgpack
import ui
+import sys
import sqlite3
-
-with open('result_all.pack', 'rb') as f:
- r = f.read()
-
-d = p = msgpack.unpackb(r, raw=False)
-del r
-
-with sqlite3.connect("result_all.sql") as con:
- cur = con.cursor()
- cur.execute('''CREATE TABLE IF NOT EXISTS entry (
- word TEXT,
- cat_gram TEXT,
- API TEXT,
- infos TEXT,
- genre TEXT,
- accord TEXT,
- defs BLOG,
- ID INTEGER PRIMARY KEY)''')
- con.commit()
-
- for w, listW in d.items():
- for word in listW:
- data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']),
- word['genre'], word['accord'],
- msgpack.packb(word['def']))
- cur.execute('''INSERT INTO entry (word, cat_gram, API, infos,
- genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data)
- con.commit()
+import argparse
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='wiktionary dump msgpack '
+ 'to SQLite database file')
+ parser.add_argument('-o', '--out', dest='outputF', action='store',
+ help='the output filename')
+ parser.add_argument('-i', '--input', dest='inputF', action='store',
+ help='the input filename, a dump of witionary')
+
+ arg = parser.parse_args()
+
+ if arg.inputF is None:
+ print('Error input file needed', file=sys.stderr)
+ if arg.outputF is None:
+ print('Error output file needed', file=sys.stderr)
+
+ with open(arg.inputF, 'rb') as f:
+ r = f.read()
+
+ d = p = msgpack.unpackb(r, raw=False)
+ del r
+
+ with sqlite3.connect(arg.outputF) as con:
+ cur = con.cursor()
+ cur.execute('''CREATE TABLE IF NOT EXISTS entry (
+ word TEXT,
+ cat_gram TEXT,
+ API TEXT,
+ infos TEXT,
+ genre TEXT,
+ accord TEXT,
+ defs BLOG,
+ ID INTEGER PRIMARY KEY)''')
+ con.commit()
+
+ for w, listW in d.items():
+ for word in listW:
+ data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']),
+ word['genre'], word['accord'],
+ msgpack.packb(word['def']))
+ cur.execute('''INSERT INTO entry (word, cat_gram, API, infos,
+ genre, accord, defs) VALUES (?, ?, ?, ?, ?, ?, ?)''', data)
+ con.commit()
def give_def(w):