Documentation README

author: ache <ache@ache.one> 2020-06-16 17:37:32 +0200
committer: ache <ache@ache.one> 2020-06-16 17:37:32 +0200
commit: ac7c2e5d071151f69872f8e97dac414e41976168 (patch)
tree: e2f9c51b5f89e61ce4e08125160b46cd5994f9f2
parent: Regex matching (diff)
4 files changed, 158 insertions, 34 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6d9a041
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+Dicofr
+======
+
+An utility to create and query a French dictionary based on [Wiktionary archive dump](https://dumps.wikimedia.org/frwiktionary/20200601/).
+
+
+Technically
+----------
+
+A bunch of **Python** scripts to transform wiktionary archive dump to MySQL database file.
+
+ - With a simple CLI.
+ - With a simple WUI, flask based.
+ - Regex support.
+ - 
+
+How to create the database
+--------------------------
+
+First you have to download a wiktionary archive file.
+For example the file `frwiktionary-20200601-pages-articles.xml.bz2` witch is a full dump of the current version of every pages.
+
+For now you have to decompress it completely before you can treat it.
+The use of [bz2](https://docs.python.org/3/library/bz2.html) may be considered in the future to make this step optional and thus reduce disk usage.
+
+~~~shell
+$ bunzip2 frwiktionary-20200601-pages-articles.xml.bz2
+~~~
+
+Then, you will create an intermediary file, a msgpack file, of every data of wiktionary.
+This file is interesting for developers not for end users.
+It's a serialization of the internal used dictionary (python dictionary). 
+
+~~~shell
+$ python dump2msgp.py -i frwiktionary-20200601-pages-articles.xml.bz2
+~~~
+
+Then, you can create the SQLite database file.
+~~~shell
+$ python msgPack2sqlite_msgPack.py -i dicofr.msgpk -o dicofr.db
+~~~
+
+You can then use `dicofr.py` to search a word from the CLI or use the WUI with the command:
+~~~shell
+$ python web.py
+~~~
+
+How to use it
+-------------
+
+You can use the CLI.
+
+~~~shell
+$ dicofr -h
+usage: dicofr [-h] [--sql] [--matching] PATTERN
+
+Get a french word's definition.
+
+positional arguments:
+  PATTERN     the word or the pattern to match
+
+optional arguments:
+  -h, --help  show this help message and exit
+  --sql       search a definition using SQL regex, _ to match a letter, % to match a group of letters
+  --matching  search the french words that match the regex
+~~~
+
+For example
+
+~~~shell
+$ dicofr julien
+   julien
+   /ʒy.ljɛ̃/, adjectif
+   	(Chronologie) Qui est lié à Jules César et à sa décision d’instaurer l’alternance entre trois années de trois cent soixante-cinq jours et une année bissextile de trois cent soixante-six jours.
+   		 * Calendrier *julien*.
+   		 * Année *julienne*.
+   		 * Correction *julienne*.
+~~~
+
+~~~shell
+$ dicofr -m /julien/
+julienois
+juliennette
+juliennoises
+juliennes
+julien
+julienne
+julienoises
+juliennettes
+juliennoise
+julienoise
+juliennois
+juliens
+~~~
+
+How to contribute ?
+-------------------
+
+This project is free, you are free to send me a PR to improove this software.
+Respect each other is the only rule.
+
+License: MIT like.
diff --git a/dicofr.py b/dicofr.py
index ea5741c..9bef7ac 100755
--- a/dicofr.py
+++ b/dicofr.py
@@ -12,7 +12,7 @@ sys.path.insert(-1, DIR_PATH)
 
 import ui
 
-dico = 'wiktfr.sql'
+dico = 'dicofr.db'
 
 
 def get_def_sql(word):
@@ -56,7 +56,11 @@ def matching(word):
 
     matchingWord = []
 
-    with open('list_word.msgpk', 'rb') as f:
+    if not exists(arg.wordList):
+        print(f'Error: Word list {arg.wordList} not found', file=sys.stderr)
+        return
+
+    with open(arg.wordList, 'rb') as f:
         msgpackList = f.read()
         listWord = msgpack.unpackb(msgpackList, raw=False)
 
@@ -81,8 +85,9 @@ def matching(word):
 
 if __name__ == '__main__':
     if len(sys.argv) < 2:
-      print("Erreur: Rechercher un mot", file=sys.stderr)
-      exit()
+      print("Erreur: Rechercher un mot\nUtilisez l'option -h pour avoir de l'aide",
+            file=sys.stderr)
+      exit(-1)
 
     # Si on n'arrive pas à trouver le dictionnaire
     if not exists(dico):
@@ -97,6 +102,8 @@ if __name__ == '__main__':
                         const=get_def_sql_reg, default=get_def_sql,
                         help='search a definition using SQL regex, '
                              '_ to match a letter, %% to match a group of letters')
+    parser.add_argument('-w', '--wordlist', dest='wordList',
+                        action='store_const', default='list_word.msgpack')
     parser.add_argument('-m', '--matching', dest='matching', action='store_true',
                         help='search the french words that match the regex')
     parser.add_argument('word', metavar='PATTERN', type=str,
@@ -113,6 +120,4 @@ if __name__ == '__main__':
     else:
         for w in arg.action(arg.word):
             ui.show_terminal(w)
-        if not ret:
-            exit(1)
 
diff --git a/main.py b/dump2msgp.py
index d4fb050..d4fb050 100644
--- a/main.py
+++ b/dump2msgp.py
diff --git a/msgPack2sqlite_msgPack.py b/msgPack2sqlite_msgPack.py
index b77dd2e..c251d59 100644
--- a/msgPack2sqlite_msgPack.py
+++ b/msgPack2sqlite_msgPack.py
@@ -2,36 +2,53 @@
 
 import msgpack
 import ui
+import sys
 
 import sqlite3
-
-with open('result_all.pack', 'rb') as f:
-    r = f.read()
-
-d = p = msgpack.unpackb(r, raw=False)
-del r
-
-with sqlite3.connect("result_all.sql") as con:
-    cur = con.cursor()
-    cur.execute('''CREATE TABLE IF NOT EXISTS entry (
-        word TEXT,
-        cat_gram TEXT,
-        API TEXT,
-        infos TEXT,
-        genre TEXT,
-        accord TEXT,
-        defs BLOG,
-        ID INTEGER PRIMARY KEY)''')
-    con.commit()
-
-    for w, listW in d.items():
-        for word in listW:
-            data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']),
-                    word['genre'], word['accord'],
-                    msgpack.packb(word['def']))
-            cur.execute('''INSERT INTO entry (word, cat_gram, API, infos,
-            genre, accord, defs) VALUES (?, ?, ?, ?,  ?, ?,  ?)''', data)
-    con.commit()
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='wiktionary dump msgpack '
+                                     'to SQLite database file')
+    parser.add_argument('-o', '--out', dest='outputF', action='store',
+                        help='the output filename')
+    parser.add_argument('-i', '--input', dest='inputF', action='store',
+                        help='the input filename, a dump of witionary')
+
+    arg = parser.parse_args()
+
+    if arg.inputF is None:
+        print('Error input file needed', file=sys.stderr)
+    if arg.outputF is None:
+        print('Error output file needed', file=sys.stderr)
+
+    with open(arg.inputF, 'rb') as f:
+        r = f.read()
+
+    d = p = msgpack.unpackb(r, raw=False)
+    del r
+
+    with sqlite3.connect(arg.outputF) as con:
+        cur = con.cursor()
+        cur.execute('''CREATE TABLE IF NOT EXISTS entry (
+            word TEXT,
+            cat_gram TEXT,
+            API TEXT,
+            infos TEXT,
+            genre TEXT,
+            accord TEXT,
+            defs BLOG,
+            ID INTEGER PRIMARY KEY)''')
+        con.commit()
+
+        for w, listW in d.items():
+            for word in listW:
+                data = (w, word['cat-gram'], word['API'], "\t".join(word['infos']),
+                        word['genre'], word['accord'],
+                        msgpack.packb(word['def']))
+                cur.execute('''INSERT INTO entry (word, cat_gram, API, infos,
+                genre, accord, defs) VALUES (?, ?, ?, ?,  ?, ?,  ?)''', data)
+        con.commit()
 
 
 def give_def(w):
author	ache <ache@ache.one>	2020-06-16 17:37:32 +0200
committer	ache <ache@ache.one>	2020-06-16 17:37:32 +0200
commit	ac7c2e5d071151f69872f8e97dac414e41976168 (patch)
tree	e2f9c51b5f89e61ce4e08125160b46cd5994f9f2
parent	Regex matching (diff)