Rename files

author: ache <ache@ache.one> 2021-10-03 02:31:32 +0200
committer: ache <ache@ache.one> 2021-10-03 02:31:54 +0200
commit: 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree: c1e02297807107096cf5a8962ed51342a69f0626
parent: Command every scripts (diff)
7 files changed, 29 insertions, 58 deletions
diff --git a/Makefile b/Makefile
index 66da002..b9880ce 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ install:
 
 	# Copy code then assets
 	cp -u *.py ${DIR_INSTALL_PATH}/
-	cp -r download ${DIR_INSTALL_PATH}/download
+	cp -r dfr ${DIR_INSTALL_PATH}/dfr
 	cp -r assets ${DIR_INSTALL_PATH}/assets
 	@[ -f frwiktionary-latest-pages-meta-current.xml.bz2 ] && \
 		cp -u frwiktionary-latest-pages-meta-current.xml.bz2 ${DIR_INSTALL_PATH}/ \
@@ -23,7 +23,7 @@ install:
 	@ echo ''
 
 	# Get the external assets if needed
-	cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/download/download.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db"
+	cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/dfr/createDB.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db"
 	@ echo ''
 
 	# Set permission and install command
diff --git a/download/download.py b/dfr/createDB.py
index 21137f0..acf443d 100755
--- a/download/download.py
+++ b/dfr/createDB.py
@@ -11,11 +11,11 @@ will be used instead of the last dump from wikimedia. It's the goal of the
 command line option `--download`.
 
 ###########################
-⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
+❗ WARNING: Beware that a wiktionary compressed archive dump is big.
 Then a decompressed one is bigger again and so a lot of disk space is
 needed to store these files.
 
-⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
+❗ WARNING: Beware that wiktionary dump are really big and than a lot of memory
 space are needed to process them. It's NOT recommended to try to create a
 database file on a computer with less than 2Gio memory, even using the on
 the fly decompression method.
@@ -42,11 +42,11 @@ They is only a few other command line options that can use with that script.
     The "download" and "input" options are incompatibles.
 
 + --word-list
-    Not usefull for the moment. dfr will have a functionality to list every
-    words in the dictionnary. You will be able to filter words based on regex,
+    Not useful for the moment. dfr will have a functionality to list every
+    words in the dictionary. You will be able to filter words based on regex,
     optionally, there will be a option to auto correct a word but nothing
     is implemented here.
-    In all cases, you can specify the filename of the file that will store
+    In any cases, you can specify the file name of the file that will store
     every words.
 
 Note: This script have many option to decompress the wiktionary archive dump.
@@ -69,17 +69,6 @@ In every case, a lot of memory (RAM) is necessary to process the last wiktionary
 
 # TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
 
-import argparse
-import sys
-import urllib.request
-import dump2msgp
-import msgPack2sqlite_msgPack
-import subprocess
-import os
-
-from os.path import exists
-
-
 URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
 
 
@@ -101,6 +90,16 @@ def unbz2(file):
 
 
 if __name__ == '__main__':
+    import argparse
+    import sys
+    import urllib.request
+    import dump2msgp
+    import msgp2sqlite
+    import subprocess
+    import os
+
+    from os.path import exists
+
     parser = argparse.ArgumentParser(description='Download and create the database')
     parser.add_argument('-o', '--output', dest='outputF', action='store',
                         help='the output, the database filename',
@@ -203,7 +202,7 @@ if __name__ == '__main__':
                 with open(arg.wordList, 'wb'):
                     f.write('\n'.join(a.keys()))
 
-                msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+                msgp2sqlite.writeDB(arg.outputF, res)
             print(f'Word list { arg.wordList } created ! 👏 🎉')
             print(f'Database { arg.outputF } created ! 👏 🎉')
         except:
@@ -218,7 +217,7 @@ if __name__ == '__main__':
         with open(output_fn, 'r') as f:
             print('Create the database')
             res = dump2msgp.extractAll(f, 'error.log', False)
-            msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+            msgp2sqlite.writeDB(arg.outputF, res)
         print(f'Database { arg.outputF } created ! 👏 🎉')
 
         print('Removing temporary files')
diff --git a/download/dump2msgp.py b/dfr/dump2msgp.py
index e76115c..1d3e1a2 100644
--- a/download/dump2msgp.py
+++ b/dfr/dump2msgp.py
@@ -40,8 +40,15 @@ import sys
 import msgpack
 import argparse
 
-from sectionList import listInfoSection
-from template import template
+if __name__ == '__main__':
+    from sectionList import listInfoSection
+    from template import template
+else:
+    from dfr.sectionList import listInfoSection
+    from dfr.template import template
+
+
+dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
 
 
 DEFAULT_OUTPUT = 'dfr.msgpk'
@@ -90,7 +97,6 @@ template_second_lambda_snd = {
     'vérifier': (lambda x: '(À vérifier : ' + x + ')'),
 }
 
-dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
 
 interdit = " :"
 
@@ -365,6 +371,7 @@ def extractAll(f, errorF, ignore):
 
 
 if __name__ == '__main__':
+
     parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
     parser.add_argument('-o', '--out', dest='outputF', action='store_const',
                         const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
diff --git a/download/msgPack2sqlite_msgPack.py b/dfr/msgp2sqlite.py
index c08efdb..c08efdb 100644
--- a/download/msgPack2sqlite_msgPack.py
+++ b/dfr/msgp2sqlite.py
diff --git a/download/sectionList.py b/dfr/sectionList.py
index 68dd657..68dd657 100644
--- a/download/sectionList.py
+++ b/dfr/sectionList.py
diff --git a/download/template.py b/dfr/template.py
index fa0394a..fa0394a 100644
--- a/download/template.py
+++ b/dfr/template.py
diff --git a/download/bz2toDB.py b/download/bz2toDB.py
deleted file mode 100644
index a0c2cd3..0000000
--- a/download/bz2toDB.py
+++ /dev/null
@@ -1,35 +0,0 @@
-""" Not a script
-
-Don't use that script
-
-
-This python file store function related to bz2 python module and then the
-on the fly method of decompression.
-
-"""
-
-import bz2
-import sys
-
-
-def unbz2(file):
-    decomp = bz2.BZ2Decompressor()
-    buf = b''
-    for c in file:
-        buf += decomp.decompress(c)
-
-        while b'\n' in buf:
-            i = buf.index(b'\n')
-            if i + 1 < len(buf):
-                ret = buf[:i + 1]
-                buf = buf[i + 1:]
-                yield ret.decode("utf-8")
-            else:
-                yield buf
-                buf = b''
-
-
-with open('./wiktionary_dump.xml.bz2', 'rb') as f:
-    it = iter(lambda: f.read(32768), b'')
-    for a in unbz2(it):
-        print(a, end='')
author	ache <ache@ache.one>	2021-10-03 02:31:32 +0200
committer	ache <ache@ache.one>	2021-10-03 02:31:54 +0200
commit	7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree	c1e02297807107096cf5a8962ed51342a69f0626
parent	Command every scripts (diff)