diff options
author | ache <ache@ache.one> | 2021-10-03 02:31:32 +0200 |
---|---|---|
committer | ache <ache@ache.one> | 2021-10-03 02:31:54 +0200 |
commit | 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch) | |
tree | c1e02297807107096cf5a8962ed51342a69f0626 | |
parent | Command every scripts (diff) |
Rename files
-rw-r--r-- | Makefile | 4 | ||||
-rwxr-xr-x | dfr/createDB.py (renamed from download/download.py) | 35 | ||||
-rw-r--r-- | dfr/dump2msgp.py (renamed from download/dump2msgp.py) | 13 | ||||
-rw-r--r-- | dfr/msgp2sqlite.py (renamed from download/msgPack2sqlite_msgPack.py) | 0 | ||||
-rw-r--r-- | dfr/sectionList.py (renamed from download/sectionList.py) | 0 | ||||
-rw-r--r-- | dfr/template.py (renamed from download/template.py) | 0 | ||||
-rw-r--r-- | download/bz2toDB.py | 35 |
7 files changed, 29 insertions, 58 deletions
@@ -12,7 +12,7 @@ install: # Copy code then assets cp -u *.py ${DIR_INSTALL_PATH}/ - cp -r download ${DIR_INSTALL_PATH}/download + cp -r dfr ${DIR_INSTALL_PATH}/dfr cp -r assets ${DIR_INSTALL_PATH}/assets @[ -f frwiktionary-latest-pages-meta-current.xml.bz2 ] && \ cp -u frwiktionary-latest-pages-meta-current.xml.bz2 ${DIR_INSTALL_PATH}/ \ @@ -23,7 +23,7 @@ install: @ echo '' # Get the external assets if needed - cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/download/download.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db" + cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/dfr/createDB.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db" @ echo '' # Set permission and install command diff --git a/download/download.py b/dfr/createDB.py index 21137f0..acf443d 100755 --- a/download/download.py +++ b/dfr/createDB.py @@ -11,11 +11,11 @@ will be used instead of the last dump from wikimedia. It's the goal of the command line option `--download`. ########################### -β οΈ WARNING: Beaware that a wiktionary cmpressed archive dump is big. +β WARNING: Beware that a wiktionary compressed archive dump is big. Then a decompressed one is bigger again and so a lot of disk space is needed to store these files. -β οΈ WARNING: Beaware that wiktionary dump are really big and than a lot of memory +β WARNING: Beware that wiktionary dump are really big and than a lot of memory space are needed to process them. It's NOT recommended to try to create a database file on a computer with less than 2Gio memory, even using the on the fly decompression method. @@ -42,11 +42,11 @@ They is only a few other command line options that can use with that script. The "download" and "input" options are incompatibles. + --word-list - Not usefull for the moment. dfr will have a functionality to list every - words in the dictionnary. You will be able to filter words based on regex, + Not useful for the moment. dfr will have a functionality to list every + words in the dictionary. You will be able to filter words based on regex, optionally, there will be a option to auto correct a word but nothing is implemented here. - In all cases, you can specify the filename of the file that will store + In any cases, you can specify the file name of the file that will store every words. Note: This script have many option to decompress the wiktionary archive dump. @@ -69,17 +69,6 @@ In every case, a lot of memory (RAM) is necessary to process the last wiktionary # TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio). -import argparse -import sys -import urllib.request -import dump2msgp -import msgPack2sqlite_msgPack -import subprocess -import os - -from os.path import exists - - URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2' @@ -101,6 +90,16 @@ def unbz2(file): if __name__ == '__main__': + import argparse + import sys + import urllib.request + import dump2msgp + import msgp2sqlite + import subprocess + import os + + from os.path import exists + parser = argparse.ArgumentParser(description='Download and create the database') parser.add_argument('-o', '--output', dest='outputF', action='store', help='the output, the database filename', @@ -203,7 +202,7 @@ if __name__ == '__main__': with open(arg.wordList, 'wb'): f.write('\n'.join(a.keys())) - msgPack2sqlite_msgPack.writeDB(arg.outputF, res) + msgp2sqlite.writeDB(arg.outputF, res) print(f'Word list { arg.wordList } created ! π π') print(f'Database { arg.outputF } created ! π π') except: @@ -218,7 +217,7 @@ if __name__ == '__main__': with open(output_fn, 'r') as f: print('Create the database') res = dump2msgp.extractAll(f, 'error.log', False) - msgPack2sqlite_msgPack.writeDB(arg.outputF, res) + msgp2sqlite.writeDB(arg.outputF, res) print(f'Database { arg.outputF } created ! π π') print('Removing temporary files') diff --git a/download/dump2msgp.py b/dfr/dump2msgp.py index e76115c..1d3e1a2 100644 --- a/download/dump2msgp.py +++ b/dfr/dump2msgp.py @@ -40,8 +40,15 @@ import sys import msgpack import argparse -from sectionList import listInfoSection -from template import template +if __name__ == '__main__': + from sectionList import listInfoSection + from template import template +else: + from dfr.sectionList import listInfoSection + from dfr.template import template + + +dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} DEFAULT_OUTPUT = 'dfr.msgpk' @@ -90,7 +97,6 @@ template_second_lambda_snd = { 'vΓ©rifier': (lambda x: '(Γ vΓ©rifier : ' + x + ')'), } -dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)} interdit = " :" @@ -365,6 +371,7 @@ def extractAll(f, errorF, ignore): if __name__ == '__main__': + parser = argparse.ArgumentParser(description='wiktionary dump to msgpack') parser.add_argument('-o', '--out', dest='outputF', action='store_const', const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT, diff --git a/download/msgPack2sqlite_msgPack.py b/dfr/msgp2sqlite.py index c08efdb..c08efdb 100644 --- a/download/msgPack2sqlite_msgPack.py +++ b/dfr/msgp2sqlite.py diff --git a/download/sectionList.py b/dfr/sectionList.py index 68dd657..68dd657 100644 --- a/download/sectionList.py +++ b/dfr/sectionList.py diff --git a/download/template.py b/dfr/template.py index fa0394a..fa0394a 100644 --- a/download/template.py +++ b/dfr/template.py diff --git a/download/bz2toDB.py b/download/bz2toDB.py deleted file mode 100644 index a0c2cd3..0000000 --- a/download/bz2toDB.py +++ /dev/null @@ -1,35 +0,0 @@ -""" Not a script - -Don't use that script - - -This python file store function related to bz2 python module and then the -on the fly method of decompression. - -""" - -import bz2 -import sys - - -def unbz2(file): - decomp = bz2.BZ2Decompressor() - buf = b'' - for c in file: - buf += decomp.decompress(c) - - while b'\n' in buf: - i = buf.index(b'\n') - if i + 1 < len(buf): - ret = buf[:i + 1] - buf = buf[i + 1:] - yield ret.decode("utf-8") - else: - yield buf - buf = b'' - - -with open('./wiktionary_dump.xml.bz2', 'rb') as f: - it = iter(lambda: f.read(32768), b'') - for a in unbz2(it): - print(a, end='') |