aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorache <ache@ache.one>2021-10-03 02:31:32 +0200
committerache <ache@ache.one>2021-10-03 02:31:54 +0200
commit7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
treec1e02297807107096cf5a8962ed51342a69f0626
parentCommand every scripts (diff)
Rename files
-rw-r--r--Makefile4
-rwxr-xr-xdfr/createDB.py (renamed from download/download.py)35
-rw-r--r--dfr/dump2msgp.py (renamed from download/dump2msgp.py)13
-rw-r--r--dfr/msgp2sqlite.py (renamed from download/msgPack2sqlite_msgPack.py)0
-rw-r--r--dfr/sectionList.py (renamed from download/sectionList.py)0
-rw-r--r--dfr/template.py (renamed from download/template.py)0
-rw-r--r--download/bz2toDB.py35
7 files changed, 29 insertions, 58 deletions
diff --git a/Makefile b/Makefile
index 66da002..b9880ce 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ install:
# Copy code then assets
cp -u *.py ${DIR_INSTALL_PATH}/
- cp -r download ${DIR_INSTALL_PATH}/download
+ cp -r dfr ${DIR_INSTALL_PATH}/dfr
cp -r assets ${DIR_INSTALL_PATH}/assets
@[ -f frwiktionary-latest-pages-meta-current.xml.bz2 ] && \
cp -u frwiktionary-latest-pages-meta-current.xml.bz2 ${DIR_INSTALL_PATH}/ \
@@ -23,7 +23,7 @@ install:
@ echo ''
# Get the external assets if needed
- cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/download/download.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db"
+ cd ${DIR_INSTALL_PATH}; python ${DIR_INSTALL_PATH}/dfr/createDB.py -d -o "${DIR_INSTALL_PATH}/assets/dfr.db"
@ echo ''
# Set permission and install command
diff --git a/download/download.py b/dfr/createDB.py
index 21137f0..acf443d 100755
--- a/download/download.py
+++ b/dfr/createDB.py
@@ -11,11 +11,11 @@ will be used instead of the last dump from wikimedia. It's the goal of the
command line option `--download`.
###########################
-⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
+❗ WARNING: Beware that a wiktionary compressed archive dump is big.
Then a decompressed one is bigger again and so a lot of disk space is
needed to store these files.
-⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
+❗ WARNING: Beware that wiktionary dump are really big and than a lot of memory
space are needed to process them. It's NOT recommended to try to create a
database file on a computer with less than 2Gio memory, even using the on
the fly decompression method.
@@ -42,11 +42,11 @@ They is only a few other command line options that can use with that script.
The "download" and "input" options are incompatibles.
+ --word-list
- Not usefull for the moment. dfr will have a functionality to list every
- words in the dictionnary. You will be able to filter words based on regex,
+ Not useful for the moment. dfr will have a functionality to list every
+ words in the dictionary. You will be able to filter words based on regex,
optionally, there will be a option to auto correct a word but nothing
is implemented here.
- In all cases, you can specify the filename of the file that will store
+ In any cases, you can specify the file name of the file that will store
every words.
Note: This script have many option to decompress the wiktionary archive dump.
@@ -69,17 +69,6 @@ In every case, a lot of memory (RAM) is necessary to process the last wiktionary
# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
-import argparse
-import sys
-import urllib.request
-import dump2msgp
-import msgPack2sqlite_msgPack
-import subprocess
-import os
-
-from os.path import exists
-
-
URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
@@ -101,6 +90,16 @@ def unbz2(file):
if __name__ == '__main__':
+ import argparse
+ import sys
+ import urllib.request
+ import dump2msgp
+ import msgp2sqlite
+ import subprocess
+ import os
+
+ from os.path import exists
+
parser = argparse.ArgumentParser(description='Download and create the database')
parser.add_argument('-o', '--output', dest='outputF', action='store',
help='the output, the database filename',
@@ -203,7 +202,7 @@ if __name__ == '__main__':
with open(arg.wordList, 'wb'):
f.write('\n'.join(a.keys()))
- msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+ msgp2sqlite.writeDB(arg.outputF, res)
print(f'Word list { arg.wordList } created ! πŸ‘ πŸŽ‰')
print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰')
except:
@@ -218,7 +217,7 @@ if __name__ == '__main__':
with open(output_fn, 'r') as f:
print('Create the database')
res = dump2msgp.extractAll(f, 'error.log', False)
- msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+ msgp2sqlite.writeDB(arg.outputF, res)
print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰')
print('Removing temporary files')
diff --git a/download/dump2msgp.py b/dfr/dump2msgp.py
index e76115c..1d3e1a2 100644
--- a/download/dump2msgp.py
+++ b/dfr/dump2msgp.py
@@ -40,8 +40,15 @@ import sys
import msgpack
import argparse
-from sectionList import listInfoSection
-from template import template
+if __name__ == '__main__':
+ from sectionList import listInfoSection
+ from template import template
+else:
+ from dfr.sectionList import listInfoSection
+ from dfr.template import template
+
+
+dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
DEFAULT_OUTPUT = 'dfr.msgpk'
@@ -90,7 +97,6 @@ template_second_lambda_snd = {
'vΓ©rifier': (lambda x: '(Γ€ vΓ©rifier : ' + x + ')'),
}
-dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}
interdit = " :"
@@ -365,6 +371,7 @@ def extractAll(f, errorF, ignore):
if __name__ == '__main__':
+
parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
parser.add_argument('-o', '--out', dest='outputF', action='store_const',
const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
diff --git a/download/msgPack2sqlite_msgPack.py b/dfr/msgp2sqlite.py
index c08efdb..c08efdb 100644
--- a/download/msgPack2sqlite_msgPack.py
+++ b/dfr/msgp2sqlite.py
diff --git a/download/sectionList.py b/dfr/sectionList.py
index 68dd657..68dd657 100644
--- a/download/sectionList.py
+++ b/dfr/sectionList.py
diff --git a/download/template.py b/dfr/template.py
index fa0394a..fa0394a 100644
--- a/download/template.py
+++ b/dfr/template.py
diff --git a/download/bz2toDB.py b/download/bz2toDB.py
deleted file mode 100644
index a0c2cd3..0000000
--- a/download/bz2toDB.py
+++ /dev/null
@@ -1,35 +0,0 @@
-""" Not a script
-
-Don't use that script
-
-
-This python file store function related to bz2 python module and then the
-on the fly method of decompression.
-
-"""
-
-import bz2
-import sys
-
-
-def unbz2(file):
- decomp = bz2.BZ2Decompressor()
- buf = b''
- for c in file:
- buf += decomp.decompress(c)
-
- while b'\n' in buf:
- i = buf.index(b'\n')
- if i + 1 < len(buf):
- ret = buf[:i + 1]
- buf = buf[i + 1:]
- yield ret.decode("utf-8")
- else:
- yield buf
- buf = b''
-
-
-with open('./wiktionary_dump.xml.bz2', 'rb') as f:
- it = iter(lambda: f.read(32768), b'')
- for a in unbz2(it):
- print(a, end='')