aboutsummaryrefslogtreecommitdiff
path: root/download/download.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/download.py')
-rwxr-xr-xdownload/download.py90
1 files changed, 77 insertions, 13 deletions
diff --git a/download/download.py b/download/download.py
index b97bb8f..21137f0 100755
--- a/download/download.py
+++ b/download/download.py
@@ -1,5 +1,74 @@
#!/bin/env python
+"""dfr - Prepare database
+
+This script will download the last wiktionary dump from wikimedia, extract it
+process it and then .
+The result of that script is a sqlite database file usable with dfr.py.
+
+As downloading the dump can be challenging, you can specify a input file that
+will be used instead of the last dump from wikimedia. It's the goal of the
+command line option `--download`.
+
+###########################
+⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
+Then a decompressed one is bigger again and so a lot of disk space is
+needed to store these files.
+
+⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
+space are needed to process them. It's NOT recommended to try to create a
+database file on a computer with less than 2Gio memory, even using the on
+the fly decompression method.
+###########################
+
+They is only a few other command line options that can use with that script.
+
+ + --output
+ The name of the final sqlite database. By default it's `dfr.db`. dfr.py
+ will expect the database to on the root of the project and to be
+ named `dfr.db` so that you don't have to modify anything here.
+
++ --input
+ As describe earlier, if you already have downloaded the wiktionary dump,
+ you can set the location of the file that will be used here.
+ The purpose it to not re-download the dump each time.
+
+ The "download" and "input" options are incompatibles.
+
++ --download
+ Force the download of the file even if the file is already downloaded.
+ It used to force update of the database.
+
+ The "download" and "input" options are incompatibles.
+
++ --word-list
+ Not usefull for the moment. dfr will have a functionality to list every
+ words in the dictionnary. You will be able to filter words based on regex,
+ optionally, there will be a option to auto correct a word but nothing
+ is implemented here.
+ In all cases, you can specify the filename of the file that will store
+ every words.
+
+Note: This script have many option to decompress the wiktionary archive dump.
+
+First, it will try the `bzip2` command, if it fails, this script will try
+to extract the bzip file on the fly using bz2 python module.
+
+ - The `bzip2` command is very fast but use a lot of disk space and consume a lot
+of memory (RAM) to quickly decompress the file.
+ - The `bz2` python module isn't that fast but use less memory and the parsing
+is also done on the fly.
+
+In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.
+
+"""
+
+# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.
+
+# TODO: Add an option to choose the extract method.
+
+# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
+
import argparse
import sys
import urllib.request
@@ -145,17 +214,12 @@ if __name__ == '__main__':
print(f'Removing temporary files')
os.remove(arg.dumpF)
else:
- try:
- output_fn = arg.dumpF[:-4]
- with open(output_fn, 'r') as f:
- print('Create the database')
- res = dump2msgp.extractAll(f, 'error.log', False)
- msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
- print(f'Database { arg.outputF } created ! 👏 🎉')
- except:
- print('Failed to extract database')
- print('Exiting (-3)')
- exit(-3)
-
- print(f'Removing temporary files')
+ output_fn = arg.dumpF[:-4]
+ with open(output_fn, 'r') as f:
+ print('Create the database')
+ res = dump2msgp.extractAll(f, 'error.log', False)
+ msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+ print(f'Database { arg.outputF } created ! 👏 🎉')
+
+ print('Removing temporary files')
os.remove(output_fn)