diff options
author | ache <ache@ache.one> | 2021-10-03 02:31:32 +0200 |
---|---|---|
committer | ache <ache@ache.one> | 2021-10-03 02:31:54 +0200 |
commit | 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch) | |
tree | c1e02297807107096cf5a8962ed51342a69f0626 /download/download.py | |
parent | Command every scripts (diff) |
Rename files
Diffstat (limited to 'download/download.py')
-rwxr-xr-x | download/download.py | 225 |
1 files changed, 0 insertions, 225 deletions
diff --git a/download/download.py b/download/download.py deleted file mode 100755 index 21137f0..0000000 --- a/download/download.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/bin/env python - -"""dfr - Prepare database - -This script will download the last wiktionary dump from wikimedia, extract it -process it and then . -The result of that script is a sqlite database file usable with dfr.py. - -As downloading the dump can be challenging, you can specify a input file that -will be used instead of the last dump from wikimedia. It's the goal of the -command line option `--download`. - -########################### -β οΈ WARNING: Beaware that a wiktionary cmpressed archive dump is big. -Then a decompressed one is bigger again and so a lot of disk space is -needed to store these files. - -β οΈ WARNING: Beaware that wiktionary dump are really big and than a lot of memory -space are needed to process them. It's NOT recommended to try to create a -database file on a computer with less than 2Gio memory, even using the on -the fly decompression method. -########################### - -They is only a few other command line options that can use with that script. - - + --output - The name of the final sqlite database. By default it's `dfr.db`. dfr.py - will expect the database to on the root of the project and to be - named `dfr.db` so that you don't have to modify anything here. - -+ --input - As describe earlier, if you already have downloaded the wiktionary dump, - you can set the location of the file that will be used here. - The purpose it to not re-download the dump each time. - - The "download" and "input" options are incompatibles. - -+ --download - Force the download of the file even if the file is already downloaded. - It used to force update of the database. - - The "download" and "input" options are incompatibles. - -+ --word-list - Not usefull for the moment. dfr will have a functionality to list every - words in the dictionnary. You will be able to filter words based on regex, - optionally, there will be a option to auto correct a word but nothing - is implemented here. - In all cases, you can specify the filename of the file that will store - every words. - -Note: This script have many option to decompress the wiktionary archive dump. - -First, it will try the `bzip2` command, if it fails, this script will try -to extract the bzip file on the fly using bz2 python module. - - - The `bzip2` command is very fast but use a lot of disk space and consume a lot -of memory (RAM) to quickly decompress the file. - - The `bz2` python module isn't that fast but use less memory and the parsing -is also done on the fly. - -In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump. - -""" - -# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr. - -# TODO: Add an option to choose the extract method. - -# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio). - -import argparse -import sys -import urllib.request -import dump2msgp -import msgPack2sqlite_msgPack -import subprocess -import os - -from os.path import exists - - -URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2' - - -def unbz2(file): - decomp = bz2.BZ2Decompressor() - buf = b'' - for c in file: - buf += decomp.decompress(c) - - while b'\n' in buf: - i = buf.index(b'\n') - if i + 1 < len(buf): - ret = buf[:i + 1] - buf = buf[i + 1:] - yield ret.decode('utf-8') - else: - yield buf.decode('utf-8') - buf = b'' - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Download and create the database') - parser.add_argument('-o', '--output', dest='outputF', action='store', - help='the output, the database filename', - default='dfr.db') - parser.add_argument('-i', '--input', dest='dumpF', action='store', - help='the input dump file\'s filename', - default='') - parser.add_argument('-l', '--word-list', dest='wordList', action='store', - help='the alternative output, filename of the word list', - default=None) - parser.add_argument('-d', '--download', dest='download', action='store_true', - help='to download the lastest dump') - - download = True - - arg = parser.parse_args() - - - if not arg.wordList: - arg.wordList = arg.outputF + '.wordlist' - - if arg.download and arg.dumpF: - print('''Incompatible options '-i' and '-d'.''') - exit(1) - elif arg.download: - arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] - elif arg.dumpF: - download = False - - - if not arg.dumpF or not arg.dumpF.endswith('bz2'): - print('A bz2 dump file filename needed', file=sys.stderr) - exit(-1) - - if exists(arg.dumpF) and download: - print(f'{arg.dumpF} exists. Force downloading ? (y/N)') - answer = input('> ') - if answer.lower()[0] != 'y': - download = False - - if download: - print(download); - print(f'Downloading the dump ({arg.dumpF})\nIt should take some time') - try: - urllib.request.urlretrieve(URL_DUMP, arg.dumpF) - except urllib.error.URLError: - print('Error: Unable to download from internet') - print(f'Check connection and source URL : ({ URL_DUMP })') - print('Exiting') - exit(-10) - except: - print('Download failed.') - print('Exiting') - exit(-1) - - if not exists(arg.dumpF): - if download: - print('Download failed.\nExiting.', file=sys.stderr) - else: - print(f'Fichier { arg.dumpF } introuvable.\nArrΓͺt.') - exit(-2) - - decompress = False - - try: - print('Trying the bzip2 command') - assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) - decompress = True - except: - print('''The command "bzip" doesn't exists, or doesn't work as intended''') - print('Fallback to Python bz2 module decompressor') - - # Decompression using bzip2 - if not decompress: - try: - import bz2 - with open(arg.dumpF, 'rb') as f: - it = iter(lambda: f.read(2**16), b'') - - output_fn = arg.dumpF[:-4] - - with open(output_fn, 'wb') as fout: - dcomp = bz2.BZ2Decompressor() - for chunk in it: - datal = len(chunk) - data = dcomp.decompress(chunk) - fout.write(data) - decompress = True - except: - print('''Python bz2 module decompressor failed, maybe you don't have any space available''') - print('Fallback to on the fly decompressor (RAM will be needed)') - - if not decompress: - try: - # On the fly Decompression - with open(arg.dumpF, 'rb') as f: - it = iter(lambda: f.read(2**16), b'') - print('Data extraction on the fly') - res = dump2msgp.extractAll(unbz2(it), 'error.log', False) - with open(arg.wordList, 'wb'): - f.write('\n'.join(a.keys())) - - msgPack2sqlite_msgPack.writeDB(arg.outputF, res) - print(f'Word list { arg.wordList } created ! π π') - print(f'Database { arg.outputF } created ! π π') - except: - print('''Error: Can't extract the dump file''') - print('Exiting (-1)') - exit(-1) - - print(f'Removing temporary files') - os.remove(arg.dumpF) - else: - output_fn = arg.dumpF[:-4] - with open(output_fn, 'r') as f: - print('Create the database') - res = dump2msgp.extractAll(f, 'error.log', False) - msgPack2sqlite_msgPack.writeDB(arg.outputF, res) - print(f'Database { arg.outputF } created ! π π') - - print('Removing temporary files') - os.remove(output_fn) |