From 7e1d9e251b517153db8b639133c9e3bee266ce1b Mon Sep 17 00:00:00 2001 From: ache Date: Sun, 3 Oct 2021 02:31:32 +0200 Subject: Rename files --- dfr/createDB.py | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100755 dfr/createDB.py (limited to 'dfr/createDB.py') diff --git a/dfr/createDB.py b/dfr/createDB.py new file mode 100755 index 0000000..acf443d --- /dev/null +++ b/dfr/createDB.py @@ -0,0 +1,224 @@ +#!/bin/env python + +"""dfr - Prepare database + +This script will download the last wiktionary dump from wikimedia, extract it +process it and then . +The result of that script is a sqlite database file usable with dfr.py. + +As downloading the dump can be challenging, you can specify a input file that +will be used instead of the last dump from wikimedia. It's the goal of the +command line option `--download`. + +########################### +❗ WARNING: Beware that a wiktionary compressed archive dump is big. +Then a decompressed one is bigger again and so a lot of disk space is +needed to store these files. + +❗ WARNING: Beware that wiktionary dump are really big and than a lot of memory +space are needed to process them. It's NOT recommended to try to create a +database file on a computer with less than 2Gio memory, even using the on +the fly decompression method. +########################### + +They is only a few other command line options that can use with that script. + + + --output + The name of the final sqlite database. By default it's `dfr.db`. dfr.py + will expect the database to on the root of the project and to be + named `dfr.db` so that you don't have to modify anything here. + ++ --input + As describe earlier, if you already have downloaded the wiktionary dump, + you can set the location of the file that will be used here. + The purpose it to not re-download the dump each time. + + The "download" and "input" options are incompatibles. + ++ --download + Force the download of the file even if the file is already downloaded. + It used to force update of the database. + + The "download" and "input" options are incompatibles. + ++ --word-list + Not useful for the moment. dfr will have a functionality to list every + words in the dictionary. You will be able to filter words based on regex, + optionally, there will be a option to auto correct a word but nothing + is implemented here. + In any cases, you can specify the file name of the file that will store + every words. + +Note: This script have many option to decompress the wiktionary archive dump. + +First, it will try the `bzip2` command, if it fails, this script will try +to extract the bzip file on the fly using bz2 python module. + + - The `bzip2` command is very fast but use a lot of disk space and consume a lot +of memory (RAM) to quickly decompress the file. + - The `bz2` python module isn't that fast but use less memory and the parsing +is also done on the fly. + +In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump. + +""" + +# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr. + +# TODO: Add an option to choose the extract method. + +# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio). + +URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2' + + +def unbz2(file): + decomp = bz2.BZ2Decompressor() + buf = b'' + for c in file: + buf += decomp.decompress(c) + + while b'\n' in buf: + i = buf.index(b'\n') + if i + 1 < len(buf): + ret = buf[:i + 1] + buf = buf[i + 1:] + yield ret.decode('utf-8') + else: + yield buf.decode('utf-8') + buf = b'' + + +if __name__ == '__main__': + import argparse + import sys + import urllib.request + import dump2msgp + import msgp2sqlite + import subprocess + import os + + from os.path import exists + + parser = argparse.ArgumentParser(description='Download and create the database') + parser.add_argument('-o', '--output', dest='outputF', action='store', + help='the output, the database filename', + default='dfr.db') + parser.add_argument('-i', '--input', dest='dumpF', action='store', + help='the input dump file\'s filename', + default='') + parser.add_argument('-l', '--word-list', dest='wordList', action='store', + help='the alternative output, filename of the word list', + default=None) + parser.add_argument('-d', '--download', dest='download', action='store_true', + help='to download the lastest dump') + + download = True + + arg = parser.parse_args() + + + if not arg.wordList: + arg.wordList = arg.outputF + '.wordlist' + + if arg.download and arg.dumpF: + print('''Incompatible options '-i' and '-d'.''') + exit(1) + elif arg.download: + arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] + elif arg.dumpF: + download = False + + + if not arg.dumpF or not arg.dumpF.endswith('bz2'): + print('A bz2 dump file filename needed', file=sys.stderr) + exit(-1) + + if exists(arg.dumpF) and download: + print(f'{arg.dumpF} exists. Force downloading ? (y/N)') + answer = input('> ') + if answer.lower()[0] != 'y': + download = False + + if download: + print(download); + print(f'Downloading the dump ({arg.dumpF})\nIt should take some time') + try: + urllib.request.urlretrieve(URL_DUMP, arg.dumpF) + except urllib.error.URLError: + print('Error: Unable to download from internet') + print(f'Check connection and source URL : ({ URL_DUMP })') + print('Exiting') + exit(-10) + except: + print('Download failed.') + print('Exiting') + exit(-1) + + if not exists(arg.dumpF): + if download: + print('Download failed.\nExiting.', file=sys.stderr) + else: + print(f'Fichier { arg.dumpF } introuvable.\nArrΓͺt.') + exit(-2) + + decompress = False + + try: + print('Trying the bzip2 command') + assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) + decompress = True + except: + print('''The command "bzip" doesn't exists, or doesn't work as intended''') + print('Fallback to Python bz2 module decompressor') + + # Decompression using bzip2 + if not decompress: + try: + import bz2 + with open(arg.dumpF, 'rb') as f: + it = iter(lambda: f.read(2**16), b'') + + output_fn = arg.dumpF[:-4] + + with open(output_fn, 'wb') as fout: + dcomp = bz2.BZ2Decompressor() + for chunk in it: + datal = len(chunk) + data = dcomp.decompress(chunk) + fout.write(data) + decompress = True + except: + print('''Python bz2 module decompressor failed, maybe you don't have any space available''') + print('Fallback to on the fly decompressor (RAM will be needed)') + + if not decompress: + try: + # On the fly Decompression + with open(arg.dumpF, 'rb') as f: + it = iter(lambda: f.read(2**16), b'') + print('Data extraction on the fly') + res = dump2msgp.extractAll(unbz2(it), 'error.log', False) + with open(arg.wordList, 'wb'): + f.write('\n'.join(a.keys())) + + msgp2sqlite.writeDB(arg.outputF, res) + print(f'Word list { arg.wordList } created ! πŸ‘ πŸŽ‰') + print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰') + except: + print('''Error: Can't extract the dump file''') + print('Exiting (-1)') + exit(-1) + + print(f'Removing temporary files') + os.remove(arg.dumpF) + else: + output_fn = arg.dumpF[:-4] + with open(output_fn, 'r') as f: + print('Create the database') + res = dump2msgp.extractAll(f, 'error.log', False) + msgp2sqlite.writeDB(arg.outputF, res) + print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰') + + print('Removing temporary files') + os.remove(output_fn) -- cgit v1.2.3