#!/bin/env python """dfr - Prepare database This script will download the last wiktionary dump from wikimedia, extract it process it and then . The result of that script is a sqlite database file usable with dfr.py. As downloading the dump can be challenging, you can specify a input file that will be used instead of the last dump from wikimedia. It's the goal of the command line option `--download`. ########################### ⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big. Then a decompressed one is bigger again and so a lot of disk space is needed to store these files. ⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory space are needed to process them. It's NOT recommended to try to create a database file on a computer with less than 2Gio memory, even using the on the fly decompression method. ########################### They is only a few other command line options that can use with that script. + --output The name of the final sqlite database. By default it's `dfr.db`. dfr.py will expect the database to on the root of the project and to be named `dfr.db` so that you don't have to modify anything here. + --input As describe earlier, if you already have downloaded the wiktionary dump, you can set the location of the file that will be used here. The purpose it to not re-download the dump each time. The "download" and "input" options are incompatibles. + --download Force the download of the file even if the file is already downloaded. It used to force update of the database. The "download" and "input" options are incompatibles. + --word-list Not usefull for the moment. dfr will have a functionality to list every words in the dictionnary. You will be able to filter words based on regex, optionally, there will be a option to auto correct a word but nothing is implemented here. In all cases, you can specify the filename of the file that will store every words. Note: This script have many option to decompress the wiktionary archive dump. First, it will try the `bzip2` command, if it fails, this script will try to extract the bzip file on the fly using bz2 python module. - The `bzip2` command is very fast but use a lot of disk space and consume a lot of memory (RAM) to quickly decompress the file. - The `bz2` python module isn't that fast but use less memory and the parsing is also done on the fly. In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump. """ # TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr. # TODO: Add an option to choose the extract method. # TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio). import argparse import sys import urllib.request import dump2msgp import msgPack2sqlite_msgPack import subprocess import os from os.path import exists URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2' def unbz2(file): decomp = bz2.BZ2Decompressor() buf = b'' for c in file: buf += decomp.decompress(c) while b'\n' in buf: i = buf.index(b'\n') if i + 1 < len(buf): ret = buf[:i + 1] buf = buf[i + 1:] yield ret.decode('utf-8') else: yield buf.decode('utf-8') buf = b'' if __name__ == '__main__': parser = argparse.ArgumentParser(description='Download and create the database') parser.add_argument('-o', '--output', dest='outputF', action='store', help='the output, the database filename', default='dfr.db') parser.add_argument('-i', '--input', dest='dumpF', action='store', help='the input dump file\'s filename', default='') parser.add_argument('-l', '--word-list', dest='wordList', action='store', help='the alternative output, filename of the word list', default=None) parser.add_argument('-d', '--download', dest='download', action='store_true', help='to download the lastest dump') download = True arg = parser.parse_args() if not arg.wordList: arg.wordList = arg.outputF + '.wordlist' if arg.download and arg.dumpF: print('''Incompatible options '-i' and '-d'.''') exit(1) elif arg.download: arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] elif arg.dumpF: download = False if not arg.dumpF or not arg.dumpF.endswith('bz2'): print('A bz2 dump file filename needed', file=sys.stderr) exit(-1) if exists(arg.dumpF) and download: print(f'{arg.dumpF} exists. Force downloading ? (y/N)') answer = input('> ') if answer.lower()[0] != 'y': download = False if download: print(download); print(f'Downloading the dump ({arg.dumpF})\nIt should take some time') try: urllib.request.urlretrieve(URL_DUMP, arg.dumpF) except urllib.error.URLError: print('Error: Unable to download from internet') print(f'Check connection and source URL : ({ URL_DUMP })') print('Exiting') exit(-10) except: print('Download failed.') print('Exiting') exit(-1) if not exists(arg.dumpF): if download: print('Download failed.\nExiting.', file=sys.stderr) else: print(f'Fichier { arg.dumpF } introuvable.\nArrêt.') exit(-2) decompress = False try: print('Trying the bzip2 command') assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) decompress = True except: print('''The command "bzip" doesn't exists, or doesn't work as intended''') print('Fallback to Python bz2 module decompressor') # Decompression using bzip2 if not decompress: try: import bz2 with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') output_fn = arg.dumpF[:-4] with open(output_fn, 'wb') as fout: dcomp = bz2.BZ2Decompressor() for chunk in it: datal = len(chunk) data = dcomp.decompress(chunk) fout.write(data) decompress = True except: print('''Python bz2 module decompressor failed, maybe you don't have any space available''') print('Fallback to on the fly decompressor (RAM will be needed)') if not decompress: try: # On the fly Decompression with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') print('Data extraction on the fly') res = dump2msgp.extractAll(unbz2(it), 'error.log', False) with open(arg.wordList, 'wb'): f.write('\n'.join(a.keys())) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f'Word list { arg.wordList } created ! 👏 🎉') print(f'Database { arg.outputF } created ! 👏 🎉') except: print('''Error: Can't extract the dump file''') print('Exiting (-1)') exit(-1) print(f'Removing temporary files') os.remove(arg.dumpF) else: output_fn = arg.dumpF[:-4] with open(output_fn, 'r') as f: print('Create the database') res = dump2msgp.extractAll(f, 'error.log', False) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f'Database { arg.outputF } created ! 👏 🎉') print('Removing temporary files') os.remove(output_fn)