#!/bin/env python import argparse import sys import urllib.request import dump2msgp import msgPack2sqlite_msgPack import subprocess import os from os.path import exists URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2' def unbz2(file): decomp = bz2.BZ2Decompressor() buf = b'' for c in file: buf += decomp.decompress(c) while b'\n' in buf: i = buf.index(b'\n') if i + 1 < len(buf): ret = buf[:i + 1] buf = buf[i + 1:] yield ret.decode('utf-8') else: yield buf.decode('utf-8') buf = b'' if __name__ == '__main__': parser = argparse.ArgumentParser(description='Download and create the database') parser.add_argument('-o', '--output', dest='outputF', action='store', help='the output, the database filename', default='dfr.db') parser.add_argument('-i', '--input', dest='dumpF', action='store', help='the input dump file\'s filename', default='') parser.add_argument('-l', '--word-list', dest='wordList', action='store', help='the alternative output, filename of the word list', default=None) parser.add_argument('-d', '--download', dest='download', action='store_true', help='to download the lastest dump') download = True arg = parser.parse_args() if not arg.wordList: arg.wordList = arg.outputF + '.wordlist' if arg.download and arg.dumpF: print('''Incompatible options '-i' and '-d'.''') exit(1) elif arg.download: arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] elif arg.dumpF: download = False if not arg.dumpF or not arg.dumpF.endswith('bz2'): print('A bz2 dump file filename needed', file=sys.stderr) exit(-1) if exists(arg.dumpF) and download: print(f'{arg.dumpF} exists. Force downloading ? (y/N)') answer = input('> ') if answer.lower()[0] != 'y': download = False if download: print(download); print(f'Downloading the dump ({arg.dumpF})\nIt should take some time') try: urllib.request.urlretrieve(URL_DUMP, arg.dumpF) except urllib.error.URLError: print('Error: Unable to download from internet') print(f'Check connection and source URL : ({ URL_DUMP })') print('Exiting') exit(-10) except: print('Download failed.') print('Exiting') exit(-1) if not exists(arg.dumpF): if download: print('Download failed.\nExiting.', file=sys.stderr) else: print(f'Fichier { arg.dumpF } introuvable.\nArrΓͺt.') exit(-2) decompress = False try: print('Trying the bzip2 command') assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) decompress = True except: print('''The command "bzip" doesn't exists, or doesn't work as intended''') print('Fallback to Python bz2 module decompressor') # Decompression using bzip2 if not decompress: try: import bz2 with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') output_fn = arg.dumpF[:-4] with open(output_fn, 'wb') as fout: dcomp = bz2.BZ2Decompressor() for chunk in it: datal = len(chunk) data = dcomp.decompress(chunk) fout.write(data) decompress = True except: print('''Python bz2 module decompressor failed, maybe you don't have any space available''') print('Fallback to on the fly decompressor (RAM will be needed)') if not decompress: try: # On the fly Decompression with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') print('Data extraction on the fly') res = dump2msgp.extractAll(unbz2(it), 'error.log', False) with open(arg.wordList, 'wb'): f.write('\n'.join(a.keys())) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f'Word list { arg.wordList } created ! πŸ‘ πŸŽ‰') print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰') except: print('''Error: Can't extract the dump file''') print('Exiting (-1)') exit(-1) print(f'Removing temporary files') os.remove(arg.dumpF) else: try: output_fn = arg.dumpF[:-4] with open(output_fn, 'r') as f: print('Create the database') res = dump2msgp.extractAll(f, 'error.log', False) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰') except: print('Failed to extract database') print('Exiting (-3)') exit(-3) print(f'Removing temporary files') os.remove(output_fn)