import argparse import sys import urllib.request import dump2msgp import msgPack2sqlite_msgPack import subprocess from os.path import exists URL_DUMP = "https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2" def unbz2(file): decomp = bz2.BZ2Decompressor() buf = b'' for c in file: buf += decomp.decompress(c) while b'\n' in buf: i = buf.index(b'\n') if i + 1 < len(buf): ret = buf[:i + 1] buf = buf[i + 1:] yield ret.decode("utf-8") else: yield buf.decode("utf-8") buf = b'' if __name__ == '__main__': parser = argparse.ArgumentParser(description='Download and create the database') parser.add_argument('-o', '--out', dest='outputF', action='store', help='the output, the database file', default='dicofr.db') parser.add_argument('-i', '--in', dest='dumpF', action='store', help='the input dump file\'s filename', default='') parser.add_argument('-d', '--download', dest='download', action='store_true', help='to download the lastest dump') arg = parser.parse_args() download = True if download and arg.dumpF: print("Incompatible options '-i' and '-d'.") exit(1) elif download: arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] if not arg.dumpF or not arg.dumpF.endswith('bz2'): print('A bz2 dump file filename needed', file=sys.stderr) exit(-1) if exists(arg.dumpF) and download: print(f"{arg.dumpF} exists. Force downloading ? (y/N)") answer = input('> ') if answer.lower()[0] != 'y': download = False if download: print(f"Downloading the dump ({arg.dumpF})\nIt should take some time") urllib.request.urlretrieve(URL_DUMP, arg.dumpF) if not exists(arg.dumpF): print('Download failed.\nExiting.', file=sys.stderr) exit(-2) decompress = False try: print("Trying the bzip2 command") assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) decompress = True except: print("The command “bzip” doesn't exists, or doesn't work as intended") print("Fallback to Python bz2 module decompressor") # Decompression using bzip2 if not decompress: try: import bz2 with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') output_fn = arg.dumpF[:-4] with open(output_fn, 'wb') as fout: dcomp = bz2.BZ2Decompressor() for chunk in it: datal = len(chunk) data = dcomp.decompress(chunk) fout.write(data) decompress = True except: print("Python bz2 module decompressor failed, maybe you don't have any space available") print("Fallback to on the fly decompressor (RAM will be needed)") if not decompress: try: # On the fly Decompression with open(arg.dumpF, 'rb') as f: it = iter(lambda: f.read(2**16), b'') print("Data extraction on the fly") res = dump2msgp.extractAll(unbz2(it), "error.log", False) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f"Database { arg.outputF } created ! 👏 🎉") except: print("Error: Can't extract the dump file") print("Exiting (-1)") exit(-1) print(f"Removing temporary files") os.remove(arg.dumpF) else: try: output_fn = arg.dumpF[:-4] with open(output_fn, 'r') as f: print("Create the database") res = dump2msgp.extractAll(f, "error.log", False) msgPack2sqlite_msgPack.writeDB(arg.outputF, res) print(f"Database { arg.outputF } created ! 👏 🎉") except: print("Failed to extract database") print(("Exiting (-3)") exit(-3) print(f"Removing temporary files") os.remove(output_fn)