diff options
Diffstat (limited to 'download/download.py')
-rwxr-xr-x | download/download.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/download/download.py b/download/download.py new file mode 100755 index 0000000..a941b90 --- /dev/null +++ b/download/download.py @@ -0,0 +1,130 @@ +import argparse +import sys +import urllib.request +import dump2msgp +import msgPack2sqlite_msgPack +import subprocess + +from os.path import exists + + +URL_DUMP = "https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2" + + +def unbz2(file): + decomp = bz2.BZ2Decompressor() + buf = b'' + for c in file: + buf += decomp.decompress(c) + + while b'\n' in buf: + i = buf.index(b'\n') + if i + 1 < len(buf): + ret = buf[:i + 1] + buf = buf[i + 1:] + yield ret.decode("utf-8") + else: + yield buf.decode("utf-8") + buf = b'' + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download and create the database') + parser.add_argument('-o', '--out', dest='outputF', action='store', + help='the output, the database file', + default='dicofr.db') + parser.add_argument('-i', '--in', dest='dumpF', action='store', + help='the input dump file\'s filename', + default='') + parser.add_argument('-d', '--download', dest='download', action='store_true', + help='to download the lastest dump') + + arg = parser.parse_args() + + download = True + + if download and arg.dumpF: + print("Incompatible options '-i' and '-d'.") + exit(1) + elif download: + arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] + + if not arg.dumpF or not arg.dumpF.endswith('bz2'): + print('A bz2 dump file filename needed', file=sys.stderr) + exit(-1) + + if exists(arg.dumpF) and download: + print(f"{arg.dumpF} exists. Force downloading ? (y/N)") + answer = input('> ') + if answer.lower()[0] != 'y': + download = False + + if download: + print(f"Downloading the dump ({arg.dumpF})\nIt should take some time") + urllib.request.urlretrieve(URL_DUMP, arg.dumpF) + + if not exists(arg.dumpF): + print('Download failed.\nExiting.', file=sys.stderr) + exit(-2) + + decompress = False + + try: + print("Trying the bzip2 command") + assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0) + decompress = True + except: + print("The command “bzip” doesn't exists, or doesn't work as intended") + print("Fallback to Python bz2 module decompressor") + + # Decompression using bzip2 + if not decompress: + try: + import bz2 + with open(arg.dumpF, 'rb') as f: + it = iter(lambda: f.read(2**16), b'') + + output_fn = arg.dumpF[:-4] + + with open(output_fn, 'wb') as fout: + dcomp = bz2.BZ2Decompressor() + for chunk in it: + datal = len(chunk) + data = dcomp.decompress(chunk) + fout.write(data) + decompress = True + except: + print("Python bz2 module decompressor failed, maybe you don't have any space available") + print("Fallback to on the fly decompressor (RAM will be needed)") + + if not decompress: + try: + # On the fly Decompression + with open(arg.dumpF, 'rb') as f: + it = iter(lambda: f.read(2**16), b'') + print("Data extraction on the fly") + res = dump2msgp.extractAll(unbz2(it), "error.log", False) + msgPack2sqlite_msgPack.writeDB(arg.outputF, res) + print(f"Database { arg.outputF } created ! 👏 🎉") + except: + print("Error: Can't extract the dump file") + print("Exiting (-1)") + exit(-1) + + print(f"Removing temporary files") + os.remove(arg.dumpF) + else: + try: + output_fn = arg.dumpF[:-4] + with open(output_fn, 'r') as f: + print("Create the database") + res = dump2msgp.extractAll(f, "error.log", False) + msgPack2sqlite_msgPack.writeDB(arg.outputF, res) + print(f"Database { arg.outputF } created ! 👏 🎉") + except: + print("Failed to extract database") + print(("Exiting (-3)") + exit(-3) + + print(f"Removing temporary files") + os.remove(output_fn) |