aboutsummaryrefslogtreecommitdiff
path: root/download/download.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/download.py')
-rwxr-xr-xdownload/download.py130
1 files changed, 130 insertions, 0 deletions
diff --git a/download/download.py b/download/download.py
new file mode 100755
index 0000000..a941b90
--- /dev/null
+++ b/download/download.py
@@ -0,0 +1,130 @@
+import argparse
+import sys
+import urllib.request
+import dump2msgp
+import msgPack2sqlite_msgPack
+import subprocess
+
+from os.path import exists
+
+
+URL_DUMP = "https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2"
+
+
+def unbz2(file):
+ decomp = bz2.BZ2Decompressor()
+ buf = b''
+ for c in file:
+ buf += decomp.decompress(c)
+
+ while b'\n' in buf:
+ i = buf.index(b'\n')
+ if i + 1 < len(buf):
+ ret = buf[:i + 1]
+ buf = buf[i + 1:]
+ yield ret.decode("utf-8")
+ else:
+ yield buf.decode("utf-8")
+ buf = b''
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Download and create the database')
+ parser.add_argument('-o', '--out', dest='outputF', action='store',
+ help='the output, the database file',
+ default='dicofr.db')
+ parser.add_argument('-i', '--in', dest='dumpF', action='store',
+ help='the input dump file\'s filename',
+ default='')
+ parser.add_argument('-d', '--download', dest='download', action='store_true',
+ help='to download the lastest dump')
+
+ arg = parser.parse_args()
+
+ download = True
+
+ if download and arg.dumpF:
+ print("Incompatible options '-i' and '-d'.")
+ exit(1)
+ elif download:
+ arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
+
+ if not arg.dumpF or not arg.dumpF.endswith('bz2'):
+ print('A bz2 dump file filename needed', file=sys.stderr)
+ exit(-1)
+
+ if exists(arg.dumpF) and download:
+ print(f"{arg.dumpF} exists. Force downloading ? (y/N)")
+ answer = input('> ')
+ if answer.lower()[0] != 'y':
+ download = False
+
+ if download:
+ print(f"Downloading the dump ({arg.dumpF})\nIt should take some time")
+ urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
+
+ if not exists(arg.dumpF):
+ print('Download failed.\nExiting.', file=sys.stderr)
+ exit(-2)
+
+ decompress = False
+
+ try:
+ print("Trying the bzip2 command")
+ assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
+ decompress = True
+ except:
+ print("The command “bzip” doesn't exists, or doesn't work as intended")
+ print("Fallback to Python bz2 module decompressor")
+
+ # Decompression using bzip2
+ if not decompress:
+ try:
+ import bz2
+ with open(arg.dumpF, 'rb') as f:
+ it = iter(lambda: f.read(2**16), b'')
+
+ output_fn = arg.dumpF[:-4]
+
+ with open(output_fn, 'wb') as fout:
+ dcomp = bz2.BZ2Decompressor()
+ for chunk in it:
+ datal = len(chunk)
+ data = dcomp.decompress(chunk)
+ fout.write(data)
+ decompress = True
+ except:
+ print("Python bz2 module decompressor failed, maybe you don't have any space available")
+ print("Fallback to on the fly decompressor (RAM will be needed)")
+
+ if not decompress:
+ try:
+ # On the fly Decompression
+ with open(arg.dumpF, 'rb') as f:
+ it = iter(lambda: f.read(2**16), b'')
+ print("Data extraction on the fly")
+ res = dump2msgp.extractAll(unbz2(it), "error.log", False)
+ msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+ print(f"Database { arg.outputF } created ! 👏 🎉")
+ except:
+ print("Error: Can't extract the dump file")
+ print("Exiting (-1)")
+ exit(-1)
+
+ print(f"Removing temporary files")
+ os.remove(arg.dumpF)
+ else:
+ try:
+ output_fn = arg.dumpF[:-4]
+ with open(output_fn, 'r') as f:
+ print("Create the database")
+ res = dump2msgp.extractAll(f, "error.log", False)
+ msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+ print(f"Database { arg.outputF } created ! 👏 🎉")
+ except:
+ print("Failed to extract database")
+ print(("Exiting (-3)")
+ exit(-3)
+
+ print(f"Removing temporary files")
+ os.remove(output_fn)