aboutsummaryrefslogtreecommitdiff
path: root/download/download.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/download.py')
-rwxr-xr-xdownload/download.py67
1 files changed, 36 insertions, 31 deletions
diff --git a/download/download.py b/download/download.py
index 04c337c..25fae89 100755
--- a/download/download.py
+++ b/download/download.py
@@ -1,3 +1,5 @@
+#!/bin/env python
+
import argparse
import sys
import urllib.request
@@ -9,7 +11,7 @@ import os
from os.path import exists
-URL_DUMP = "https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2"
+URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
def unbz2(file):
@@ -23,17 +25,17 @@ def unbz2(file):
if i + 1 < len(buf):
ret = buf[:i + 1]
buf = buf[i + 1:]
- yield ret.decode("utf-8")
+ yield ret.decode('utf-8')
else:
- yield buf.decode("utf-8")
+ yield buf.decode('utf-8')
buf = b''
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download and create the database')
parser.add_argument('-o', '--out', dest='outputF', action='store',
- help='the output, the database file',
- default='dicofr.db')
+ help='the output, the database filename',
+ default='dfr.db')
parser.add_argument('-i', '--in', dest='dumpF', action='store',
help='the input dump file\'s filename',
default='')
@@ -45,7 +47,7 @@ if __name__ == '__main__':
download = True
if download and arg.dumpF:
- print("Incompatible options '-i' and '-d'.")
+ print('''Incompatible options '-i' and '-d'.''')
exit(1)
elif download:
arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
@@ -55,26 +57,25 @@ if __name__ == '__main__':
exit(-1)
if exists(arg.dumpF) and download:
- print(f"{arg.dumpF} exists. Force downloading ? (y/N)")
+ print(f'{arg.dumpF} exists. Force downloading ? (y/N)')
answer = input('> ')
if answer.lower()[0] != 'y':
download = False
if download:
- print(f"Downloading the dump ({arg.dumpF})\nIt should take some time")
+ print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
try:
urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
except urllib.error.URLError:
- print("Error: Unable to download from internet")
- print(f"Check connection and source URL : ({ URL_DUMP })")
- print("Exiting")
+ print('Error: Unable to download from internet')
+ print(f'Check connection and source URL : ({ URL_DUMP })')
+ print('Exiting')
exit(-10)
except:
- print("Download failed.")
- print("Exiting")
+ print('Download failed.')
+ print('Exiting')
exit(-1)
-
if not exists(arg.dumpF):
print('Download failed.\nExiting.', file=sys.stderr)
exit(-2)
@@ -82,12 +83,12 @@ if __name__ == '__main__':
decompress = False
try:
- print("Trying the bzip2 command")
+ print('Trying the bzip2 command')
assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
decompress = True
except:
- print("The command “bzip” doesn't exists, or doesn't work as intended")
- print("Fallback to Python bz2 module decompressor")
+ print('''The command "bzip" doesn't exists, or doesn't work as intended''')
+ print('Fallback to Python bz2 module decompressor')
# Decompression using bzip2
if not decompress:
@@ -106,37 +107,41 @@ if __name__ == '__main__':
fout.write(data)
decompress = True
except:
- print("Python bz2 module decompressor failed, maybe you don't have any space available")
- print("Fallback to on the fly decompressor (RAM will be needed)")
+ print('''Python bz2 module decompressor failed, maybe you don't have any space available''')
+ print('Fallback to on the fly decompressor (RAM will be needed)')
if not decompress:
try:
# On the fly Decompression
with open(arg.dumpF, 'rb') as f:
it = iter(lambda: f.read(2**16), b'')
- print("Data extraction on the fly")
- res = dump2msgp.extractAll(unbz2(it), "error.log", False)
+ print('Data extraction on the fly')
+ res = dump2msgp.extractAll(unbz2(it), 'error.log', False)
+ with open(arg.wordList, 'wb'):
+ f.write('\n'.join(a.keys()))
+
msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
- print(f"Database { arg.outputF } created ! 👏 🎉")
+ print(f'Word list { arg.wordList } created ! 👏 🎉')
+ print(f'Database { arg.outputF } created ! 👏 🎉')
except:
- print("Error: Can't extract the dump file")
- print("Exiting (-1)")
+ print('''Error: Can't extract the dump file''')
+ print('Exiting (-1)')
exit(-1)
- print(f"Removing temporary files")
+ print(f'Removing temporary files')
os.remove(arg.dumpF)
else:
try:
output_fn = arg.dumpF[:-4]
with open(output_fn, 'r') as f:
- print("Create the database")
- res = dump2msgp.extractAll(f, "error.log", False)
+ print('Create the database')
+ res = dump2msgp.extractAll(f, 'error.log', False)
msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
- print(f"Database { arg.outputF } created ! 👏 🎉")
+ print(f'Database { arg.outputF } created ! 👏 🎉')
except:
- print("Failed to extract database")
- print("Exiting (-3)")
+ print('Failed to extract database')
+ print('Exiting (-3)')
exit(-3)
- print(f"Removing temporary files")
+ print(f'Removing temporary files')
os.remove(output_fn)