aboutsummaryrefslogtreecommitdiff
path: root/download/download.py
diff options
context:
space:
mode:
authorache <ache@ache.one>2021-10-03 02:31:32 +0200
committerache <ache@ache.one>2021-10-03 02:31:54 +0200
commit7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
treec1e02297807107096cf5a8962ed51342a69f0626 /download/download.py
parentCommand every scripts (diff)
Rename files
Diffstat (limited to 'download/download.py')
-rwxr-xr-xdownload/download.py225
1 files changed, 0 insertions, 225 deletions
diff --git a/download/download.py b/download/download.py
deleted file mode 100755
index 21137f0..0000000
--- a/download/download.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/bin/env python
-
-"""dfr - Prepare database
-
-This script will download the last wiktionary dump from wikimedia, extract it
-process it and then .
-The result of that script is a sqlite database file usable with dfr.py.
-
-As downloading the dump can be challenging, you can specify a input file that
-will be used instead of the last dump from wikimedia. It's the goal of the
-command line option `--download`.
-
-###########################
-⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
-Then a decompressed one is bigger again and so a lot of disk space is
-needed to store these files.
-
-⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
-space are needed to process them. It's NOT recommended to try to create a
-database file on a computer with less than 2Gio memory, even using the on
-the fly decompression method.
-###########################
-
-They is only a few other command line options that can use with that script.
-
- + --output
- The name of the final sqlite database. By default it's `dfr.db`. dfr.py
- will expect the database to on the root of the project and to be
- named `dfr.db` so that you don't have to modify anything here.
-
-+ --input
- As describe earlier, if you already have downloaded the wiktionary dump,
- you can set the location of the file that will be used here.
- The purpose it to not re-download the dump each time.
-
- The "download" and "input" options are incompatibles.
-
-+ --download
- Force the download of the file even if the file is already downloaded.
- It used to force update of the database.
-
- The "download" and "input" options are incompatibles.
-
-+ --word-list
- Not usefull for the moment. dfr will have a functionality to list every
- words in the dictionnary. You will be able to filter words based on regex,
- optionally, there will be a option to auto correct a word but nothing
- is implemented here.
- In all cases, you can specify the filename of the file that will store
- every words.
-
-Note: This script have many option to decompress the wiktionary archive dump.
-
-First, it will try the `bzip2` command, if it fails, this script will try
-to extract the bzip file on the fly using bz2 python module.
-
- - The `bzip2` command is very fast but use a lot of disk space and consume a lot
-of memory (RAM) to quickly decompress the file.
- - The `bz2` python module isn't that fast but use less memory and the parsing
-is also done on the fly.
-
-In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.
-
-"""
-
-# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.
-
-# TODO: Add an option to choose the extract method.
-
-# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
-
-import argparse
-import sys
-import urllib.request
-import dump2msgp
-import msgPack2sqlite_msgPack
-import subprocess
-import os
-
-from os.path import exists
-
-
-URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
-
-
-def unbz2(file):
- decomp = bz2.BZ2Decompressor()
- buf = b''
- for c in file:
- buf += decomp.decompress(c)
-
- while b'\n' in buf:
- i = buf.index(b'\n')
- if i + 1 < len(buf):
- ret = buf[:i + 1]
- buf = buf[i + 1:]
- yield ret.decode('utf-8')
- else:
- yield buf.decode('utf-8')
- buf = b''
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Download and create the database')
- parser.add_argument('-o', '--output', dest='outputF', action='store',
- help='the output, the database filename',
- default='dfr.db')
- parser.add_argument('-i', '--input', dest='dumpF', action='store',
- help='the input dump file\'s filename',
- default='')
- parser.add_argument('-l', '--word-list', dest='wordList', action='store',
- help='the alternative output, filename of the word list',
- default=None)
- parser.add_argument('-d', '--download', dest='download', action='store_true',
- help='to download the lastest dump')
-
- download = True
-
- arg = parser.parse_args()
-
-
- if not arg.wordList:
- arg.wordList = arg.outputF + '.wordlist'
-
- if arg.download and arg.dumpF:
- print('''Incompatible options '-i' and '-d'.''')
- exit(1)
- elif arg.download:
- arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
- elif arg.dumpF:
- download = False
-
-
- if not arg.dumpF or not arg.dumpF.endswith('bz2'):
- print('A bz2 dump file filename needed', file=sys.stderr)
- exit(-1)
-
- if exists(arg.dumpF) and download:
- print(f'{arg.dumpF} exists. Force downloading ? (y/N)')
- answer = input('> ')
- if answer.lower()[0] != 'y':
- download = False
-
- if download:
- print(download);
- print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
- try:
- urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
- except urllib.error.URLError:
- print('Error: Unable to download from internet')
- print(f'Check connection and source URL : ({ URL_DUMP })')
- print('Exiting')
- exit(-10)
- except:
- print('Download failed.')
- print('Exiting')
- exit(-1)
-
- if not exists(arg.dumpF):
- if download:
- print('Download failed.\nExiting.', file=sys.stderr)
- else:
- print(f'Fichier { arg.dumpF } introuvable.\nArrΓͺt.')
- exit(-2)
-
- decompress = False
-
- try:
- print('Trying the bzip2 command')
- assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
- decompress = True
- except:
- print('''The command "bzip" doesn't exists, or doesn't work as intended''')
- print('Fallback to Python bz2 module decompressor')
-
- # Decompression using bzip2
- if not decompress:
- try:
- import bz2
- with open(arg.dumpF, 'rb') as f:
- it = iter(lambda: f.read(2**16), b'')
-
- output_fn = arg.dumpF[:-4]
-
- with open(output_fn, 'wb') as fout:
- dcomp = bz2.BZ2Decompressor()
- for chunk in it:
- datal = len(chunk)
- data = dcomp.decompress(chunk)
- fout.write(data)
- decompress = True
- except:
- print('''Python bz2 module decompressor failed, maybe you don't have any space available''')
- print('Fallback to on the fly decompressor (RAM will be needed)')
-
- if not decompress:
- try:
- # On the fly Decompression
- with open(arg.dumpF, 'rb') as f:
- it = iter(lambda: f.read(2**16), b'')
- print('Data extraction on the fly')
- res = dump2msgp.extractAll(unbz2(it), 'error.log', False)
- with open(arg.wordList, 'wb'):
- f.write('\n'.join(a.keys()))
-
- msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
- print(f'Word list { arg.wordList } created ! πŸ‘ πŸŽ‰')
- print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰')
- except:
- print('''Error: Can't extract the dump file''')
- print('Exiting (-1)')
- exit(-1)
-
- print(f'Removing temporary files')
- os.remove(arg.dumpF)
- else:
- output_fn = arg.dumpF[:-4]
- with open(output_fn, 'r') as f:
- print('Create the database')
- res = dump2msgp.extractAll(f, 'error.log', False)
- msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
- print(f'Database { arg.outputF } created ! πŸ‘ πŸŽ‰')
-
- print('Removing temporary files')
- os.remove(output_fn)