Rename files

author: ache <ache@ache.one> 2021-10-03 02:31:32 +0200
committer: ache <ache@ache.one> 2021-10-03 02:31:54 +0200
commit: 7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree: c1e02297807107096cf5a8962ed51342a69f0626 /download/download.py
parent: Command every scripts (diff)
1 files changed, 0 insertions, 225 deletions
diff --git a/download/download.py b/download/download.py
deleted file mode 100755
index 21137f0..0000000
--- a/download/download.py
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/bin/env python
-
-"""dfr - Prepare database
-
-This script will download the last wiktionary dump from wikimedia, extract it
-process it and then .
-The result of that script is a sqlite database file usable with dfr.py.
-
-As downloading the dump can be challenging, you can specify a input file that
-will be used instead of the last dump from wikimedia. It's the goal of the
-command line option `--download`.
-
-###########################
-⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
-Then a decompressed one is bigger again and so a lot of disk space is
-needed to store these files.
-
-⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
-space are needed to process them. It's NOT recommended to try to create a
-database file on a computer with less than 2Gio memory, even using the on
-the fly decompression method.
-###########################
-
-They is only a few other command line options that can use with that script.
-
- + --output
-    The name of the final sqlite database. By default it's `dfr.db`. dfr.py
-    will expect the database to on the root of the project and to be
-    named `dfr.db` so that you don't have to modify anything here.
-
-+ --input
-    As describe earlier, if you already have downloaded the wiktionary dump,
-    you can set the location of the file that will be used here.
-    The purpose it to not re-download the dump each time.
-
-    The "download" and "input" options are incompatibles.
-
-+ --download
-    Force the download of the file even if the file is already downloaded.
-    It used to force update of the database.
-
-    The "download" and "input" options are incompatibles.
-
-+ --word-list
-    Not usefull for the moment. dfr will have a functionality to list every
-    words in the dictionnary. You will be able to filter words based on regex,
-    optionally, there will be a option to auto correct a word but nothing
-    is implemented here.
-    In all cases, you can specify the filename of the file that will store
-    every words.
-
-Note: This script have many option to decompress the wiktionary archive dump.
-
-First, it will try the `bzip2` command, if it fails, this script will try
-to extract the bzip file on the fly using bz2 python module.
-
- - The `bzip2` command is very fast but use a lot of disk space and consume a lot
-of memory (RAM) to quickly decompress the file.
- - The `bz2` python module isn't that fast but use less memory and the parsing
-is also done on the fly.
-
-In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.
-
-"""
-
-# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.
-
-# TODO: Add an option to choose the extract method.
-
-# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
-
-import argparse
-import sys
-import urllib.request
-import dump2msgp
-import msgPack2sqlite_msgPack
-import subprocess
-import os
-
-from os.path import exists
-
-
-URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
-
-
-def unbz2(file):
-    decomp = bz2.BZ2Decompressor()
-    buf = b''
-    for c in file:
-        buf += decomp.decompress(c)
-
-        while b'\n' in buf:
-            i = buf.index(b'\n')
-            if i + 1 < len(buf):
-                ret = buf[:i + 1]
-                buf = buf[i + 1:]
-                yield ret.decode('utf-8')
-            else:
-                yield buf.decode('utf-8')
-                buf = b''
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Download and create the database')
-    parser.add_argument('-o', '--output', dest='outputF', action='store',
-                        help='the output, the database filename',
-                        default='dfr.db')
-    parser.add_argument('-i', '--input', dest='dumpF', action='store',
-                        help='the input dump file\'s filename',
-                        default='')
-    parser.add_argument('-l', '--word-list', dest='wordList', action='store',
-                        help='the alternative output, filename of the word list',
-                        default=None)
-    parser.add_argument('-d', '--download', dest='download', action='store_true',
-                        help='to download the lastest dump')
-
-    download = True
-
-    arg = parser.parse_args()
-
-
-    if not arg.wordList:
-        arg.wordList = arg.outputF + '.wordlist'
-
-    if arg.download and arg.dumpF:
-        print('''Incompatible options '-i' and '-d'.''')
-        exit(1)
-    elif arg.download:
-        arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
-    elif arg.dumpF:
-        download = False
-
-
-    if not arg.dumpF or not arg.dumpF.endswith('bz2'):
-        print('A bz2 dump file filename needed', file=sys.stderr)
-        exit(-1)
-
-    if exists(arg.dumpF) and download:
-        print(f'{arg.dumpF} exists. Force downloading ? (y/N)')
-        answer = input('> ')
-        if answer.lower()[0] != 'y':
-            download = False
-
-    if download:
-        print(download);
-        print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
-        try:
-            urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
-        except urllib.error.URLError:
-            print('Error: Unable to download from internet')
-            print(f'Check connection and source URL : ({ URL_DUMP })')
-            print('Exiting')
-            exit(-10)
-        except:
-            print('Download failed.')
-            print('Exiting')
-            exit(-1)
-
-    if not exists(arg.dumpF):
-        if download:
-            print('Download failed.\nExiting.', file=sys.stderr)
-        else:
-            print(f'Fichier { arg.dumpF } introuvable.\nArrêt.')
-        exit(-2)
-
-    decompress = False
-
-    try:
-        print('Trying the bzip2 command')
-        assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
-        decompress = True
-    except:
-        print('''The command "bzip" doesn't exists, or doesn't work as intended''')
-        print('Fallback to Python bz2 module decompressor')
-
-    # Decompression using bzip2
-    if not decompress:
-        try:
-            import bz2
-            with open(arg.dumpF, 'rb') as f:
-                it = iter(lambda: f.read(2**16), b'')
-
-                output_fn = arg.dumpF[:-4]
-
-                with open(output_fn, 'wb') as fout:
-                    dcomp = bz2.BZ2Decompressor()
-                    for chunk in it:
-                        datal = len(chunk)
-                        data = dcomp.decompress(chunk)
-                        fout.write(data)
-            decompress = True
-        except:
-            print('''Python bz2 module decompressor failed, maybe you don't have any space available''')
-            print('Fallback to on the fly decompressor (RAM will be needed)')
-
-    if not decompress:
-        try:
-            # On the fly Decompression
-            with open(arg.dumpF, 'rb') as f:
-                it = iter(lambda: f.read(2**16), b'')
-                print('Data extraction on the fly')
-                res = dump2msgp.extractAll(unbz2(it), 'error.log', False)
-                with open(arg.wordList, 'wb'):
-                    f.write('\n'.join(a.keys()))
-
-                msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
-            print(f'Word list { arg.wordList } created ! 👏 🎉')
-            print(f'Database { arg.outputF } created ! 👏 🎉')
-        except:
-            print('''Error: Can't extract the dump file''')
-            print('Exiting (-1)')
-            exit(-1)
-
-        print(f'Removing temporary files')
-        os.remove(arg.dumpF)
-    else:
-        output_fn = arg.dumpF[:-4]
-        with open(output_fn, 'r') as f:
-            print('Create the database')
-            res = dump2msgp.extractAll(f, 'error.log', False)
-            msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
-        print(f'Database { arg.outputF } created ! 👏 🎉')
-
-        print('Removing temporary files')
-        os.remove(output_fn)
author	ache <ache@ache.one>	2021-10-03 02:31:32 +0200
committer	ache <ache@ache.one>	2021-10-03 02:31:54 +0200
commit	7e1d9e251b517153db8b639133c9e3bee266ce1b (patch)
tree	c1e02297807107096cf5a8962ed51342a69f0626 /download/download.py
parent	Command every scripts (diff)