From 7e1d9e251b517153db8b639133c9e3bee266ce1b Mon Sep 17 00:00:00 2001
From: ache <ache@ache.one>
Date: Sun, 3 Oct 2021 02:31:32 +0200
Subject: Rename files

---
 dfr/createDB.py | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100755 dfr/createDB.py

(limited to 'dfr/createDB.py')

diff --git a/dfr/createDB.py b/dfr/createDB.py
new file mode 100755
index 0000000..acf443d
--- /dev/null
+++ b/dfr/createDB.py
@@ -0,0 +1,224 @@
+#!/bin/env python
+
+"""dfr - Prepare database
+
+This script will download the last wiktionary dump from wikimedia, extract it
+process it and then .
+The result of that script is a sqlite database file usable with dfr.py.
+
+As downloading the dump can be challenging, you can specify a input file that
+will be used instead of the last dump from wikimedia. It's the goal of the
+command line option `--download`.
+
+###########################
+❗ WARNING: Beware that a wiktionary compressed archive dump is big.
+Then a decompressed one is bigger again and so a lot of disk space is
+needed to store these files.
+
+❗ WARNING: Beware that wiktionary dump are really big and than a lot of memory
+space are needed to process them. It's NOT recommended to try to create a
+database file on a computer with less than 2Gio memory, even using the on
+the fly decompression method.
+###########################
+
+They is only a few other command line options that can use with that script.
+
+ + --output
+    The name of the final sqlite database. By default it's `dfr.db`. dfr.py
+    will expect the database to on the root of the project and to be
+    named `dfr.db` so that you don't have to modify anything here.
+
++ --input
+    As describe earlier, if you already have downloaded the wiktionary dump,
+    you can set the location of the file that will be used here.
+    The purpose it to not re-download the dump each time.
+
+    The "download" and "input" options are incompatibles.
+
++ --download
+    Force the download of the file even if the file is already downloaded.
+    It used to force update of the database.
+
+    The "download" and "input" options are incompatibles.
+
++ --word-list
+    Not useful for the moment. dfr will have a functionality to list every
+    words in the dictionary. You will be able to filter words based on regex,
+    optionally, there will be a option to auto correct a word but nothing
+    is implemented here.
+    In any cases, you can specify the file name of the file that will store
+    every words.
+
+Note: This script have many option to decompress the wiktionary archive dump.
+
+First, it will try the `bzip2` command, if it fails, this script will try
+to extract the bzip file on the fly using bz2 python module.
+
+ - The `bzip2` command is very fast but use a lot of disk space and consume a lot
+of memory (RAM) to quickly decompress the file.
+ - The `bz2` python module isn't that fast but use less memory and the parsing
+is also done on the fly.
+
+In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.
+
+"""
+
+# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.
+
+# TODO: Add an option to choose the extract method.
+
+# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
+
+URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'
+
+
+def unbz2(file):
+    decomp = bz2.BZ2Decompressor()
+    buf = b''
+    for c in file:
+        buf += decomp.decompress(c)
+
+        while b'\n' in buf:
+            i = buf.index(b'\n')
+            if i + 1 < len(buf):
+                ret = buf[:i + 1]
+                buf = buf[i + 1:]
+                yield ret.decode('utf-8')
+            else:
+                yield buf.decode('utf-8')
+                buf = b''
+
+
+if __name__ == '__main__':
+    import argparse
+    import sys
+    import urllib.request
+    import dump2msgp
+    import msgp2sqlite
+    import subprocess
+    import os
+
+    from os.path import exists
+
+    parser = argparse.ArgumentParser(description='Download and create the database')
+    parser.add_argument('-o', '--output', dest='outputF', action='store',
+                        help='the output, the database filename',
+                        default='dfr.db')
+    parser.add_argument('-i', '--input', dest='dumpF', action='store',
+                        help='the input dump file\'s filename',
+                        default='')
+    parser.add_argument('-l', '--word-list', dest='wordList', action='store',
+                        help='the alternative output, filename of the word list',
+                        default=None)
+    parser.add_argument('-d', '--download', dest='download', action='store_true',
+                        help='to download the lastest dump')
+
+    download = True
+
+    arg = parser.parse_args()
+
+
+    if not arg.wordList:
+        arg.wordList = arg.outputF + '.wordlist'
+
+    if arg.download and arg.dumpF:
+        print('''Incompatible options '-i' and '-d'.''')
+        exit(1)
+    elif arg.download:
+        arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
+    elif arg.dumpF:
+        download = False
+
+
+    if not arg.dumpF or not arg.dumpF.endswith('bz2'):
+        print('A bz2 dump file filename needed', file=sys.stderr)
+        exit(-1)
+
+    if exists(arg.dumpF) and download:
+        print(f'{arg.dumpF} exists. Force downloading ? (y/N)')
+        answer = input('> ')
+        if answer.lower()[0] != 'y':
+            download = False
+
+    if download:
+        print(download);
+        print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
+        try:
+            urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
+        except urllib.error.URLError:
+            print('Error: Unable to download from internet')
+            print(f'Check connection and source URL : ({ URL_DUMP })')
+            print('Exiting')
+            exit(-10)
+        except:
+            print('Download failed.')
+            print('Exiting')
+            exit(-1)
+
+    if not exists(arg.dumpF):
+        if download:
+            print('Download failed.\nExiting.', file=sys.stderr)
+        else:
+            print(f'Fichier { arg.dumpF } introuvable.\nArrêt.')
+        exit(-2)
+
+    decompress = False
+
+    try:
+        print('Trying the bzip2 command')
+        assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
+        decompress = True
+    except:
+        print('''The command "bzip" doesn't exists, or doesn't work as intended''')
+        print('Fallback to Python bz2 module decompressor')
+
+    # Decompression using bzip2
+    if not decompress:
+        try:
+            import bz2
+            with open(arg.dumpF, 'rb') as f:
+                it = iter(lambda: f.read(2**16), b'')
+
+                output_fn = arg.dumpF[:-4]
+
+                with open(output_fn, 'wb') as fout:
+                    dcomp = bz2.BZ2Decompressor()
+                    for chunk in it:
+                        datal = len(chunk)
+                        data = dcomp.decompress(chunk)
+                        fout.write(data)
+            decompress = True
+        except:
+            print('''Python bz2 module decompressor failed, maybe you don't have any space available''')
+            print('Fallback to on the fly decompressor (RAM will be needed)')
+
+    if not decompress:
+        try:
+            # On the fly Decompression
+            with open(arg.dumpF, 'rb') as f:
+                it = iter(lambda: f.read(2**16), b'')
+                print('Data extraction on the fly')
+                res = dump2msgp.extractAll(unbz2(it), 'error.log', False)
+                with open(arg.wordList, 'wb'):
+                    f.write('\n'.join(a.keys()))
+
+                msgp2sqlite.writeDB(arg.outputF, res)
+            print(f'Word list { arg.wordList } created ! 👏 🎉')
+            print(f'Database { arg.outputF } created ! 👏 🎉')
+        except:
+            print('''Error: Can't extract the dump file''')
+            print('Exiting (-1)')
+            exit(-1)
+
+        print(f'Removing temporary files')
+        os.remove(arg.dumpF)
+    else:
+        output_fn = arg.dumpF[:-4]
+        with open(output_fn, 'r') as f:
+            print('Create the database')
+            res = dump2msgp.extractAll(f, 'error.log', False)
+            msgp2sqlite.writeDB(arg.outputF, res)
+        print(f'Database { arg.outputF } created ! 👏 🎉')
+
+        print('Removing temporary files')
+        os.remove(output_fn)
-- 
cgit v1.2.3