dfr/createDB.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

#!/bin/env python

"""dfr - Prepare database

This script will download the last wiktionary dump from wikimedia, extract it
process it and then .
The result of that script is a sqlite database file usable with dfr.py.

As downloading the dump can be challenging, you can specify a input file that
will be used instead of the last dump from wikimedia. It's the goal of the
command line option `--download`.

###########################
❗ WARNING: Beware that a wiktionary compressed archive dump is big.
Then a decompressed one is bigger again and so a lot of disk space is
needed to store these files.

❗ WARNING: Beware that wiktionary dump are really big and than a lot of memory
space are needed to process them. It's NOT recommended to try to create a
database file on a computer with less than 2Gio memory, even using the on
the fly decompression method.
###########################

They is only a few other command line options that can use with that script.

 + --output
    The name of the final sqlite database. By default it's `dfr.db`. dfr.py
    will expect the database to on the root of the project and to be
    named `dfr.db` so that you don't have to modify anything here.

+ --input
    As describe earlier, if you already have downloaded the wiktionary dump,
    you can set the location of the file that will be used here.
    The purpose it to not re-download the dump each time.

    The "download" and "input" options are incompatibles.

+ --download
    Force the download of the file even if the file is already downloaded.
    It used to force update of the database.

    The "download" and "input" options are incompatibles.

+ --word-list
    Not useful for the moment. dfr will have a functionality to list every
    words in the dictionary. You will be able to filter words based on regex,
    optionally, there will be a option to auto correct a word but nothing
    is implemented here.
    In any cases, you can specify the file name of the file that will store
    every words.

Note: This script have many option to decompress the wiktionary archive dump.

First, it will try the `bzip2` command, if it fails, this script will try
to extract the bzip file on the fly using bz2 python module.

 - The `bzip2` command is very fast but use a lot of disk space and consume a lot
of memory (RAM) to quickly decompress the file.
 - The `bz2` python module isn't that fast but use less memory and the parsing
is also done on the fly.

In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.

"""

# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.

# TODO: Add an option to choose the extract method.

# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).

URL_DUMP = 'https://dumps.wikimedia.org/frwiktionary/latest/frwiktionary-latest-pages-meta-current.xml.bz2'


def unbz2(file):
    decomp = bz2.BZ2Decompressor()
    buf = b''
    for c in file:
        buf += decomp.decompress(c)

        while b'\n' in buf:
            i = buf.index(b'\n')
            if i + 1 < len(buf):
                ret = buf[:i + 1]
                buf = buf[i + 1:]
                yield ret.decode('utf-8')
            else:
                yield buf.decode('utf-8')
                buf = b''


if __name__ == '__main__':
    import argparse
    import sys
    import urllib.request
    import dump2msgp
    import msgp2sqlite
    import subprocess
    import os

    from os.path import exists

    parser = argparse.ArgumentParser(description='Download and create the database')
    parser.add_argument('-o', '--output', dest='outputF', action='store',
                        help='the output, the database filename',
                        default='dfr.db')
    parser.add_argument('-i', '--input', dest='dumpF', action='store',
                        help='the input dump file\'s filename',
                        default='')
    parser.add_argument('-l', '--word-list', dest='wordList', action='store',
                        help='the alternative output, filename of the word list',
                        default=None)
    parser.add_argument('-d', '--download', dest='download', action='store_true',
                        help='to download the lastest dump')

    download = True

    arg = parser.parse_args()


    if not arg.wordList:
        arg.wordList = arg.outputF + '.wordlist'

    if arg.download and arg.dumpF:
        print('''Incompatible options '-i' and '-d'.''')
        exit(1)
    elif arg.download:
        arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
    elif arg.dumpF:
        download = False


    if not arg.dumpF or not arg.dumpF.endswith('bz2'):
        print('A bz2 dump file filename needed', file=sys.stderr)
        exit(-1)

    if exists(arg.dumpF) and download:
        print(f'{arg.dumpF} exists. Force downloading ? (y/N)')
        answer = input('> ')
        if answer.lower()[0] != 'y':
            download = False

    if download:
        print(download);
        print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
        try:
            urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
        except urllib.error.URLError:
            print('Error: Unable to download from internet')
            print(f'Check connection and source URL : ({ URL_DUMP })')
            print('Exiting')
            exit(-10)
        except:
            print('Download failed.')
            print('Exiting')
            exit(-1)

    if not exists(arg.dumpF):
        if download:
            print('Download failed.\nExiting.', file=sys.stderr)
        else:
            print(f'Fichier { arg.dumpF } introuvable.\nArrêt.')
        exit(-2)

    decompress = False

    try:
        print('Trying the bzip2 command')
        assert(subprocess.call(['bzip2', '-d', arg.dumpF]) == 0)
        decompress = True
    except:
        print('''The command "bzip" doesn't exists, or doesn't work as intended''')
        print('Fallback to Python bz2 module decompressor')

    # Decompression using bzip2
    if not decompress:
        try:
            import bz2
            with open(arg.dumpF, 'rb') as f:
                it = iter(lambda: f.read(2**16), b'')

                output_fn = arg.dumpF[:-4]

                with open(output_fn, 'wb') as fout:
                    dcomp = bz2.BZ2Decompressor()
                    for chunk in it:
                        datal = len(chunk)
                        data = dcomp.decompress(chunk)
                        fout.write(data)
            decompress = True
        except:
            print('''Python bz2 module decompressor failed, maybe you don't have any space available''')
            print('Fallback to on the fly decompressor (RAM will be needed)')

    if not decompress:
        try:
            # On the fly Decompression
            with open(arg.dumpF, 'rb') as f:
                it = iter(lambda: f.read(2**16), b'')
                print('Data extraction on the fly')
                res = dump2msgp.extractAll(unbz2(it), 'error.log', False)
                with open(arg.wordList, 'wb'):
                    f.write('\n'.join(a.keys()))

                msgp2sqlite.writeDB(arg.outputF, res)
            print(f'Word list { arg.wordList } created ! 👏 🎉')
            print(f'Database { arg.outputF } created ! 👏 🎉')
        except:
            print('''Error: Can't extract the dump file''')
            print('Exiting (-1)')
            exit(-1)

        print(f'Removing temporary files')
        os.remove(arg.dumpF)
    else:
        output_fn = arg.dumpF[:-4]
        with open(output_fn, 'r') as f:
            print('Create the database')
            res = dump2msgp.extractAll(f, 'error.log', False)
            msgp2sqlite.writeDB(arg.outputF, res)
        print(f'Database { arg.outputF } created ! 👏 🎉')

        print('Removing temporary files')
        os.remove(output_fn)