diff options
Diffstat (limited to 'download')
-rwxr-xr-x | download/download.py | 16 | ||||
-rw-r--r-- | download/dump2msgp.py | 12 | ||||
-rw-r--r-- | download/msgPack2sqlite_msgPack.py | 7 |
3 files changed, 30 insertions, 5 deletions
diff --git a/download/download.py b/download/download.py index 18a60fe..b97bb8f 100755 --- a/download/download.py +++ b/download/download.py @@ -45,18 +45,22 @@ if __name__ == '__main__': parser.add_argument('-d', '--download', dest='download', action='store_true', help='to download the lastest dump') + download = True + arg = parser.parse_args() - download = True if not arg.wordList: arg.wordList = arg.outputF + '.wordlist' - if download and arg.dumpF: + if arg.download and arg.dumpF: print('''Incompatible options '-i' and '-d'.''') exit(1) - elif download: + elif arg.download: arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:] + elif arg.dumpF: + download = False + if not arg.dumpF or not arg.dumpF.endswith('bz2'): print('A bz2 dump file filename needed', file=sys.stderr) @@ -69,6 +73,7 @@ if __name__ == '__main__': download = False if download: + print(download); print(f'Downloading the dump ({arg.dumpF})\nIt should take some time') try: urllib.request.urlretrieve(URL_DUMP, arg.dumpF) @@ -83,7 +88,10 @@ if __name__ == '__main__': exit(-1) if not exists(arg.dumpF): - print('Download failed.\nExiting.', file=sys.stderr) + if download: + print('Download failed.\nExiting.', file=sys.stderr) + else: + print(f'Fichier { arg.dumpF } introuvable.\nArrêt.') exit(-2) decompress = False diff --git a/download/dump2msgp.py b/download/dump2msgp.py index 70b483c..c0186af 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -255,6 +255,7 @@ def extractAll(f, errorF, ignore): hasForbidden = False hasText = False tf = None + isEnd = False dict_ = dict() @@ -271,12 +272,16 @@ def extractAll(f, errorF, ignore): hasText = False isFr = False title = "" + isEnd = False elif "</page>" in line: tf = None hasForbidden = False hasText = False isFr = False title = "" + isEnd = False + if isEnd: + continue if "<title>" in line: title = line[line.find('>') + 1:] @@ -308,7 +313,12 @@ def extractAll(f, errorF, ignore): elif not hasForbidden and "== {{langue|" in line: isFr = False if not hasForbidden and isFr and tf: - tf.write(line) + try: + ind = line.index('</text>') + tf.write(line[:ind]) + isEnd = True + except: + tf.write(line) return dict_ diff --git a/download/msgPack2sqlite_msgPack.py b/download/msgPack2sqlite_msgPack.py index 38d34cd..c08efdb 100644 --- a/download/msgPack2sqlite_msgPack.py +++ b/download/msgPack2sqlite_msgPack.py @@ -1,4 +1,5 @@ import msgpack +import os import sys import sqlite3 @@ -6,6 +7,12 @@ import argparse def writeDB(outputF, data): + # Delete if exists + try: + os.remove(outputF) + except OSError: + pass + with sqlite3.connect(outputF) as con: cur = con.cursor() cur.execute('''CREATE TABLE IF NOT EXISTS entry ( |