aboutsummaryrefslogtreecommitdiff
path: root/download
diff options
context:
space:
mode:
Diffstat (limited to 'download')
-rwxr-xr-xdownload/download.py16
-rw-r--r--download/dump2msgp.py12
-rw-r--r--download/msgPack2sqlite_msgPack.py7
3 files changed, 30 insertions, 5 deletions
diff --git a/download/download.py b/download/download.py
index 18a60fe..b97bb8f 100755
--- a/download/download.py
+++ b/download/download.py
@@ -45,18 +45,22 @@ if __name__ == '__main__':
parser.add_argument('-d', '--download', dest='download', action='store_true',
help='to download the lastest dump')
+ download = True
+
arg = parser.parse_args()
- download = True
if not arg.wordList:
arg.wordList = arg.outputF + '.wordlist'
- if download and arg.dumpF:
+ if arg.download and arg.dumpF:
print('''Incompatible options '-i' and '-d'.''')
exit(1)
- elif download:
+ elif arg.download:
arg.dumpF = URL_DUMP[URL_DUMP.rindex('/') + 1:]
+ elif arg.dumpF:
+ download = False
+
if not arg.dumpF or not arg.dumpF.endswith('bz2'):
print('A bz2 dump file filename needed', file=sys.stderr)
@@ -69,6 +73,7 @@ if __name__ == '__main__':
download = False
if download:
+ print(download);
print(f'Downloading the dump ({arg.dumpF})\nIt should take some time')
try:
urllib.request.urlretrieve(URL_DUMP, arg.dumpF)
@@ -83,7 +88,10 @@ if __name__ == '__main__':
exit(-1)
if not exists(arg.dumpF):
- print('Download failed.\nExiting.', file=sys.stderr)
+ if download:
+ print('Download failed.\nExiting.', file=sys.stderr)
+ else:
+ print(f'Fichier { arg.dumpF } introuvable.\nArrêt.')
exit(-2)
decompress = False
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 70b483c..c0186af 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -255,6 +255,7 @@ def extractAll(f, errorF, ignore):
hasForbidden = False
hasText = False
tf = None
+ isEnd = False
dict_ = dict()
@@ -271,12 +272,16 @@ def extractAll(f, errorF, ignore):
hasText = False
isFr = False
title = ""
+ isEnd = False
elif "</page>" in line:
tf = None
hasForbidden = False
hasText = False
isFr = False
title = ""
+ isEnd = False
+ if isEnd:
+ continue
if "<title>" in line:
title = line[line.find('>') + 1:]
@@ -308,7 +313,12 @@ def extractAll(f, errorF, ignore):
elif not hasForbidden and "== {{langue|" in line:
isFr = False
if not hasForbidden and isFr and tf:
- tf.write(line)
+ try:
+ ind = line.index('</text>')
+ tf.write(line[:ind])
+ isEnd = True
+ except:
+ tf.write(line)
return dict_
diff --git a/download/msgPack2sqlite_msgPack.py b/download/msgPack2sqlite_msgPack.py
index 38d34cd..c08efdb 100644
--- a/download/msgPack2sqlite_msgPack.py
+++ b/download/msgPack2sqlite_msgPack.py
@@ -1,4 +1,5 @@
import msgpack
+import os
import sys
import sqlite3
@@ -6,6 +7,12 @@ import argparse
def writeDB(outputF, data):
+ # Delete if exists
+ try:
+ os.remove(outputF)
+ except OSError:
+ pass
+
with sqlite3.connect(outputF) as con:
cur = con.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS entry (