From 3f90a60d0f793084eebbbd15a26b7af2acdeac48 Mon Sep 17 00:00:00 2001 From: ache Date: Thu, 16 Sep 2021 03:42:56 +0200 Subject: Command every scripts --- download/bz2toDB.py | 10 ++++++ download/download.py | 90 +++++++++++++++++++++++++++++++++++++++++++-------- download/dump2msgp.py | 65 +++++++++++++++++++++++++++++-------- 3 files changed, 139 insertions(+), 26 deletions(-) diff --git a/download/bz2toDB.py b/download/bz2toDB.py index 1fddd85..a0c2cd3 100644 --- a/download/bz2toDB.py +++ b/download/bz2toDB.py @@ -1,3 +1,13 @@ +""" Not a script + +Don't use that script + + +This python file store function related to bz2 python module and then the +on the fly method of decompression. + +""" + import bz2 import sys diff --git a/download/download.py b/download/download.py index b97bb8f..21137f0 100755 --- a/download/download.py +++ b/download/download.py @@ -1,5 +1,74 @@ #!/bin/env python +"""dfr - Prepare database + +This script will download the last wiktionary dump from wikimedia, extract it +process it and then . +The result of that script is a sqlite database file usable with dfr.py. + +As downloading the dump can be challenging, you can specify a input file that +will be used instead of the last dump from wikimedia. It's the goal of the +command line option `--download`. + +########################### +⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big. +Then a decompressed one is bigger again and so a lot of disk space is +needed to store these files. + +⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory +space are needed to process them. It's NOT recommended to try to create a +database file on a computer with less than 2Gio memory, even using the on +the fly decompression method. +########################### + +They is only a few other command line options that can use with that script. + + + --output + The name of the final sqlite database. By default it's `dfr.db`. dfr.py + will expect the database to on the root of the project and to be + named `dfr.db` so that you don't have to modify anything here. + ++ --input + As describe earlier, if you already have downloaded the wiktionary dump, + you can set the location of the file that will be used here. + The purpose it to not re-download the dump each time. + + The "download" and "input" options are incompatibles. + ++ --download + Force the download of the file even if the file is already downloaded. + It used to force update of the database. + + The "download" and "input" options are incompatibles. + ++ --word-list + Not usefull for the moment. dfr will have a functionality to list every + words in the dictionnary. You will be able to filter words based on regex, + optionally, there will be a option to auto correct a word but nothing + is implemented here. + In all cases, you can specify the filename of the file that will store + every words. + +Note: This script have many option to decompress the wiktionary archive dump. + +First, it will try the `bzip2` command, if it fails, this script will try +to extract the bzip file on the fly using bz2 python module. + + - The `bzip2` command is very fast but use a lot of disk space and consume a lot +of memory (RAM) to quickly decompress the file. + - The `bz2` python module isn't that fast but use less memory and the parsing +is also done on the fly. + +In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump. + +""" + +# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr. + +# TODO: Add an option to choose the extract method. + +# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio). + import argparse import sys import urllib.request @@ -145,17 +214,12 @@ if __name__ == '__main__': print(f'Removing temporary files') os.remove(arg.dumpF) else: - try: - output_fn = arg.dumpF[:-4] - with open(output_fn, 'r') as f: - print('Create the database') - res = dump2msgp.extractAll(f, 'error.log', False) - msgPack2sqlite_msgPack.writeDB(arg.outputF, res) - print(f'Database { arg.outputF } created ! 👏 🎉') - except: - print('Failed to extract database') - print('Exiting (-3)') - exit(-3) - - print(f'Removing temporary files') + output_fn = arg.dumpF[:-4] + with open(output_fn, 'r') as f: + print('Create the database') + res = dump2msgp.extractAll(f, 'error.log', False) + msgPack2sqlite_msgPack.writeDB(arg.outputF, res) + print(f'Database { arg.outputF } created ! 👏 🎉') + + print('Removing temporary files') os.remove(output_fn) diff --git a/download/dump2msgp.py b/download/dump2msgp.py index 50372cf..e76115c 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -1,3 +1,39 @@ +#!/bin/env python + +"""dfr - Dump to msgpack + +Extract words from the Wiktionnary archive. All the parsing is done here. +The product of that script is a MessagePack file that store every information in +easily editable and dev friendly format. + +More information on MessagePack (msgpack) : + + +So there is some command line options that script can deal with. + + + + --output + The filename of the msgpack file to write. + + + --input + The filename of the decompressed wiktionary dump. + + + --error + The filename that will log errors related to parsing. + Wiktionnary is a community edited platform so there is a lot of + formatting mistakes. This script will report everything that it + doesn't understand in that file. + + + --ignore + By default, this script stops on the first error. But as I say earlier, + there is a lot of mistakes in the wiktionary archive dump so this option + is intended to ignore errors and just continue. + Errors are still logged though. + + +""" + + import tempfile as tmp import re import sys @@ -8,12 +44,6 @@ from sectionList import listInfoSection from template import template -""" - -Extract words from the Wiktionnary archive - -""" - DEFAULT_OUTPUT = 'dfr.msgpk' @@ -259,14 +289,17 @@ def extractAll(f, errorF, ignore): dict_ = dict() - cleanr = re.compile('<.*?>') + i = 0 + + clearHTML = re.compile('<.*>', re.IGNORECASE) for line in f: + i += 1 if "" in line and tf: tf.seek(0) - i = extract(tf, title, errorF) + tmpInfo = extract(tf, title, errorF) - dict_[title] = i + dict_[title] = tmpInfo tf.close() tf = None @@ -307,7 +340,7 @@ def extractAll(f, errorF, ignore): tf.seek(0) while line2 := tf.readline(): print(line2, end='') - print(line) + print(f"{i}: {line}") exit(-1) else: @@ -315,12 +348,18 @@ def extractAll(f, errorF, ignore): elif not hasForbidden and "== {{langue|" in line: isFr = False if not hasForbidden and isFr and tf: + start = "" + try: + start = line.split()[0] + except: + pass + tLine = clearHTML.sub('', line.replace('<br>', '\n' + start + ' ')) try: - ind = line.index('') - tf.write(cleanr.sub('', line[:ind].replace('<br>', '\n'))) + ind = tLine.index('') + tf.write(tLine[:ind]) isEnd = True except: - tf.write(cleanr.sub('', line.replace('<br>', '\n'))) + tf.write(tLine) return dict_ -- cgit v1.2.3