Command every scripts

author: ache <ache@ache.one> 2021-09-16 03:42:56 +0200
committer: ache <ache@ache.one> 2021-09-16 03:42:56 +0200
commit: 3f90a60d0f793084eebbbd15a26b7af2acdeac48 (patch)
tree: e99e04522ee1144b428161c4723b1d76c2470354
parent: Add the dfrf command (diff)
3 files changed, 138 insertions, 25 deletions
diff --git a/download/bz2toDB.py b/download/bz2toDB.py
index 1fddd85..a0c2cd3 100644
--- a/download/bz2toDB.py
+++ b/download/bz2toDB.py
@@ -1,3 +1,13 @@
+""" Not a script
+
+Don't use that script
+
+
+This python file store function related to bz2 python module and then the
+on the fly method of decompression.
+
+"""
+
 import bz2
 import sys
 
diff --git a/download/download.py b/download/download.py
index b97bb8f..21137f0 100755
--- a/download/download.py
+++ b/download/download.py
@@ -1,5 +1,74 @@
 #!/bin/env python
 
+"""dfr - Prepare database
+
+This script will download the last wiktionary dump from wikimedia, extract it
+process it and then .
+The result of that script is a sqlite database file usable with dfr.py.
+
+As downloading the dump can be challenging, you can specify a input file that
+will be used instead of the last dump from wikimedia. It's the goal of the
+command line option `--download`.
+
+###########################
+⚠️ WARNING: Beaware that a wiktionary cmpressed archive dump is big.
+Then a decompressed one is bigger again and so a lot of disk space is
+needed to store these files.
+
+⚠️ WARNING: Beaware that wiktionary dump are really big and than a lot of memory
+space are needed to process them. It's NOT recommended to try to create a
+database file on a computer with less than 2Gio memory, even using the on
+the fly decompression method.
+###########################
+
+They is only a few other command line options that can use with that script.
+
+ + --output
+    The name of the final sqlite database. By default it's `dfr.db`. dfr.py
+    will expect the database to on the root of the project and to be
+    named `dfr.db` so that you don't have to modify anything here.
+
++ --input
+    As describe earlier, if you already have downloaded the wiktionary dump,
+    you can set the location of the file that will be used here.
+    The purpose it to not re-download the dump each time.
+
+    The "download" and "input" options are incompatibles.
+
++ --download
+    Force the download of the file even if the file is already downloaded.
+    It used to force update of the database.
+
+    The "download" and "input" options are incompatibles.
+
++ --word-list
+    Not usefull for the moment. dfr will have a functionality to list every
+    words in the dictionnary. You will be able to filter words based on regex,
+    optionally, there will be a option to auto correct a word but nothing
+    is implemented here.
+    In all cases, you can specify the filename of the file that will store
+    every words.
+
+Note: This script have many option to decompress the wiktionary archive dump.
+
+First, it will try the `bzip2` command, if it fails, this script will try
+to extract the bzip file on the fly using bz2 python module.
+
+ - The `bzip2` command is very fast but use a lot of disk space and consume a lot
+of memory (RAM) to quickly decompress the file.
+ - The `bz2` python module isn't that fast but use less memory and the parsing
+is also done on the fly.
+
+In every case, a lot of memory (RAM) is necessary to process the last wiktionary dump.
+
+"""
+
+# TODO: Add an option to set URL of the DUMP. To cache another file than the latest or for another language than fr.
+
+# TODO: Add an option to choose the extract method.
+
+# TODO: Optimize the bz2 module process to write the msgpack file on the fly. The goal is to never store a lot of information in memory. This optimization could reduce a lot the memory (RAM) usage and possibly allow creation of the database on low memory computer (less than 2Gio).
+
 import argparse
 import sys
 import urllib.request
@@ -145,17 +214,12 @@ if __name__ == '__main__':
         print(f'Removing temporary files')
         os.remove(arg.dumpF)
     else:
-        try:
-            output_fn = arg.dumpF[:-4]
-            with open(output_fn, 'r') as f:
-                print('Create the database')
-                res = dump2msgp.extractAll(f, 'error.log', False)
-                msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
-            print(f'Database { arg.outputF } created ! 👏 🎉')
-        except:
-            print('Failed to extract database')
-            print('Exiting (-3)')
-            exit(-3)
+        output_fn = arg.dumpF[:-4]
+        with open(output_fn, 'r') as f:
+            print('Create the database')
+            res = dump2msgp.extractAll(f, 'error.log', False)
+            msgPack2sqlite_msgPack.writeDB(arg.outputF, res)
+        print(f'Database { arg.outputF } created ! 👏 🎉')
 
-        print(f'Removing temporary files')
+        print('Removing temporary files')
         os.remove(output_fn)
diff --git a/download/dump2msgp.py b/download/dump2msgp.py
index 50372cf..e76115c 100644
--- a/download/dump2msgp.py
+++ b/download/dump2msgp.py
@@ -1,3 +1,39 @@
+#!/bin/env python
+
+"""dfr - Dump to msgpack
+
+Extract words from the Wiktionnary archive. All the parsing is done here.
+The product of that script is a MessagePack file that store every information in
+easily editable and dev friendly format.
+
+More information on MessagePack (msgpack) :
+<https://msgpack.org/>
+
+So there is some command line options that script can deal with.
+
+
+ + --output
+    The filename of the msgpack file to write.
+
+ + --input
+    The filename of the decompressed wiktionary dump.
+
+ + --error
+    The filename that will log errors related to parsing.
+    Wiktionnary is a community edited platform so there is a lot of
+    formatting mistakes. This script will report everything that it
+    doesn't understand in that file.
+
+ + --ignore
+    By default, this script stops on the first error. But as I say earlier,
+    there is a lot of mistakes in the wiktionary archive dump so this option
+    is intended to ignore errors and just continue.
+    Errors are still logged though.
+
+
+"""
+
+
 import tempfile as tmp
 import re
 import sys
@@ -8,12 +44,6 @@ from sectionList import listInfoSection
 from template import template
 
 
-"""
-
-Extract words from the Wiktionnary archive
-
-"""
-
 DEFAULT_OUTPUT = 'dfr.msgpk'
 
 
@@ -259,14 +289,17 @@ def extractAll(f, errorF, ignore):
 
     dict_ = dict()
 
-    cleanr = re.compile('&lt;.*?&gt;')
+    i = 0
+
+    clearHTML = re.compile('&lt;.*&gt;', re.IGNORECASE)
 
     for line in f:
+        i += 1
         if "</page>" in line and tf:
             tf.seek(0)
-            i = extract(tf, title, errorF)
+            tmpInfo = extract(tf, title, errorF)
 
-            dict_[title] = i
+            dict_[title] = tmpInfo
             tf.close()
 
             tf = None
@@ -307,7 +340,7 @@ def extractAll(f, errorF, ignore):
                         tf.seek(0)
                         while line2 := tf.readline():
                             print(line2, end='')
-                        print(line)
+                        print(f"{i}: {line}")
 
                         exit(-1)
             else:
@@ -315,12 +348,18 @@ def extractAll(f, errorF, ignore):
         elif not hasForbidden and "== {{langue|" in line:
             isFr = False
         if not hasForbidden and isFr and tf:
+            start = ""
+            try:
+                start = line.split()[0]
+            except:
+                pass
+            tLine = clearHTML.sub('', line.replace('&lt;br&gt;', '\n' + start + ' '))
             try:
-                ind = line.index('</text>')
-                tf.write(cleanr.sub('', line[:ind].replace('&lt;br&gt;', '\n')))
+                ind = tLine.index('</text>')
+                tf.write(tLine[:ind])
                 isEnd = True
             except:
-                tf.write(cleanr.sub('', line.replace('&lt;br&gt;', '\n')))
+                tf.write(tLine)
 
     return dict_
author	ache <ache@ache.one>	2021-09-16 03:42:56 +0200
committer	ache <ache@ache.one>	2021-09-16 03:42:56 +0200
commit	3f90a60d0f793084eebbbd15a26b7af2acdeac48 (patch)
tree	e99e04522ee1144b428161c4723b1d76c2470354
parent	Add the dfrf command (diff)