From 3f90a60d0f793084eebbbd15a26b7af2acdeac48 Mon Sep 17 00:00:00 2001 From: ache Date: Thu, 16 Sep 2021 03:42:56 +0200 Subject: Command every scripts --- download/dump2msgp.py | 65 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 13 deletions(-) (limited to 'download/dump2msgp.py') diff --git a/download/dump2msgp.py b/download/dump2msgp.py index 50372cf..e76115c 100644 --- a/download/dump2msgp.py +++ b/download/dump2msgp.py @@ -1,3 +1,39 @@ +#!/bin/env python + +"""dfr - Dump to msgpack + +Extract words from the Wiktionnary archive. All the parsing is done here. +The product of that script is a MessagePack file that store every information in +easily editable and dev friendly format. + +More information on MessagePack (msgpack) : + + +So there is some command line options that script can deal with. + + + + --output + The filename of the msgpack file to write. + + + --input + The filename of the decompressed wiktionary dump. + + + --error + The filename that will log errors related to parsing. + Wiktionnary is a community edited platform so there is a lot of + formatting mistakes. This script will report everything that it + doesn't understand in that file. + + + --ignore + By default, this script stops on the first error. But as I say earlier, + there is a lot of mistakes in the wiktionary archive dump so this option + is intended to ignore errors and just continue. + Errors are still logged though. + + +""" + + import tempfile as tmp import re import sys @@ -8,12 +44,6 @@ from sectionList import listInfoSection from template import template -""" - -Extract words from the Wiktionnary archive - -""" - DEFAULT_OUTPUT = 'dfr.msgpk' @@ -259,14 +289,17 @@ def extractAll(f, errorF, ignore): dict_ = dict() - cleanr = re.compile('<.*?>') + i = 0 + + clearHTML = re.compile('<.*>', re.IGNORECASE) for line in f: + i += 1 if "" in line and tf: tf.seek(0) - i = extract(tf, title, errorF) + tmpInfo = extract(tf, title, errorF) - dict_[title] = i + dict_[title] = tmpInfo tf.close() tf = None @@ -307,7 +340,7 @@ def extractAll(f, errorF, ignore): tf.seek(0) while line2 := tf.readline(): print(line2, end='') - print(line) + print(f"{i}: {line}") exit(-1) else: @@ -315,12 +348,18 @@ def extractAll(f, errorF, ignore): elif not hasForbidden and "== {{langue|" in line: isFr = False if not hasForbidden and isFr and tf: + start = "" + try: + start = line.split()[0] + except: + pass + tLine = clearHTML.sub('', line.replace('<br>', '\n' + start + ' ')) try: - ind = line.index('') - tf.write(cleanr.sub('', line[:ind].replace('<br>', '\n'))) + ind = tLine.index('') + tf.write(tLine[:ind]) isEnd = True except: - tf.write(cleanr.sub('', line.replace('<br>', '\n'))) + tf.write(tLine) return dict_ -- cgit v1.2.3