From f6586d1862e6fe1acb617b6219901357e8ea6345 Mon Sep 17 00:00:00 2001 From: ache Date: Thu, 6 Jan 2022 03:54:18 +0100 Subject: Ajout d'une correction orthographique --- dfr.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 6 deletions(-) diff --git a/dfr.py b/dfr.py index e664264..1ed5d40 100755 --- a/dfr.py +++ b/dfr.py @@ -11,6 +11,7 @@ Maybe extended to other languages later. import sys import argparse import msgpack +import gzip import sqlite3 from os.path import exists import os @@ -27,6 +28,60 @@ else: dico = 'dfr.db' +def initWordList(): + if (not arg.wordList) or (arg.wordList == f'{DIR_PATH}/assets/wordList' and not exists(f'{DIR_PATH}/assets/wordList')): + create_wordlist() + arg.wordList = f'{DIR_PATH}/assets/wordList' + + + +def didYouMean(word): + A = "abcdefghijklmnopqrstuvwxyzéèïœä" + def var(word_i, i=1, alpha=A, memo={}): + + if word_i == "": + for c in A: + memo[c] = True + yield c + + if word_i in memo: + return + + if i == 1: + yield word_i + return + + for word in var(word_i, i - 1, alpha, memo={}): + if word not in memo: + memo[word] = True + yield word + for i in range(len(word)): + for c in A: + # Replace + if c != word[i]: + if word[:i] + c + word[i + 1:] not in memo: + memo[word[:i] + c + word[i + 1:]] = True + yield word[:i] + c + word[i + 1:] + # Insert + if word[:i] + c + word[i:] not in memo: + memo[word[:i] + c + word[i:]] = True + yield word[:i] + c + word[i:] + + for i in range(len(word)): + if word[:i] + word[i + 1:] not in memo: + memo[word[:i] + word[i + 1:]] = True + yield word[:i] + word[i + 1:] + + + with gzip.open(arg.wordList, 'r') as f: + wl = f.read().decode() + listWord = {w for w in wl.split('\n')} + ret = [w for w in var(word, 2, memo={}) if w in listWord] + if ret == []: + return [w for w in var(word, 3, memo={}) if w in listWord] + else: + return ret + def get_def_sql(word): with sqlite3.connect(dico) as con: @@ -45,6 +100,16 @@ def get_def_sql(word): }, res)) +def create_wordlist(): + with sqlite3.connect(dico) as con: + cur = con.cursor() + cur.execute('''SELECT word FROM entry''') + + with gzip.open(f"{DIR_PATH}/assets/wordList", 'w') as f: + to_w = list({f[0] for f in cur.fetchall()}) + f.write(str.encode("\n".join(to_w))) + + def get_def_sql_reg(word): with sqlite3.connect(dico) as con: cur = con.cursor() @@ -73,9 +138,9 @@ def matching(word): print(f'Error: Word list {arg.wordList} not found', file=sys.stderr) return - with open(arg.wordList, 'rb') as f: - msgpackList = f.read() - listWord = msgpack.unpackb(msgpackList, raw=False) + with gzip.open(arg.wordList, 'r') as f: + msgpackList = f.read().decode() + listWord = msgpackList.split('\n') if word[0] != '/': for w in listWord: @@ -113,7 +178,7 @@ if __name__ == '__main__': help='the word or the pattern to match') parser.add_argument('-l', '--word-list', dest='wordList', action='store', help='the filename of the word list', - default=None) + default=f'{DIR_PATH}/assets/wordList') parser.add_argument('-d', '--dico', dest='dico', action='store', help='the filename of the dictionnary', default='dfr.db') @@ -149,6 +214,8 @@ if __name__ == '__main__': dico = f'{DIR_PATH}/assets/{dico}' if arg.matching: + initWordList() + ret = matching(arg.word) for word in ret: print(word) @@ -157,11 +224,22 @@ if __name__ == '__main__': else: if arg.first: a = arg.action(arg.word) - if a : + if a: if a[0]['def']: a[0]['def'] = [a[0]['def'][0]] a[0]['def'][0]['ex'] = [] ui.show_terminal(a[0]) else: - for w in arg.action(arg.word): + res = arg.action(arg.word) + for w in res: ui.show_terminal(w) + if not res: + print("Word not found") + + initWordList() + listVar = didYouMean(arg.word) + + if len(listVar) == 1: + print("Did you mean ", listVar[0]) + elif len(listVar) > 1: + print("Did you mean ", ", ".join(listVar[:-1]), 'or', listVar[-1], '?') -- cgit v1.2.3