import tempfile as tmp
import re
import sys
import msgpack
import argparse

from sectionList import listInfoSection
from template import template


"""

Extract words from the Wiktionnary archive

"""

DEFAULT_OUTPUT = 'dfr.msgpk'


template_second = ['link', 'bd', 'pc', 'nom w pc', 'w', 'smcp', 'lien', 'ws',
                   'in', 'siècle2', 'fchim', 'nobr', 'wp', 'r',
                   'clé de tri', 'contexte', 'emploi', 'l', 'polytonique',
                   'pron-API', 'registre', 'scmp', 'siècle', 'x',
                   ]


template_second_lambda_trd = {
    'refnec': (lambda x: '(Référence nécessaire : ' + x + ')'),
    'refnéc': (lambda x: '(Référence nécessaire : ' + x + ')'),
}

template_second_lambda_snd = {
    'term': (lambda x: '(' + x.title() + ')'),
    'terme': (lambda x: '(' + x.title() + ')'),
    'ex': (lambda x: '^{' + x if x else 'e' + '}'),
    'exp': (lambda x: '^{' + x if x else 'e' + '}'),
    'e': (lambda x: '^{' + x if x else 'e' + '}'),
    'er': (lambda x: '^{' + x if x else 'er' + '}'),
    'ère': (lambda x: '^{' + x if x else 'ère' + '}'),
    'ème': (lambda x: '^{' + x if x else 'ème' + '}'),
    'Ier': (lambda x: '^{' + x if x else 'Ier' + '}'),
    'III': (lambda x: '^{' + x if x else 'III' + '}'),
    'small': (lambda x: '_{' + x if x else '' + '}'),
    'indice': (lambda x: '_{' + x if x else '' + '}'),
    'graphie': (lambda x: '«' + x if x else '»'),
    'petites capitales': (lambda x: x.upper()),
    'isbn': (lambda x: 'cf. ISBN ' + x),
    'OCLC': (lambda x: 'cf. OCLC ' + x),
    'variante de': (lambda x: 'Variante de ' + x),
    'variante  de': (lambda x: 'Variante de ' + x),
    'variante ortho de': (lambda x: 'Variante orthographique de ' + x),
    'variante  ortho de': (lambda x: 'Variante orthographique de ' + x),
    'variante ortho  de': (lambda x: 'Variante orthographique de ' + x),
    'variante orthographique de': (lambda x: 'Variante orthographique de ' + x),
    'sic !': (lambda x: '^{sic ' + x + '}'),
    'sic': (lambda x: '^{sic ' + x + '}'),
    'incise': (lambda x: '_' + x + '_'),
    'n°': (lambda x: 'n°' + x),
    'superlatif de': (lambda x: 'Superlatif de' + x),
    'vérifier': (lambda x: '(À vérifier : ' + x + ')'),
}

dictMatch = {x['match']: i for (i, x) in enumerate(listInfoSection)}

interdit = " :"


def transclusion(trans, info, errorF):
    trans = trans[2:-2]

    while '{{' in trans:
        l0 = trans.rfind('{{')
        l1 = trans.find('}}', l0)
        if l1 == -1:
            break
        else:
            l1 += 2
        t = trans[l0:l1]
        t = transclusion(t, info, errorF)
        trans = trans[:l0] + t + trans[l1:]

    s = list(map(lambda x: x.strip(), trans.split('|')))
    if s[0] in template:
        return template[s[0]]

    if s[0].lower() in template_second:
        return s[1] if len(s) > 1 else info['mot']

    if s[0].lower().startswith('citation'):
        cit = s[0].split('/')
        if len(cit) == 4:
            return 'Par ' + cit[1] + ', ' + cit[2] + ', ' + cit[3]
        if len(cit) == 3:
            return 'Par ' + cit[1] + ', ' + cit[2]
        if len(cit) == 5:
            return cit[1] + '/' + cit[2] + '/' + cit[3] + ', ' + cit[4]
        if len(cit) <= 2:
            return ''
        else:
            return '/'.join(cit[1:])

    if s[0].lower() in template_second_lambda_snd:
        return template_second_lambda_snd[s[0].lower()](s[1] if len(s) > 1 else '')

    if s[0].lower() in template_second_lambda_trd:
        return template_second_lambda_trd[s[0].lower()](s[2] if len(s) > 2 else '')

    if errorF:
        with open(errorF, 'a') as err:
           print(s[0], file=err)
           print("Incompréhension de la transclusion {} du mot {}".format(trans,
                 info['mot']), file=err)
    return ''


def extract(f, w, errorF):
    infoFin = []

    toRead = True
    goBack = 0

    while toRead:
        toRead = False

        info = {'mot': w,
                'cat-gram': None,
                'def': [],
                'API': None,
                'infos': [],
                'genre': '',
                'accord': None}

        # State 0 // Initialisation !
        while line := f.readline():

            if line.startswith('=== ') or line.startswith('==={'):
                if re.match('^=== *{{ *S\\|([^|]+)|.*$', line):
                    try:
                        r = re.match('^=== *{{ *S\\|([^|]+)|.*$', line)
                        r = r.groups()
                        nat = r[0].strip()
                        if nat in dictMatch.keys():
                            info['cat-gram'] = nat
                            toRead = True
                            break
                    except Exception as e:
                        if errorF:
                            with open(errorF, 'a') as err:
                                print("^[1] Problème à l'initialisation du mot {mot}: {e}", file=err)
                                print(f'line: [{line}]: {e}', file=err)
                                e = sys.exc_info()[0]
                                print("Erreur :", e, file=err)

        if not toRead:
            break

        # State 1
        while line := f.readline():
            if line.startswith('{{fr-'):
                e = line.find('}}')
                if e == -1:
                    continue
                ex = line[:e]
                try:
                    infos = list(map(lambda x: x.strip(), ex.split('|')))
                    info['infos'] = infos
                    info['accord'] = infos[0]
                    if len(infos) > 1:
                        info['API'] = infos[1]
                except e:
                    err = sys.exc_info()[0]
                    print(ex)
                    print("Erreur :", e)
                    print("Erreur :", err)
            if line.rstrip().startswith("'''"):
                if '{{pron' in line:
                    p0 = line.find('{{pron')
                    p1 = line.find('}}', p0)
                    if p1 > 0:
                        p1 += 2
                        p = line[p0:p1]
                        p = p.split('|')
                        info['API'] = p[1]
                if '{{m}}' in line:
                    info['genre'] = 'mas'
                elif '{{f}}' in line:
                    info['genre'] = 'fem'

            if line.startswith('# '):
                info['def'].append({'def': wikiToMd(line[2:], info, errorF)})
            elif line.startswith('#* '):
                if not info['def']:
                    with open('wiki_err.log', 'a') as err:
                        print("Exemple sans définition pour le mot {}".format(
                              info['mot']), file=err)
                elif 'ex' in info['def'][-1]:
                    info['def'][-1]['ex'].append(wikiToMd(line[3:], info, errorF))
                else:
                    info['def'][-1]['ex'] = [wikiToMd(line[3:], info, errorF)]
            elif line.startswith('#') and not line.startswith('##'):
                info['def'].append({'def': wikiToMd(line[1:], info, errorF)})
            if line.startswith('==='):
                goBack = len(line)
                break
        if goBack:
            f.seek(f.tell() - goBack)
            goBack = 0
            toRead = True
        infoFin.append(info)

    return infoFin


def wikiToMd(line, info, errorF):
    line = line.strip()
    # 3 Étapes:
    #   - Links [...]
    #   - Style ''ita'' / '''bold'''
    #   - Template / Transclusion {{info}} = (Informatique)

    # Template
    while '{{' in line:
        l0 = line.rfind('{{')
        l1 = line.find('}}', l0)
        if l1 == -1:
            break
        else:
            l1 += 2
        trans = line[l0:l1]
        trans = transclusion(trans, info, errorF)
        line = line[:l0] + trans + line[l1:]

    # Links !
    while '[[' in line:
        link0 = line.rfind('[[')
        link1 = line.find(']]', link0)
        if link1 == -1:
            break
        else:
            link1 += 2

        link = line[link0:link1]
        link = link[2:-2].split('|')
        line = line[:link0] + (link[1] if len(link) > 1 else link[0]) + line[link1:]

    # Style
    line = line.replace("'''", '*').replace("''", '')

    return line


def extractAll(f, errorF, ignore):
    title = ""
    isFr = False
    hasForbidden = False
    hasText = False
    tf = None
    isEnd = False

    dict_ = dict()

    cleanr = re.compile('&lt;.*?&gt;')

    for line in f:
        if "</page>" in line and tf:
            tf.seek(0)
            i = extract(tf, title, errorF)

            dict_[title] = i
            tf.close()

            tf = None
            hasForbidden = False
            hasText = False
            isFr = False
            title = ""
            isEnd = False
        elif "</page>" in line:
            tf = None
            hasForbidden = False
            hasText = False
            isFr = False
            title = ""
            isEnd = False
        if isEnd:
            continue

        if "<title>" in line:
            title = line[line.find('>') + 1:]
            title = title[:title.find('<')]

            for c in interdit:
                if c in title:
                    hasForbidden = True
        if not hasForbidden and "<text bytes=\"" in line and "\" xml:space=\"preserve\">" in line:
            hasText = True
        if not hasForbidden and "== {{langue|fr}}" in line and hasText:
            isFr = True
            if tf:
                if not ignore:
                    if errorF:
                        with open(errorF, 'a') as err:
                            print(f"{title}: Erreur tf encore ouvert !",
                                  file=err)
                    else:
                        print(f"{title}: Erreur tf encore ouvert !")
                        tf.seek(0)
                        while line2 := tf.readline():
                            print(line2, end='')
                        print(line)

                        exit(-1)
            else:
                tf = tmp.NamedTemporaryFile(mode="w+t")
        elif not hasForbidden and "== {{langue|" in line:
            isFr = False
        if not hasForbidden and isFr and tf:
            try:
                ind = line.index('</text>')
                tf.write(cleanr.sub('', line[:ind].replace('&lt;br&gt;', '\n')))
                isEnd = True
            except:
                tf.write(cleanr.sub('', line.replace('&lt;br&gt;', '\n')))

    return dict_


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='wiktionary dump to msgpack')
    parser.add_argument('-o', '--out', dest='outputF', action='store_const',
                        const=DEFAULT_OUTPUT, default=DEFAULT_OUTPUT,
                        help='the output filename')
    parser.add_argument('-i', '--input', dest='inputF', action='store',
                        help='the input filename, a dump of witionary')
    parser.add_argument('-e', '--error', dest='errorF', action='store',
                        help='the filename to log errors')
    parser.add_argument('--ignore', dest='ignoreError', action='store_true',
                        help='the filename to log errors')

    arg = parser.parse_args()

    if arg.inputF is None:
        print('A wiktionary dump is needed', file=sys.stderr)
        exit(-1)

    with open(arg.inputF, 'r') as f:
        res = extractAll(f, arg.errorF, arg.ignoreError)

        with open(arg.outputF, 'wb') as f:
            to_w = msgpack.packb(res)
            f.write(to_w)