aboutsummaryrefslogtreecommitdiff
path: root/download/bz2toDB.py
blob: a0c2cd3a0322a3a71d0decf8ee697a8592befe07 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
""" Not a script

Don't use that script


This python file store function related to bz2 python module and then the
on the fly method of decompression.

"""

import bz2
import sys


def unbz2(file):
    decomp = bz2.BZ2Decompressor()
    buf = b''
    for c in file:
        buf += decomp.decompress(c)

        while b'\n' in buf:
            i = buf.index(b'\n')
            if i + 1 < len(buf):
                ret = buf[:i + 1]
                buf = buf[i + 1:]
                yield ret.decode("utf-8")
            else:
                yield buf
                buf = b''


with open('./wiktionary_dump.xml.bz2', 'rb') as f:
    it = iter(lambda: f.read(32768), b'')
    for a in unbz2(it):
        print(a, end='')