aboutsummaryrefslogtreecommitdiff
path: root/download/bz2toDB.py
blob: 1fddd85f64685c562c62af4b61ff4989762b217c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import bz2
import sys


def unbz2(file):
    decomp = bz2.BZ2Decompressor()
    buf = b''
    for c in file:
        buf += decomp.decompress(c)

        while b'\n' in buf:
            i = buf.index(b'\n')
            if i + 1 < len(buf):
                ret = buf[:i + 1]
                buf = buf[i + 1:]
                yield ret.decode("utf-8")
            else:
                yield buf
                buf = b''


with open('./wiktionary_dump.xml.bz2', 'rb') as f:
    it = iter(lambda: f.read(32768), b'')
    for a in unbz2(it):
        print(a, end='')