blob: 1fddd85f64685c562c62af4b61ff4989762b217c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
import bz2
import sys
def unbz2(file):
decomp = bz2.BZ2Decompressor()
buf = b''
for c in file:
buf += decomp.decompress(c)
while b'\n' in buf:
i = buf.index(b'\n')
if i + 1 < len(buf):
ret = buf[:i + 1]
buf = buf[i + 1:]
yield ret.decode("utf-8")
else:
yield buf
buf = b''
with open('./wiktionary_dump.xml.bz2', 'rb') as f:
it = iter(lambda: f.read(32768), b'')
for a in unbz2(it):
print(a, end='')
|