blob: a0c2cd3a0322a3a71d0decf8ee697a8592befe07 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
""" Not a script
Don't use that script
This python file store function related to bz2 python module and then the
on the fly method of decompression.
"""
import bz2
import sys
def unbz2(file):
decomp = bz2.BZ2Decompressor()
buf = b''
for c in file:
buf += decomp.decompress(c)
while b'\n' in buf:
i = buf.index(b'\n')
if i + 1 < len(buf):
ret = buf[:i + 1]
buf = buf[i + 1:]
yield ret.decode("utf-8")
else:
yield buf
buf = b''
with open('./wiktionary_dump.xml.bz2', 'rb') as f:
it = iter(lambda: f.read(32768), b'')
for a in unbz2(it):
print(a, end='')
|