diff options
Diffstat (limited to 'download/bz2toDB.py')
-rw-r--r-- | download/bz2toDB.py | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/download/bz2toDB.py b/download/bz2toDB.py new file mode 100644 index 0000000..1fddd85 --- /dev/null +++ b/download/bz2toDB.py @@ -0,0 +1,25 @@ +import bz2 +import sys + + +def unbz2(file): + decomp = bz2.BZ2Decompressor() + buf = b'' + for c in file: + buf += decomp.decompress(c) + + while b'\n' in buf: + i = buf.index(b'\n') + if i + 1 < len(buf): + ret = buf[:i + 1] + buf = buf[i + 1:] + yield ret.decode("utf-8") + else: + yield buf + buf = b'' + + +with open('./wiktionary_dump.xml.bz2', 'rb') as f: + it = iter(lambda: f.read(32768), b'') + for a in unbz2(it): + print(a, end='') |