import bz2 import sys def unbz2(file): decomp = bz2.BZ2Decompressor() buf = b'' for c in file: buf += decomp.decompress(c) while b'\n' in buf: i = buf.index(b'\n') if i + 1 < len(buf): ret = buf[:i + 1] buf = buf[i + 1:] yield ret.decode("utf-8") else: yield buf buf = b'' with open('./wiktionary_dump.xml.bz2', 'rb') as f: it = iter(lambda: f.read(32768), b'') for a in unbz2(it): print(a, end='')