aboutsummaryrefslogtreecommitdiff
path: root/download/bz2toDB.py
diff options
context:
space:
mode:
Diffstat (limited to 'download/bz2toDB.py')
-rw-r--r--download/bz2toDB.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/download/bz2toDB.py b/download/bz2toDB.py
new file mode 100644
index 0000000..1fddd85
--- /dev/null
+++ b/download/bz2toDB.py
@@ -0,0 +1,25 @@
+import bz2
+import sys
+
+
+def unbz2(file):
+ decomp = bz2.BZ2Decompressor()
+ buf = b''
+ for c in file:
+ buf += decomp.decompress(c)
+
+ while b'\n' in buf:
+ i = buf.index(b'\n')
+ if i + 1 < len(buf):
+ ret = buf[:i + 1]
+ buf = buf[i + 1:]
+ yield ret.decode("utf-8")
+ else:
+ yield buf
+ buf = b''
+
+
+with open('./wiktionary_dump.xml.bz2', 'rb') as f:
+ it = iter(lambda: f.read(32768), b'')
+ for a in unbz2(it):
+ print(a, end='')