#!/bin/env python import re import os import sys import tempfile usage = """Usage: $ bot4chan "URL_BOARD_4CHAN" ["DIRECTORY"] DIRECTORY is optional and by default is "dump\"""" # Use http instead of https to avoid the overhead of encryption PROTOCOL = 'http' REGEX = '' if len(sys.argv) <= 1 or sys.argv[1] in ['-h', '--help']: print(usage) sys.exit(0) new_file, filename = tempfile.mkstemp() if not new_file: filename = "index.html" dumpDir = "dump" if len(sys.argv) >= 3: dumpDir = sys.argv[2] # Download the board os.system(f"wget {sys.argv[1]} -O {filename} -N -q") # The result is on a single line so we put line feed after every link to space markup out # We use sed instead of python to not read the whole file os.system(f"sed -i 's/<\\/a>/<\\/a>\\n/g' {filename}") # Create the output directory os.system(f"mkdir -p {dumpDir}") # We look for a image link on each line since one line should have a uniq link with open(filename, 'r') as f: for line in f: yes = re.search(REGEX, line) if yes: image = yes.group(1) print(f"Téléchargement de : {image}") res = os.system(f"wget {PROTOCOL}:{image} -N -q") if res != 0: print(f"💀 Error downloading imgage : {image}") continue res = os.system(f"mv {image[image.rfind('/') + 1:]} {dumpDir}") if res != 0: print(f"💀 Error moving imgage to {dumpDir}") os.remove(filename)