aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorache <ache@ache.one>2024-04-03 11:24:52 +0200
committerache <ache@ache.one>2024-04-03 11:25:17 +0200
commita9b5e74e6f529ca2b451fb30bbc5b66b6baf37b3 (patch)
treeb4ac6f1cc351e05a62fc5e696e19a0cd904b4d58
parentAdd the clip2file tool (diff)
Update size of downloadHEADmaster
-rwxr-xr-xdlr34.py302
1 files changed, 302 insertions, 0 deletions
diff --git a/dlr34.py b/dlr34.py
new file mode 100755
index 0000000..5045274
--- /dev/null
+++ b/dlr34.py
@@ -0,0 +1,302 @@
+#!/bin/env python
+
+import os
+import re
+import requests
+import sys
+import csv
+import lxml
+import json
+from urllib.parse import urlparse
+from pathlib import Path
+import lxml.html
+import argparse
+
+
+DEBUG_MODE = True
+dirOut = Path("r34")
+
+
+class Spinner(object):
+ def __init__(self):
+ self.SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
+ self.SPINNER_status = 1
+
+ def __call__(self):
+ self.SPINNER_status = (self.SPINNER_status + 1) % len(self.SPINNER)
+ return self.SPINNER[self.SPINNER_status]
+
+
+quickSpinner = Spinner()
+
+
+def pretty_print_Request_header(req):
+ print(f"{req.method} {req.url}")
+
+ for k, v in req.headers.items():
+ print(f"{k}: {v}")
+
+
+def outputFileFromURL(url):
+ u = urlparse(url)
+ name = Path(u.path).name
+ alreadyExists = False
+
+ finalDest = dirOut.joinpath(Path(name))
+
+ i = 1
+
+ while finalDest.exists():
+ alreadyExists = True
+
+ s_name = Path(u.path).stem
+ s_suffix = Path(u.path).suffix
+
+ next_name = f"{s_name}_{i}{s_suffix}"
+ i += 1
+
+ finalDest = dirOut.joinpath(Path(next_name))
+
+ return alreadyExists, str(finalDest)
+
+
+def downloadURL(s, url, idLink, output, headers={}):
+ r = s.get(url, stream=True, headers=headers)
+
+ u = urlparse(url)
+
+ if r.ok:
+ if DEBUG_MODE:
+ print(f"\n{url} >> {output}")
+
+ print(f"\r⬇️ {Path(u.path).name}", end='')
+
+ try:
+ with open(output, "wb") as f:
+ print(f"\r⬇️ {quickSpinner()} {Path(u.path).name}", end='')
+ chunk_size = 4 * 1024
+
+ for chunk in r.iter_content(chunk_size=chunk_size):
+ f.write(chunk)
+ print(f"\rDownloaded {Path(u.path).name} - {idLink}")
+ except Exception as e:
+ print(f"\r☠️ error {Path(u.path).name} - {idLink}")
+ if DEBUG_MODE:
+ print(e)
+
+def downloadFromItems(items, s, u):
+ replace = {
+ 'mov480.': 'mov.',
+ 'mov720.': 'mov.',
+ 'mov256.': 'mov.',
+ 'pic256.': 'pic.',
+ 'pic480.': 'pic.',
+ 'pic720.': 'pic.',
+ 'picsmall.': 'pic.'
+ }
+
+ for item in items:
+ isDownloaded = False
+ isVideo = item['duration'] is not None
+ for file in item['imageLinks']:
+ url = file['url']
+ if isVideo and not url.endswith('.mp4'):
+ continue
+
+ for r,t in replace.items():
+ if r in url:
+ url = url.replace(r, t)
+ e, outputFile = outputFileFromURL(url)
+ if e:
+ print("Already downloaded")
+ return
+
+ headers = {"Referer": f"{u.scheme}://{u.hostname}/"}
+ headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ headers['Accept-encoding'] = "gzip, deflate, br"
+ downloadURL(s, url, "???", outputFile, headers)
+ isDownloaded = True
+ break
+ if isDownloaded:
+ break
+
+
+def downloadImagesFromPlaylist(index, link):
+ s = requests.Session()
+
+ u = urlparse(link)
+ p = Path(u.path)
+ playlistId = p.stem
+
+ print(f"Link n°{index} - {playlistId}")
+
+ nbPage = 0
+ while True:
+ urlPlaylistFormat = f"https://rule34.world/api/playlist-item?playlistId={playlistId}&Skip={nbPage*60}&Take=60&DisableTotal=true"
+ resp = s.get(urlPlaylistFormat)
+ if resp.ok and resp.status_code == 200:
+ data = json.loads(resp.content)
+ if len(data['items']) > 0:
+ print(f"Ok ! {len(data['items'])}")
+ downloadFromItems(data['items'], s, u)
+ else:
+ break
+
+ nbPage += 1
+
+ return
+ try:
+ html = lxml.html.fromstring(resp.content)
+ boxes = html.cssselect("div.box a.boxInner")
+ for box in boxes:
+ url = box.attrib['href']
+ url = f"{u.scheme}://{u.hostname}{url}"
+ downloadImageFromPage(index, url)
+ except Exception as e:
+ print(f"\r☠️ error {Path(u.path).name} - {idLink}")
+ if DEBUG_MODE:
+ print(e)
+
+
+def downloadImagesFromUserboard(index, link):
+ s = requests.Session()
+
+ u = urlparse(link)
+ p = Path(u.path)
+ boardName = p.stem
+
+ print(f"Link n°{index} - {boardName}", end='')
+ resp = s.get(link)
+
+ findImagesRegex = re.compile('<img class="image" src="/images/t.png" data-src="(.+)\?width=300"')
+ if resp.ok and resp.status_code == 200:
+ for subIndex, urlImg in enumerate(findImagesRegex.findall(resp.content.decode('utf8'))):
+ match = re.search(r'(\d{7,})', urlImg)
+ if not match:
+ return
+
+ e, outputFile = outputFileFromURL(urlImg)
+ if e:
+ print("Already downloaded")
+ return
+
+ if DEBUG_MODE:
+ print(f"\n{urlImg} >> {outputFile}")
+
+ idLink = match.group(1)
+ print(f"Link n°{index}.{subIndex} - {boardName}>{idLink}", end='')
+
+ headers = {"Referer": f"{u.scheme}://{u.hostname}/"}
+ headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ headers['Accept-encoding'] = "gzip, deflate, br"
+ downloadURL(s, urlImg, idLink, outputFile, headers)
+
+
+def downloadVideoFromSource(index, urlBase, idVid, urlVideo):
+ s = requests.Session()
+
+ print(f"Link n°{index} - Vidéo {idVid}", end='')
+ print(f"Link : {urlVideo} !", end='\n')
+
+
+ u = urlparse(urlBase)
+ urlVideo = urlVideo.replace('mov480', 'mov')
+
+ e, outputFile = outputFileFromURL(urlVideo)
+ if e:
+ print("Already downloaded")
+ return
+
+ headers = {"Referer": f"{u.scheme}://{u.hostname}/"}
+ headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ headers['Accept-encoding'] = "gzip, deflate, br"
+
+ print(f"Download to {outputFile}")
+ downloadURL(s, urlVideo, idVid, outputFile, headers)
+
+
+def downloadImageFromPage(index, link):
+ s = requests.Session()
+ match = re.search(r'(\d{6,})', link)
+
+ if not match:
+ return
+
+ idLink = match.group(1)
+ print(f"Link n°{index} - {idLink}\n", end='')
+ resp = s.get(link)
+
+ if resp.ok and resp.status_code == 200:
+ try:
+ html = lxml.html.fromstring(resp.content)
+ imgs = html.cssselect("img.img.shadow-base")
+
+ if len(imgs) == 0:
+ vids = html.cssselect("video.video.shadow-base source")
+ if len(vids) > 0:
+ url = vids[0].attrib['src']
+
+ u = urlparse(link)
+ if url[0] == '/':
+ url = f"{u.scheme}://{u.hostname}{url}"
+
+ downloadVideoFromSource(index, link, idLink, url)
+ else:
+ img = imgs[0]
+
+ if 'src' in img.attrib:
+ url = img.attrib['src']
+ u = urlparse(link)
+ if url[0] == '/':
+ url = f"{u.scheme}://{u.hostname}{url}"
+
+ url = url.replace('picsmall', 'pic')
+
+ e, outputFile = outputFileFromURL(url)
+ if e:
+ print("Already downloaded")
+ return
+
+ headers = {"Referer": f"{u.scheme}://{u.hostname}/"}
+ headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ headers['Accept-encoding'] = "gzip, deflate, br"
+ downloadURL(s, url, idLink, outputFile, headers)
+
+ except:
+ print(f"\r☠️ error {Path(u.path).name} - {idLink}")
+ if DEBUG_MODE:
+ print(e)
+ return
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-d", "--directory", help="Output directory")
+ parser.add_argument('files', nargs=argparse.REMAINDER)
+ args = parser.parse_args()
+
+ if args.directory:
+ global dirOut
+ dirOut = Path(args.directory)
+
+ if not dirOut.exists():
+ print(f"Output dir \"{dirOut}\" doesn't exists")
+ return
+
+ if not dirOut.is_dir():
+ print(f"Output dir \"{dirOut}\" isn't a directory")
+ return
+
+ for file in args.files:
+ with open(file) as f_listLink:
+ for index, link in enumerate(f_listLink.read().split()):
+ if '/post/' in link:
+ downloadImageFromPage(index, link)
+ if '/user/' in link:
+ downloadImagesFromUserboard(index, link)
+ if '/playlists/' in link:
+ downloadImagesFromPlaylist(index, link)
+
+
+if __name__ == "__main__":
+ main()