# -*- encoding: utf-8 -*- import logging import tempfile import urllib.request from dataclasses import dataclass from multiprocessing.pool import ThreadPool from pathlib import Path from typing import cast from bs4 import BeautifulSoup from PIL import Image, ImageFile, UnidentifiedImageError from .scans import Chapter, Manga, ScanFetcher _BASE_URL = "https://lelscan.net" _REQUEST_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" ) } # fix for some images ImageFile.LOAD_TRUNCATED_IMAGES = True LOGGER = logging.getLogger(__package__) @dataclass(frozen=True) class LelScansManga(Manga): url: str @dataclass(frozen=True) class LelScansChapter(Chapter): url: str class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]): def _request(self, url: str) -> bytes | str: request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS) return cast(bytes | str, urllib.request.urlopen(request).read()) def list_mangas(self) -> list[LelScansManga]: soup = BeautifulSoup(self._request(_BASE_URL), "html.parser") assert soup.body is not None # find the select select = soup.body.select("#header-image > h2 > form > select")[1] return sorted( ( LelScansManga(name=option.text, url=option.attrs["value"]) for option in select.select("option") ), key=lambda m: m.name, ) def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]: soup = BeautifulSoup(self._request(manga.url), "html.parser") assert soup.body is not None # find the select select = soup.body.select("#header-image > h2 > form > select")[0] return sorted( ( LelScansChapter(manga, option.text, url=option.attrs["value"]) for option in select.select("option") ), key=lambda c: float(c.number), ) def _fetch_chapter( self, chapter: LelScansChapter, folder: Path, ) -> list[Path]: LOGGER.info("Retrieving scan {}... ".format(chapter.number)) folder.mkdir(exist_ok=True) # retrieve the main page soup = BeautifulSoup(self._request(chapter.url), "html.parser") # retrieve the pages anchors = soup.select("#navigation a") pages: list[tuple[int, str]] = [] for anchor in anchors: try: # skip non-page chapter pages.append((int(anchor.text), anchor.attrs["href"])) except ValueError: pass pages = sorted(pages) # Download each page of the scan def retrieve_page(page: tuple[int, str]) -> Path | None: number, url = page LOGGER.info(" Retrieving page {:02}/{:02d}".format(number, len(pages))) soup = BeautifulSoup( self._request(url), "html.parser", ) url = ( _BASE_URL + soup.select("#image > table > tr > td > a > img")[0] .attrs["src"] .strip() ) data = self._request(url) filepath = folder.joinpath(url.split("/")[-1].split("?")[0]) with open(filepath, "wb") as fp: fp.write(data) # type: ignore # Remove alpha channel, if any: try: Image.open(filepath).convert("RGB").save(filepath) except UnidentifiedImageError: LOGGER.warning( " Failed to convert page {:02}/{:02d}, removing page.".format( number, len(pages) ) ) filepath.unlink() return None except (OSError, KeyError): LOGGER.warning( " Failed to convert page {:02}/{:02d}".format(number, len(pages)) ) return None return filepath with ThreadPool() as pool: return [image for image in pool.map(retrieve_page, pages) if image] def fetch_chapter( self, chapter: LelScansChapter, folder: Path | None = None, pdf: Path | None = None, ): if folder is None: with tempfile.TemporaryDirectory() as t: images = self._fetch_chapter(chapter, Path(t)) else: images = self._fetch_chapter(chapter, folder) if pdf is not None: LOGGER.info("Merging scan {}... ".format(chapter.number)) self._merge_scan(pdf, images) if __name__ == "__main__": for manga in LelScansFetcher().list_mangas(): print(manga)