From 1273fefe12980508a0d79e20b41dde04a8bd7d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=C3=ABl=20Capelle?= Date: Sat, 24 Jun 2023 11:07:39 +0200 Subject: [PATCH] Initial commit. --- .gitignore | 8 +++ pyproject.toml | 60 ++++++++++++++++++ src/scans/__init__.py | 0 src/scans/lelscans.py | 144 ++++++++++++++++++++++++++++++++++++++++++ src/scans/scans.py | 74 ++++++++++++++++++++++ 5 files changed, 286 insertions(+) create mode 100644 .gitignore create mode 100644 pyproject.toml create mode 100644 src/scans/__init__.py create mode 100644 src/scans/lelscans.py create mode 100644 src/scans/scans.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1dfc7e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# python +*.egg-info +__pycache__ +venv +build + +# others +/scans diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a19973f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "manga-scan-fetcher" +version = "0.0.1" +authors = [ + { name = "Mikaƫl Capelle", email = "capelle.mikael@gmail.com" }, +] +description = "" +requires-python = ">=3.10" +license = { text = "MIT" } +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "License :: MIT", +] +dependencies = [ + "img2pdf", + "Pillow", + "bs4" +] + +[project.optional-dependencies] +dev = [ + "black", + "flake8", + "flake8-black", + "flake8-pyproject", + "mypy", + "pytest", + "isort", + "types-beautifulsoup4", + "types-Pillow" +] + +[tool.flake8] +max-line-length = 88 +# See https://github.com/PyCQA/pycodestyle/issues/373 +extend-ignore = ['E203', 'E231'] + +[tool.isort] +profile = "black" +multi_line_output = 3 + +[tool.mypy] +warn_return_any = true +warn_unused_configs = true + +[[tool.mypy.overrides]] +module = "img2pdf.*" +ignore_missing_imports = true + +[tool.pyright] +# reportUnknownVariableType = false +# reportMissingTypeStubs = false +# reportUnknownMemberType = false +# reportUnknownArgumentType = false diff --git a/src/scans/__init__.py b/src/scans/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/scans/lelscans.py b/src/scans/lelscans.py new file mode 100644 index 0000000..844a5f8 --- /dev/null +++ b/src/scans/lelscans.py @@ -0,0 +1,144 @@ +# -*- encoding: utf-8 -*- + +import tempfile +import urllib.request +from dataclasses import dataclass +from multiprocessing.pool import ThreadPool +from pathlib import Path +from typing import cast + +from bs4 import BeautifulSoup +from PIL import Image + +from .scans import Chapter, Manga, ScanFetcher + +_BASE_URL = "https://lelscan.net" + +_REQUEST_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" + ) +} + + +@dataclass(frozen=True) +class LelScansManga(Manga): + url: str + + +@dataclass(frozen=True) +class LelScansChapter(Chapter): + url: str + + +class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]): + def _request(self, url: str) -> bytes | str: + request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS) + + return cast(bytes | str, urllib.request.urlopen(request).read()) + + def list_mangas(self) -> list[LelScansManga]: + soup = BeautifulSoup(self._request(_BASE_URL), "html.parser") + assert soup.body is not None + + # find the select + select = soup.body.select("#header-image > h2 > form > select")[1] + return sorted( + ( + LelScansManga(name=option.text, url=option.attrs["value"]) + for option in select.select("option") + ), + key=lambda m: m.name, + ) + + def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]: + soup = BeautifulSoup(self._request(manga.url), "html.parser") + assert soup.body is not None + + # find the select + select = soup.body.select("#header-image > h2 > form > select")[0] + + return sorted( + ( + LelScansChapter(manga, option.text, url=option.attrs["value"]) + for option in select.select("option") + ), + key=lambda c: float(c.number), + ) + + def _fetch_chapter( + self, + chapter: LelScansChapter, + folder: Path, + ) -> list[Path]: + print("Retrieving scan {}... ".format(chapter.number)) + folder.mkdir(exist_ok=True) + + # retrieve the main page + soup = BeautifulSoup(self._request(chapter.url), "html.parser") + + # retrieve the pages + anchors = soup.select("#navigation a") + pages: list[tuple[int, str]] = [] + for anchor in anchors: + try: + # skip non-page chapter + pages.append((int(anchor.text), anchor.attrs["href"])) + except ValueError: + pass + pages = sorted(pages) + + # Download each page of the scan + def retrieve_page(page: tuple[int, str]) -> Path | None: + number, url = page + print(" Retrieving page {:02}/{:02d}".format(number, len(pages))) + soup = BeautifulSoup( + self._request(url), + "html.parser", + ) + + url = ( + _BASE_URL + + soup.select("#image > table > tr > td > a > img")[0] + .attrs["src"] + .strip() + ) + data = self._request(url) + + filepath = folder.joinpath(url.split("/")[-1].split("?")[0]) + with open(filepath, "wb") as fp: + fp.write(data) # type: ignore + + # Remove alpha channel, if any: + try: + Image.open(filepath).convert("RGB").save(filepath) + except (OSError, KeyError): + print( + " Failed to convert page {:02}/{:02d}".format(number, len(pages)) + ) + + return filepath + + with ThreadPool() as pool: + return [image for image in pool.map(retrieve_page, pages) if image] + + def fetch_chapter( + self, + chapter: LelScansChapter, + folder: Path | None = None, + pdf: Path | None = None, + ): + if folder is None: + with tempfile.TemporaryDirectory() as t: + images = self._fetch_chapter(chapter, Path(t)) + else: + images = self._fetch_chapter(chapter, folder) + + if pdf is not None: + self._merge_scan(pdf, images) + + +if __name__ == "__main__": + for manga in LelScansFetcher().list_mangas(): + print(manga) diff --git a/src/scans/scans.py b/src/scans/scans.py new file mode 100644 index 0000000..20e250c --- /dev/null +++ b/src/scans/scans.py @@ -0,0 +1,74 @@ +# -*- encoding: utf-8 -*- + +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Generic, TypeVar + +import img2pdf + + +@dataclass(frozen=True) +class Manga: + name: str + """ Name of the manga. """ + + +@dataclass(frozen=True) +class Chapter: + manga: Manga + number: str + + +_MangaT = TypeVar("_MangaT", bound=Manga) +_ChapterT = TypeVar("_ChapterT", bound=Chapter) + + +class ScanFetcher(Generic[_MangaT, _ChapterT]): + @abstractmethod + def list_mangas(self) -> list[_MangaT]: + """ + Retrieve the list of mangas available from this fetcher. + The exact type of the items in the returned list is not defined + but the type must inherit from the Manga class. + """ + pass + + @abstractmethod + def list_chapters(self, manga: _MangaT) -> list[_ChapterT]: + """ + Return the list of chapters available for the given manga. + + Args: + manga: An object of type Manga corresponding to the manga + for which chapters should be retrieved. + + Returns: + A list of manga chapter. + """ + pass + + def _merge_scan(self, pdf: Path, images: list[Path]): + """ + Create a PDF using the given images. + """ + try: + with open(pdf, "wb") as fp: + data = img2pdf.convert([image.as_posix() for image in images]) + assert data is not None + fp.write(data) + except Exception as e: + pdf.unlink() + raise e + + @abstractmethod + def fetch_chapter( + self, + chapter: _ChapterT, + folder: Path | None = None, + pdf: Path | None = None, + ): + """ + Retrieve the given chapter and store it in the specified folder. + """ + pass