Initial commit.

2023-06-24 11:07:39 +02:00 · 2023-06-24 11:07:39 +02:00 · 1273fefe12
commit 1273fefe12
5 changed files with 286 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+# python
+*.egg-info
+__pycache__
+venv
+build
+
+# others
+/scans
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,60 @@
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "manga-scan-fetcher"
+version = "0.0.1"
+authors = [
+    { name = "Mikaël Capelle", email = "capelle.mikael@gmail.com" },
+]
+description = ""
+requires-python = ">=3.10"
+license = { text = "MIT" }
+classifiers = [
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "License :: MIT",
+]
+dependencies = [
+    "img2pdf",
+    "Pillow",
+    "bs4"
+]
+
+[project.optional-dependencies]
+dev = [
+    "black",
+    "flake8",
+    "flake8-black",
+    "flake8-pyproject",
+    "mypy",
+    "pytest",
+    "isort",
+    "types-beautifulsoup4",
+    "types-Pillow"
+]
+
+[tool.flake8]
+max-line-length = 88
+# See https://github.com/PyCQA/pycodestyle/issues/373
+extend-ignore = ['E203', 'E231']
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+
+[tool.mypy]
+warn_return_any = true
+warn_unused_configs = true
+
+[[tool.mypy.overrides]]
+module = "img2pdf.*"
+ignore_missing_imports = true
+
+[tool.pyright]
+# reportUnknownVariableType = false
+# reportMissingTypeStubs = false
+# reportUnknownMemberType = false
+# reportUnknownArgumentType = false
--- a/src/scans/init.py
+++ b/src/scans/init.py
--- a/src/scans/lelscans.py
+++ b/src/scans/lelscans.py
@ -0,0 +1,144 @@
+# -*- encoding: utf-8 -*-
+
+import tempfile
+import urllib.request
+from dataclasses import dataclass
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from typing import cast
+
+from bs4 import BeautifulSoup
+from PIL import Image
+
+from .scans import Chapter, Manga, ScanFetcher
+
+_BASE_URL = "https://lelscan.net"
+
+_REQUEST_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
+    )
+}
+
+
+@dataclass(frozen=True)
+class LelScansManga(Manga):
+    url: str
+
+
+@dataclass(frozen=True)
+class LelScansChapter(Chapter):
+    url: str
+
+
+class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
+    def _request(self, url: str) -> bytes | str:
+        request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)
+
+        return cast(bytes | str, urllib.request.urlopen(request).read())
+
+    def list_mangas(self) -> list[LelScansManga]:
+        soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
+        assert soup.body is not None
+
+        # find the select
+        select = soup.body.select("#header-image > h2 > form > select")[1]
+        return sorted(
+            (
+                LelScansManga(name=option.text, url=option.attrs["value"])
+                for option in select.select("option")
+            ),
+            key=lambda m: m.name,
+        )
+
+    def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
+        soup = BeautifulSoup(self._request(manga.url), "html.parser")
+        assert soup.body is not None
+
+        # find the select
+        select = soup.body.select("#header-image > h2 > form > select")[0]
+
+        return sorted(
+            (
+                LelScansChapter(manga, option.text, url=option.attrs["value"])
+                for option in select.select("option")
+            ),
+            key=lambda c: float(c.number),
+        )
+
+    def _fetch_chapter(
+        self,
+        chapter: LelScansChapter,
+        folder: Path,
+    ) -> list[Path]:
+        print("Retrieving scan {}... ".format(chapter.number))
+        folder.mkdir(exist_ok=True)
+
+        # retrieve the main page
+        soup = BeautifulSoup(self._request(chapter.url), "html.parser")
+
+        # retrieve the pages
+        anchors = soup.select("#navigation a")
+        pages: list[tuple[int, str]] = []
+        for anchor in anchors:
+            try:
+                # skip non-page chapter
+                pages.append((int(anchor.text), anchor.attrs["href"]))
+            except ValueError:
+                pass
+        pages = sorted(pages)
+
+        # Download each page of the scan
+        def retrieve_page(page: tuple[int, str]) -> Path | None:
+            number, url = page
+            print("  Retrieving page {:02}/{:02d}".format(number, len(pages)))
+            soup = BeautifulSoup(
+                self._request(url),
+                "html.parser",
+            )
+
+            url = (
+                _BASE_URL
+                + soup.select("#image > table > tr > td > a > img")[0]
+                .attrs["src"]
+                .strip()
+            )
+            data = self._request(url)
+
+            filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
+            with open(filepath, "wb") as fp:
+                fp.write(data)  # type: ignore
+
+            # Remove alpha channel, if any:
+            try:
+                Image.open(filepath).convert("RGB").save(filepath)
+            except (OSError, KeyError):
+                print(
+                    "    Failed to convert page {:02}/{:02d}".format(number, len(pages))
+                )
+
+            return filepath
+
+        with ThreadPool() as pool:
+            return [image for image in pool.map(retrieve_page, pages) if image]
+
+    def fetch_chapter(
+        self,
+        chapter: LelScansChapter,
+        folder: Path | None = None,
+        pdf: Path | None = None,
+    ):
+        if folder is None:
+            with tempfile.TemporaryDirectory() as t:
+                images = self._fetch_chapter(chapter, Path(t))
+        else:
+            images = self._fetch_chapter(chapter, folder)
+
+        if pdf is not None:
+            self._merge_scan(pdf, images)
+
+
+if __name__ == "__main__":
+    for manga in LelScansFetcher().list_mangas():
+        print(manga)
--- a/src/scans/scans.py
+++ b/src/scans/scans.py
@ -0,0 +1,74 @@
+# -*- encoding: utf-8 -*-
+
+from abc import abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generic, TypeVar
+
+import img2pdf
+
+
+@dataclass(frozen=True)
+class Manga:
+    name: str
+    """ Name of the manga. """
+
+
+@dataclass(frozen=True)
+class Chapter:
+    manga: Manga
+    number: str
+
+
+_MangaT = TypeVar("_MangaT", bound=Manga)
+_ChapterT = TypeVar("_ChapterT", bound=Chapter)
+
+
+class ScanFetcher(Generic[_MangaT, _ChapterT]):
+    @abstractmethod
+    def list_mangas(self) -> list[_MangaT]:
+        """
+        Retrieve the list of mangas available from this fetcher.
+        The exact type of the items in the returned list is not defined
+        but the type must inherit from the Manga class.
+        """
+        pass
+
+    @abstractmethod
+    def list_chapters(self, manga: _MangaT) -> list[_ChapterT]:
+        """
+        Return the list of chapters available for the given manga.
+
+        Args:
+            manga: An object of type Manga corresponding to the manga
+              for which chapters should be retrieved.
+
+        Returns:
+            A list of manga chapter.
+        """
+        pass
+
+    def _merge_scan(self, pdf: Path, images: list[Path]):
+        """
+        Create a PDF using the given images.
+        """
+        try:
+            with open(pdf, "wb") as fp:
+                data = img2pdf.convert([image.as_posix() for image in images])
+                assert data is not None
+                fp.write(data)
+        except Exception as e:
+            pdf.unlink()
+            raise e
+
+    @abstractmethod
+    def fetch_chapter(
+        self,
+        chapter: _ChapterT,
+        folder: Path | None = None,
+        pdf: Path | None = None,
+    ):
+        """
+        Retrieve the given chapter and store it in the specified folder.
+        """
+        pass