manga-scans-fetcher/src/scans/lelscans.py

# -*- encoding: utf-8 -*-

import logging
import tempfile
import urllib.request
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import cast

from bs4 import BeautifulSoup
from PIL import Image, ImageFile

from .scans import Chapter, Manga, ScanFetcher

_BASE_URL = "https://lelscan.net"

_REQUEST_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    )
}

# fix for some images
ImageFile.LOAD_TRUNCATED_IMAGES = True

LOGGER = logging.getLogger(__package__)


@dataclass(frozen=True)
class LelScansManga(Manga):
    url: str


@dataclass(frozen=True)
class LelScansChapter(Chapter):
    url: str


class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
    def _request(self, url: str) -> bytes | str:
        request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)

        return cast(bytes | str, urllib.request.urlopen(request).read())

    def list_mangas(self) -> list[LelScansManga]:
        soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
        assert soup.body is not None

        # find the select
        select = soup.body.select("#header-image > h2 > form > select")[1]
        return sorted(
            (
                LelScansManga(name=option.text, url=option.attrs["value"])
                for option in select.select("option")
            ),
            key=lambda m: m.name,
        )

    def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
        soup = BeautifulSoup(self._request(manga.url), "html.parser")
        assert soup.body is not None

        # find the select
        select = soup.body.select("#header-image > h2 > form > select")[0]

        return sorted(
            (
                LelScansChapter(manga, option.text, url=option.attrs["value"])
                for option in select.select("option")
            ),
            key=lambda c: float(c.number),
        )

    def _fetch_chapter(
        self,
        chapter: LelScansChapter,
        folder: Path,
    ) -> list[Path]:
        LOGGER.info("Retrieving scan {}... ".format(chapter.number))
        folder.mkdir(exist_ok=True)

        # retrieve the main page
        soup = BeautifulSoup(self._request(chapter.url), "html.parser")

        # retrieve the pages
        anchors = soup.select("#navigation a")
        pages: list[tuple[int, str]] = []
        for anchor in anchors:
            try:
                # skip non-page chapter
                pages.append((int(anchor.text), anchor.attrs["href"]))
            except ValueError:
                pass
        pages = sorted(pages)

        # Download each page of the scan
        def retrieve_page(page: tuple[int, str]) -> Path | None:
            number, url = page
            LOGGER.info("  Retrieving page {:02}/{:02d}".format(number, len(pages)))
            soup = BeautifulSoup(
                self._request(url),
                "html.parser",
            )

            url = (
                _BASE_URL
                + soup.select("#image > table > tr > td > a > img")[0]
                .attrs["src"]
                .strip()
            )
            data = self._request(url)

            filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
            with open(filepath, "wb") as fp:
                fp.write(data)  # type: ignore

            # Remove alpha channel, if any:
            try:
                Image.open(filepath).convert("RGB").save(filepath)
            except (OSError, KeyError):
                LOGGER.warning(
                    "    Failed to convert page {:02}/{:02d}".format(number, len(pages))
                )

            return filepath

        with ThreadPool() as pool:
            return [image for image in pool.map(retrieve_page, pages) if image]

    def fetch_chapter(
        self,
        chapter: LelScansChapter,
        folder: Path | None = None,
        pdf: Path | None = None,
    ):
        if folder is None:
            with tempfile.TemporaryDirectory() as t:
                images = self._fetch_chapter(chapter, Path(t))
        else:
            images = self._fetch_chapter(chapter, folder)

        if pdf is not None:
            LOGGER.info("Merging scan {}... ".format(chapter.number))
            self._merge_scan(pdf, images)


if __name__ == "__main__":
    for manga in LelScansFetcher().list_mangas():
        print(manga)
Initial commit. 2023-06-24 09:07:39 +00:00			`# -- encoding: utf-8 --`

Add main code. 2023-07-19 17:11:42 +00:00			`import logging`
Initial commit. 2023-06-24 09:07:39 +00:00			`import tempfile`
			`import urllib.request`
			`from dataclasses import dataclass`
			`from multiprocessing.pool import ThreadPool`
			`from pathlib import Path`
			`from typing import cast`

			`from bs4 import BeautifulSoup`
Add main code. 2023-07-19 17:11:42 +00:00			`from PIL import Image, ImageFile`
Initial commit. 2023-06-24 09:07:39 +00:00
			`from .scans import Chapter, Manga, ScanFetcher`

			`_BASE_URL = "https://lelscan.net"`

			`_REQUEST_HEADERS = {`
			`"User-Agent": (`
			`"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "`
			`"(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"`
			`)`
			`}`

Add main code. 2023-07-19 17:11:42 +00:00			`# fix for some images`
			`ImageFile.LOAD_TRUNCATED_IMAGES = True`

			`LOGGER = logging.getLogger(__package__)`

Initial commit. 2023-06-24 09:07:39 +00:00
			`@dataclass(frozen=True)`
			`class LelScansManga(Manga):`
			`url: str`


			`@dataclass(frozen=True)`
			`class LelScansChapter(Chapter):`
			`url: str`


			`class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):`
			`def _request(self, url: str) -> bytes \| str:`
			`request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)`

			`return cast(bytes \| str, urllib.request.urlopen(request).read())`

			`def list_mangas(self) -> list[LelScansManga]:`
			`soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")`
			`assert soup.body is not None`

			`# find the select`
			`select = soup.body.select("#header-image > h2 > form > select")[1]`
			`return sorted(`
			`(`
			`LelScansManga(name=option.text, url=option.attrs["value"])`
			`for option in select.select("option")`
			`),`
			`key=lambda m: m.name,`
			`)`

			`def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:`
			`soup = BeautifulSoup(self._request(manga.url), "html.parser")`
			`assert soup.body is not None`

			`# find the select`
			`select = soup.body.select("#header-image > h2 > form > select")[0]`

			`return sorted(`
			`(`
			`LelScansChapter(manga, option.text, url=option.attrs["value"])`
			`for option in select.select("option")`
			`),`
			`key=lambda c: float(c.number),`
			`)`

			`def _fetch_chapter(`
			`self,`
			`chapter: LelScansChapter,`
			`folder: Path,`
			`) -> list[Path]:`
Add main code. 2023-07-19 17:11:42 +00:00			`LOGGER.info("Retrieving scan {}... ".format(chapter.number))`
Initial commit. 2023-06-24 09:07:39 +00:00			`folder.mkdir(exist_ok=True)`

			`# retrieve the main page`
			`soup = BeautifulSoup(self._request(chapter.url), "html.parser")`

			`# retrieve the pages`
			`anchors = soup.select("#navigation a")`
			`pages: list[tuple[int, str]] = []`
			`for anchor in anchors:`
			`try:`
			`# skip non-page chapter`
			`pages.append((int(anchor.text), anchor.attrs["href"]))`
			`except ValueError:`
			`pass`
			`pages = sorted(pages)`

			`# Download each page of the scan`
			`def retrieve_page(page: tuple[int, str]) -> Path \| None:`
			`number, url = page`
Add main code. 2023-07-19 17:11:42 +00:00			`LOGGER.info(" Retrieving page {:02}/{:02d}".format(number, len(pages)))`
Initial commit. 2023-06-24 09:07:39 +00:00			`soup = BeautifulSoup(`
			`self._request(url),`
			`"html.parser",`
			`)`

			`url = (`
			`_BASE_URL`
			`+ soup.select("#image > table > tr > td > a > img")[0]`
			`.attrs["src"]`
			`.strip()`
			`)`
			`data = self._request(url)`

			`filepath = folder.joinpath(url.split("/")[-1].split("?")[0])`
			`with open(filepath, "wb") as fp:`
			`fp.write(data) # type: ignore`

			`# Remove alpha channel, if any:`
			`try:`
			`Image.open(filepath).convert("RGB").save(filepath)`
			`except (OSError, KeyError):`
Add main code. 2023-07-19 17:11:42 +00:00			`LOGGER.warning(`
Initial commit. 2023-06-24 09:07:39 +00:00			`" Failed to convert page {:02}/{:02d}".format(number, len(pages))`
			`)`

			`return filepath`

			`with ThreadPool() as pool:`
			`return [image for image in pool.map(retrieve_page, pages) if image]`

			`def fetch_chapter(`
			`self,`
			`chapter: LelScansChapter,`
			`folder: Path \| None = None,`
			`pdf: Path \| None = None,`
			`):`
			`if folder is None:`
			`with tempfile.TemporaryDirectory() as t:`
			`images = self._fetch_chapter(chapter, Path(t))`
			`else:`
			`images = self._fetch_chapter(chapter, folder)`

			`if pdf is not None:`
Add main code. 2023-07-19 17:11:42 +00:00			`LOGGER.info("Merging scan {}... ".format(chapter.number))`
Initial commit. 2023-06-24 09:07:39 +00:00			`self._merge_scan(pdf, images)`


			`if __name__ == "__main__":`
			`for manga in LelScansFetcher().list_mangas():`
			`print(manga)`