Initial commit.

This commit is contained in:
Mikaël Capelle 2023-06-24 11:07:39 +02:00
commit 1273fefe12
5 changed files with 286 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# python
*.egg-info
__pycache__
venv
build
# others
/scans

60
pyproject.toml Normal file
View File

@ -0,0 +1,60 @@
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[project]
name = "manga-scan-fetcher"
version = "0.0.1"
authors = [
{ name = "Mikaël Capelle", email = "capelle.mikael@gmail.com" },
]
description = ""
requires-python = ">=3.10"
license = { text = "MIT" }
classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"License :: MIT",
]
dependencies = [
"img2pdf",
"Pillow",
"bs4"
]
[project.optional-dependencies]
dev = [
"black",
"flake8",
"flake8-black",
"flake8-pyproject",
"mypy",
"pytest",
"isort",
"types-beautifulsoup4",
"types-Pillow"
]
[tool.flake8]
max-line-length = 88
# See https://github.com/PyCQA/pycodestyle/issues/373
extend-ignore = ['E203', 'E231']
[tool.isort]
profile = "black"
multi_line_output = 3
[tool.mypy]
warn_return_any = true
warn_unused_configs = true
[[tool.mypy.overrides]]
module = "img2pdf.*"
ignore_missing_imports = true
[tool.pyright]
# reportUnknownVariableType = false
# reportMissingTypeStubs = false
# reportUnknownMemberType = false
# reportUnknownArgumentType = false

0
src/scans/__init__.py Normal file
View File

144
src/scans/lelscans.py Normal file
View File

@ -0,0 +1,144 @@
# -*- encoding: utf-8 -*-
import tempfile
import urllib.request
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import cast
from bs4 import BeautifulSoup
from PIL import Image
from .scans import Chapter, Manga, ScanFetcher
_BASE_URL = "https://lelscan.net"
_REQUEST_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
)
}
@dataclass(frozen=True)
class LelScansManga(Manga):
url: str
@dataclass(frozen=True)
class LelScansChapter(Chapter):
url: str
class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
def _request(self, url: str) -> bytes | str:
request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)
return cast(bytes | str, urllib.request.urlopen(request).read())
def list_mangas(self) -> list[LelScansManga]:
soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[1]
return sorted(
(
LelScansManga(name=option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda m: m.name,
)
def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
soup = BeautifulSoup(self._request(manga.url), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[0]
return sorted(
(
LelScansChapter(manga, option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda c: float(c.number),
)
def _fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path,
) -> list[Path]:
print("Retrieving scan {}... ".format(chapter.number))
folder.mkdir(exist_ok=True)
# retrieve the main page
soup = BeautifulSoup(self._request(chapter.url), "html.parser")
# retrieve the pages
anchors = soup.select("#navigation a")
pages: list[tuple[int, str]] = []
for anchor in anchors:
try:
# skip non-page chapter
pages.append((int(anchor.text), anchor.attrs["href"]))
except ValueError:
pass
pages = sorted(pages)
# Download each page of the scan
def retrieve_page(page: tuple[int, str]) -> Path | None:
number, url = page
print(" Retrieving page {:02}/{:02d}".format(number, len(pages)))
soup = BeautifulSoup(
self._request(url),
"html.parser",
)
url = (
_BASE_URL
+ soup.select("#image > table > tr > td > a > img")[0]
.attrs["src"]
.strip()
)
data = self._request(url)
filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
with open(filepath, "wb") as fp:
fp.write(data) # type: ignore
# Remove alpha channel, if any:
try:
Image.open(filepath).convert("RGB").save(filepath)
except (OSError, KeyError):
print(
" Failed to convert page {:02}/{:02d}".format(number, len(pages))
)
return filepath
with ThreadPool() as pool:
return [image for image in pool.map(retrieve_page, pages) if image]
def fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path | None = None,
pdf: Path | None = None,
):
if folder is None:
with tempfile.TemporaryDirectory() as t:
images = self._fetch_chapter(chapter, Path(t))
else:
images = self._fetch_chapter(chapter, folder)
if pdf is not None:
self._merge_scan(pdf, images)
if __name__ == "__main__":
for manga in LelScansFetcher().list_mangas():
print(manga)

74
src/scans/scans.py Normal file
View File

@ -0,0 +1,74 @@
# -*- encoding: utf-8 -*-
from abc import abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Generic, TypeVar
import img2pdf
@dataclass(frozen=True)
class Manga:
name: str
""" Name of the manga. """
@dataclass(frozen=True)
class Chapter:
manga: Manga
number: str
_MangaT = TypeVar("_MangaT", bound=Manga)
_ChapterT = TypeVar("_ChapterT", bound=Chapter)
class ScanFetcher(Generic[_MangaT, _ChapterT]):
@abstractmethod
def list_mangas(self) -> list[_MangaT]:
"""
Retrieve the list of mangas available from this fetcher.
The exact type of the items in the returned list is not defined
but the type must inherit from the Manga class.
"""
pass
@abstractmethod
def list_chapters(self, manga: _MangaT) -> list[_ChapterT]:
"""
Return the list of chapters available for the given manga.
Args:
manga: An object of type Manga corresponding to the manga
for which chapters should be retrieved.
Returns:
A list of manga chapter.
"""
pass
def _merge_scan(self, pdf: Path, images: list[Path]):
"""
Create a PDF using the given images.
"""
try:
with open(pdf, "wb") as fp:
data = img2pdf.convert([image.as_posix() for image in images])
assert data is not None
fp.write(data)
except Exception as e:
pdf.unlink()
raise e
@abstractmethod
def fetch_chapter(
self,
chapter: _ChapterT,
folder: Path | None = None,
pdf: Path | None = None,
):
"""
Retrieve the given chapter and store it in the specified folder.
"""
pass