manga-scans-fetcher/src/scans/lelscans.py

145 lines
4.2 KiB
Python
Raw Normal View History

2023-06-24 09:07:39 +00:00
# -*- encoding: utf-8 -*-
import tempfile
import urllib.request
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import cast
from bs4 import BeautifulSoup
from PIL import Image
from .scans import Chapter, Manga, ScanFetcher
_BASE_URL = "https://lelscan.net"
_REQUEST_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
)
}
@dataclass(frozen=True)
class LelScansManga(Manga):
url: str
@dataclass(frozen=True)
class LelScansChapter(Chapter):
url: str
class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
def _request(self, url: str) -> bytes | str:
request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)
return cast(bytes | str, urllib.request.urlopen(request).read())
def list_mangas(self) -> list[LelScansManga]:
soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[1]
return sorted(
(
LelScansManga(name=option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda m: m.name,
)
def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
soup = BeautifulSoup(self._request(manga.url), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[0]
return sorted(
(
LelScansChapter(manga, option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda c: float(c.number),
)
def _fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path,
) -> list[Path]:
print("Retrieving scan {}... ".format(chapter.number))
folder.mkdir(exist_ok=True)
# retrieve the main page
soup = BeautifulSoup(self._request(chapter.url), "html.parser")
# retrieve the pages
anchors = soup.select("#navigation a")
pages: list[tuple[int, str]] = []
for anchor in anchors:
try:
# skip non-page chapter
pages.append((int(anchor.text), anchor.attrs["href"]))
except ValueError:
pass
pages = sorted(pages)
# Download each page of the scan
def retrieve_page(page: tuple[int, str]) -> Path | None:
number, url = page
print(" Retrieving page {:02}/{:02d}".format(number, len(pages)))
soup = BeautifulSoup(
self._request(url),
"html.parser",
)
url = (
_BASE_URL
+ soup.select("#image > table > tr > td > a > img")[0]
.attrs["src"]
.strip()
)
data = self._request(url)
filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
with open(filepath, "wb") as fp:
fp.write(data) # type: ignore
# Remove alpha channel, if any:
try:
Image.open(filepath).convert("RGB").save(filepath)
except (OSError, KeyError):
print(
" Failed to convert page {:02}/{:02d}".format(number, len(pages))
)
return filepath
with ThreadPool() as pool:
return [image for image in pool.map(retrieve_page, pages) if image]
def fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path | None = None,
pdf: Path | None = None,
):
if folder is None:
with tempfile.TemporaryDirectory() as t:
images = self._fetch_chapter(chapter, Path(t))
else:
images = self._fetch_chapter(chapter, folder)
if pdf is not None:
self._merge_scan(pdf, images)
if __name__ == "__main__":
for manga in LelScansFetcher().list_mangas():
print(manga)