manga-scans-fetcher/src/scans/lelscans.py

152 lines
4.4 KiB
Python
Raw Normal View History

2023-06-24 09:07:39 +00:00
# -*- encoding: utf-8 -*-
2023-07-19 17:11:42 +00:00
import logging
2023-06-24 09:07:39 +00:00
import tempfile
import urllib.request
from dataclasses import dataclass
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import cast
from bs4 import BeautifulSoup
2023-07-19 17:11:42 +00:00
from PIL import Image, ImageFile
2023-06-24 09:07:39 +00:00
from .scans import Chapter, Manga, ScanFetcher
_BASE_URL = "https://lelscan.net"
_REQUEST_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
)
}
2023-07-19 17:11:42 +00:00
# fix for some images
ImageFile.LOAD_TRUNCATED_IMAGES = True
LOGGER = logging.getLogger(__package__)
2023-06-24 09:07:39 +00:00
@dataclass(frozen=True)
class LelScansManga(Manga):
url: str
@dataclass(frozen=True)
class LelScansChapter(Chapter):
url: str
class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
def _request(self, url: str) -> bytes | str:
request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)
return cast(bytes | str, urllib.request.urlopen(request).read())
def list_mangas(self) -> list[LelScansManga]:
soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[1]
return sorted(
(
LelScansManga(name=option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda m: m.name,
)
def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
soup = BeautifulSoup(self._request(manga.url), "html.parser")
assert soup.body is not None
# find the select
select = soup.body.select("#header-image > h2 > form > select")[0]
return sorted(
(
LelScansChapter(manga, option.text, url=option.attrs["value"])
for option in select.select("option")
),
key=lambda c: float(c.number),
)
def _fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path,
) -> list[Path]:
2023-07-19 17:11:42 +00:00
LOGGER.info("Retrieving scan {}... ".format(chapter.number))
2023-06-24 09:07:39 +00:00
folder.mkdir(exist_ok=True)
# retrieve the main page
soup = BeautifulSoup(self._request(chapter.url), "html.parser")
# retrieve the pages
anchors = soup.select("#navigation a")
pages: list[tuple[int, str]] = []
for anchor in anchors:
try:
# skip non-page chapter
pages.append((int(anchor.text), anchor.attrs["href"]))
except ValueError:
pass
pages = sorted(pages)
# Download each page of the scan
def retrieve_page(page: tuple[int, str]) -> Path | None:
number, url = page
2023-07-19 17:11:42 +00:00
LOGGER.info(" Retrieving page {:02}/{:02d}".format(number, len(pages)))
2023-06-24 09:07:39 +00:00
soup = BeautifulSoup(
self._request(url),
"html.parser",
)
url = (
_BASE_URL
+ soup.select("#image > table > tr > td > a > img")[0]
.attrs["src"]
.strip()
)
data = self._request(url)
filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
with open(filepath, "wb") as fp:
fp.write(data) # type: ignore
# Remove alpha channel, if any:
try:
Image.open(filepath).convert("RGB").save(filepath)
except (OSError, KeyError):
2023-07-19 17:11:42 +00:00
LOGGER.warning(
2023-06-24 09:07:39 +00:00
" Failed to convert page {:02}/{:02d}".format(number, len(pages))
)
return filepath
with ThreadPool() as pool:
return [image for image in pool.map(retrieve_page, pages) if image]
def fetch_chapter(
self,
chapter: LelScansChapter,
folder: Path | None = None,
pdf: Path | None = None,
):
if folder is None:
with tempfile.TemporaryDirectory() as t:
images = self._fetch_chapter(chapter, Path(t))
else:
images = self._fetch_chapter(chapter, folder)
if pdf is not None:
2023-07-19 17:11:42 +00:00
LOGGER.info("Merging scan {}... ".format(chapter.number))
2023-06-24 09:07:39 +00:00
self._merge_scan(pdf, images)
if __name__ == "__main__":
for manga in LelScansFetcher().list_mangas():
print(manga)