167 lines
5.1 KiB
Python
167 lines
5.1 KiB
Python
import logging
|
|
import tempfile
|
|
import urllib.error
|
|
import urllib.request
|
|
from dataclasses import dataclass
|
|
from multiprocessing.pool import ThreadPool
|
|
from pathlib import Path
|
|
from typing import cast
|
|
|
|
from bs4 import BeautifulSoup
|
|
from PIL import Image, ImageFile, UnidentifiedImageError
|
|
|
|
from .scans import Chapter, Manga, ScanFetcher
|
|
|
|
_BASE_URL = "https://lelscan.net"
|
|
|
|
_REQUEST_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
|
|
)
|
|
}
|
|
|
|
# fix for some images
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
|
LOGGER = logging.getLogger(__package__)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LelScansManga(Manga):
|
|
url: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LelScansChapter(Chapter):
|
|
url: str
|
|
|
|
|
|
class LelScansFetcher(ScanFetcher[LelScansManga, LelScansChapter]):
|
|
def _request(self, url: str) -> bytes | str:
|
|
request = urllib.request.Request(url=url, headers=_REQUEST_HEADERS)
|
|
return cast(bytes | str, urllib.request.urlopen(request).read())
|
|
|
|
def list_mangas(self) -> list[LelScansManga]:
|
|
soup = BeautifulSoup(self._request(_BASE_URL), "html.parser")
|
|
assert soup.body is not None
|
|
|
|
# find the select
|
|
select = soup.body.select("#header-image > h2 > form > select")[1]
|
|
return sorted(
|
|
(
|
|
LelScansManga(name=option.text, url=option.attrs["value"])
|
|
for option in select.select("option")
|
|
),
|
|
key=lambda m: m.name,
|
|
)
|
|
|
|
def list_chapters(self, manga: LelScansManga) -> list[LelScansChapter]:
|
|
soup = BeautifulSoup(self._request(manga.url), "html.parser")
|
|
assert soup.body is not None
|
|
|
|
# find the select
|
|
select = soup.body.select("#header-image > h2 > form > select")[0]
|
|
|
|
return sorted(
|
|
(
|
|
LelScansChapter(manga, option.text, url=option.attrs["value"])
|
|
for option in select.select("option")
|
|
),
|
|
key=lambda c: float(c.number),
|
|
)
|
|
|
|
def _fetch_chapter(
|
|
self,
|
|
chapter: LelScansChapter,
|
|
folder: Path,
|
|
) -> list[Path]:
|
|
LOGGER.info("Retrieving scan {}... ".format(chapter.number))
|
|
folder.mkdir(exist_ok=True)
|
|
|
|
# retrieve the main page
|
|
soup = BeautifulSoup(self._request(chapter.url), "html.parser")
|
|
|
|
# retrieve the pages
|
|
anchors = soup.select("#navigation a")
|
|
pages: list[tuple[int, str]] = []
|
|
for anchor in anchors:
|
|
try:
|
|
# skip non-page chapter
|
|
pages.append((int(anchor.text), anchor.attrs["href"]))
|
|
except ValueError:
|
|
pass
|
|
pages = sorted(pages)
|
|
|
|
# Download each page of the scan
|
|
def retrieve_page(page: tuple[int, str]) -> Path | None:
|
|
number, url = page
|
|
LOGGER.info(" Retrieving page {:02}/{:02d}".format(number, len(pages)))
|
|
soup = BeautifulSoup(
|
|
self._request(url),
|
|
"html.parser",
|
|
)
|
|
|
|
url = (
|
|
_BASE_URL
|
|
+ soup.select("#image > table > tr > td > a > img")[0]
|
|
.attrs["src"]
|
|
.strip()
|
|
)
|
|
try:
|
|
data = self._request(url)
|
|
except urllib.error.HTTPError:
|
|
LOGGER.warning(
|
|
" Failed to retrieve page {:02}/{:02d} from {}.".format(
|
|
number, len(pages), url
|
|
)
|
|
)
|
|
return None
|
|
|
|
filepath = folder.joinpath(url.split("/")[-1].split("?")[0])
|
|
with open(filepath, "wb") as fp:
|
|
fp.write(data) # type: ignore
|
|
|
|
# Remove alpha channel, if any:
|
|
try:
|
|
Image.open(filepath).convert("RGB").save(filepath)
|
|
except UnidentifiedImageError:
|
|
LOGGER.warning(
|
|
" Failed to convert page {:02}/{:02d}, removing page.".format(
|
|
number, len(pages)
|
|
)
|
|
)
|
|
filepath.unlink()
|
|
return None
|
|
except (OSError, KeyError):
|
|
LOGGER.warning(
|
|
" Failed to convert page {:02}/{:02d}".format(number, len(pages))
|
|
)
|
|
return None
|
|
|
|
return filepath
|
|
|
|
with ThreadPool() as pool:
|
|
return [image for image in pool.map(retrieve_page, pages) if image]
|
|
|
|
def fetch_chapter(
|
|
self,
|
|
chapter: LelScansChapter,
|
|
folder: Path | None = None,
|
|
pdf: Path | None = None,
|
|
):
|
|
if folder is None:
|
|
with tempfile.TemporaryDirectory() as t:
|
|
images = self._fetch_chapter(chapter, Path(t))
|
|
else:
|
|
images = self._fetch_chapter(chapter, folder)
|
|
|
|
if pdf is not None:
|
|
LOGGER.info("Merging scan {}... ".format(chapter.number))
|
|
self._merge_scan(pdf, images)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
for manga in LelScansFetcher().list_mangas():
|
|
print(manga)
|