diff --git a/src/main/java/com/magamochi/common/ContentProviders.java b/src/main/java/com/magamochi/common/ContentProviders.java index eb7f9ac..59a4b3b 100644 --- a/src/main/java/com/magamochi/common/ContentProviders.java +++ b/src/main/java/com/magamochi/common/ContentProviders.java @@ -4,6 +4,7 @@ public class ContentProviders { public static final String MANGA_LIVRE_BLOG = "Manga Livre Blog"; public static final String MANGA_LIVRE_TO = "Manga Livre.to"; public static final String PINK_ROSA_SCAN = "Pink Rosa Scan"; + public static final String TAIMU = "Taimu"; public static final String MANGA_DEX = "MangaDex"; public static final String MANUAL_IMPORT = "Manual Import"; } diff --git a/src/main/java/com/magamochi/ingestion/client/ScrollableScrapperClient.java b/src/main/java/com/magamochi/ingestion/client/ScrollableScrapperClient.java new file mode 100644 index 0000000..26c5e0e --- /dev/null +++ b/src/main/java/com/magamochi/ingestion/client/ScrollableScrapperClient.java @@ -0,0 +1,24 @@ +package com.magamochi.ingestion.client; + +import lombok.Builder; +import lombok.Getter; +import org.springframework.cloud.openfeign.FeignClient; +import org.springframework.http.MediaType; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; + +@FeignClient(name = "scrollable-scrapper", url = "${scrollable-scrapper.endpoint}") +public interface ScrollableScrapperClient { + @PostMapping( + consumes = MediaType.APPLICATION_JSON_VALUE, + produces = MediaType.APPLICATION_JSON_VALUE) + GetResponse get(@RequestBody GetRequest request); + + @Getter + @Builder + class GetRequest { + private final String url; + } + + record GetResponse(String pageSource) {} +} diff --git a/src/main/java/com/magamochi/ingestion/providers/impl/TaimuProvider.java b/src/main/java/com/magamochi/ingestion/providers/impl/TaimuProvider.java new file mode 100644 index 0000000..2b7e2f6 --- /dev/null +++ b/src/main/java/com/magamochi/ingestion/providers/impl/TaimuProvider.java @@ -0,0 +1,175 @@ +package com.magamochi.ingestion.providers.impl; + +import static java.util.Objects.isNull; + +import com.magamochi.catalog.model.entity.MangaContentProvider; +import com.magamochi.common.ContentProviders; +import com.magamochi.common.exception.UnprocessableException; +import com.magamochi.ingestion.model.dto.ContentImageInfoDTO; +import com.magamochi.ingestion.model.dto.ContentInfoDTO; +import com.magamochi.ingestion.model.dto.MangaInfoDTO; +import com.magamochi.ingestion.providers.ContentProvider; +import com.magamochi.ingestion.providers.PagedContentProvider; +import com.magamochi.ingestion.service.FlareService; +import com.magamochi.ingestion.service.ScrollableScrapperService; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.stream.IntStream; +import lombok.RequiredArgsConstructor; +import lombok.extern.log4j.Log4j2; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Service; + +@Log4j2 +@Service(ContentProviders.TAIMU) +@RequiredArgsConstructor +public class TaimuProvider implements ContentProvider, PagedContentProvider { + private final String baseUrl = "https://taimumangas.rzword.xyz"; + + private final FlareService flareService; + private final ScrollableScrapperService scrollableScrapperService; + + @Override + public List getAvailableChapters(MangaContentProvider provider) { + log.info( + "Getting available chapters from {}, manga {}", + ContentProviders.TAIMU, + provider.getManga().getTitle()); + + try { + var document = + flareService.getContentAsJsoupDocument( + provider.getUrl() + "?page=1&order=desc", ContentProviders.TAIMU); + + var totalPages = extractContentPagesFromDocument(document); + var contentInfoList = new ArrayList<>(extractContentInfoFromDocument(document)); + + for (int page = 2; page <= totalPages; page++) { + var pageDocument = + flareService.getContentAsJsoupDocument( + provider.getUrl() + "?page=" + page + "&order=desc", ContentProviders.TAIMU); + contentInfoList.addAll(extractContentInfoFromDocument(pageDocument)); + } + + return contentInfoList; + } catch (NoSuchElementException e) { + log.error("Error parsing mangas from MangaLivre", e); + return List.of(); + } + } + + private int extractContentPagesFromDocument(Document document) { + try { + var buttonsContainer = + document.selectFirst( + "div.flex.items-center.justify-between.gap-3.mt-4.pt-4.border-t div.flex.items-center.gap-1"); + var highNumberButton = buttonsContainer.selectFirst("button:nth-last-child(2)"); + return Integer.parseInt(highNumberButton.text()); + } catch (Exception e) { + // In case of any error during parsing, we assume there is only one page of content + return 1; + } + } + + private List extractContentInfoFromDocument(Document document) { + try { + var grid = document.selectFirst("div.grid.grid-cols-1.gap-1"); + var chapters = grid.select("a"); + + return chapters.stream() + .map( + chapter -> { + var chapterUrl = baseUrl + chapter.attr("href"); + var title = + chapter.selectFirst("div.flex-1.min-w-0 div.flex.items-center.gap-2 p").text(); + + return new ContentInfoDTO(title, chapterUrl.trim(), "pt-BR"); + }) + .toList(); + } catch (Exception e) { + log.error("Error parsing content info from " + ContentProviders.TAIMU, e); + return List.of(); + } + } + + @Override + public List getContentImages(String chapterUrl) { + log.info("Getting images from {}, url {}", ContentProviders.TAIMU, chapterUrl); + + try { + var document = scrollableScrapperService.getContentAsJsoupDocument(chapterUrl); + + var chapterImages = document.select("img.w-full.h-auto.object-contain.cursor-pointer"); + + var imageUrls = + chapterImages.stream() + .map(chapterImagesElement -> chapterImagesElement.attr("src")) + .toList(); + + return IntStream.range(0, imageUrls.size()) + .boxed() + .map(position -> new ContentImageInfoDTO(position, imageUrls.get(position))) + .toList(); + } catch (NoSuchElementException e) { + log.error("Error parsing manga images from " + ContentProviders.TAIMU, e); + return List.of(); + } + } + + @Override + public List getMangasFromPage(int page) { + log.info("Getting mangas from {}, page {}", ContentProviders.TAIMU, page); + + try { + var document = + flareService.getContentAsJsoupDocument( + baseUrl + "/biblioteca?page=" + page, ContentProviders.TAIMU); + + var mangas = document.select("a.group"); + + return mangas.stream() + .map( + element -> { + var mangaUrl = element.attr("href"); + var title = element.selectFirst("div h3").text(); + + return new MangaInfoDTO(title.trim(), baseUrl + mangaUrl.trim()); + }) + .toList(); + } catch (NoSuchElementException e) { + log.error("Error parsing mangas from " + ContentProviders.TAIMU, e); + return List.of(); + } + } + + @Override + public int getTotalPages() { + log.info("Getting total pages for {}", ContentProviders.TAIMU); + + try { + var document = + flareService.getContentAsJsoupDocument(baseUrl + "/biblioteca", ContentProviders.TAIMU); + + var container = document.selectFirst("div.flex.flex-col.items-center.space-y-3.mt-6"); + var pagination = container.selectFirst("div.flex.items-center.gap-2"); + var buttonsContainer = pagination.selectFirst("div.flex.items-center.gap-1"); + + var lastButton = buttonsContainer.select("button").last(); + + if (isNull(lastButton)) { + throw new UnprocessableException( + "Pagination buttons not found in " + ContentProviders.TAIMU); + } + + var buttonText = lastButton.text(); + return Integer.parseInt(buttonText); + } catch (Exception e) { + log.error( + "Error parsing total pages from " + + ContentProviders.TAIMU + + ": pagination container not found"); + return 0; + } + } +} diff --git a/src/main/java/com/magamochi/ingestion/service/ScrollableScrapperService.java b/src/main/java/com/magamochi/ingestion/service/ScrollableScrapperService.java new file mode 100644 index 0000000..0c2c797 --- /dev/null +++ b/src/main/java/com/magamochi/ingestion/service/ScrollableScrapperService.java @@ -0,0 +1,22 @@ +package com.magamochi.ingestion.service; + +import com.magamochi.ingestion.client.ScrollableScrapperClient; +import lombok.RequiredArgsConstructor; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Service; + +@Service +@RequiredArgsConstructor +public class ScrollableScrapperService { + private final ScrollableScrapperClient client; + + public Document getContentAsJsoupDocument(String url) { + return Jsoup.parse(getContent(url)); + } + + private String getContent(String url) { + + return client.get(ScrollableScrapperClient.GetRequest.builder().url(url).build()).pageSource(); + } +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index f9ee347..bc65589 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -22,9 +22,9 @@ spring: openfeign: client: config: - web-scrapper: - connect-timeout: 240000 - read-timeout: 240000 + scrollable-scrapper: + connect-timeout: 480000 + read-timeout: 480000 rabbitmq: host: ${RABBITMQ_HOST} port: ${RABBITMQ_PORT} @@ -41,6 +41,9 @@ springdoc: flare-solverr: endpoint: ${FLARESOLVERR_ENDPOINT} +scrollable-scrapper: + endpoint: ${SCROLLABLE_SCRAPPER_ENDPOINT} + minio: endpoint: ${MINIO_ENDPOINT} accessKey: ${MINIO_USER} diff --git a/src/main/resources/db/migration/V0009__TAIMU_PROVIDER.sql b/src/main/resources/db/migration/V0009__TAIMU_PROVIDER.sql new file mode 100644 index 0000000..3b11f35 --- /dev/null +++ b/src/main/resources/db/migration/V0009__TAIMU_PROVIDER.sql @@ -0,0 +1,2 @@ +INSERT INTO content_providers(name, url, active, supports_content_fetch, manual_import) +VALUES ('Taimu', 'https://taimumangas.rzword.xyz', TRUE, TRUE, FALSE); \ No newline at end of file