From 2f39749698f57b0e5edbc5d30fbed60f225d2539 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 22:05:34 +0100 Subject: [PATCH 1/2] add date to title for regular topics --- .../mserver/crawler/dw/DwConstants.java | 5 +++++ .../crawler/dw/parser/DwFilmDetailDeserializer.java | 6 +++++- .../mserver/crawler/dw/tasks/DWOverviewTask.java | 2 +- .../mserver/crawler/dw/tasks/DwFilmDetailTask.java | 12 ++++++++++-- .../crawler/dw/tasks/DWDetailDeserializerTest.java | 2 +- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/DwConstants.java b/src/main/java/de/mediathekview/mserver/crawler/dw/DwConstants.java index c1415eeb4..f2ecb9eed 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/DwConstants.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/DwConstants.java @@ -1,5 +1,8 @@ package de.mediathekview.mserver.crawler.dw; +import java.util.Arrays; +import java.util.List; + public class DwConstants { private DwConstants() {} @@ -7,4 +10,6 @@ private DwConstants() {} public static final String URL_OVERVIEW = "/list/mediacenter/1?pageIndex=1"; + public static final List REGULAR_TOPICS = Arrays.asList("Euromaxx", "Shift", "Fokus Europa", "Projekt Zukunft", "Global Us"); + } diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/parser/DwFilmDetailDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/dw/parser/DwFilmDetailDeserializer.java index 4573d57d4..8c7018479 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/parser/DwFilmDetailDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/parser/DwFilmDetailDeserializer.java @@ -7,6 +7,7 @@ import de.mediathekview.mlib.daten.Sender; import de.mediathekview.mserver.base.utils.JsonUtils; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.dw.DwConstants; import de.mediathekview.mserver.crawler.dw.DwVideoDto; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -127,7 +128,10 @@ public Optional deserialize( final JsonArray jsonObjectMainContentSources = jsonObjectMainContent.get(ELEMENT_MAINCONTENT_SOURCES).getAsJsonArray(); getVideos(title.get(), jsonObjectMainContentSources).ifPresent(film::addAllUrls); - // + // Euromaxx always has the same title and we do not get the subtitle + if (DwConstants.REGULAR_TOPICS.contains(film.getThema())) { + film.setTitel(film.getTitel() + " " + film.getTime().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"))); + } return Optional.of(film); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DWOverviewTask.java b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DWOverviewTask.java index 62c1c80fa..164f20cdd 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DWOverviewTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DWOverviewTask.java @@ -15,7 +15,7 @@ import java.util.concurrent.ConcurrentLinkedQueue; public class DWOverviewTask extends DWTaskBase { - + private static final long serialVersionUID = 4050423702709695861L; private static final Type OPTIONAL_OVERVIEW_DTO_TYPE_TOKEN = new TypeToken>>() {}.getType(); private final int subpage; diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java index 6014b075e..db681ad1e 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java @@ -2,6 +2,7 @@ import com.google.gson.reflect.TypeToken; import de.mediathekview.mlib.daten.Film; +import de.mediathekview.mserver.base.utils.FilmlistDebugHelper; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -44,14 +45,21 @@ protected void processRestTarget(final CrawlerUrlDTO aDTO, final WebTarget aTarg filmDetailDtoOptional = deserializeOptional(aTarget, OPTIONAL_FILM_DETAIL_DTO_TYPE_TOKEN); } catch (Exception e) { LOG.error("error processing {} ", aDTO.getUrl(), e); + crawler.incrementAndGetErrorCount(); + crawler.updateProgress(); } if (filmDetailDtoOptional.isEmpty()) { crawler.incrementAndGetErrorCount(); crawler.updateProgress(); return; } - this.taskResults.add(filmDetailDtoOptional.get()); - crawler.incrementAndGetActualCount(); + if (!this.taskResults.add(filmDetailDtoOptional.get())) { + crawler.incrementAndGetErrorCount(); + Film dup = FilmlistDebugHelper.getFilmFromSet(this.taskResults, filmDetailDtoOptional.get()); + LOG.warn("Entry was rejected {} \nBecause exists as {}", filmDetailDtoOptional.get(), dup); + } else { + crawler.incrementAndGetActualCount(); + } crawler.updateProgress(); } } diff --git a/src/test/java/de/mediathekview/mserver/crawler/dw/tasks/DWDetailDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/dw/tasks/DWDetailDeserializerTest.java index 9a55ef67c..9b537c027 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/dw/tasks/DWDetailDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/dw/tasks/DWDetailDeserializerTest.java @@ -95,7 +95,7 @@ public static Collection data() { }, { "/dw/dw_film_detail_five_video_urls.json", - "Energiezukunft? Schwimmende Windkraftanlagen", + "Energiezukunft? Schwimmende Windkraftanlagen 2022-11-12", "Projekt Zukunft", "https://p.dw.com/p/4JNwb", Duration.ofSeconds(385), From 150b3975c0d3aac8bd8f71d2e847792345d61794 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 8 Nov 2023 22:15:02 +0100 Subject: [PATCH 2/2] remove reference to debug --- .../mserver/crawler/dw/tasks/DwFilmDetailTask.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java index db681ad1e..66e88fb37 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/tasks/DwFilmDetailTask.java @@ -2,7 +2,6 @@ import com.google.gson.reflect.TypeToken; import de.mediathekview.mlib.daten.Film; -import de.mediathekview.mserver.base.utils.FilmlistDebugHelper; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -55,8 +54,7 @@ protected void processRestTarget(final CrawlerUrlDTO aDTO, final WebTarget aTarg } if (!this.taskResults.add(filmDetailDtoOptional.get())) { crawler.incrementAndGetErrorCount(); - Film dup = FilmlistDebugHelper.getFilmFromSet(this.taskResults, filmDetailDtoOptional.get()); - LOG.warn("Entry was rejected {} \nBecause exists as {}", filmDetailDtoOptional.get(), dup); + LOG.warn("Entry was rejected because existing {}", filmDetailDtoOptional.get()); } else { crawler.incrementAndGetActualCount(); }