From 3532127071b11631d6f226f5171ce25f1e99d61c Mon Sep 17 00:00:00 2001 From: vtempest <1274452+vtempest@users.noreply.github.com> Date: Sat, 7 Sep 2024 17:01:25 -0700 Subject: [PATCH] added UMAP dimensionality reduction, HNSW vector earch, with test demos. exractors modularized (readabilty, psotlight) --- .github/workflows/static.yml | 14 +- docs/assets/navigation.js | 2 +- docs/assets/search.js | 2 +- .../src_extract_content_core.Document.html | 23 - .../src_extract_content_core.Extractor.html | 25 - ...act_content_courlan_urlstore.UrlStore.html | 42 - ...xtract_content_deduplication.LRUCache.html | 10 - ...extract_content_deduplication.Simhash.html | 14 - ...rc_extract_content_downloads.Response.html | 13 - ...src_extract_content_settings.Document.html | 30 - ...rc_extract_content_settings.Extractor.html | 27 - ...xtract_content_spider.CrawlParameters.html | 19 - ...c_trafilatura_extractor_core.Document.html | 23 + ..._trafilatura_extractor_core.Extractor.html | 25 + ...a_extractor_courlan_urlstore.UrlStore.html | 42 + ...tura_extractor_deduplication.LRUCache.html | 10 + ...atura_extractor_deduplication.Simhash.html | 14 + ...filatura_extractor_downloads.Response.html | 13 + ...ra_extractor_readability.Readability.html} | 325 +- ...afilatura_extractor_settings.Document.html | 30 + ...filatura_extractor_settings.Extractor.html | 27 + ...tura_extractor_spider.CrawlParameters.html | 19 + ...tocomplete.suggestNextWordCompletions.html | 4 +- ...n_allow_cors.modifyChromeAPIAllowCORS.html | 4 +- ...compile_topic_model.compileTopicModel.html | 4 +- ...topic_model.weightWikiWordSpecificity.html | 2 +- ...press_json_jz64.compressBase64ZipText.html | 7 +- ...rt_dictionary_import.importDictionary.html | 2 +- ...t_human_names_import.importHumanNames.html | 2 +- ...ed_typos_import.importMisspelledTypos.html | 2 +- ...port_quora_import.importCommonQueries.html | 2 +- ..._frequency_import.importTermFrequency.html | 2 +- ...ge_titles_import.importWikiPageTitles.html | 2 +- ...src_extract_content_baseline.baseline.html | 4 - ...xtract_content_baseline.basicCleaning.html | 4 - ...src_extract_content_baseline.loadHtml.html | 4 - .../src_extract_content_baseline.trim.html | 4 - ...c_extract_content_core.bareExtraction.html | 4 - .../src_extract_content_core.extract.html | 4 - ...ract_content_courlan_clean.cleanQuery.html | 11 - ...xtract_content_courlan_clean.cleanUrl.html | 10 - ..._content_courlan_clean.decodePunycode.html | 9 - ...ntent_courlan_clean.normalizeFragment.html | 10 - ...t_content_courlan_clean.normalizePart.html | 9 - ...ct_content_courlan_clean.normalizeUrl.html | 12 - ...xtract_content_courlan_clean.scrubUrl.html | 9 - ...t_content_courlan_filters.basicFilter.html | 9 - ..._content_courlan_filters.domainFilter.html | 9 - ...ntent_courlan_filters.extensionFilter.html | 9 - ...tent_courlan_filters.isNavigationPage.html | 9 - ...ontent_courlan_filters.isNotCrawlable.html | 9 - ...ct_content_courlan_filters.isValidUrl.html | 9 - ...ct_content_courlan_filters.langFilter.html | 12 - ...ct_content_courlan_filters.pathFilter.html | 10 - ...ct_content_courlan_filters.typeFilter.html | 11 - ...t_content_courlan_filters.validateUrl.html | 9 - ...extract_content_courlan_meta.addCache.html | 6 - ...ract_content_courlan_meta.clearCaches.html | 6 - ...ntent_courlan_network.redirectionTest.html | 12 - ...ontent_courlan_urlutils.extractDomain.html | 11 - ...t_content_courlan_urlutils.filterUrls.html | 10 - ...t_content_courlan_urlutils.getBaseUrl.html | 9 - ...ntent_courlan_urlutils.getHostAndPath.html | 9 - ..._content_courlan_urlutils.getHostinfo.html | 9 - ...t_content_courlan_urlutils.getTldinfo.html | 10 - ...t_content_courlan_urlutils.isExternal.html | 11 - ..._content_courlan_urlutils.isKnownLink.html | 10 - ...tent_deduplication.contentFingerprint.html | 4 - ...content_deduplication.isSimilarDomain.html | 4 - ...content_downloads.addToCompressedDict.html | 4 - ...t_content_downloads.bufferedDownloads.html | 4 - ...tract_content_downloads.fetchResponse.html | 4 - ...rc_extract_content_downloads.fetchUrl.html | 4 - ..._content_downloads.loadDownloadBuffer.html | 4 - ...ct_content_external.compareExtraction.html | 4 - ...xtract_content_external.justextRescue.html | 4 - ...extract_content_external.sanitizeTree.html | 4 - ...rc_extract_content_feeds.findFeedUrls.html | 4 - ...src_extract_content_feeds.tryHomepage.html | 4 - ...ontent_htmlprocessing.buildHtmlOutput.html | 4 - ...ontent_htmlprocessing.collectLinkInfo.html | 4 - ...ntent_htmlprocessing.convertDeletions.html | 4 - ...content_htmlprocessing.convertDetails.html | 4 - ...t_content_htmlprocessing.convertLists.html | 4 - ..._content_htmlprocessing.convertQuotes.html | 4 - ..._content_htmlprocessing.convertToHtml.html | 4 - ...nt_htmlprocessing.deleteByLinkDensity.html | 4 - ...ontent_htmlprocessing.linkDensityTest.html | 4 - ..._htmlprocessing.linkDensityTestTables.html | 4 - ...ract_content_htmlprocessing.pruneHtml.html | 4 - ...ent_htmlprocessing.pruneUnwantedNodes.html | 4 - ...t_content_htmlprocessing.treeCleaning.html | 4 - ...act_content_json_metadata.extractJson.html | 4 - ...t_json_metadata.extractJsonParseError.html | 4 - ...ontent_json_metadata.normalizeAuthors.html | 4 - ...ontent_main_extractor.extractComments.html | 4 - ...content_main_extractor.extractContent.html | 4 - .../src_extract_content_meta.resetCaches.html | 4 - ...ract_content_metadata.extractMetadata.html | 4 - ...ract_content_settings.argsToExtractor.html | 4 - ...ontent_sitemaps.extractRobotsSitemaps.html | 4 - ...t_content_sitemaps.findRobotsSitemaps.html | 4 - ...t_content_sitemaps.isPlausibleSitemap.html | 4 - ...xtract_content_sitemaps.sitemapSearch.html | 4 - ...extract_content_spider.focusedCrawler.html | 4 - ...xtract_content_utils.controlXmlOutput.html | 4 - ...c_extract_content_utils.deleteElement.html | 4 - ...extract_content_utils.mergeWithParent.html | 4 - ...act_content_utils.removeEmptyElements.html | 4 - ...extract_content_utils.stripDoubleTags.html | 4 - .../src_extract_content_utils.textfilter.html | 4 - .../src_extract_content_utils.trim.html | 4 - .../src_extract_content_utils.xmlToTxt.html | 4 - ...c_extract_content_xml.buildJsonOutput.html | 4 - ...rc_extract_content_xml.buildTeiOutput.html | 4 - .../src_extract_content_xml.checkTei.html | 4 - ...src_extract_content_xml.deleteElement.html | 4 - ...tract_content_xml.removeEmptyElements.html | 4 - ..._to_cite_extract_author.extractAuthor.html | 2 +- ...html_to_cite_extract_cite.extractCite.html | 2 +- ...html_to_cite_extract_date.extractDate.html | 2 +- ..._to_cite_extract_source.extractSource.html | 2 +- ...ml_to_cite_extract_title.extractTitle.html | 2 +- ...an_names_recognize.extractNamedEntity.html | 2 +- ...adata_to_cite.extractCiteFromMetadata.html | 2 +- ...cite_url_to_domain.convertURLToDomain.html | 2 +- ...to_cite_url_to_favicon.extractFavicon.html | 4 +- ...nt_extract_content.extractMainContent.html | 77 - ...extractor1_content.extractContentHTML.html | 77 + ...xtractor2_content.extractContentHTML2.html | 41 + ...special_chars.convertHTMLSpecialChars.html | 6 +- ...nt_html_to_basic_html.addDOMFunctions.html | 2 +- ..._to_basic_html.convertHTMLToBasicHTML.html | 10 +- ...ontent_html_to_content.extractContent.html | 6 - ...html_to_content.extractContentAndCite.html | 14 + ...ntent_readability2.extractContentHTML.html | 24 - ...tent_pdf_to_content.convertPDFToHTML.html} | 9 +- ...rl_to_content_pdf_to_content.isUrlPDF.html | 2 +- ...r_url_to_content_scrape_url.scrapeURL.html | 10 +- ...url_to_content_url_to_content.extract.html | 7 +- ...tent_youtube_embed.embedYoutubePlayer.html | 2 +- ...nt_youtube_to_text.extractYoutubeText.html | 7 +- ...ings_to_graph.convertEmbeddingsToHNSW.html | 15 + ...ings_to_graph.convertEmbeddingsToUMAP.html | 13 + ...to_graph.convertTextToEmbeddingVector.html | 17 + ..._embeddings_to_graph.cosineSimilarity.html | 4 + ..._embeddings_to_graph.getAllEmbeddings.html | 8 + ...eddings_to_graph.getEmbeddingPipeline.html | 6 + ...h_embeddings_to_graph.searchWithQuery.html | 4 + ...e_hnsw.convertEmbeddingsIndexToBase64.html | 10 + ...etters.calculateSimilarityByCharacter.html | 2 +- .../src_match_match_quasar.matchQUASAR.html | 8 +- ...h_softmax.calculateProbabilitySoftmax.html | 5 +- ..._frequency.calculatePhraseSpecificity.html | 2 +- ...frequency.weighRelevanceTermFrequency.html | 4 +- ...search_web_search_stream.searchSTREAM.html | 16 +- .../src_search_web_search_web.searchWeb.html | 7 +- ..._web_search_wikipedia.searchWikipedia.html | 4 +- ...larity_concept.vectorizeTextAsConcept.html | 16 - ...y_concept.weighRelevanceConceptVector.html | 7 +- ...oncept.weighRelevanceConceptVectorAPI.html | 7 +- ...src_tokenize_sentences.splitSentences.html | 7 +- ...okenize_stopwords.isWordCommonIgnored.html | 2 +- ...text_to_chunks.splitTextSemanticChars.html | 2 +- ...kenize_tokenize_topics.tokenizeTopics.html | 7 +- ...rd_to_root_stem.convertWordToRootStem.html | 2 +- ...ution.weighTopicDirichletDistribution.html | 4 +- ...rc_topics_ngrams.extractNounEdgeGrams.html | 2 +- ...rases.rankSentencesCentralToKeyphrase.html | 2 +- ...seektopic_keyphrases.extractSEEKTOPIC.html | 4 +- ...afilatura_extractor_baseline.baseline.html | 4 + ...tura_extractor_baseline.basicCleaning.html | 4 + ...afilatura_extractor_baseline.html2txt.html | 4 + ...afilatura_extractor_baseline.loadHtml.html | 4 + ...c_trafilatura_extractor_baseline.trim.html | 4 + ...latura_extractor_core.bareExtraction.html} | 2 +- ...rc_trafilatura_extractor_core.extract.html | 4 + ...ra_extractor_courlan_clean.cleanQuery.html | 11 + ...tura_extractor_courlan_clean.cleanUrl.html | 10 + ...xtractor_courlan_clean.decodePunycode.html | 9 + ...actor_courlan_clean.normalizeFragment.html | 10 + ...extractor_courlan_clean.normalizePart.html | 9 + ..._extractor_courlan_clean.normalizeUrl.html | 12 + ...tura_extractor_courlan_clean.scrubUrl.html | 9 + ...tura_extractor_courlan_core.checkUrl.html} | 4 +- ..._extractor_courlan_core.extractLinks.html} | 4 +- ...a_extractor_courlan_core.filterLinks.html} | 4 +- ...extractor_courlan_filters.basicFilter.html | 9 + ...xtractor_courlan_filters.domainFilter.html | 9 + ...actor_courlan_filters.extensionFilter.html | 9 + ...ctor_courlan_filters.isNavigationPage.html | 9 + ...ractor_courlan_filters.isNotCrawlable.html | 9 + ...extractor_courlan_filters.isValidUrl.html} | 7 +- ..._extractor_courlan_filters.langFilter.html | 12 + ..._extractor_courlan_filters.pathFilter.html | 10 + ..._extractor_courlan_filters.typeFilter.html | 11 + ...extractor_courlan_filters.validateUrl.html | 9 + ...tura_extractor_courlan_meta.addCache.html} | 4 +- ...ra_extractor_courlan_meta.clearCaches.html | 6 + ...actor_courlan_network.redirectionTest.html | 12 + ...xtractor_courlan_sampling.sampleUrls.html} | 4 +- ...actor_courlan_urlutils.extractDomain.html} | 9 +- ...extractor_courlan_urlutils.filterUrls.html | 10 + ...ctor_courlan_urlutils.fixRelativeUrls.html | 10 + ...extractor_courlan_urlutils.getBaseUrl.html | 9 + ...actor_courlan_urlutils.getHostAndPath.html | 9 + ...xtractor_courlan_urlutils.getHostinfo.html | 9 + ...extractor_courlan_urlutils.getTldinfo.html | 10 + ...extractor_courlan_urlutils.isExternal.html | 11 + ...xtractor_courlan_urlutils.isKnownLink.html | 10 + ...ctor_deduplication.contentFingerprint.html | 4 + ...extractor_deduplication.duplicateTest.html | 4 + ...tractor_deduplication.generateBowHash.html | 4 + ...tractor_deduplication.isSimilarDomain.html | 4 + ...tractor_downloads.addToCompressedDict.html | 4 + ...extractor_downloads.bufferedDownloads.html | 4 + ...ura_extractor_downloads.fetchResponse.html | 4 + ...filatura_extractor_downloads.fetchUrl.html | 4 + ...xtractor_downloads.loadDownloadBuffer.html | 4 + ..._extractor_external.compareExtraction.html | 4 + ...tura_extractor_external.justextRescue.html | 4 + ...atura_extractor_external.sanitizeTree.html | 4 + ...ilatura_extractor_feeds.findFeedUrls.html} | 2 +- ...filatura_extractor_feeds.tryHomepage.html} | 2 +- ...special_chars.convertHTMLSpecialChars.html | 13 + ...ractor_htmlprocessing.buildHtmlOutput.html | 4 + ...actor_htmlprocessing.collectLinkInfo.html} | 2 +- ...actor_htmlprocessing.convertDeletions.html | 4 + ...tractor_htmlprocessing.convertDetails.html | 4 + ...actor_htmlprocessing.convertHeadings.html} | 8 +- ...ctor_htmlprocessing.convertLineBreaks.html | 4 + ...xtractor_htmlprocessing.convertLists.html} | 2 +- ...xtractor_htmlprocessing.convertQuotes.html | 4 + ..._extractor_htmlprocessing.convertTags.html | 4 + ...xtractor_htmlprocessing.convertToHtml.html | 4 + ...or_htmlprocessing.deleteByLinkDensity.html | 4 + ...ractor_htmlprocessing.handleTextnode.html} | 2 +- ...ractor_htmlprocessing.linkDensityTest.html | 4 + ..._htmlprocessing.linkDensityTestTables.html | 4 + ..._extractor_htmlprocessing.processNode.html | 4 + ...ra_extractor_htmlprocessing.pruneHtml.html | 4 + ...or_htmlprocessing.pruneUnwantedNodes.html} | 2 +- ...extractor_htmlprocessing.treeCleaning.html | 4 + ..._extractor_json_metadata.extractJson.html} | 2 +- ...r_json_metadata.extractJsonParseError.html | 4 + ...ractor_json_metadata.normalizeAuthors.html | 4 + ...extractor_json_metadata.normalizeJson.html | 4 + ...actor_main_extractor.extractComments.html} | 2 +- ...tractor_main_extractor.extractContent.html | 4 + ...rafilatura_extractor_meta.resetCaches.html | 4 + ...ra_extractor_metadata.extractMetadata.html | 4 + ...r_readability_lxml.extractContentHTML.html | 4 + ...a_extractor_settings.argsToExtractor.html} | 2 +- ...tura_extractor_settings.setDateParams.html | 4 + ...ractor_sitemaps.extractRobotsSitemaps.html | 4 + ...extractor_sitemaps.findRobotsSitemaps.html | 4 + ...extractor_sitemaps.isPlausibleSitemap.html | 4 + ...ura_extractor_sitemaps.sitemapSearch.html} | 2 +- ...atura_extractor_spider.focusedCrawler.html | 4 + ...tura_extractor_utils.buildJsonOutput.html} | 2 +- ...atura_extractor_utils.buildTeiOutput.html} | 2 +- ...latura_extractor_utils.buildXmlOutput.html | 4 + ..._trafilatura_extractor_utils.checkTei.html | 4 + ...tura_extractor_utils.controlXmlOutput.html | 4 + ...latura_extractor_utils.deleteElement.html} | 2 +- ...tura_extractor_utils.mergeWithParent.html} | 2 +- ...ura_extractor_utils.normalizeUnicode.html} | 2 +- ...a_extractor_utils.removeEmptyElements.html | 4 + ...tura_extractor_utils.stripDoubleTags.html} | 2 +- ...ilatura_extractor_utils.textCharsTest.html | 4 + ...rafilatura_extractor_utils.textfilter.html | 4 + .../src_trafilatura_extractor_utils.trim.html | 4 + ...atura_extractor_utils.writeFullheader.html | 4 + ...filatura_extractor_utils.writeTeitree.html | 4 + ...trafilatura_extractor_utils.xmlToTxt.html} | 2 +- ...ilatura_extractor_xml.buildJsonOutput.html | 4 + ...ilatura_extractor_xml.buildTeiOutput.html} | 2 +- ...filatura_extractor_xml.buildXmlOutput.html | 4 + ...rc_trafilatura_extractor_xml.checkTei.html | 4 + ...latura_extractor_xml.controlXmlOutput.html | 4 + ...afilatura_extractor_xml.deleteElement.html | 4 + ...ilatura_extractor_xml.mergeWithParent.html | 4 + ...ura_extractor_xml.removeEmptyElements.html | 4 + ...ilatura_extractor_xml.stripDoubleTags.html | 4 + docs/index.html | 48 +- ...te_human_names_recognize.AuthorObject.html | 8 +- ...tent_html_to_content.ExtractedContent.html | 18 + ...url_to_content_url_to_content.Article.html | 24 +- ...o_content_youtube_embed.YouTubePlayer.html | 74 +- ...aph_embeddings_to_graph.PlotDataPoint.html | 10 + .../src_tokenize_tokenize_topics.Token.html | 8 +- docs/modules/index.html | 14 +- .../src_autocomplete_autocomplete.html | 2 +- .../src_chrome_extension_allow_cors.html | 2 +- ...rc_dataset_import_compile_topic_model.html | 2 +- ...src_dataset_import_compress_json_jz64.html | 2 +- .../src_dataset_import_dictionary_import.html | 2 +- ...src_dataset_import_human_names_import.html | 2 +- .../src_dataset_import_metadata_stats.html | 2 +- ...ataset_import_misspelled_typos_import.html | 2 +- .../src_dataset_import_quora_import.html | 2 +- ..._dataset_import_term_frequency_import.html | 2 +- ...dataset_import_wikipage_titles_import.html | 2 +- .../modules/src_extract_content_baseline.html | 9 - docs/modules/src_extract_content_core.html | 8 - .../src_extract_content_courlan-1.html | 31 - docs/modules/src_extract_content_courlan.html | 31 - .../src_extract_content_courlan_clean.html | 11 - .../src_extract_content_courlan_filters.html | 14 - .../src_extract_content_courlan_settings.html | 8 - .../src_extract_content_courlan_urlutils.html | 13 - .../src_extract_content_deduplication.html | 10 - .../src_extract_content_htmldate-1.html | 5 - .../src_extract_content_htmldate.export_.html | 4 - .../src_extract_content_htmldate_core.html | 15 - ...c_extract_content_htmldate_extractors.html | 25 - .../src_extract_content_htmldate_meta.html | 5 - ...src_extract_content_htmldate_settings.html | 9 - .../src_extract_content_htmldate_utils.html | 16 - ...c_extract_content_htmldate_validators.html | 14 - .../src_extract_content_json_metadata.html | 8 - .../src_extract_content_readability_lxml.html | 5 - .../modules/src_extract_content_settings.html | 23 - .../modules/src_extract_content_sitemaps.html | 8 - docs/modules/src_extract_content_spider.html | 7 - docs/modules/src_extract_content_utils.html | 18 - docs/modules/src_extract_content_xml.html | 13 - docs/modules/src_extract_content_xpaths.html | 17 - ...extractor_html_to_cite_extract_author.html | 2 +- ...c_extractor_html_to_cite_extract_cite.html | 2 +- ...c_extractor_html_to_cite_extract_date.html | 2 +- ...extractor_html_to_cite_extract_source.html | 2 +- ..._extractor_html_to_cite_extract_title.html | 2 +- ...or_html_to_cite_human_names_recognize.html | 2 +- ...tractor_html_to_cite_metadata_to_cite.html | 2 +- ..._extractor_html_to_cite_url_to_domain.html | 2 +- ...extractor_html_to_cite_url_to_favicon.html | 2 +- ...actor_html_to_content_extract_content.html | 5 - ...t_extract_content_extractor1_content.html} | 3 +- ...nt_extract_content_extractor2_content.html | 5 + ...act_content_extractor2_content_utils.html} | 2 +- ...or_html_to_content_html_special_chars.html | 2 +- ...or_html_to_content_html_to_basic_html.html | 2 +- ...actor_html_to_content_html_to_content.html | 3 +- ...tractor_url_to_content_pdf_to_content.html | 2 +- ...c_extractor_url_to_content_scrape_url.html | 2 +- ...tractor_url_to_content_url_to_content.html | 2 +- ...xtractor_url_to_content_youtube_embed.html | 2 +- ...ractor_url_to_content_youtube_to_text.html | 2 +- .../src_graph_embeddings_to_graph.html | 12 + ...ent_meta.html => src_graph_save_hnsw.html} | 2 +- docs/modules/src_match_compare_letters.html | 2 +- docs/modules/src_match_match_quasar.html | 2 +- docs/modules/src_match_softmax.html | 2 +- .../src_match_weigh_relevance_frequency.html | 2 +- .../modules/src_search_web_search_stream.html | 2 +- docs/modules/src_search_web_search_web.html | 2 +- .../src_search_web_search_wikipedia.html | 2 +- .../src_similarity_similarity_concept.html | 3 +- docs/modules/src_tokenize_sentences.html | 2 +- docs/modules/src_tokenize_stopwords.html | 2 +- docs/modules/src_tokenize_text_to_chunks.html | 2 +- .../modules/src_tokenize_tokenize_topics.html | 2 +- .../src_tokenize_word_to_root_stem.html | 2 +- ...topic_distribution_topic_distribution.html | 2 +- docs/modules/src_topics_ngrams.html | 2 +- .../src_topics_rank_sentences_keyphrases.html | 2 +- .../src_topics_seektopic_keyphrases.html | 2 +- .../src_trafilatura_extractor_baseline.html | 9 + .../src_trafilatura_extractor_core.html | 8 + .../src_trafilatura_extractor_courlan-1.html | 31 + .../src_trafilatura_extractor_courlan.html | 31 + ...c_trafilatura_extractor_courlan_clean.html | 11 + ...rc_trafilatura_extractor_courlan_core.html | 7 + ...trafilatura_extractor_courlan_filters.html | 14 + ...rc_trafilatura_extractor_courlan_meta.html | 6 + ...rafilatura_extractor_courlan_network.html} | 2 +- ...afilatura_extractor_courlan_sampling.html} | 3 +- ...rafilatura_extractor_courlan_settings.html | 8 + ...rafilatura_extractor_courlan_urlstore.html | 5 + ...rafilatura_extractor_courlan_urlutils.html | 13 + ...c_trafilatura_extractor_deduplication.html | 10 + ... src_trafilatura_extractor_downloads.html} | 7 +- ...> src_trafilatura_extractor_external.html} | 21 +- ...l => src_trafilatura_extractor_feeds.html} | 5 +- ...ilatura_extractor_html_special_chars.html} | 2 +- .../src_trafilatura_extractor_htmldate-1.html | 5 + ...rafilatura_extractor_htmldate.export_.html | 4 + .../src_trafilatura_extractor_htmldate.html | 6 + ...c_trafilatura_extractor_htmldate_core.html | 15 + ...ilatura_extractor_htmldate_extractors.html | 25 + ...c_trafilatura_extractor_htmldate_meta.html | 5 + ...afilatura_extractor_htmldate_settings.html | 9 + ..._trafilatura_extractor_htmldate_utils.html | 16 + ...ilatura_extractor_htmldate_validators.html | 14 + ..._trafilatura_extractor_htmlprocessing.html | 22 + ...c_trafilatura_extractor_json_metadata.html | 8 + ..._trafilatura_extractor_main_extractor.html | 6 + ...ml => src_trafilatura_extractor_meta.html} | 2 +- ...> src_trafilatura_extractor_metadata.html} | 3 +- ...rc_trafilatura_extractor_readability.html} | 2 +- ...afilatura_extractor_readability_lxml.html} | 2 +- .../src_trafilatura_extractor_settings.html | 23 + .../src_trafilatura_extractor_sitemaps.html | 8 + ... => src_trafilatura_extractor_spider.html} | 6 +- ...afilatura_extractor_test_extract_test.html | 4 + .../src_trafilatura_extractor_utils.html | 20 + ...tml => src_trafilatura_extractor_xml.html} | 15 +- ... => src_trafilatura_extractor_xpaths.html} | 15 +- ...ct_content_courlan_settings.BLACKLIST.html | 4 - ..._content_courlan_settings.LANG_PARAMS.html | 4 - ...rc_extract_content_htmldate.export_-1.html | 4 - ...tract_content_settings.DEFAULT_CONFIG.html | 4 - ...act_content_settings.MANUALLY_CLEANED.html | 4 - ...tent_settings.MAX_FILES_PER_DIRECTORY.html | 4 - ...rc_extract_content_settings.MAX_LINKS.html | 4 - ...ct_content_settings.MAX_SITEMAPS_SEEN.html | 4 - ...ct_content_settings.SUPPORTED_FORMATS.html | 4 - ..._extract_content_settings.TAG_CATALOG.html | 4 - .../src_extract_content_spider.URL_STORE.html | 4 - ..._content_xpaths.AUTHOR_DISCARD_XPATHS.html | 4 - ..._extract_content_xpaths.AUTHOR_XPATHS.html | 4 - ...ract_content_xpaths.CATEGORIES_XPATHS.html | 4 - ..._content_xpaths.OVERALL_DISCARD_XPATH.html | 4 - ...rc_extract_content_xpaths.TAGS_XPATHS.html | 4 - ...t_content_xpaths.TEASER_DISCARD_XPATH.html | 4 - ...c_extract_content_xpaths.TITLE_XPATHS.html | 4 - ...actor_courlan_settings.ALLOWED_PARAMS.html | 4 + ..._extractor_courlan_settings.BLACKLIST.html | 4 + ...xtractor_courlan_settings.LANG_PARAMS.html | 4 + ...tractor_courlan_settings.TARGET_LANGS.html | 4 + ...filatura_extractor_htmldate.export_-1.html | 4 + ...extractor_settings.BASIC_CLEAN_XPATH.html} | 2 +- ...a_extractor_settings.CUT_EMPTY_ELEMS.html} | 2 +- ...ura_extractor_settings.DEFAULT_CONFIG.html | 4 + ...tura_extractor_settings.FILENAME_LEN.html} | 2 +- ...extractor_settings.JUSTEXT_LANGUAGES.html} | 2 +- ...filatura_extractor_settings.LRU_SIZE.html} | 2 +- ..._extractor_settings.MANUALLY_CLEANED.html} | 2 +- ..._extractor_settings.MANUALLY_STRIPPED.html | 4 + ...tor_settings.MAX_FILES_PER_DIRECTORY.html} | 2 +- ...ilatura_extractor_settings.MAX_LINKS.html} | 2 +- ...extractor_settings.MAX_SITEMAPS_SEEN.html} | 2 +- ...ra_extractor_settings.PARALLEL_CORES.html} | 2 +- ...extractor_settings.SUPPORTED_FMT_CLI.html} | 2 +- ..._extractor_settings.SUPPORTED_FORMATS.html | 4 + ...latura_extractor_settings.TAG_CATALOG.html | 4 + ...afilatura_extractor_spider.URL_STORE.html} | 2 +- ...xtractor_xpaths.AUTHOR_DISCARD_XPATHS.html | 4 + ...latura_extractor_xpaths.AUTHOR_XPATHS.html | 4 + ...filatura_extractor_xpaths.BODY_XPATH.html} | 2 +- ...ra_extractor_xpaths.CATEGORIES_XPATHS.html | 4 + ...tractor_xpaths.COMMENTS_DISCARD_XPATH.html | 4 + ...atura_extractor_xpaths.COMMENTS_XPATH.html | 4 + ...tractor_xpaths.DISCARD_IMAGE_ELEMENTS.html | 4 + ...xtractor_xpaths.OVERALL_DISCARD_XPATH.html | 4 + ...ractor_xpaths.PRECISION_DISCARD_XPATH.html | 4 + ...xtractor_xpaths.REMOVE_COMMENTS_XPATH.html | 4 + ...ilatura_extractor_xpaths.TAGS_XPATHS.html} | 2 +- ...extractor_xpaths.TEASER_DISCARD_XPATH.html | 4 + ...ilatura_extractor_xpaths.TITLE_XPATHS.html | 4 + index.js | 40 +- package.json | 31 +- pnpm-lock.yaml | 2821 +++++++++++++++++ readme.md | 47 +- src/autocomplete/autocomplete.js | 2 +- src/dataset-import/compile-topic-model.js | 2 +- src/dataset-import/compress-json-jz64.js | 4 +- src/extract-content/python/baseline.py | 121 - src/extract-content/python/clean.py | 207 -- src/extract-content/python/core.py | 364 --- src/extract-content/python/courlan-core.py | 256 -- src/extract-content/python/deduplication.py | 253 -- src/extract-content/python/downloads.py | 474 --- src/extract-content/python/external.py | 189 -- src/extract-content/python/extractors.py | 515 --- src/extract-content/python/feeds.py | 312 -- src/extract-content/python/filters.py | 286 -- src/extract-content/python/htmldate-core.py | 982 ------ src/extract-content/python/htmlprocessing.py | 412 --- src/extract-content/python/json_metadata.py | 268 -- src/extract-content/python/main_extractor.py | 666 ---- src/extract-content/python/meta (1).py | 14 - src/extract-content/python/meta (2).py | 40 - src/extract-content/python/meta.py | 32 - src/extract-content/python/metadata.py | 496 --- src/extract-content/python/network.py | 71 - .../python/readability_lxml.py | 520 --- src/extract-content/python/sampling.py | 73 - src/extract-content/python/settings (1).py | 109 - src/extract-content/python/settings (2).py | 41 - src/extract-content/python/settings.py | 282 -- src/extract-content/python/sitemaps.py | 301 -- src/extract-content/python/spider.py | 351 -- src/extract-content/python/urlstore.py | 575 ---- src/extract-content/python/urlutils.py | 173 - src/extract-content/python/utils (1).py | 260 -- src/extract-content/python/utils.py | 445 --- src/extract-content/python/validators.py | 200 -- src/extract-content/python/xml.py | 611 ---- src/extract-content/python/xpaths.py | 267 -- src/extractor/html-to-cite/url-to-favicon.js | 2 +- .../html-to-content/extract-content.js | 222 -- .../extract-selectors-per-domain.json} | 0 .../extract-content/extractor1-content.js | 391 +++ .../extractor2-content-utils.js | 743 +++++ .../extract-content/extractor2-content.js | 820 +++++ .../html-to-content/html-special-chars.js | 7 +- .../html-to-content/html-to-basic-html.js | 13 +- .../html-to-content/html-to-content.js | 73 +- src/extractor/html-to-content/readability2.js | 303 -- .../url-to-content/pdf-to-content.js | 10 +- src/extractor/url-to-content/scrape-url.js | 26 +- .../url-to-content/url-to-content.js | 18 +- .../url-to-content/youtube-to-text.js | 23 +- src/graph/embeddings-to-graph.js | 172 + src/graph/save-hnsw.js | 32 + src/match/match-quasar.js | 12 +- src/match/softmax.js | 6 +- src/match/weigh-relevance-frequency.js | 2 +- src/search-web/search-stream.js | 15 +- src/search-web/search-web.js | 4 +- src/search-web/search-wikipedia.js | 2 +- src/similarity/similarity-concept.js | 64 +- src/tokenize/sentences.js | 4 +- src/tokenize/tokenize-topics.js | 4 +- src/topic-distribution/topic-distribution.js | 2 +- src/topics/seektopic-keyphrases.js | 2 +- src/trafilatura-extractor/.gitignore | 175 + .../baseline.js | 0 .../core.js | 0 .../courlan/clean.js | 0 .../courlan/core.js | 0 .../courlan/filters.js | 0 .../courlan/index.js | 0 .../courlan/meta.js | 0 .../courlan/network.js | 0 .../courlan/sampling.js | 0 .../courlan/settings.js | 0 .../courlan/urlstore.js | 0 .../courlan/urlutils.js | 0 .../deduplication.js | 0 .../downloads.js | 0 .../external.js | 0 .../feeds.js | 0 .../html-special-chars.js | 74 + .../htmldate/core.js | 13 - .../htmldate/extractors.js | 0 .../htmldate/index.js | 0 .../htmldate/meta.js | 0 .../htmldate/settings.js | 0 .../htmldate/utils.js | 0 .../htmldate/validators.js | 0 .../htmlprocessing.js | 0 src/trafilatura-extractor/jsconfig.json | 27 + .../json_metadata.js | 0 .../main_extractor.js | 0 .../meta.js | 0 .../metadata.js | 0 src/trafilatura-extractor/package.json | 19 + .../readability.js | 31 +- .../readability_lxml.js | 0 .../settings.js | 2 +- .../sitemaps.js | 0 .../spider.js | 0 .../test/extract.test.js | 28 + .../utils.js | 0 .../xml.js | 0 .../xpaths.js | 0 test/data/extract-article.test.js | 2 +- test/embedding.test.js | 89 + test/extractor-test/add-score.test.js | 23 + test/extractor-test/add-to-parent.test.js | 17 + test/extractor-test/extractor.test.js | 26 + .../extractor-test/find-top-candidate.test.js | 70 + test/extractor-test/get-or-init-score.test.js | 55 + test/extractor-test/get-score.test.js | 20 + test/extractor-test/get-weight.test.js | 92 + test/extractor-test/score-commas.test.js | 18 + test/extractor-test/score-content.test.js | 90 + test/extractor-test/score-length.test.js | 21 + test/extractor-test/score-node.test.js | 101 + test/extractor-test/score-parag.test.js | 39 + test/extractor-test/set-score.test.js | 19 + test/foo.dat | Bin 1856 -> 0 bytes test/graph.test.js | 32 + test/hnsw.js | 45 - test/readability.test.js | 37 +- test/relevace-concept-vector.test.js | 6 +- test/topic-distribution.test.js | 2 +- test/vectors.dat | Bin 8496 -> 0 bytes tsconfig.json | 2 +- web-app/package.json | 7 +- web-app/src/components/Home/Graph.svelte | 86 + web-app/src/components/Home/Home.svelte | 144 +- web-app/src/components/Home/ReadView.svelte | 2 +- 596 files changed, 8373 insertions(+), 12754 deletions(-) delete mode 100644 docs/classes/src_extract_content_core.Document.html delete mode 100644 docs/classes/src_extract_content_core.Extractor.html delete mode 100644 docs/classes/src_extract_content_courlan_urlstore.UrlStore.html delete mode 100644 docs/classes/src_extract_content_deduplication.LRUCache.html delete mode 100644 docs/classes/src_extract_content_deduplication.Simhash.html delete mode 100644 docs/classes/src_extract_content_downloads.Response.html delete mode 100644 docs/classes/src_extract_content_settings.Document.html delete mode 100644 docs/classes/src_extract_content_settings.Extractor.html delete mode 100644 docs/classes/src_extract_content_spider.CrawlParameters.html create mode 100644 docs/classes/src_trafilatura_extractor_core.Document.html create mode 100644 docs/classes/src_trafilatura_extractor_core.Extractor.html create mode 100644 docs/classes/src_trafilatura_extractor_courlan_urlstore.UrlStore.html create mode 100644 docs/classes/src_trafilatura_extractor_deduplication.LRUCache.html create mode 100644 docs/classes/src_trafilatura_extractor_deduplication.Simhash.html create mode 100644 docs/classes/src_trafilatura_extractor_downloads.Response.html rename docs/classes/{src_extractor_html_to_content_readability.Readability.html => src_trafilatura_extractor_readability.Readability.html} (55%) create mode 100644 docs/classes/src_trafilatura_extractor_settings.Document.html create mode 100644 docs/classes/src_trafilatura_extractor_settings.Extractor.html create mode 100644 docs/classes/src_trafilatura_extractor_spider.CrawlParameters.html delete mode 100644 docs/functions/src_extract_content_baseline.baseline.html delete mode 100644 docs/functions/src_extract_content_baseline.basicCleaning.html delete mode 100644 docs/functions/src_extract_content_baseline.loadHtml.html delete mode 100644 docs/functions/src_extract_content_baseline.trim.html delete mode 100644 docs/functions/src_extract_content_core.bareExtraction.html delete mode 100644 docs/functions/src_extract_content_core.extract.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.cleanQuery.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.cleanUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.decodePunycode.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizeFragment.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizePart.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizeUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.scrubUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.basicFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.domainFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.extensionFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isNavigationPage.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isNotCrawlable.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isValidUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.langFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.pathFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.typeFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.validateUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_meta.addCache.html delete mode 100644 docs/functions/src_extract_content_courlan_meta.clearCaches.html delete mode 100644 docs/functions/src_extract_content_courlan_network.redirectionTest.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.extractDomain.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.filterUrls.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getBaseUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getHostAndPath.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getHostinfo.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getTldinfo.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.isExternal.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.isKnownLink.html delete mode 100644 docs/functions/src_extract_content_deduplication.contentFingerprint.html delete mode 100644 docs/functions/src_extract_content_deduplication.isSimilarDomain.html delete mode 100644 docs/functions/src_extract_content_downloads.addToCompressedDict.html delete mode 100644 docs/functions/src_extract_content_downloads.bufferedDownloads.html delete mode 100644 docs/functions/src_extract_content_downloads.fetchResponse.html delete mode 100644 docs/functions/src_extract_content_downloads.fetchUrl.html delete mode 100644 docs/functions/src_extract_content_downloads.loadDownloadBuffer.html delete mode 100644 docs/functions/src_extract_content_external.compareExtraction.html delete mode 100644 docs/functions/src_extract_content_external.justextRescue.html delete mode 100644 docs/functions/src_extract_content_external.sanitizeTree.html delete mode 100644 docs/functions/src_extract_content_feeds.findFeedUrls.html delete mode 100644 docs/functions/src_extract_content_feeds.tryHomepage.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.buildHtmlOutput.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.collectLinkInfo.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertDeletions.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertDetails.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertLists.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertQuotes.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertToHtml.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.deleteByLinkDensity.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.linkDensityTest.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.linkDensityTestTables.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.pruneHtml.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.pruneUnwantedNodes.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.treeCleaning.html delete mode 100644 docs/functions/src_extract_content_json_metadata.extractJson.html delete mode 100644 docs/functions/src_extract_content_json_metadata.extractJsonParseError.html delete mode 100644 docs/functions/src_extract_content_json_metadata.normalizeAuthors.html delete mode 100644 docs/functions/src_extract_content_main_extractor.extractComments.html delete mode 100644 docs/functions/src_extract_content_main_extractor.extractContent.html delete mode 100644 docs/functions/src_extract_content_meta.resetCaches.html delete mode 100644 docs/functions/src_extract_content_metadata.extractMetadata.html delete mode 100644 docs/functions/src_extract_content_settings.argsToExtractor.html delete mode 100644 docs/functions/src_extract_content_sitemaps.extractRobotsSitemaps.html delete mode 100644 docs/functions/src_extract_content_sitemaps.findRobotsSitemaps.html delete mode 100644 docs/functions/src_extract_content_sitemaps.isPlausibleSitemap.html delete mode 100644 docs/functions/src_extract_content_sitemaps.sitemapSearch.html delete mode 100644 docs/functions/src_extract_content_spider.focusedCrawler.html delete mode 100644 docs/functions/src_extract_content_utils.controlXmlOutput.html delete mode 100644 docs/functions/src_extract_content_utils.deleteElement.html delete mode 100644 docs/functions/src_extract_content_utils.mergeWithParent.html delete mode 100644 docs/functions/src_extract_content_utils.removeEmptyElements.html delete mode 100644 docs/functions/src_extract_content_utils.stripDoubleTags.html delete mode 100644 docs/functions/src_extract_content_utils.textfilter.html delete mode 100644 docs/functions/src_extract_content_utils.trim.html delete mode 100644 docs/functions/src_extract_content_utils.xmlToTxt.html delete mode 100644 docs/functions/src_extract_content_xml.buildJsonOutput.html delete mode 100644 docs/functions/src_extract_content_xml.buildTeiOutput.html delete mode 100644 docs/functions/src_extract_content_xml.checkTei.html delete mode 100644 docs/functions/src_extract_content_xml.deleteElement.html delete mode 100644 docs/functions/src_extract_content_xml.removeEmptyElements.html delete mode 100644 docs/functions/src_extractor_html_to_content_extract_content.extractMainContent.html create mode 100644 docs/functions/src_extractor_html_to_content_extract_content_extractor1_content.extractContentHTML.html create mode 100644 docs/functions/src_extractor_html_to_content_extract_content_extractor2_content.extractContentHTML2.html delete mode 100644 docs/functions/src_extractor_html_to_content_html_to_content.extractContent.html create mode 100644 docs/functions/src_extractor_html_to_content_html_to_content.extractContentAndCite.html delete mode 100644 docs/functions/src_extractor_html_to_content_readability2.extractContentHTML.html rename docs/functions/{src_extractor_url_to_content_pdf_to_content.extractPDF.html => src_extractor_url_to_content_pdf_to_content.convertPDFToHTML.html} (55%) create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToHNSW.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToUMAP.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertTextToEmbeddingVector.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.cosineSimilarity.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.getAllEmbeddings.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.getEmbeddingPipeline.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.searchWithQuery.html create mode 100644 docs/functions/src_graph_save_hnsw.convertEmbeddingsIndexToBase64.html delete mode 100644 docs/functions/src_similarity_similarity_concept.vectorizeTextAsConcept.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.baseline.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.basicCleaning.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.html2txt.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.loadHtml.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.trim.html rename docs/{variables/src_extract_content_xpaths.COMMENTS_XPATH.html => functions/src_trafilatura_extractor_core.bareExtraction.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_core.extract.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.cleanQuery.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.cleanUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.decodePunycode.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizeFragment.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizePart.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizeUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.scrubUrl.html rename docs/functions/{src_extract_content_courlan_core.checkUrl.html => src_trafilatura_extractor_courlan_core.checkUrl.html} (58%) rename docs/functions/{src_extract_content_courlan_core.extractLinks.html => src_trafilatura_extractor_courlan_core.extractLinks.html} (67%) rename docs/functions/{src_extract_content_courlan_core.filterLinks.html => src_trafilatura_extractor_courlan_core.filterLinks.html} (60%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.basicFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.domainFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.extensionFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.isNavigationPage.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.isNotCrawlable.html rename docs/{variables/src_extract_content_xpaths.BODY_XPATH.html => functions/src_trafilatura_extractor_courlan_filters.isValidUrl.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.langFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.pathFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.typeFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.validateUrl.html rename docs/functions/{src_extract_content_baseline.html2txt.html => src_trafilatura_extractor_courlan_meta.addCache.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_meta.clearCaches.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_network.redirectionTest.html rename docs/functions/{src_extract_content_courlan_sampling.sampleUrls.html => src_trafilatura_extractor_courlan_sampling.sampleUrls.html} (57%) rename docs/functions/{src_extract_content_deduplication.duplicateTest.html => src_trafilatura_extractor_courlan_urlutils.extractDomain.html} (66%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.filterUrls.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.fixRelativeUrls.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getBaseUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getHostAndPath.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getHostinfo.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getTldinfo.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.isExternal.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.isKnownLink.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.contentFingerprint.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.duplicateTest.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.generateBowHash.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.isSimilarDomain.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.addToCompressedDict.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.bufferedDownloads.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.fetchResponse.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.fetchUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.loadDownloadBuffer.html create mode 100644 docs/functions/src_trafilatura_extractor_external.compareExtraction.html create mode 100644 docs/functions/src_trafilatura_extractor_external.justextRescue.html create mode 100644 docs/functions/src_trafilatura_extractor_external.sanitizeTree.html rename docs/{variables/src_extract_content_courlan_settings.TARGET_LANGS.html => functions/src_trafilatura_extractor_feeds.findFeedUrls.html} (71%) rename docs/functions/{src_extract_content_htmlprocessing.processNode.html => src_trafilatura_extractor_feeds.tryHomepage.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_html_special_chars.convertHTMLSpecialChars.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.buildHtmlOutput.html rename docs/functions/{src_extract_content_htmlprocessing.convertHeadings.html => src_trafilatura_extractor_htmlprocessing.collectLinkInfo.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertDeletions.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertDetails.html rename docs/functions/{src_extract_content_courlan_urlutils.fixRelativeUrls.html => src_trafilatura_extractor_htmlprocessing.convertHeadings.html} (73%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertLineBreaks.html rename docs/{variables/src_extract_content_settings.FILENAME_LEN.html => functions/src_trafilatura_extractor_htmlprocessing.convertLists.html} (79%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertQuotes.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertTags.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertToHtml.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.deleteByLinkDensity.html rename docs/{variables/src_extract_content_settings.PARALLEL_CORES.html => functions/src_trafilatura_extractor_htmlprocessing.handleTextnode.html} (71%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.linkDensityTest.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.linkDensityTestTables.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.processNode.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.pruneHtml.html rename docs/functions/{src_extract_content_readability_lxml.extractContentHTML.html => src_trafilatura_extractor_htmlprocessing.pruneUnwantedNodes.html} (74%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.treeCleaning.html rename docs/functions/{src_extract_content_utils.buildTeiOutput.html => src_trafilatura_extractor_json_metadata.extractJson.html} (50%) create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.extractJsonParseError.html create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.normalizeAuthors.html create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.normalizeJson.html rename docs/functions/{src_extract_content_deduplication.generateBowHash.html => src_trafilatura_extractor_main_extractor.extractComments.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_main_extractor.extractContent.html create mode 100644 docs/functions/src_trafilatura_extractor_meta.resetCaches.html create mode 100644 docs/functions/src_trafilatura_extractor_metadata.extractMetadata.html create mode 100644 docs/functions/src_trafilatura_extractor_readability_lxml.extractContentHTML.html rename docs/{variables/src_extract_content_settings.CUT_EMPTY_ELEMS.html => functions/src_trafilatura_extractor_settings.argsToExtractor.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_settings.setDateParams.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.extractRobotsSitemaps.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.findRobotsSitemaps.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.isPlausibleSitemap.html rename docs/functions/{src_extract_content_settings.setDateParams.html => src_trafilatura_extractor_sitemaps.sitemapSearch.html} (61%) create mode 100644 docs/functions/src_trafilatura_extractor_spider.focusedCrawler.html rename docs/functions/{src_extract_content_utils.writeFullheader.html => src_trafilatura_extractor_utils.buildJsonOutput.html} (77%) rename docs/functions/{src_extract_content_htmlprocessing.handleTextnode.html => src_trafilatura_extractor_utils.buildTeiOutput.html} (72%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.buildXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.checkTei.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.controlXmlOutput.html rename docs/functions/{src_extract_content_json_metadata.normalizeJson.html => src_trafilatura_extractor_utils.deleteElement.html} (77%) rename docs/functions/{src_extract_content_xml.stripDoubleTags.html => src_trafilatura_extractor_utils.mergeWithParent.html} (76%) rename docs/functions/{src_extract_content_xml.controlXmlOutput.html => src_trafilatura_extractor_utils.normalizeUnicode.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.removeEmptyElements.html rename docs/functions/{src_extract_content_xml.mergeWithParent.html => src_trafilatura_extractor_utils.stripDoubleTags.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.textCharsTest.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.textfilter.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.trim.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.writeFullheader.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.writeTeitree.html rename docs/functions/{src_extract_content_utils.checkTei.html => src_trafilatura_extractor_utils.xmlToTxt.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_xml.buildJsonOutput.html rename docs/functions/{src_extract_content_utils.buildXmlOutput.html => src_trafilatura_extractor_xml.buildTeiOutput.html} (80%) create mode 100644 docs/functions/src_trafilatura_extractor_xml.buildXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.checkTei.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.controlXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.deleteElement.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.mergeWithParent.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.removeEmptyElements.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.stripDoubleTags.html create mode 100644 docs/interfaces/src_extractor_html_to_content_html_to_content.ExtractedContent.html create mode 100644 docs/interfaces/src_graph_embeddings_to_graph.PlotDataPoint.html delete mode 100644 docs/modules/src_extract_content_baseline.html delete mode 100644 docs/modules/src_extract_content_core.html delete mode 100644 docs/modules/src_extract_content_courlan-1.html delete mode 100644 docs/modules/src_extract_content_courlan.html delete mode 100644 docs/modules/src_extract_content_courlan_clean.html delete mode 100644 docs/modules/src_extract_content_courlan_filters.html delete mode 100644 docs/modules/src_extract_content_courlan_settings.html delete mode 100644 docs/modules/src_extract_content_courlan_urlutils.html delete mode 100644 docs/modules/src_extract_content_deduplication.html delete mode 100644 docs/modules/src_extract_content_htmldate-1.html delete mode 100644 docs/modules/src_extract_content_htmldate.export_.html delete mode 100644 docs/modules/src_extract_content_htmldate_core.html delete mode 100644 docs/modules/src_extract_content_htmldate_extractors.html delete mode 100644 docs/modules/src_extract_content_htmldate_meta.html delete mode 100644 docs/modules/src_extract_content_htmldate_settings.html delete mode 100644 docs/modules/src_extract_content_htmldate_utils.html delete mode 100644 docs/modules/src_extract_content_htmldate_validators.html delete mode 100644 docs/modules/src_extract_content_json_metadata.html delete mode 100644 docs/modules/src_extract_content_readability_lxml.html delete mode 100644 docs/modules/src_extract_content_settings.html delete mode 100644 docs/modules/src_extract_content_sitemaps.html delete mode 100644 docs/modules/src_extract_content_spider.html delete mode 100644 docs/modules/src_extract_content_utils.html delete mode 100644 docs/modules/src_extract_content_xml.html delete mode 100644 docs/modules/src_extract_content_xpaths.html delete mode 100644 docs/modules/src_extractor_html_to_content_extract_content.html rename docs/modules/{src_extract_content_feeds.html => src_extractor_html_to_content_extract_content_extractor1_content.html} (51%) create mode 100644 docs/modules/src_extractor_html_to_content_extract_content_extractor2_content.html rename docs/{variables/src_extract_content_settings.LRU_SIZE.html => modules/src_extractor_html_to_content_extract_content_extractor2_content_utils.html} (56%) create mode 100644 docs/modules/src_graph_embeddings_to_graph.html rename docs/modules/{src_extract_content_meta.html => src_graph_save_hnsw.html} (53%) create mode 100644 docs/modules/src_trafilatura_extractor_baseline.html create mode 100644 docs/modules/src_trafilatura_extractor_core.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan-1.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_clean.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_core.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_filters.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_meta.html rename docs/modules/{src_extractor_html_to_content_readability.html => src_trafilatura_extractor_courlan_network.html} (89%) rename docs/modules/{src_extract_content_htmldate.html => src_trafilatura_extractor_courlan_sampling.html} (53%) create mode 100644 docs/modules/src_trafilatura_extractor_courlan_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_urlstore.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_urlutils.html create mode 100644 docs/modules/src_trafilatura_extractor_deduplication.html rename docs/modules/{src_extract_content_courlan_network.html => src_trafilatura_extractor_downloads.html} (73%) rename docs/modules/{src_extract_content_htmlprocessing.html => src_trafilatura_extractor_external.html} (55%) rename docs/modules/{src_extract_content_external.html => src_trafilatura_extractor_feeds.html} (51%) rename docs/modules/{src_extract_content_courlan_sampling.html => src_trafilatura_extractor_html_special_chars.html} (52%) create mode 100644 docs/modules/src_trafilatura_extractor_htmldate-1.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate.export_.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_core.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_extractors.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_meta.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_utils.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_validators.html create mode 100644 docs/modules/src_trafilatura_extractor_htmlprocessing.html create mode 100644 docs/modules/src_trafilatura_extractor_json_metadata.html create mode 100644 docs/modules/src_trafilatura_extractor_main_extractor.html rename docs/modules/{src_extract_content_metadata.html => src_trafilatura_extractor_meta.html} (53%) rename docs/modules/{src_extract_content_main_extractor.html => src_trafilatura_extractor_metadata.html} (88%) rename docs/modules/{src_extract_content_courlan_urlstore.html => src_trafilatura_extractor_readability.html} (52%) rename docs/modules/{src_extractor_html_to_content_readability2.html => src_trafilatura_extractor_readability_lxml.html} (92%) create mode 100644 docs/modules/src_trafilatura_extractor_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_sitemaps.html rename docs/modules/{src_extract_content_courlan_core.html => src_trafilatura_extractor_spider.html} (80%) create mode 100644 docs/modules/src_trafilatura_extractor_test_extract_test.html create mode 100644 docs/modules/src_trafilatura_extractor_utils.html rename docs/modules/{src_extract_content_downloads.html => src_trafilatura_extractor_xml.html} (72%) rename docs/modules/{src_extract_content_courlan_meta.html => src_trafilatura_extractor_xpaths.html} (61%) delete mode 100644 docs/variables/src_extract_content_courlan_settings.BLACKLIST.html delete mode 100644 docs/variables/src_extract_content_courlan_settings.LANG_PARAMS.html delete mode 100644 docs/variables/src_extract_content_htmldate.export_-1.html delete mode 100644 docs/variables/src_extract_content_settings.DEFAULT_CONFIG.html delete mode 100644 docs/variables/src_extract_content_settings.MANUALLY_CLEANED.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_FILES_PER_DIRECTORY.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_LINKS.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_SITEMAPS_SEEN.html delete mode 100644 docs/variables/src_extract_content_settings.SUPPORTED_FORMATS.html delete mode 100644 docs/variables/src_extract_content_settings.TAG_CATALOG.html delete mode 100644 docs/variables/src_extract_content_spider.URL_STORE.html delete mode 100644 docs/variables/src_extract_content_xpaths.AUTHOR_DISCARD_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.AUTHOR_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.CATEGORIES_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.OVERALL_DISCARD_XPATH.html delete mode 100644 docs/variables/src_extract_content_xpaths.TAGS_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.TEASER_DISCARD_XPATH.html delete mode 100644 docs/variables/src_extract_content_xpaths.TITLE_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.ALLOWED_PARAMS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.BLACKLIST.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.LANG_PARAMS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.TARGET_LANGS.html create mode 100644 docs/variables/src_trafilatura_extractor_htmldate.export_-1.html rename docs/{functions/src_extract_content_htmlprocessing.convertLineBreaks.html => variables/src_trafilatura_extractor_settings.BASIC_CLEAN_XPATH.html} (79%) rename docs/{functions/src_extract_content_utils.buildJsonOutput.html => variables/src_trafilatura_extractor_settings.CUT_EMPTY_ELEMS.html} (76%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.DEFAULT_CONFIG.html rename docs/{functions/src_extract_content_utils.writeTeitree.html => variables/src_trafilatura_extractor_settings.FILENAME_LEN.html} (79%) rename docs/variables/{src_extract_content_settings.JUSTEXT_LANGUAGES.html => src_trafilatura_extractor_settings.JUSTEXT_LANGUAGES.html} (51%) rename docs/variables/{src_extract_content_settings.MANUALLY_STRIPPED.html => src_trafilatura_extractor_settings.LRU_SIZE.html} (53%) rename docs/variables/{src_extract_content_xpaths.DISCARD_IMAGE_ELEMENTS.html => src_trafilatura_extractor_settings.MANUALLY_CLEANED.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.MANUALLY_STRIPPED.html rename docs/variables/{src_extract_content_xpaths.COMMENTS_DISCARD_XPATH.html => src_trafilatura_extractor_settings.MAX_FILES_PER_DIRECTORY.html} (53%) rename docs/variables/{src_extract_content_settings.SUPPORTED_FMT_CLI.html => src_trafilatura_extractor_settings.MAX_LINKS.html} (53%) rename docs/variables/{src_extract_content_settings.BASIC_CLEAN_XPATH.html => src_trafilatura_extractor_settings.MAX_SITEMAPS_SEEN.html} (88%) rename docs/{functions/src_extract_content_xml.buildXmlOutput.html => variables/src_trafilatura_extractor_settings.PARALLEL_CORES.html} (80%) rename docs/variables/{src_extract_content_xpaths.PRECISION_DISCARD_XPATH.html => src_trafilatura_extractor_settings.SUPPORTED_FMT_CLI.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.SUPPORTED_FORMATS.html create mode 100644 docs/variables/src_trafilatura_extractor_settings.TAG_CATALOG.html rename docs/variables/{src_extract_content_xpaths.REMOVE_COMMENTS_XPATH.html => src_trafilatura_extractor_spider.URL_STORE.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.AUTHOR_DISCARD_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.AUTHOR_XPATHS.html rename docs/variables/{src_extract_content_courlan_settings.ALLOWED_PARAMS.html => src_trafilatura_extractor_xpaths.BODY_XPATH.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.CATEGORIES_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.COMMENTS_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.COMMENTS_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.DISCARD_IMAGE_ELEMENTS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.OVERALL_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.PRECISION_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.REMOVE_COMMENTS_XPATH.html rename docs/{functions/src_extract_content_htmlprocessing.convertTags.html => variables/src_trafilatura_extractor_xpaths.TAGS_XPATHS.html} (74%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.TEASER_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.TITLE_XPATHS.html create mode 100644 pnpm-lock.yaml delete mode 100644 src/extract-content/python/baseline.py delete mode 100644 src/extract-content/python/clean.py delete mode 100644 src/extract-content/python/core.py delete mode 100644 src/extract-content/python/courlan-core.py delete mode 100644 src/extract-content/python/deduplication.py delete mode 100644 src/extract-content/python/downloads.py delete mode 100644 src/extract-content/python/external.py delete mode 100644 src/extract-content/python/extractors.py delete mode 100644 src/extract-content/python/feeds.py delete mode 100644 src/extract-content/python/filters.py delete mode 100644 src/extract-content/python/htmldate-core.py delete mode 100644 src/extract-content/python/htmlprocessing.py delete mode 100644 src/extract-content/python/json_metadata.py delete mode 100644 src/extract-content/python/main_extractor.py delete mode 100644 src/extract-content/python/meta (1).py delete mode 100644 src/extract-content/python/meta (2).py delete mode 100644 src/extract-content/python/meta.py delete mode 100644 src/extract-content/python/metadata.py delete mode 100644 src/extract-content/python/network.py delete mode 100644 src/extract-content/python/readability_lxml.py delete mode 100644 src/extract-content/python/sampling.py delete mode 100644 src/extract-content/python/settings (1).py delete mode 100644 src/extract-content/python/settings (2).py delete mode 100644 src/extract-content/python/settings.py delete mode 100644 src/extract-content/python/sitemaps.py delete mode 100644 src/extract-content/python/spider.py delete mode 100644 src/extract-content/python/urlstore.py delete mode 100644 src/extract-content/python/urlutils.py delete mode 100644 src/extract-content/python/utils (1).py delete mode 100644 src/extract-content/python/utils.py delete mode 100644 src/extract-content/python/validators.py delete mode 100644 src/extract-content/python/xml.py delete mode 100644 src/extract-content/python/xpaths.py delete mode 100644 src/extractor/html-to-content/extract-content.js rename src/extractor/html-to-content/{specific-domain-extractors.json => extract-content/extract-selectors-per-domain.json} (100%) create mode 100644 src/extractor/html-to-content/extract-content/extractor1-content.js create mode 100644 src/extractor/html-to-content/extract-content/extractor2-content-utils.js create mode 100644 src/extractor/html-to-content/extract-content/extractor2-content.js delete mode 100644 src/extractor/html-to-content/readability2.js create mode 100644 src/graph/embeddings-to-graph.js create mode 100644 src/graph/save-hnsw.js create mode 100644 src/trafilatura-extractor/.gitignore rename src/{extract-content => trafilatura-extractor}/baseline.js (100%) rename src/{extract-content => trafilatura-extractor}/core.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/clean.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/core.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/filters.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/index.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/network.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/sampling.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/settings.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/urlstore.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/urlutils.js (100%) rename src/{extract-content => trafilatura-extractor}/deduplication.js (100%) rename src/{extract-content => trafilatura-extractor}/downloads.js (100%) rename src/{extract-content => trafilatura-extractor}/external.js (100%) rename src/{extract-content => trafilatura-extractor}/feeds.js (100%) create mode 100644 src/trafilatura-extractor/html-special-chars.js rename src/{extract-content => trafilatura-extractor}/htmldate/core.js (98%) rename src/{extract-content => trafilatura-extractor}/htmldate/extractors.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/index.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/settings.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/utils.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/validators.js (100%) rename src/{extract-content => trafilatura-extractor}/htmlprocessing.js (100%) create mode 100644 src/trafilatura-extractor/jsconfig.json rename src/{extract-content => trafilatura-extractor}/json_metadata.js (100%) rename src/{extract-content => trafilatura-extractor}/main_extractor.js (100%) rename src/{extract-content => trafilatura-extractor}/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/metadata.js (100%) create mode 100644 src/trafilatura-extractor/package.json rename src/{extractor/html-to-content => trafilatura-extractor}/readability.js (98%) rename src/{extract-content => trafilatura-extractor}/readability_lxml.js (100%) rename src/{extract-content => trafilatura-extractor}/settings.js (99%) rename src/{extract-content => trafilatura-extractor}/sitemaps.js (100%) rename src/{extract-content => trafilatura-extractor}/spider.js (100%) create mode 100644 src/trafilatura-extractor/test/extract.test.js rename src/{extract-content => trafilatura-extractor}/utils.js (100%) rename src/{extract-content => trafilatura-extractor}/xml.js (100%) rename src/{extract-content => trafilatura-extractor}/xpaths.js (100%) create mode 100644 test/embedding.test.js create mode 100644 test/extractor-test/add-score.test.js create mode 100644 test/extractor-test/add-to-parent.test.js create mode 100644 test/extractor-test/extractor.test.js create mode 100644 test/extractor-test/find-top-candidate.test.js create mode 100644 test/extractor-test/get-or-init-score.test.js create mode 100644 test/extractor-test/get-score.test.js create mode 100644 test/extractor-test/get-weight.test.js create mode 100644 test/extractor-test/score-commas.test.js create mode 100644 test/extractor-test/score-content.test.js create mode 100644 test/extractor-test/score-length.test.js create mode 100644 test/extractor-test/score-node.test.js create mode 100644 test/extractor-test/score-parag.test.js create mode 100644 test/extractor-test/set-score.test.js delete mode 100644 test/foo.dat create mode 100644 test/graph.test.js delete mode 100644 test/hnsw.js delete mode 100644 test/vectors.dat create mode 100644 web-app/src/components/Home/Graph.svelte diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index eb709a4..284a97a 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -38,6 +38,18 @@ jobs: with: # Upload entire repository path: './docs/' - - name: Deploy to GitHub Pages + - name: Deploy Docs to GitHub Pages id: deployment uses: actions/deploy-pages@v4 + + test: + name: Run Unit Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 # checkout the repo + - name: Download dependencies # execute tests in tests/ + run: | + npm install + - name: Execute tests # execute tests in tests/ + run: | + npm run test \ No newline at end of file diff --git a/docs/assets/navigation.js b/docs/assets/navigation.js index 576cf71..6eb2050 100644 --- a/docs/assets/navigation.js +++ b/docs/assets/navigation.js @@ -1 +1 @@ -window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAAA72dXXPjNpaG/4v7NtpsMtnUbKr2gpbotqZlW5Ho7vRMTbEgEraQJgkFBG0rW/vftwB+iACBQ4CycpPYbrzvwwOCIIADkv/63yuO3/jVL1ekSPHb1XdXB8T3V79c5TStMlx+L//8H3ueZ1ffXX0jRXr1y4/fXSV7kqUMF1e//KszSFCWVBnieEtykiFG+PH6ON8jhhKOGeT8YVRag3/64b9/+tt//vR/352YtHjBjN9Gd6vtAScEZUJVwjCrxkbBb1wcCeh6KjPisl7cuBjVxUa8tmH4KXpYL+cujv3CI75facWrHY7EHx2c1eI27xzxZP/rY7ANNqCpWs7mViYMHfDjZgV69UtZnTBiyX4bbcLgDjZTC8J+X/DOwawuNeJEvpEDTgly8euVtboeMsK3uOC4SDB8qQyKWj2r52dc8nv8xr9Qls5pfsgwJ7QY8YdkNhan33BB/sQRPZAE9h8UtXm+4IRTJkriNx6Uc1ok+AC3favExnjF5Hm/wRl+QUWCm/KfpQsIgnUTaMF6ORVYS92YEWb5DcN/VLhIjh5AXQfS5JldEEaSfYb5gpSckV0lmtA4EdSq1H/32zpLrsx3QFRxmtRtGH/f/2V4LCVL4n4J5Ren263LJfdUFYn80wjP7qUeys9qTSR7RnM8w28cFyWhxfcoy+jrLKGmO7A4gloQd4JYCmIhcIo5pyl5Os6lS7BeBkI9f9hsbRFDPJsXGHGKOCoxn5H8QBm3NANRrSTDMy7a1yynKc7M1dG4xbVb3OhiqYulzm3gVetkc75TaWp1OPAGXoPq0C5ALm45otHIsdQTSQg/nsG3esINkeYHhsty9ntJi9nvf/78k3OFC1ksZLGQOde3kF2jEv/80z/JQR0njcasIY12cCskEoDY8dQQx6M9qZq/OAVbF110Wsc4hzDdCIxwX+WomBUox6VPiFIWS5l/jLdCfC+0jjEaaLoTGGSOORKWs5IjbukxNWQriaVEC65vTcrygLMMpzN+PFCvSjxpY6n1r8m7ziESBo7VaeMaPcGK/aOiDPmELAX+cc5pntPi1woz4txoFJTBBgyMY5bPntqxkU+EQhl3Sv9QLYM5MFQz0+AHxvwqpjToGc844Zlfd9BK41rqH7W4D63RM46k3jFsC9XkaAj838PZ+CyhYgZmG27sUIkzUlgGmY1H3HjEbWmnWhhaq5Fbzc0UZfSwQyVJ5hlGBSmepwBOaoAi/uVHbr89g9UjhIB3RlF6W/+rn3crBLw5I7m3rxCNjJOYYzMRJZ2ayIImVV43zsY2yVBZQratRPX/4ce/9440rIX9qfGob6eBjHeI4aagMkmEq1j6q1Lg5A3WCx3Mmz+OnL6KZcgwszW7ysJOJ/GRZVvu0TZOzh96UttEXV6rNyQzrgKPAlS1dSF4j5Nvj8wy0QIBPanVXXQ0E91PUpt7SnNEiskVpMmBpd16EjwZNHQYWUZeyCObRlL0I5wVKb5Zxs8umFZuozzJcKdCVDXMeGTZdEQjthPeNjhDnLzgyRjdwcZ6xlzMZqddMYoYINzSkgdFuhb2kyiqwQiJFE90OqZRA4woS6cjTmIbgZThG8esQFPOiCK2Ez4V9LUQDX0Soq+2M+7RC3lG4l4qRtGTQAMLgEb5nKHXDO2yiSzVwE76jDKSTrteFLGNkKHieXLHr4hthIKyHGXkz4lXvSa3UYTt5DgUsTWVhcTi98QuUhEDqctqN62WelJreux4wJNrSBFbk2WiuSE+8USranDE5eUeS4nbAq4oKdZbrCsZMOOkByYBwzGjN+KRQVPEFCc0xeuqOIr/T8OoHgCsuzxvGHpW53s+vIGNC3KN2Lk4YeGCmny++g4AaHjp+0Ba9bvN8Rtz17n+cJLlePQC0IrH583aSNuD0TcAOMYBvQempwdPRV3O7z4SNyLX9bnhzNotkBbTs4D6GuMU1Y/U94AbgXme6kfTbACgfVDpR9R9YKRxZOkN7LmAuOHw0hfVOkDroYYxph/m5ABgTENAP8zJAVqBNYyj/DAnBwBjHEz5cXoWo/lGv75IKJw6IpSmc5TsvZuyBLTikcEUk4W8e2yJ6OnBKiowf6XMbwIbNyKnimI4JQzLw45w6T2saVGaDRiTnJMoORaXoFqV214kw5zJLaAOc7KAg8Gck+LZ857aqpyCCVarhy/hIl4Hm+Cut7XoBTEiOlpHlGqjkv/Wz9Vfr4L5p9VyG01GdQ4AZRXcfzw3pJ4HQIqCzccwikXh6ai+yYDVbxAVy0rv1EXcqiZmR+BUlIZo5cOslBZHxYnnokPcqpzisKzKu12oHUlxGR1kT+kTOtTJAuRY1rW9YYoPQDStbnvCThYwx7jG7c/q2Yzz1GXoaTDhAZMG693+oMYCHPgOV709OScLkGNY+/YGdR4j20KmpIFnP1wuEdx4XzIV3CIumAxuEZdKB3f+F0wIt4y/ICXcov6KpHCPddm0sAa6VGK4xVw0NaxCLpgcPoEunB5uQZdOEPc4l00Ra6DLJIl7kIuliVvGRRPFJ8jFU8Un1F+RLO7RLpouPnEulzBuGZdNGbeUyyaNW8ol08Yt43KJ445wsdRxS7ho8riFQOlj5RkZnFaHjCTI/CygiaRInIbLq82jtuwJrQyo/q0W2qy6Jfkelfsp9o0Ucm+EN6R4xuzAiHtSV0UNfaBMUiPEPsufKk+xAGeZBWaI42v6eqvUog9MMwEngc3bC/wWWFScZgKuhab0tRBbyR17nK64U9Pe4PJAi9K1aXferQ5qeChNIzpvHnvDqXgmzLm2OpDBBHraoHp6wgyni2GduQIHFtCiFObJfliFrihFPobxWIfSCCPpPIq6WK9l7P6YoQfYorHX4BMbF2aAhzYnbf3vKAMHoO5+r0rx0waXSeV8/juSooZ2k6CCcPEGAob9IX0xvIsBY9cuRhZ1Oh1PpEhvME59lmxr974SSt+y4y3N8cEjoV/794RgtYh/EwMRt5ppSzsu1ItHxf7H07lWxSpBXRbRbOGcjG6sLyEqSR/3XUetrce2o/rCixkW/W+RTAZ9MDnZF5FQTgoco92OxTjDYrea42VgINvcxujS5t3outsYfY9R6joZALCdzRhP/nourTEZZZH8HWtWd7MvtBVp7N9t9Ih9h9GJdSxfpjSZNfSxTk3l64fig/NKi4GmeoySuLiRnQ9rbey8DAsrVKTkrBNnMLJPXxs/65tUrLyT0KlvvV3GNw+buyCKlvcfz0N90L1sNXoX/BZHwfUqjL8sF9HtmdShm5UbRkEcRNFmef0YhdtzuQM3G/c+3Ioaib/cLqNQ3egxiWzys7O/rJb3YRyuwrtzI9a9bMztOpwvg9X7NSyjoY0ehct4sfwcb5fXq6WyBWUS22AHkTfh3cPnMI6CpeHFdN5g1Q3ifg5Wy4Vsju8RsOo2zo2Cd6lnxcxG/bKMbt/vKh66WZO+VclpHh8QK33vATpUs7KmgUmZIJbGVfGKCo7TM6kGOygxLCaJ9YBRHqfvKFCnWyxHUrhie8SU4ZKBrttZkyspoeWxSBgqCS7jepxwJt7maT2G/PmdwH0jG02+iupdcKoTkI4RbeF9kAMzG5XhZ/z2Lpev6mRNprBjM996O5x78ehe9nGj+x7vDua8yZvhEnN9E7Yf4oPqYY/Db59vB/La6DsP5rdhvF3+MzyH8kGxsbWG+SoM7sWgbcL4T8NpTtAo+2a5eocAdSeIuH7YipFSGM+D+8VyEfjfoYdss6f1KJb3sShzHvZkYm+jHvt1O4z7bl3DG3A8CR/6FuBetnivvMbIl6N4WEcz8lFT8dCKb//bI6kmIyg2yL9MxPWM7EguJve4SGjq/KyFGakbWZeSRApHjGimw/oW9s0mcVrtCK3KM1vJ0AhAvjJaPMfp4OVSE6ADKzu24k9/P4fV6K17aihKz6zEvoV9pHNAhMVPqMr48Uye0cs+5CH5lDXcHq9vYe95m00g/qtjJ6Fb5kHsBa5HXqQ4VL5xabQPBjtwC3PrjNPJuQ/zIZh9rcfSZE1eUFZ5D/0GB6Cb2anyMw1TZn9DpmIFb7SNj3k6ebVXJ1ssgS2ecY7e3iNmzQokkuLdiCcroHOXyvdA6l6jzCeR0Dj3Mh66WWe6GapKsstwfMSINQ/snkm3eeq95fm5asdnRiZlq0877WVuGeztRcEDowkuS6+h1UnjFMeuIpl8LedDxZW+Ht41oJE0F2CHQkIzkRAS+5iXHk9CaTzNBeTJXnCBRz4TMApUbVyIHBH3bR42njQZp91ilKorB1Nwrcs4b0UKfM0wcn+XiJl48nFhlvxsXKm/QNxE+rWi3P2ZezOq9hhnRejcsyYcHDjU5629FhIdeYFvKq4PfH0UF+ZCPCFl/yzCCNHgBHD3qEgz+U2awuOtUHp3qZhAewRPx+Szn1fDaS7uvEjuXnofau0FsBvp/fRq7TmAnKrAZ7TQTj/GeGwSQeJ4plbi0Ajck4ex7xu/NV7fAtyeJ1MR7ecS3MYMisRx6CMd/lG6bydVIT0DoN56pdYi6xAyRp0341qBJysA3e0ACiq+V+bbPlTdxQU4vVIVC7CRiC32p2SLWytRNT7NRHznQd1sBkelkTSX8fYyr33Oxcl/havROel0Xq5p5PiFd083esjuPcOUTuFuQBg//P4lemdkamEwjFK0Ixnhxzh7c13o01V+jVh6iK9nukY2wA2twCD9coFeKUC/rxd01hf4gkHn7fQVg+tgu5zHMhUY/7YO+pvo4E3dp3cG6Q7A5u75YxSHd+voq76NzJGl6QHSIrwJHldRPH+4v1l+9AapcoAj0pn3wV0Yr8J7b0pfDDD+8biNwt/qdxg9Bh9D/3obOAC01eZRy/Q6Qloh4H0X3D8Gq9XXurmEC2+GbuDC2kab5Xp9Dqx1AGl1ansbr8NNvFhuwnn0sPk6gWn0GSGvlvef/FtFpxxx3y6j8C5Yb+NtOKGRDxwAmngl2GoVruL5w2ZCM1flAGf7uF4/bKJwEd/cRfF8tfRGDRzcaHLfqX9gAweAFgUf43kQBasH/26vpwUIiD2XETXcluC7d0fR9NBzaZgvEBcvXUa58/Ct4yhqeGBAOM7RwXVg0JT2Ge1s6I7ycjvgjITSkowuQMWJZzvORA4tAB4p1202oSntzRtaQA2jLrHVNv45ohQ13DAOxPm5pbqs24Yx8W4O2S6x+l5ncGBX+2taaHD3uFnF2+hh434vrxGdDugEnmhSlTiVR+P+QG8DUMXgGfDYG+W+JUrmVsSk3i9DUwM0MdBKZckIk8mUTjsG+c032dSD/OaSYhLZ/ggTP/tWBa+kc0azicevq0eX0cP6wTo/iiIFEDlmz/gL4fs1Yt4QTQxgGM7pCw7zAz+Gg6cOXVAGA6ir5YwcFrTaZdgnr1KjNDG0pIvfuJ7WdiGcdOB6sftHEBtb0xcQT4avjHB8U2WZ/mCri7cmHsNEmHCP5/J7jEYJAN7yLKKR+1cta/NWBXbczotIrutGEzttYX/pLrtjXK7D7hCX6K6F+QU7a2l/4a5aMC7cUQvEX9JNC9Bf0kkL0FgXrVzTwtVxNFaXdbqyg8fo9kGsdmznwWZRryI6T5IbjtEDGL825c9hjTKuHxZf/VZVG8BJCC2mBlH48WGzDLfTohjoIdbD3V14H23V+vUFGk1cqGfRxijt0Szvgo/1M8lC5kkzmwDUh8+hWKw6q0KNHtDy2iacL7fLh/uzqBYXgNs8fXzW6TR6wMtgE6+LnhLyD4NtuDmrIk0WEHEZrcKJIfWkA4LxU/FyWc/UVwv1jNNZQuQuUGDJa4Zkjh68W1Am92XEnMbCsDv+WuqzwBZoMOONbwSnWIE3wjbEphb8AxS/eGVLFZBXcBLVs3EKbHSTr43m8W4qKVigyYE175TqbJwCK2nFRh5+sPFqqU9wWw3mFV6DU6ycQuSEjzwaZ0NKpU+AkYryiq+G9Y3A6PZVjopZgXJczhhO6HNB/vSLUjrE0iHuHNwGqLJHeNj9jvsvmCQFx+wJJVOofUftEP7r5+E5vUc5TsOCj2/+dMMPbZ02ufQ6fvdqb8XtH3y7vRtGc8c9MKNoiy28As3kDS8d/9yAjhcvieA0Tg1vYLW+oU5uCn7crCLq9PZXGDm0c4n0Cb2QZORdxxZuI/U5xTc6zSfOlqd6GYLUHwORLbnb1gaNYk6l3GqjGXlpIzGvDV+IFG5b7hy4Q0+4lxV1Ux5wQlA2S/Zo5MFIwwHI3xuHWDr4tHyxh2tbi+cq3bUCDHyL93hFcDqTH4qZ7ZVd1B4VwWksHeRvThWB0nTxcHfTRjuxAlSu5unw7Et0t4rotXBw2KDndhBma6eTMPEa1H6fsDtxeuDa9edy7fW2N/pG2pM6RbkxoQzJ5xHUxobVP0rXE/14Rmw/XmaD6Rh0aDhyh2lupPAN5pA+ebTt5obXHt8hfZrYsteLm/HqgGknJ3BLyCPL3gHW+sD7NBKGDng29g6LIawWir861WFd/HHj0KLsoM7EZTA2rXmov7rNcRgnSX8yZ53ewKzGx2FS412HGqkpB9bikVa82uEZzncj770b4hptLLVOdfiVVlG1w+sMHftJYdeaVIGKGVidovjXWqujHWtWJQ8NneqY05n8y7Ra5lS+rsOnK2sOMcKjCexx7NDS2MerD/3UrzIGXjU+yzBXd3j160QaxO2LLJqibqNllCVVhjhuPvJB+PH6KAa14mUc1tNv5sFeI4898WQ/+6NCJbKs89bI+r91Oaf4pODXx2AbbOBgFOeeCr5b0CeeozfoiJsifidjzeiuGTRsdYTp4FsIYADG8YrJ837GcIZfUJHg2RPDf1S4SCzDxxoqRXEnijuRZ6x7hkosp1NPJAHWpcaodkdoY4yw27RuEWb5zTB2v+MALEdGe/UbI2eveGfpDJoCJWcY5eZz07xF/BXv2h/r0m6jEqnYRpswuLOFbwX0xSNPT/XCdArhFe88jv9L33js4IV1J3M6bPKNHHBKLA/qGQitwCeEIWQ0kA6jWYw1uq6z/v70oxgtJvhguQWfyvV/bCROUb5gcSsV33rBbzwo5zpNCxbkmb2cr/mm/GcMPYIAHwFgOO0wgvXy3Y8kWC/B5s3pN9xkQsztUgx3Etvbtlp13JVza+2HjPDt0FqN2WCuCuHrltPDK2W2LwadzNtyTkdOyi+UpeLRZ1osnwvK+rMC2+F3BIMaPjn4jctJ3L6yftq3w4jCcoQqC7ufB3H9bHGOCk4ScM3URjK7OLW6GacHkoxGdvpBlHYKLRIa6xTKai1l0GSpFUTagduqSvNX5fDojLJUnHxGKZ+VHFvu/B1BlBdnRZSPRXmfdXPRLiO6oZRvFZIlrCHL6DNyD5J1MkuJ2EK4qwTm++GfbFEfSBL3yxn+5FQBsu+Up2NBGEn2GeYLI16vCpcDGDEfuUaaFmY66uJZfaRtUDdlXBfxmQ/f06oI02f8EXpcTnU3KeFFalR8m3Ud+uwbPh7kuB2ORahOt4H4pHIKT6i7m8YcF5yhLKKfWpORSO3sEd+RYSX+Vjd2xyroBL7RtxtAwvBT9LBezkfCNXJ0D8vaxr//H6myZCnGtQAA" \ No newline at end of file +window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAAA72db3PjNpKHv4vnbbRzM5tL7aVqX9ASPdZGthWJnsnsVooFk7DEDEUoIDi2c3Xf/Qr8DxBoACTtSlViO+jfgwZBAEQ3yP/87wXDz+zi54ski/HzxQ8XZ8SOFz9fnEhcpDh/X/75b0d2Si9+uPiWZPHFzx9/uIiOSRpTnF38/J9WIEJpVKSI4X1ySlJEE/Zy+bI8Iooihimk/M5oWoF//PA/P/79v378vx86Jsm+Y8r80wOO4yQ75GsuGpBLlOOffoSZJlNrZkCub/df3GCtjQPl/sbbulJqGwPlOrjZ7M84SlDKWz23oShsDJTt6iog3MxGvl/YoBvgZxaQ1uvPOGLE0N9gQx0PPzPeI0HproxBZe/7vwR32/XSRq5f2KD7lRSseMDcNxtlsbhO+4CZl6ZdxwKVFYUB3bbcNjnjNMmwSVtloNM/IRYdf7339t4OlBXL6dTyiKIzvt/BHbhfSquEEY2O+2DnezewmFgQ1vuCHyzEqlIGpeRbcsZxgmz0emVNquz4a4Hpi5VqV1arek4TtscZw1mE4T45KKrVLA4HnLNb/My+EBovyemcYpaQzKAPmelYjHzDWfIXDsg5iWD9QVGd5hNODscdTvF3lEV4SbIIn5nFkAjbjaB52/VYYGVqxwwwPV1R/GeBswjuV7AdSCtbfZXQJDqmmK2SnNHkoeCX10wEbUXq7/1+SKML9ToLFYxEVf/C7/u/DOuS0yjslxB+sVrU2dwOj0UWlX8y8PRaYlV+ElsiOlJywgv8zHCWJyR7j9KUPC0iolqn8BpUBmFrEJYGITew8vlE4uTxZVmqeNu1x62Xd7u9zmOIp9MCPY4RQzlmi+R0JpRpugFv1iTFC8b71+JEYpyqm6NWCyu1sLYLS7uwtLNb3ld2ZXe+EWlic1jwBlqD5pBuQMYnGd5pyhXnYxIl7GUCX6sJd0RyOlOc54s/cpIt/vhL9YChqQA3C7lZyM2s25ubVc8j/07O4orO6LOEVMrBvTApAYi+dB3R7G1nVf/Fytmq6Kq1tfRzCJOFQA+PxQlliwydcO7iYmkWlmbuPl5z41tua+mjgiYrgU6eMENccpEzxDQjpoRsTMLSRHKuL53k+RmnKY4X7OVMnBqxsw1LW/eWvGkVAi5g2Zw6rlITbNg/C0KRi8ulgbufS3I6kYyvhRPrTiOgFDKgYwzT0+KxWRu5eMgtw9bS3VXNYg50Vc1U6IE+P/GHGHTAC5aw1G04aEzDytTdaz4PbdEBB6W9pdsaqkpR4fjvw32Dcp2vqiS3XjCyiJJyiakqUkssUMGOqucMXv8WE3LBkJGQCzZ/DitTq1arTTwJJjaXHU6QAjtI42LdCu4O8l9c3FsKICfnSlRPxsqxGI10jBu6OLZCox0rUT0ZK8dyUtBonGuVqYtzewnm5F6NE6SsXCzHgFEelpYuDgYiysm/CtYXsl6eURyRA98AcfKyv3ZqFay8rUaEu4c/cH+/N8kYpo8oGkPtK0pV+O+fhteUL+xiP2PAg44Tfihrt2rsBn77Zm/Xj/UfXIe9K0pON7WGo/MDtEYWdL6g5YQXkxNKFDs8AL6g5c+VpeVjXhmKuN9tArKSgFYei8ihnI2nj+h7Eqk2s8zc2tTlEl/JNBc/G56oZVjitEsYwjeAdTsq7RRvU4rQD72Cds1WFe+mafF3Qj80f3K6YSoTMa5mbFH7qgwxNlMSoR9nb56PE5vn44zt8xFon4+ODbQoWJJqnsynVC0sdeXH98F9kVdB3EWkjvzCtSh/rxXCUsFl1NPHnW0vkIKv0YaXGvUA8YDyJFrUJd0bgpGwVCh/s2oIFMeru5urxtuRDSByJU1gb7XXVGXiQxKNG0YUlVBLW12EkeOG9LtV8/uVGo6XMtO81NNgZUmLtV5d0stix4c+TRWUsoYpsl4JwHPfOX50uED1jN3U8hw/ul4efeKIrmlgpqwH3BpJfk/T7epqKrLRAXt+lTKwKKhx2JFglSH/q100b5jAYOlVD9SK2Kwsx3UV8Ve7BzbKkqj/ZKq9gWFWrWNx1zq3oUSqy4Gt+FJl5ywwz3lxbMTaNixtrdrwKymC4gFvU/TST9izbUkRKIiBzcmL12lIMtqyZUXyUNCqjRlZMGXSlBWakZANImrw0lSZqeXocYMdSirH+77fB4rOR91DTpu8xZulKalol/J/hV1xXqHyb1YNsU0J38dDW5IAc6+eIdhDfcyYsyk2u56oETKvsfSJnBPQXMiMNuRHOvKVamAl8iTrJfaOAYsKAEyfp2gLkxVgGJC86AAcqABQbSadLU8SgNcl6DteHLP8Cbr5eaGQF3JZ0hkztlXudCRYxbDULZM9gawaRPEixYxh3eNwKRDWRcO66Kw58qL3ah6sBe/ucsHFnwXKkSZaVyGrf1fl7FKnVPm2KmcE5Z4V3B3JIzshxSGFTrcu4nYxtpQ8oIckTdjLXkaoKt9AAAE42MzTjxa0SUXsYu2QZ6VR2Bp1IW9HX48U5dgijcpE1SuakrlMuZtu9QAkDeNANRIunsp8aOXjUlUgZxSjk/raVEXCJ/zQ/FiVtnscU2aBi+5rAX1j+K7pu2nlwhN+cKi/kHVuqjyXbs2sqq3PRNcQGgMXF4YQoyMtRpIwdbp2sH7f/cgfk3n6s8bHtlz/x9rEykurpHDJYxAKCFrf/fps8Zlq4m3XcJ5TnVOvvfu1Bwt4tRrrsC1n1+E0ZxZEnxXioiF86zByfiI0Nta8KWeXrJTXWdsnkq0PGaH9HQld9VuCwtqQhPbMyg2kY5F9MznCC5dPx2Vh++vAn2X2+IQylkRgAEJHUqtY9boqcdvoWfcDL23lWsBttE/TWunSDHqI1h1X0TWVpC+awwskQmN+8SkhbJEzrJl8WwIvz68KLx/y8i6PIrxfBmRHCNsLJI1bQ5ZSxzANVFn7ce80yPvhn3Re80T2fjnFn+ynBZuzLXJT2FTAIG64R+oepqp1dqDopL1vuGFYFXHZi7slRebHB/xJ1FY43qqrLEGvKMq+LdoBffENv5zLpTPsC7fqpoGws7Jyj1u3k8YSZ4yiNCC/NCIGT/Vsg65hZYe/VZ3dsglaA1fv9Udtle4qObKG6cam6DFJESsoWpgyax9QrjnxWlaqU+rtAzc2Vv4PAZLfMELNEpZ0ZcR3mWKUJdlhPKbTAFj8/3xk+s1yiwbj5gAhJSi+FoL/ToTGHCAwmuinGFidmxrOJ1GnrsTLW3WjFYmKkxDFi1KU52bxxlCkfPj4j16t/d5N4qbeWkLyD4jiuiA0pekpogBwaQ1hQQBhEwmMSEFTpFsSaLRLE6tLfE/TPXPuP53+u56AeKhVGiuuklT5NhBLjKihI0VHHH2710XTLTA9AS2DD1iTGJ2AjlEldU5sMklEx2qPr07EDXUAIheQs17deYKKgbZJ9I+T1rBGRMd6LF2fhhI1YNI91eUPWoNqCT3neYdTxJLveCJM1tERD5jxaM6UO0yQADjXJOdZUlsOmcASZQy8JHskU2G1BkAK0ngqqJPQcZLcf2aYZmj8lRIk9JxfMvKU8VtiAqivoSfdou/JAfHZm5+jm4AbCAFMwpYUPaXoQXeYx44oyuh5n1GaxFPuL0FCx0lRdpg4pQgSOk5G6AmlyV+TRgxJRMfi4hN9EiR0nBzxV2RMGnAFCS0nosXDlHbrCegY7OWMJ7aZIKHjfOddErFJ3UDUAFeAIxhhaWi3RclLgokeNqROBcrTGaxnR4Lu5WRUARPjiMR4W2Qv/L9TYKISgGxv6iuKDuJzrDt1IGYD3iI6D5QL2QAnXse+DpSLNBg23FGNxsw7GzXCdodj+MDo5AnHNBLmHQLpucCZ1JcBaMqHEGdYTwW8RFW5MfNUWJva7mgO9xBcnGpgPSFotFI+gI/h9ZXgLqJ+Ch/DlMTA0w66Be8YrqwGg5Wr3pHYnhYIHS59xwEbHWhPWbH+HQPrdACYamE6BtbpQHvZihXdGFinA8CUy7oxtJ4QnCOJmSbnCOZwO6uhDMXxEkXHkZ2+xDQShmUdLQuNnAVKUE8FbLQMsydCxzyeh7WpXVQTxwnFpQsBzkcurRqgJGZIiD6dUyHSZe9gY2uXnqJ48nNxroV1QoaYLGPqF/tasGpbu3Nam83dF38Vbr2dd9N7peJ3RBM+bDsBRTGR//f+m8ouN97yl816H0wEtjoAa+PdfprHvZ4SwAu83Sc/CHnhqcC+1IAone7LR4aLwsZ2ZFzKJjgogRqRYZxQ8gk4iG9EqQ7bw+9BAt/2YckTtIwPA+PHkxbYCYE0TbRgJFJQg0/kDGIGo5CdEExTRg7GEntiZqq4rT8FyZVg3iCKMBZXC4FL82EsYRStEwJpiojCSFyrZEgQGh/IX3x47VB+TXj9YH4DevVwfgN63YB+S3n1kH5DerOgfgN8u7B+j/gWgX0J97qh/Qb2BsF9EfXq4f0O9yYB/gb3NiH+Hu0tgvwS7jXD/D3UKwf6G9IbhPo71BsF+zvg24X7e8w3CPh3tNcO+Tektwj6N6y3CPs3rNcP/Dek1w79t5xXDv43nDcI/zcoKAFA+EYCjotzmkQIOK6j5AmGVsv6ze5e2i4274SIlEYBSpfeJ6cjyo/jIbUAxKjfYXOVZAdMzzRxDciLwKEaFM+rDbH7trFIFYTAp+cMU8TwJXm6FtrVHSlJgY+19SshxmwxiVBJCtxDjslTxo9AOI1ZrZHVTbDD+ZlkudtN0BIaa6hzojgOyLL+WAqO+ZdEHNuvxSmkoJM0xeMjpjheDVvRDTsQgjbqMIuOw0Z1AwoiJpjz3pzEMQRbCWr9vizbYSxsqAT2fTxiYYyVG1TA54AmHGVpWQMdoDX/KHL+0w7nUeHYO1qeoAHlEqEsYfx0MMVjUX0JOFcFY7dBqjSwukyPSRZfYRy7b3tXjL49FHqnL9fkhM/OqRoVpWcONtTR8v3AatbxTd4NbM0e+15g/Ycx9GiHb2Lw77X8c5R+ZRuKHHFnTBK3CQrK8vKusxB7dE3Ma8QdMvPqt15RzCe1TPchD1vcO5WefmcRnZIMh+jhgYY4xTzd0/kOkPg6TVMdSpmZ6yBrmupwxCh2e4ID4K2YiVr+Og+zljISk9PsbS1r6ndiszgcO+T0uH0d4+5J2LwSbwpxqKbdc6heoXR23GxTMEUlI4+/OM/pMR1CNmJ6aor5+/lRFiczXFCFnH5folUa1307c6sx+nodXt3tbrwgWN9+mgP4TlbUtfGN91sYeJcbP/yyXgXXs7CHmlq6H3ihFwS79eV94O/noQ80dfRbf89bJ/xyvQ58MbdpAl+lqq/Bl8361g/9jX8zj/eyoo683/rLtbeZu9spZXV1CPx1uFp/Dvfry81ayL2aUAOFKMTf+Td3n/0w8NaKb9KPxIuaEP2zt1mvys46n/OippkeeDO2vCCpY39ZB9dz3/VDTW2WQpEzcgrPiObjZhQZLQlq8xaSPEI0DovsCWVM9zJ6R7ZCFMpk4E/51aK1rO24lahcB42wIdugfB/76FldUQdZVBvFixOSv2QRRXmC8/ptlLNUQqesrcnpMCu+L6djlt+znhEq6gFxP95H5gQPJHVsig/4ecbbXdTTxuvoS/10+Hye5zaTFfVrV9eDFy3S+uQFxTlm8pmIMaB3opLepzHp9S3OKb9+6S2v/XC//rc/nfVOENP1leXG9275knH0GlSCSnrQ2v9qvZnNWVkP4m7v9nyF5odL73a1Xnlj1wHDGqiVtXVZ34a8zBzwTkrfj50T5FuYfXq84oVgozjv+kJg0md4FN75No4mKGlXUeWZdX4Wbdxo3uOJUgYgHYT4JkF7cnow49sVOItI7HhgSg2W5bQbaDxKGGo/pWWP7Avpc6rCuHhISJHP0oeGcgD4iZLsEMaDd/ONRg8E9fCCPf5jOrFW0aaRERTP0qx9If0664wSGj6iImUvs1CVivoFV3Iav9Pdo/aF9ON4nd80dn+wM7eL4fDk+2r1l2TnYpyPEvOdQhQ8P9Bo4nhiLEldEbW6tkZ1FOo7SouRi9BBNWRJPbuMg45/eh2SBUE4mz18OcUT98RlvkYYyJcOT+h5Pv8lQZCbZDNzO0Fgqigt5wPLikbyIw8NzXPbDzW1T+0pKvLkIcXhC0a0Pv8/Sx10yvJ4O1cmgeXBrwm5BN0hmDLmD84dvOCZkgjn+YiFXWdp5dNDkaTlO5XvCibMHLZJID2epAWk10Qk5cE2fnBg7XzwUaJKWiC1HEdXOMXgx6AtsaKYDZehxDVrSUctpczMa4zA7/S5QBstM3WTZPiSYuT6YiQ1t1OzIedsJmjOLHi/FoS5vvhDDayUzMQAzXM1uY4Fjbi/rl3DI4Y3t8f8TsKXL/xGXvGjkPpPp1lxFXoA/YiyOC2/65o5vzZPHnYFKSiZtquZe7q8BJW07KlBma82J7tSBGpQm95ObeieDkgrMjy5F7cqJtJ9HWrjtZrWrEM5MGEV43GfiZCofSEwdbMM7fCd+hi5xRMEQ8uFV2n7r9w1K1tE9WSAluyV2vIojk+p/itujthOEKhAm+flFexIXBNzRbasZYOd2syCENiF+ImXTsalD4mWLp2IfyFNTDu08VDiSVrm3rSsDlHNA60+bw82rGOIb1pkz8oXTuhZG6vvOqqMGVBuBhxbV/o39o2SLH+bCqO4/oKti1c9M8vzWwqM+QhXH7PTIeX3OfWMwvTZbT9XtnW7g8sbgJ8zcLtwA+hQELyGY4LMTrHlMd8BagGv9i2glmD1PaBLb79ehmWMOfxt6/VzRm1OR3RvgpN1gFMSy/sg9G+2wVc5U9KJKKkAvJV/5d1vgnB5d3u1/jQSJ4oANB4zv/Vu/HDj345k9SUA0r/u94H/W/V+unvvkz+2JQc6AHOzu5cSDJxQjTlAuPFu773N5mvVmfzVSJIsY0PcB7v1djsd2eiAzCq7Yh9u/V24Wu/8ZXC3+zqarFQz8Dfr21/G9pnW3sDYrwP/xtvuw70/+nYY6ABM/nrIzcbfhMu73egbQhQBaPv77fZuF/ir8OomCJeb9UjgQMeOWaZlj3VyoAMwA+9TuPQCb3M3dvjsKQAcRA95QBTTns16oWVJKsByP8dshRj/jAD0mVOYJmjAC5KE4RM6uy1IahuXFdeOPBCW7wc0K7canlILaEp+wGoW8FAIoCb5tgk+1aVHUodCULepSuyl/FcnoKABd5tz4njMsLKwy4/kb0oq+y4Wv1dgsdCsKJICtNi8323CfXC3c109VKDWGhhAHklU5Dgu6+T6soMaI0qAV4bhnL2vBf7GhD1h80Xi5Ztfy1+k6zUlzdA+u7AMAvKdnzEBxQojSQD3TVkywMlEVqtgQv02Lk7aQ/1mEx3lCS8BTsZAGls4tMMoSSf5ImsY4zp+dTJ3DEsQAEAnTA/4S8KOW0RHoiQJm93S+yxx/9BSRZM1ABzFJ/Id+6cze/EH56btiQoZaFZiNDmvSPGQYveoYwWUJKAABn5m5fsi3ENhFUoQMIDkXBU3yqPxwx3OH6GuxVVfoO5kn2jC8FWRpvJ7AuwJkoQJFuCEOb8vpkeq7QHM8ykNSOD6zfEK0diCU6rjxqTtXuSkaY5D3maSa0mvPcW1oNeb4Dji1ae3EvImkxsnvcnUxkH2E9vkmYbj3nCe4TjTLCOMB1zbae1bWViNCt59cH3H98v2S2+3qvatHbdTappSCXhOqctPJxpJl3err2N29mtMZw5t6HuB/+lut/b3UzwaqEDEu5sb/zbYiy0+DquUsmHPwDSxmjqtb7xP1WsguNkoploKYN999vk26AxNrFSCNnF3/nK9X9/dzsDWaAH0+oUPM1xmpRK8zTrpDurZQxTf2/u7GZpWJQRx18HGn+ReT2DA+b365/8BE/dPkMHHAAA=" \ No newline at end of file diff --git a/docs/assets/search.js b/docs/assets/search.js index 3d7c0be..5707bc5 100644 --- a/docs/assets/search.js +++ b/docs/assets/search.js @@ -1 +1 @@ -window.searchData = "data:application/octet-stream;base64,"; \ No newline at end of file +window.searchData = "data:application/octet-stream;base64,"; \ No newline at end of file diff --git a/docs/classes/src_extract_content_core.Document.html b/docs/classes/src_extract_content_core.Document.html deleted file mode 100644 index b9452e6..0000000 --- a/docs/classes/src_extract_content_core.Document.html +++ /dev/null @@ -1,23 +0,0 @@ -
The domains allowed to initiate CORS request
+The domains allowed to initiate CORS request
The domains allowed to receive the request
include line breaks in JSON output for debugging
+Optional
options: { include line breaks in JSON output for debugging
true to add wiki page titles, false for dictionary only
max synonyms per term
min length of term to include
filetype js instead of json with "export"
sort the first words by first two letters Trie, needd for autocomplete after 2 letters typed
-
Clean a node of all elements of type "tag". +
- Preparing search index...
- The search index is not available
ai-research-agentClass Readability
Mozilla Readability (2015)
++- Define regex patterns for content identification
+- Clean HTML by removing unlikely content
+- Score nodes based on content quality indicators
+- Select the best candidate for main content
+- Extract top candidate
+- Clean the selected content
+
+Param: html
The HTML string to extract content from
+Param: options
Param: options.minContentLength
Minimum length of content to be considered valid
+Param: options.minScore
Minimum score for content to be considered valid
+Param: options.minTextLength
Minimum length of text to be considered valid
+Param: options.retryLength
Length to retry content extraction if initial attempt fails
+Returns
Extracted HTML element of main content such as article body
+Author
Based on
+Example
+ +Index
Constructors
Properties
Constructors
constructor
Mozilla Readability (2015)
++- Define regex patterns for content identification
+- Clean HTML by removing unlikely content
+- Score nodes based on content quality indicators
+- Select the best candidate for main content
+- Extract top candidate
+- Clean the selected content
+
+Parameters
Optional
options: {minContentLength: number;
minScore: number;
minTextLength: number;
retryLength: number;
}
min Content Length: number
Minimum length of content to be considered valid
+min Score: number
Minimum score for content to be considered valid
+min Text Length: number
Minimum length of text to be considered valid
+retry Length: number
Length to retry content extraction if initial attempt fails
+Rest
...args: anyReturns Readability
Extracted HTML element of main content such as article body
+Author
Based on
+Example
+ +Properties
ALTER_ TO_ DIV_ EXCEPTIONS
CLASSES_ TO_ PRESERVE
DEFAULT_ CHAR_ THRESHOLD
DEFAULT_ MAX_ ELEMS_ TO_ PARSE
DEFAULT_ N_ TOP_ CANDIDATES
DEFAULT_ TAGS_ TO_ SCORE
DEPRECATED_ SIZE_ ATTRIBUTE_ ELEMS
DIV_ TO_ P_ ELEMS
ELEMENT_ NODE
FLAG_ CLEAN_ CONDITIONALLY
FLAG_ STRIP_ UNLIKELYS
FLAG_ WEIGHT_ CLASSES
HTML_ ESCAPE_ MAP
amp: string;
apos: string;
gt: string;
lt: string;
quot: string;
} = ...
PHRASING_ ELEMS
PRESENTATIONAL_ ATTRIBUTES
REGEXPS
b64DataUrl: RegExp;
byline: RegExp;
extraneous: RegExp;
hasContent: RegExp;
hashUrl: RegExp;
jsonLdArticleTypes: RegExp;
negative: RegExp;
nextLink: RegExp;
normalize: RegExp;
okMaybeItsACandidate: RegExp;
positive: RegExp;
prevLink: RegExp;
replaceFonts: RegExp;
shareElements: RegExp;
srcsetUrl: RegExp;
tokenize: RegExp;
unlikelyCandidates: RegExp;
videos: RegExp;
whitespace: RegExp;
} = ...
TEXT_ NODE
UNLIKELY_ ROLES
_article Byline
_article Dir
_article Site Name
_article Title
_attempts
_char Threshold
_check Byline
_classes To Preserve
_clean
Clean a node of all elements of type "tag". (Unless it's a youtube/vimeo video. People love movies.)
Type declaration
Parameters
Returns void
void
-_clean Classes
Removes the class="" attribute from every element in the given +
_clean Classes
Removes the class="" attribute from every element in the given subtree, except those that match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
Type declaration
Parameters
Returns void
void
-_clean Conditionally
Clean an element of all tags of type "tag" if they look fishy. +
_clean Conditionally
Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
Type declaration
Parameters
Returns void
void
-_clean Headers
Clean out spurious headers from an Element.
+_clean Headers
Clean out spurious headers from an Element.
Type declaration
Parameters
Returns void
void
-_clean Matched Nodes
Clean out elements that match the specified conditions
+_clean Matched Nodes
Clean out elements that match the specified conditions
Type declaration
Parameters
Returns void
void
-_clean Styles
Remove the style attribute on every e and under. +
_clean Styles
Remove the style attribute on every e and under. TODO: Test if getElementsByTagName(*) is faster.
Type declaration
Parameters
Returns void
void
-_concat Node Lists
Concat all nodelists passed as arguments.
-Type declaration
Parameters
Rest
...args: any[]Returns any
...NodeList
-_debug
_disableJSONLD
_doc
_docJSDOMParser
_every Node
Iterate over a NodeList, return true if all of the provided iterate +
_concat Node Lists
Concat all nodelists passed as arguments.
+Type declaration
Parameters
Rest
...args: anyReturns any
...NodeList
+_debug
_disableJSONLD
_doc
_docJSDOMParser
_every Node
Iterate over a NodeList, return true if all of the provided iterate function calls return true, false otherwise.
For convenience, the current object context is applied to the provided iterate function.
Type declaration
Parameters
Returns any
Boolean
-_find Node
Iterate over a NodeList, and return the first node that passes +
_find Node
Iterate over a NodeList, and return the first node that passes the supplied test function
For convenience, the current object context is applied to the provided test function.
Type declaration
Parameters
Returns any
void
-_fix Lazy Images
_fix Relative Uris
Converts each and uri in the given element to an absolute URI, +
_fix Lazy Images
_fix Relative Uris
Converts each and uri in the given element to an absolute URI, ignoring #ref URIs.
Type declaration
Parameters
Returns void
void
-_flag Is Active
_flags
_for Each Node
Iterate over a NodeList, which doesn't natively fully implement the Array +
_flag Is Active
_flags
_for Each Node
Iterate over a NodeList, which doesn't natively fully implement the Array interface.
For convenience, the current object context is applied to the provided iterate function.
Type declaration
Parameters
Returns void
void
-_get All Nodes With Tag
_get Article Metadata
byline: any;
excerpt: any;
siteName: any;
title: any;
}) = ...
Attempts to get excerpt and byline metadata for the article.
+_get All Nodes With Tag
_get Article Metadata
byline: any;
excerpt: any;
siteName: any;
title: any;
}) = ...
Attempts to get excerpt and byline metadata for the article.
Type declaration
byline: any;
excerpt: any;
siteName: any;
title: any;
}
Parameters
— object containing any metadata that could be extracted from JSON-LD object.
Returns {
byline: any;
excerpt: any;
siteName: any;
title: any;
}
Object with optional "excerpt" and "byline" properties
-byline: any
excerpt: any
site Name: any
title: any
_get Article Title
Get the article title as an H1.
+byline: any
excerpt: any
site Name: any
title: any
_get Article Title
Get the article title as an H1.
Type declaration
Returns string
string
-_get Char Count
Get the number of times a string s appears in the node e.
+_get Char Count
Get the number of times a string s appears in the node e.
Type declaration
Parameters
Returns number
number (integer)
-_get Class Weight
Get an elements class/id weight. Uses regular expressions to tell if this +
_get Class Weight
Get an elements class/id weight. Uses regular expressions to tell if this element looks good or bad.
Type declaration
Parameters
Returns number
number (Integer)
-_get Inner Text
Get the inner text of a node - cross browser compatibly. +
_get Inner Text
Get the inner text of a node - cross browser compatibly. This also strips out any excess whitespace to be found.
Type declaration
Parameters
Returns any
string
-_getJSONLD
Try to extract metadata from JSON-LD object. +
_getJSONLD
Try to extract metadata from JSON-LD object. For now, only Schema.org objects of type Article or its subtypes are supported.
Type declaration
Parameters
Returns {}
Object with any metadata that could be extracted (possibly none)
-_get Link Density
Get the density of links as a percentage of the content +
_get Link Density
Get the density of links as a percentage of the content This is the amount of text that is inside a link divided by the total text in the node.
Type declaration
Parameters
Returns number
number (float)
-_get Next Node
Traverse the DOM from node to node, starting at the node passed in. +
_get Next Node
Traverse the DOM from node to node, starting at the node passed in. Pass true for the second parameter to indicate this node itself (and its kids) are going away, and we want the next node over.
Calling this in a loop will traverse the DOM depth-first.
-_get Node Ancestors
_get Row And Column Count
columns: number;
rows: number;
}) = ...
Return an object indicating how many rows and columns this table has.
-_get Text Density
_grab Article
grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is +
_get Node Ancestors
_get Row And Column Count
columns: number;
rows: number;
}) = ...
Return an object indicating how many rows and columns this table has.
+_get Text Density
_grab Article
grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
Param: page
a document to run upon. Needs to be a full document, complete with body.
-Returns
Element
-_has Ancestor Tag
Check if a given node has one of its ancestor tag name matching the +
Returns
Element
+_has Ancestor Tag
Check if a given node has one of its ancestor tag name matching the provided one.
Type declaration
Parameters
Returns boolean
Boolean
-_has Child Block Element
Determine whether element has any children block level elements.
-_has Single Tag Inside Element
Check if this node has only whitespace and a single element with given tag +
_has Child Block Element
Determine whether element has any children block level elements.
+_has Single Tag Inside Element
Check if this node has only whitespace and a single element with given tag Returns false if the DIV node contains non-empty text nodes or if it contains no element with given tag or more than 1 element.
-_header Duplicates Title
Check if this node is an H1 or H2 element whose content is mostly +
_header Duplicates Title
Check if this node is an H1 or H2 element whose content is mostly the same as the article title.
Type declaration
Parameters
Returns boolean
boolean indicating whether this is a title-like header.
-_initialize Node
Initialize a node with the readability object. Also checks the +
_initialize Node
Initialize a node with the readability object. Also checks the className/id for special names to add to its score.
Type declaration
Parameters
Returns void
void
-_is Element Without Content
_is Phrasing Content
Determine if a node qualifies as phrasing content. +
_is Element Without Content
_is Phrasing Content
Determine if a node qualifies as phrasing content. https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
-_is Probably Visible
_is Single Image
Check if node is image, or if node contains exactly only one image +
_is Probably Visible
_is Single Image
Check if node is image, or if node contains exactly only one image whether as a direct child or as its descendants.
-_is Valid Byline
Check whether the input string could be a byline. +
_is Valid Byline
Check whether the input string could be a byline. This verifies that the input is a string, and that the length is less than 100 chars.
Type declaration
Parameters
Returns boolean
Boolean - whether the input string is a byline.
-_is Whitespace
_keep Classes
_mark Data Tables
Look for 'data' (as opposed to 'layout') tables, for which we use +
_is Whitespace
_keep Classes
_mark Data Tables
Look for 'data' (as opposed to 'layout') tables, for which we use similar checks as https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
-_max Elems To Parse
_nb Top Candidates
_next Node
Finds the next node, starting from the given node, and ignoring +
_max Elems To Parse
_nb Top Candidates
_next Node
Finds the next node, starting from the given node, and ignoring whitespace in between. If the given node is an element, the same node is returned.
-_post Process Content
Run any post-process modifications to article content as necessary.
+_post Process Content
Run any post-process modifications to article content as necessary.
Type declaration
Parameters
Returns void
void
-_prep Article
Prepare the article node for display. Clean out any inline styles, +
_prep Article
Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous
tags, etc.
Type declaration
Parameters
Returns void
void
-_prep Document
Prepare the HTML document for readability to scrape it. +
_prep Document
Prepare the HTML document for readability to scrape it. This includes things like stripping javascript, CSS, and handling terrible markup.
Type declaration
Returns void
void
-_remove And Get Next
_remove Flag
_remove Nodes
Iterates over a NodeList, calls
filterFn
for each node and removes node +_remove And Get Next
_remove Flag
_remove Nodes
Iterates over a NodeList, calls
filterFn
for each node and removes node if function returnedtrue
.If function is not passed, removes all the nodes in node list.
Type declaration
Parameters
Returns void
void
-_remove Scripts
Removes script tags from the document.
-_replace Brs
Replaces 2 or more successive
elements with a single
. +
_remove Scripts
Removes script tags from the document.
+_replace Brs
Replaces 2 or more successive
elements with a single
. Whitespace between
elements are ignored. For example:
bar
abc
bar
abc
_replace Node Tags
Iterates over a NodeList, and calls _setNodeTag for each node.
+bar
abc
_replace Node Tags
Iterates over a NodeList, and calls _setNodeTag for each node.
Type declaration
Parameters
Returns void
void
-_serializer
_set Node Tag
_simplify Nested Elements
_some Node
Iterate over a NodeList, return true if any of the provided iterate +
_serializer
_set Node Tag
_simplify Nested Elements
_some Node
Iterate over a NodeList, return true if any of the provided iterate function calls returns true, false otherwise.
For convenience, the current object context is applied to the provided iterate function.
Type declaration
Parameters
Returns any
Boolean
-_text Similarity
_unescape Html Entities
Converts some of the common HTML entities in string to their corresponding characters.
+_text Similarity
_unescape Html Entities
Converts some of the common HTML entities in string to their corresponding characters.
Type declaration
Parameters
{string} - a string to unescape.
Returns string
string without HTML entity.
-_unwrap Noscript Images
Find all
_unwrap Noscript Images
Find all
-log
parse
Runs readability.
+log
parse
Runs readability.
Workflow:
ReturnsReplace the current DOM tree with the new one.- Read peacefully.
Returns
void
-Static
prototypeALTER_TO_DIV_EXCEPTIONS: string[];
CLASSES_TO_PRESERVE: string[];
DEFAULT_CHAR_THRESHOLD: number;
DEFAULT_MAX_ELEMS_TO_PARSE: number;
DEFAULT_N_TOP_CANDIDATES: number;
DEFAULT_TAGS_TO_SCORE: string[];
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[];
DIV_TO_P_ELEMS: Set<string>;
ELEMENT_NODE: number;
FLAG_CLEAN_CONDITIONALLY: number;
FLAG_STRIP_UNLIKELYS: number;
FLAG_WEIGHT_CLASSES: number;
HTML_ESCAPE_MAP: {
amp: string;
apos: string;
gt: string;
lt: string;
quot: string;
};
PHRASING_ELEMS: string[];
PRESENTATIONAL_ATTRIBUTES: string[];
REGEXPS: {
b64DataUrl: RegExp;
byline: RegExp;
extraneous: RegExp;
hasContent: RegExp;
hashUrl: RegExp;
jsonLdArticleTypes: RegExp;
negative: RegExp;
nextLink: RegExp;
normalize: RegExp;
okMaybeItsACandidate: RegExp;
positive: RegExp;
prevLink: RegExp;
replaceFonts: RegExp;
shareElements: RegExp;
srcsetUrl: RegExp;
tokenize: RegExp;
unlikelyCandidates: RegExp;
videos: RegExp;
whitespace: RegExp;
};
TEXT_NODE: number;
UNLIKELY_ROLES: string[];
_checkByline: typeof __function;
_clean: ((e: any, tag: any) => void);
_cleanClasses: ((node: any) => void);
_cleanConditionally: ((e: any, tag: any) => void);
_cleanHeaders: ((e: any) => void);
_cleanMatchedNodes: ((e: any, filter: any) => void);
_cleanStyles: ((e: any) => void);
_concatNodeLists: ((...args: any[]) => any);
_everyNode: ((nodeList: any, fn: any) => any);
_findNode: ((nodeList: any, fn: any) => any);
_fixLazyImages: ((root: any) => void);
_fixRelativeUris: ((articleContent: any) => void);
_flagIsActive: ((flag: any) => boolean);
_forEachNode: ((nodeList: any, fn: any) => void);
_getAllNodesWithTag: ((node: any, tagNames: any) => any);
_getArticleMetadata: ((jsonld: any) => {
byline: any;
excerpt: any;
siteName: any;
title: any;
});
_getArticleTitle: (() => string);
_getCharCount: ((e: any, s: any) => number);
_getClassWeight: ((e: any) => number);
_getInnerText: ((e: any, normalizeSpaces: any) => any);
_getJSONLD: ((doc: any) => {});
_getLinkDensity: ((element: any) => number);
_getNextNode: ((node: any, ignoreSelfAndKids: any) => any);
_getNodeAncestors: ((node: any, maxDepth: any) => any[]);
_getRowAndColumnCount: ((table: any) => {
columns: number;
rows: number;
});
_getTextDensity: ((e: any, tags: any) => number);
_grabArticle: typeof __function;
_hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean);
_hasChildBlockElement: ((element: any) => any);
_hasSingleTagInsideElement: ((element: any, tag: any) => boolean);
_headerDuplicatesTitle: ((node: any) => boolean);
_initializeNode: ((node: any) => void);
_isElementWithoutContent: ((node: any) => boolean);
_isPhrasingContent: ((node: any) => any);
_isProbablyVisible: ((node: any) => boolean);
_isSingleImage: ((node: any) => any);
_isValidByline: ((byline: any) => boolean);
_isWhitespace: ((node: any) => boolean);
_markDataTables: ((root: any) => void);
_nextNode: ((node: any) => any);
_postProcessContent: ((articleContent: any) => void);
_prepArticle: ((articleContent: any) => void);
_prepDocument: (() => void);
_removeAndGetNext: ((node: any) => any);
_removeFlag: typeof __function;
_removeNodes: ((nodeList: any, filterFn: any) => void);
_removeScripts: ((doc: any) => void);
_replaceBrs: ((elem: any) => void);
_replaceNodeTags: ((nodeList: any, newTagName: any) => void);
_setNodeTag: ((node: any, tag: any) => any);
_simplifyNestedElements: ((articleContent: any) => void);
_someNode: ((nodeList: any, fn: any) => any);
_textSimilarity: ((textA: any, textB: any) => number);
_unescapeHtmlEntities: ((str: string) => string);
_unwrapNoscriptImages: ((doc: any) => void);
parse: typeof __function;
}
Type declaration
ALTER_ TO_ DIV_ EXCEPTIONS: string[]
CLASSES_ TO_ PRESERVE: string[]
DEFAULT_ CHAR_ THRESHOLD: number
DEFAULT_ MAX_ ELEMS_ TO_ PARSE: number
DEFAULT_ N_ TOP_ CANDIDATES: number
DEFAULT_ TAGS_ TO_ SCORE: string[]
DEPRECATED_ SIZE_ ATTRIBUTE_ ELEMS: string[]
DIV_ TO_ P_ ELEMS: Set<string>
ELEMENT_ NODE: number
FLAG_ CLEAN_ CONDITIONALLY: number
FLAG_ STRIP_ UNLIKELYS: number
FLAG_ WEIGHT_ CLASSES: number
HTML_ ESCAPE_ MAP: {
amp: string;
apos: string;
gt: string;
lt: string;
quot: string;
}
amp: string
apos: string
gt: string
lt: string
quot: string
PHRASING_ ELEMS: string[]
PRESENTATIONAL_ ATTRIBUTES: string[]
REGEXPS: {
b64DataUrl: RegExp;
byline: RegExp;
extraneous: RegExp;
hasContent: RegExp;
hashUrl: RegExp;
jsonLdArticleTypes: RegExp;
negative: RegExp;
nextLink: RegExp;
normalize: RegExp;
okMaybeItsACandidate: RegExp;
positive: RegExp;
prevLink: RegExp;
replaceFonts: RegExp;
shareElements: RegExp;
srcsetUrl: RegExp;
tokenize: RegExp;
unlikelyCandidates: RegExp;
videos: RegExp;
whitespace: RegExp;
}
b64 Data Url: RegExp
byline: RegExp
extraneous: RegExp
has Content: RegExp
hash Url: RegExp
json Ld Article Types: RegExp
negative: RegExp
next Link: RegExp
normalize: RegExp
ok Maybe ItsACandidate: RegExp
positive: RegExp
prev Link: RegExp
replace Fonts: RegExp
share Elements: RegExp
srcset Url: RegExp
tokenize: RegExp
unlikely Candidates: RegExp
videos: RegExp
whitespace: RegExp
TEXT_ NODE: number
UNLIKELY_ ROLES: string[]
_check Byline: typeof __function
_clean: ((e: any, tag: any) => void)
Clean a node of all elements of type "tag". +
Returns
void
+Static
prototypeALTER_TO_DIV_EXCEPTIONS: string[];
CLASSES_TO_PRESERVE: string[];
DEFAULT_CHAR_THRESHOLD: number;
DEFAULT_MAX_ELEMS_TO_PARSE: number;
DEFAULT_N_TOP_CANDIDATES: number;
DEFAULT_TAGS_TO_SCORE: string[];
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[];
DIV_TO_P_ELEMS: Set<string>;
ELEMENT_NODE: number;
FLAG_CLEAN_CONDITIONALLY: number;
FLAG_STRIP_UNLIKELYS: number;
FLAG_WEIGHT_CLASSES: number;
HTML_ESCAPE_MAP: {
amp: string;
apos: string;
gt: string;
lt: string;
quot: string;
};
PHRASING_ELEMS: string[];
PRESENTATIONAL_ATTRIBUTES: string[];
REGEXPS: {
b64DataUrl: RegExp;
byline: RegExp;
extraneous: RegExp;
hasContent: RegExp;
hashUrl: RegExp;
jsonLdArticleTypes: RegExp;
negative: RegExp;
nextLink: RegExp;
normalize: RegExp;
okMaybeItsACandidate: RegExp;
positive: RegExp;
prevLink: RegExp;
replaceFonts: RegExp;
shareElements: RegExp;
srcsetUrl: RegExp;
tokenize: RegExp;
unlikelyCandidates: RegExp;
videos: RegExp;
whitespace: RegExp;
};
TEXT_NODE: number;
UNLIKELY_ROLES: string[];
_checkByline: typeof __function;
_clean: ((e: any, tag: any) => void);
_cleanClasses: ((node: any) => void);
_cleanConditionally: ((e: any, tag: any) => void);
_cleanHeaders: ((e: any) => void);
_cleanMatchedNodes: ((e: any, filter: any) => void);
_cleanStyles: ((e: any) => void);
_concatNodeLists: ((...args: any) => any);
_everyNode: ((nodeList: any, fn: any) => any);
_findNode: ((nodeList: any, fn: any) => any);
_fixLazyImages: ((root: any) => void);
_fixRelativeUris: ((articleContent: any) => void);
_flagIsActive: ((flag: any) => boolean);
_forEachNode: ((nodeList: any, fn: any) => void);
_getAllNodesWithTag: ((node: any, tagNames: any) => any);
_getArticleMetadata: ((jsonld: any) => {
byline: any;
excerpt: any;
siteName: any;
title: any;
});
_getArticleTitle: (() => string);
_getCharCount: ((e: any, s: any) => number);
_getClassWeight: ((e: any) => number);
_getInnerText: ((e: any, normalizeSpaces: any) => any);
_getJSONLD: ((doc: any) => {});
_getLinkDensity: ((element: any) => number);
_getNextNode: ((node: any, ignoreSelfAndKids: any) => any);
_getNodeAncestors: ((node: any, maxDepth: any) => any[]);
_getRowAndColumnCount: ((table: any) => {
columns: number;
rows: number;
});
_getTextDensity: ((e: any, tags: any) => number);
_grabArticle: typeof __function;
_hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean);
_hasChildBlockElement: ((element: any) => any);
_hasSingleTagInsideElement: ((element: any, tag: any) => boolean);
_headerDuplicatesTitle: ((node: any) => boolean);
_initializeNode: ((node: any) => void);
_isElementWithoutContent: ((node: any) => boolean);
_isPhrasingContent: ((node: any) => any);
_isProbablyVisible: ((node: any) => boolean);
_isSingleImage: ((node: any) => any);
_isValidByline: ((byline: any) => boolean);
_isWhitespace: ((node: any) => boolean);
_markDataTables: ((root: any) => void);
_nextNode: ((node: any) => any);
_postProcessContent: ((articleContent: any) => void);
_prepArticle: ((articleContent: any) => void);
_prepDocument: (() => void);
_removeAndGetNext: ((node: any) => any);
_removeFlag: typeof __function;
_removeNodes: ((nodeList: any, filterFn: any) => void);
_removeScripts: ((doc: any) => void);
_replaceBrs: ((elem: any) => void);
_replaceNodeTags: ((nodeList: any, newTagName: any) => void);
_setNodeTag: ((node: any, tag: any) => any);
_simplifyNestedElements: ((articleContent: any) => void);
_someNode: ((nodeList: any, fn: any) => any);
_textSimilarity: ((textA: any, textB: any) => number);
_unescapeHtmlEntities: ((str: string) => string);
_unwrapNoscriptImages: ((doc: any) => void);
parse: typeof __function;
}
Type declaration
ALTER_ TO_ DIV_ EXCEPTIONS: string[]
CLASSES_ TO_ PRESERVE: string[]
DEFAULT_ CHAR_ THRESHOLD: number
DEFAULT_ MAX_ ELEMS_ TO_ PARSE: number
DEFAULT_ N_ TOP_ CANDIDATES: number
DEFAULT_ TAGS_ TO_ SCORE: string[]
DEPRECATED_ SIZE_ ATTRIBUTE_ ELEMS: string[]
DIV_ TO_ P_ ELEMS: Set<string>
ELEMENT_ NODE: number
FLAG_ CLEAN_ CONDITIONALLY: number
FLAG_ STRIP_ UNLIKELYS: number
FLAG_ WEIGHT_ CLASSES: number
HTML_ ESCAPE_ MAP: {
amp: string;
apos: string;
gt: string;
lt: string;
quot: string;
}
amp: string
apos: string
gt: string
lt: string
quot: string
PHRASING_ ELEMS: string[]
PRESENTATIONAL_ ATTRIBUTES: string[]
REGEXPS: {
b64DataUrl: RegExp;
byline: RegExp;
extraneous: RegExp;
hasContent: RegExp;
hashUrl: RegExp;
jsonLdArticleTypes: RegExp;
negative: RegExp;
nextLink: RegExp;
normalize: RegExp;
okMaybeItsACandidate: RegExp;
positive: RegExp;
prevLink: RegExp;
replaceFonts: RegExp;
shareElements: RegExp;
srcsetUrl: RegExp;
tokenize: RegExp;
unlikelyCandidates: RegExp;
videos: RegExp;
whitespace: RegExp;
}
b64 Data Url: RegExp
byline: RegExp
extraneous: RegExp
has Content: RegExp
hash Url: RegExp
json Ld Article Types: RegExp
negative: RegExp
next Link: RegExp
normalize: RegExp
ok Maybe ItsACandidate: RegExp
positive: RegExp
prev Link: RegExp
replace Fonts: RegExp
share Elements: RegExp
srcset Url: RegExp
tokenize: RegExp
unlikely Candidates: RegExp
videos: RegExp
whitespace: RegExp
TEXT_ NODE: number
UNLIKELY_ ROLES: string[]
_check Byline: typeof __function
_clean: ((e: any, tag: any) => void)
Clean a node of all elements of type "tag". (Unless it's a youtube/vimeo video. People love movies.)
Parameters
Returns void
void
_clean Classes: ((node: any) => void)
Removes the class="" attribute from every element in the given @@ -242,8 +279,8 @@
Returns
_clean Styles: ((e: any) => void)
Remove the style attribute on every e and under. TODO: Test if getElementsByTagName(*) is faster.
Parameters
Returns void
void
-_concat Node Lists: ((...args: any[]) => any)
Concat all nodelists passed as arguments.
-Parameters
Rest
...args: any[]Returns any
...NodeList
+_concat Node Lists: ((...args: any) => any)
Concat all nodelists passed as arguments.
+Parameters
Rest
...args: anyReturns any
...NodeList
_every Node: ((nodeList: any, fn: any) => any)
Iterate over a NodeList, return true if all of the provided iterate function calls return true, false otherwise.
For convenience, the current object context is applied to the @@ -290,7 +327,7 @@
Returns- (table): {
- table: any
- (e, tags): number
- e: any
- tags: any
- (node, tagName, maxDepth, filterFn): boolean
- node: any
- tagName: any
- maxDepth: any
- filterFn: any
- Read peacefully.
-
columns: number;
rows: number;
}
Parameters
Returns {
columns: number;
rows: number;
}
columns: number
rows: number
_get Text Density: ((e: any, tags: any) => number)
Parameters
Returns number
_grab Article: typeof __function
grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
Param: page
a document to run upon. Needs to be a full document, complete with body.
-Returns
Element
+Returns
Element
_has Ancestor Tag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean)
Check if a given node has one of its ancestor tag name matching the provided one.
Parameters
Returns boolean
Boolean
@@ -358,8 +395,8 @@ReturnsReplace the current DOM tree with the new one.
Returns
void
-Settings
On This Page
Constructors
Properties
- Preparing search index...
- The search index is not available
ai-research-agentClass Document
Index
Constructors
Properties
Methods
Constructors
constructor
Returns Document
Properties
author
body
categories
comments
commentsbody
date
description
filedate
fingerprint
hostname
id
image
language
license
pagetype
raw Text
sitename
tags
text
title
url
Methods
as Dict
Returns Document
clean And Trim
Returns void
set Attributes
Parameters
Returns void
Static
fromParameters
Returns Document
Settings
On This Page
Constructors
Properties
Methods
- Preparing search index...
- The search index is not available
ai-research-agentClass Extractor
Index
Constructors
Properties
Methods
Constructors
constructor
Parameters
authorBlacklist: any;
comments: boolean;
config: {
COOKIE: string;
DOWNLOAD_TIMEOUT: string;
EXTENSIVE_DATE_SEARCH: string;
EXTERNAL_URLS: string;
EXTRACTION_TIMEOUT: string;
MAX_FILE_SIZE: string;
MAX_REDIRECTS: string;
MAX_REPETITIONS: string;
MIN_DUPLCHECK_SIZE: string;
MIN_EXTRACTED_COMM_SIZE: string;
MIN_EXTRACTED_SIZE: string;
MIN_FILE_SIZE: string;
MIN_OUTPUT_COMM_SIZE: string;
MIN_OUTPUT_SIZE: string;
SLEEP_TIME: string;
USER_AGENTS: string;
};
dateParams: any;
dedup: boolean;
fast: boolean;
formatting: boolean;
images: boolean;
lang: any;
links: boolean;
maxTreeSize: any;
onlyWithMetadata: boolean;
outputFormat: string;
precision: boolean;
recall: boolean;
source: any;
tables: boolean;
teiValidation: boolean;
url: any;
urlBlacklist: any;
withMetadata: boolean;
} = {}
author Blacklist: any
comments: boolean
config: {
COOKIE: string;
DOWNLOAD_TIMEOUT: string;
EXTENSIVE_DATE_SEARCH: string;
EXTERNAL_URLS: string;
EXTRACTION_TIMEOUT: string;
MAX_FILE_SIZE: string;
MAX_REDIRECTS: string;
MAX_REPETITIONS: string;
MIN_DUPLCHECK_SIZE: string;
MIN_EXTRACTED_COMM_SIZE: string;
MIN_EXTRACTED_SIZE: string;
MIN_FILE_SIZE: string;
MIN_OUTPUT_COMM_SIZE: string;
MIN_OUTPUT_SIZE: string;
SLEEP_TIME: string;
USER_AGENTS: string;
}
COOKIE: string
DOWNLOAD_ TIMEOUT: string
EXTENSIVE_ DATE_ SEARCH: string
EXTERNAL_ URLS: string
EXTRACTION_ TIMEOUT: string
MAX_ FILE_ SIZE: string
MAX_ REDIRECTS: string
MAX_ REPETITIONS: string
MIN_ DUPLCHECK_ SIZE: string
MIN_ EXTRACTED_ COMM_ SIZE: string
MIN_ EXTRACTED_ SIZE: string
MIN_ FILE_ SIZE: string
MIN_ OUTPUT_ COMM_ SIZE: string
MIN_ OUTPUT_ SIZE: string
SLEEP_ TIME: string
USER_ AGENTS: string
date Params: any
dedup: boolean
fast: boolean
formatting: boolean
images: boolean
lang: any
links: boolean
max Tree Size: any
only With Metadata: boolean
output Format: string
precision: boolean
recall: boolean
source: any
tables: boolean
tei Validation: boolean
url: any
url Blacklist: any
with Metadata: boolean
Returns Extractor
Properties
author Blacklist
comments
config
date Params
dedup
fast
focus
format
formatting
images
lang
links
max Tree Size
only With Metadata
source
tables
tei Validation
url
url Blacklist
with Metadata
Methods
add Config
Parameters
Returns void
set Format
Parameters
Returns void
Settings
On This Page
Constructors
Properties
Methods
- Preparing search index...
- The search index is not available
ai-research-agentClass CrawlParameters
Index
Constructors
Properties
Methods
Constructors
constructor
Parameters
Returns CrawlParameters
Properties
base
i
is On
known Num
lang
prune Xpath
ref
rules
start
Methods
_get Base Url
Parameters
Returns string
_get Reference
Parameters
Returns any
filter List
Parameters
Returns any
is Valid Link
Parameters
Returns boolean
update Metadata
Parameters
Returns void
Settings
On This Page
Constructors
Properties
Methods
- Preparing search index...
- The search index is not available
ai-research-agentFunction suggestNextWordCompletions
Completes the query with the most likely next words for phrases. If typing 2+ letters of a word, returns all possible words matching those few letters.
Parameters
The input query which can be pertial words or phrases.
-limitMaxResults: number;
numberOfLastWordsToCheck: number;
phrasesModel: any;
} = {}
limit Max Results: number
default=10 - The maximum number of autocomplete suggestions to return.
+Optional
options: {limitMaxResults: number;
numberOfLastWordsToCheck: number;
phrasesModel: any;
} = {}
limit Max Results: number
default=10 - The maximum number of autocomplete suggestions to return.
number Of Last Words To Check: number
default=5 - The number of last words in the query to check for phrase completions.
phrases Model: any
A custom phrases model to use for autocomplete suggestions.
Returns Promise<any[]>
An array of autocomplete suggestions, each containing either a 'phrase' or 'word' property.
@@ -11,7 +11,7 @@Example
-Settings