From 3532127071b11631d6f226f5171ce25f1e99d61c Mon Sep 17 00:00:00 2001 From: vtempest <1274452+vtempest@users.noreply.github.com> Date: Sat, 7 Sep 2024 17:01:25 -0700 Subject: [PATCH] added UMAP dimensionality reduction, HNSW vector earch, with test demos. exractors modularized (readabilty, psotlight) --- .github/workflows/static.yml | 14 +- docs/assets/navigation.js | 2 +- docs/assets/search.js | 2 +- .../src_extract_content_core.Document.html | 23 - .../src_extract_content_core.Extractor.html | 25 - ...act_content_courlan_urlstore.UrlStore.html | 42 - ...xtract_content_deduplication.LRUCache.html | 10 - ...extract_content_deduplication.Simhash.html | 14 - ...rc_extract_content_downloads.Response.html | 13 - ...src_extract_content_settings.Document.html | 30 - ...rc_extract_content_settings.Extractor.html | 27 - ...xtract_content_spider.CrawlParameters.html | 19 - ...c_trafilatura_extractor_core.Document.html | 23 + ..._trafilatura_extractor_core.Extractor.html | 25 + ...a_extractor_courlan_urlstore.UrlStore.html | 42 + ...tura_extractor_deduplication.LRUCache.html | 10 + ...atura_extractor_deduplication.Simhash.html | 14 + ...filatura_extractor_downloads.Response.html | 13 + ...ra_extractor_readability.Readability.html} | 325 +- ...afilatura_extractor_settings.Document.html | 30 + ...filatura_extractor_settings.Extractor.html | 27 + ...tura_extractor_spider.CrawlParameters.html | 19 + ...tocomplete.suggestNextWordCompletions.html | 4 +- ...n_allow_cors.modifyChromeAPIAllowCORS.html | 4 +- ...compile_topic_model.compileTopicModel.html | 4 +- ...topic_model.weightWikiWordSpecificity.html | 2 +- ...press_json_jz64.compressBase64ZipText.html | 7 +- ...rt_dictionary_import.importDictionary.html | 2 +- ...t_human_names_import.importHumanNames.html | 2 +- ...ed_typos_import.importMisspelledTypos.html | 2 +- ...port_quora_import.importCommonQueries.html | 2 +- ..._frequency_import.importTermFrequency.html | 2 +- ...ge_titles_import.importWikiPageTitles.html | 2 +- ...src_extract_content_baseline.baseline.html | 4 - ...xtract_content_baseline.basicCleaning.html | 4 - ...src_extract_content_baseline.loadHtml.html | 4 - .../src_extract_content_baseline.trim.html | 4 - ...c_extract_content_core.bareExtraction.html | 4 - .../src_extract_content_core.extract.html | 4 - ...ract_content_courlan_clean.cleanQuery.html | 11 - ...xtract_content_courlan_clean.cleanUrl.html | 10 - ..._content_courlan_clean.decodePunycode.html | 9 - ...ntent_courlan_clean.normalizeFragment.html | 10 - ...t_content_courlan_clean.normalizePart.html | 9 - ...ct_content_courlan_clean.normalizeUrl.html | 12 - ...xtract_content_courlan_clean.scrubUrl.html | 9 - ...t_content_courlan_filters.basicFilter.html | 9 - ..._content_courlan_filters.domainFilter.html | 9 - ...ntent_courlan_filters.extensionFilter.html | 9 - ...tent_courlan_filters.isNavigationPage.html | 9 - ...ontent_courlan_filters.isNotCrawlable.html | 9 - ...ct_content_courlan_filters.isValidUrl.html | 9 - ...ct_content_courlan_filters.langFilter.html | 12 - ...ct_content_courlan_filters.pathFilter.html | 10 - ...ct_content_courlan_filters.typeFilter.html | 11 - ...t_content_courlan_filters.validateUrl.html | 9 - ...extract_content_courlan_meta.addCache.html | 6 - ...ract_content_courlan_meta.clearCaches.html | 6 - ...ntent_courlan_network.redirectionTest.html | 12 - ...ontent_courlan_urlutils.extractDomain.html | 11 - ...t_content_courlan_urlutils.filterUrls.html | 10 - ...t_content_courlan_urlutils.getBaseUrl.html | 9 - ...ntent_courlan_urlutils.getHostAndPath.html | 9 - ..._content_courlan_urlutils.getHostinfo.html | 9 - ...t_content_courlan_urlutils.getTldinfo.html | 10 - ...t_content_courlan_urlutils.isExternal.html | 11 - ..._content_courlan_urlutils.isKnownLink.html | 10 - ...tent_deduplication.contentFingerprint.html | 4 - ...content_deduplication.isSimilarDomain.html | 4 - ...content_downloads.addToCompressedDict.html | 4 - ...t_content_downloads.bufferedDownloads.html | 4 - ...tract_content_downloads.fetchResponse.html | 4 - ...rc_extract_content_downloads.fetchUrl.html | 4 - ..._content_downloads.loadDownloadBuffer.html | 4 - ...ct_content_external.compareExtraction.html | 4 - ...xtract_content_external.justextRescue.html | 4 - ...extract_content_external.sanitizeTree.html | 4 - ...rc_extract_content_feeds.findFeedUrls.html | 4 - ...src_extract_content_feeds.tryHomepage.html | 4 - ...ontent_htmlprocessing.buildHtmlOutput.html | 4 - ...ontent_htmlprocessing.collectLinkInfo.html | 4 - ...ntent_htmlprocessing.convertDeletions.html | 4 - ...content_htmlprocessing.convertDetails.html | 4 - ...t_content_htmlprocessing.convertLists.html | 4 - ..._content_htmlprocessing.convertQuotes.html | 4 - ..._content_htmlprocessing.convertToHtml.html | 4 - ...nt_htmlprocessing.deleteByLinkDensity.html | 4 - ...ontent_htmlprocessing.linkDensityTest.html | 4 - ..._htmlprocessing.linkDensityTestTables.html | 4 - ...ract_content_htmlprocessing.pruneHtml.html | 4 - ...ent_htmlprocessing.pruneUnwantedNodes.html | 4 - ...t_content_htmlprocessing.treeCleaning.html | 4 - ...act_content_json_metadata.extractJson.html | 4 - ...t_json_metadata.extractJsonParseError.html | 4 - ...ontent_json_metadata.normalizeAuthors.html | 4 - ...ontent_main_extractor.extractComments.html | 4 - ...content_main_extractor.extractContent.html | 4 - .../src_extract_content_meta.resetCaches.html | 4 - ...ract_content_metadata.extractMetadata.html | 4 - ...ract_content_settings.argsToExtractor.html | 4 - ...ontent_sitemaps.extractRobotsSitemaps.html | 4 - ...t_content_sitemaps.findRobotsSitemaps.html | 4 - ...t_content_sitemaps.isPlausibleSitemap.html | 4 - ...xtract_content_sitemaps.sitemapSearch.html | 4 - ...extract_content_spider.focusedCrawler.html | 4 - ...xtract_content_utils.controlXmlOutput.html | 4 - ...c_extract_content_utils.deleteElement.html | 4 - ...extract_content_utils.mergeWithParent.html | 4 - ...act_content_utils.removeEmptyElements.html | 4 - ...extract_content_utils.stripDoubleTags.html | 4 - .../src_extract_content_utils.textfilter.html | 4 - .../src_extract_content_utils.trim.html | 4 - .../src_extract_content_utils.xmlToTxt.html | 4 - ...c_extract_content_xml.buildJsonOutput.html | 4 - ...rc_extract_content_xml.buildTeiOutput.html | 4 - .../src_extract_content_xml.checkTei.html | 4 - ...src_extract_content_xml.deleteElement.html | 4 - ...tract_content_xml.removeEmptyElements.html | 4 - ..._to_cite_extract_author.extractAuthor.html | 2 +- ...html_to_cite_extract_cite.extractCite.html | 2 +- ...html_to_cite_extract_date.extractDate.html | 2 +- ..._to_cite_extract_source.extractSource.html | 2 +- ...ml_to_cite_extract_title.extractTitle.html | 2 +- ...an_names_recognize.extractNamedEntity.html | 2 +- ...adata_to_cite.extractCiteFromMetadata.html | 2 +- ...cite_url_to_domain.convertURLToDomain.html | 2 +- ...to_cite_url_to_favicon.extractFavicon.html | 4 +- ...nt_extract_content.extractMainContent.html | 77 - ...extractor1_content.extractContentHTML.html | 77 + ...xtractor2_content.extractContentHTML2.html | 41 + ...special_chars.convertHTMLSpecialChars.html | 6 +- ...nt_html_to_basic_html.addDOMFunctions.html | 2 +- ..._to_basic_html.convertHTMLToBasicHTML.html | 10 +- ...ontent_html_to_content.extractContent.html | 6 - ...html_to_content.extractContentAndCite.html | 14 + ...ntent_readability2.extractContentHTML.html | 24 - ...tent_pdf_to_content.convertPDFToHTML.html} | 9 +- ...rl_to_content_pdf_to_content.isUrlPDF.html | 2 +- ...r_url_to_content_scrape_url.scrapeURL.html | 10 +- ...url_to_content_url_to_content.extract.html | 7 +- ...tent_youtube_embed.embedYoutubePlayer.html | 2 +- ...nt_youtube_to_text.extractYoutubeText.html | 7 +- ...ings_to_graph.convertEmbeddingsToHNSW.html | 15 + ...ings_to_graph.convertEmbeddingsToUMAP.html | 13 + ...to_graph.convertTextToEmbeddingVector.html | 17 + ..._embeddings_to_graph.cosineSimilarity.html | 4 + ..._embeddings_to_graph.getAllEmbeddings.html | 8 + ...eddings_to_graph.getEmbeddingPipeline.html | 6 + ...h_embeddings_to_graph.searchWithQuery.html | 4 + ...e_hnsw.convertEmbeddingsIndexToBase64.html | 10 + ...etters.calculateSimilarityByCharacter.html | 2 +- .../src_match_match_quasar.matchQUASAR.html | 8 +- ...h_softmax.calculateProbabilitySoftmax.html | 5 +- ..._frequency.calculatePhraseSpecificity.html | 2 +- ...frequency.weighRelevanceTermFrequency.html | 4 +- ...search_web_search_stream.searchSTREAM.html | 16 +- .../src_search_web_search_web.searchWeb.html | 7 +- ..._web_search_wikipedia.searchWikipedia.html | 4 +- ...larity_concept.vectorizeTextAsConcept.html | 16 - ...y_concept.weighRelevanceConceptVector.html | 7 +- ...oncept.weighRelevanceConceptVectorAPI.html | 7 +- ...src_tokenize_sentences.splitSentences.html | 7 +- ...okenize_stopwords.isWordCommonIgnored.html | 2 +- ...text_to_chunks.splitTextSemanticChars.html | 2 +- ...kenize_tokenize_topics.tokenizeTopics.html | 7 +- ...rd_to_root_stem.convertWordToRootStem.html | 2 +- ...ution.weighTopicDirichletDistribution.html | 4 +- ...rc_topics_ngrams.extractNounEdgeGrams.html | 2 +- ...rases.rankSentencesCentralToKeyphrase.html | 2 +- ...seektopic_keyphrases.extractSEEKTOPIC.html | 4 +- ...afilatura_extractor_baseline.baseline.html | 4 + ...tura_extractor_baseline.basicCleaning.html | 4 + ...afilatura_extractor_baseline.html2txt.html | 4 + ...afilatura_extractor_baseline.loadHtml.html | 4 + ...c_trafilatura_extractor_baseline.trim.html | 4 + ...latura_extractor_core.bareExtraction.html} | 2 +- ...rc_trafilatura_extractor_core.extract.html | 4 + ...ra_extractor_courlan_clean.cleanQuery.html | 11 + ...tura_extractor_courlan_clean.cleanUrl.html | 10 + ...xtractor_courlan_clean.decodePunycode.html | 9 + ...actor_courlan_clean.normalizeFragment.html | 10 + ...extractor_courlan_clean.normalizePart.html | 9 + ..._extractor_courlan_clean.normalizeUrl.html | 12 + ...tura_extractor_courlan_clean.scrubUrl.html | 9 + ...tura_extractor_courlan_core.checkUrl.html} | 4 +- ..._extractor_courlan_core.extractLinks.html} | 4 +- ...a_extractor_courlan_core.filterLinks.html} | 4 +- ...extractor_courlan_filters.basicFilter.html | 9 + ...xtractor_courlan_filters.domainFilter.html | 9 + ...actor_courlan_filters.extensionFilter.html | 9 + ...ctor_courlan_filters.isNavigationPage.html | 9 + ...ractor_courlan_filters.isNotCrawlable.html | 9 + ...extractor_courlan_filters.isValidUrl.html} | 7 +- ..._extractor_courlan_filters.langFilter.html | 12 + ..._extractor_courlan_filters.pathFilter.html | 10 + ..._extractor_courlan_filters.typeFilter.html | 11 + ...extractor_courlan_filters.validateUrl.html | 9 + ...tura_extractor_courlan_meta.addCache.html} | 4 +- ...ra_extractor_courlan_meta.clearCaches.html | 6 + ...actor_courlan_network.redirectionTest.html | 12 + ...xtractor_courlan_sampling.sampleUrls.html} | 4 +- ...actor_courlan_urlutils.extractDomain.html} | 9 +- ...extractor_courlan_urlutils.filterUrls.html | 10 + ...ctor_courlan_urlutils.fixRelativeUrls.html | 10 + ...extractor_courlan_urlutils.getBaseUrl.html | 9 + ...actor_courlan_urlutils.getHostAndPath.html | 9 + ...xtractor_courlan_urlutils.getHostinfo.html | 9 + ...extractor_courlan_urlutils.getTldinfo.html | 10 + ...extractor_courlan_urlutils.isExternal.html | 11 + ...xtractor_courlan_urlutils.isKnownLink.html | 10 + ...ctor_deduplication.contentFingerprint.html | 4 + ...extractor_deduplication.duplicateTest.html | 4 + ...tractor_deduplication.generateBowHash.html | 4 + ...tractor_deduplication.isSimilarDomain.html | 4 + ...tractor_downloads.addToCompressedDict.html | 4 + ...extractor_downloads.bufferedDownloads.html | 4 + ...ura_extractor_downloads.fetchResponse.html | 4 + ...filatura_extractor_downloads.fetchUrl.html | 4 + ...xtractor_downloads.loadDownloadBuffer.html | 4 + ..._extractor_external.compareExtraction.html | 4 + ...tura_extractor_external.justextRescue.html | 4 + ...atura_extractor_external.sanitizeTree.html | 4 + ...ilatura_extractor_feeds.findFeedUrls.html} | 2 +- ...filatura_extractor_feeds.tryHomepage.html} | 2 +- ...special_chars.convertHTMLSpecialChars.html | 13 + ...ractor_htmlprocessing.buildHtmlOutput.html | 4 + ...actor_htmlprocessing.collectLinkInfo.html} | 2 +- ...actor_htmlprocessing.convertDeletions.html | 4 + ...tractor_htmlprocessing.convertDetails.html | 4 + ...actor_htmlprocessing.convertHeadings.html} | 8 +- ...ctor_htmlprocessing.convertLineBreaks.html | 4 + ...xtractor_htmlprocessing.convertLists.html} | 2 +- ...xtractor_htmlprocessing.convertQuotes.html | 4 + ..._extractor_htmlprocessing.convertTags.html | 4 + ...xtractor_htmlprocessing.convertToHtml.html | 4 + ...or_htmlprocessing.deleteByLinkDensity.html | 4 + ...ractor_htmlprocessing.handleTextnode.html} | 2 +- ...ractor_htmlprocessing.linkDensityTest.html | 4 + ..._htmlprocessing.linkDensityTestTables.html | 4 + ..._extractor_htmlprocessing.processNode.html | 4 + ...ra_extractor_htmlprocessing.pruneHtml.html | 4 + ...or_htmlprocessing.pruneUnwantedNodes.html} | 2 +- ...extractor_htmlprocessing.treeCleaning.html | 4 + ..._extractor_json_metadata.extractJson.html} | 2 +- ...r_json_metadata.extractJsonParseError.html | 4 + ...ractor_json_metadata.normalizeAuthors.html | 4 + ...extractor_json_metadata.normalizeJson.html | 4 + ...actor_main_extractor.extractComments.html} | 2 +- ...tractor_main_extractor.extractContent.html | 4 + ...rafilatura_extractor_meta.resetCaches.html | 4 + ...ra_extractor_metadata.extractMetadata.html | 4 + ...r_readability_lxml.extractContentHTML.html | 4 + ...a_extractor_settings.argsToExtractor.html} | 2 +- ...tura_extractor_settings.setDateParams.html | 4 + ...ractor_sitemaps.extractRobotsSitemaps.html | 4 + ...extractor_sitemaps.findRobotsSitemaps.html | 4 + ...extractor_sitemaps.isPlausibleSitemap.html | 4 + ...ura_extractor_sitemaps.sitemapSearch.html} | 2 +- ...atura_extractor_spider.focusedCrawler.html | 4 + ...tura_extractor_utils.buildJsonOutput.html} | 2 +- ...atura_extractor_utils.buildTeiOutput.html} | 2 +- ...latura_extractor_utils.buildXmlOutput.html | 4 + ..._trafilatura_extractor_utils.checkTei.html | 4 + ...tura_extractor_utils.controlXmlOutput.html | 4 + ...latura_extractor_utils.deleteElement.html} | 2 +- ...tura_extractor_utils.mergeWithParent.html} | 2 +- ...ura_extractor_utils.normalizeUnicode.html} | 2 +- ...a_extractor_utils.removeEmptyElements.html | 4 + ...tura_extractor_utils.stripDoubleTags.html} | 2 +- ...ilatura_extractor_utils.textCharsTest.html | 4 + ...rafilatura_extractor_utils.textfilter.html | 4 + .../src_trafilatura_extractor_utils.trim.html | 4 + ...atura_extractor_utils.writeFullheader.html | 4 + ...filatura_extractor_utils.writeTeitree.html | 4 + ...trafilatura_extractor_utils.xmlToTxt.html} | 2 +- ...ilatura_extractor_xml.buildJsonOutput.html | 4 + ...ilatura_extractor_xml.buildTeiOutput.html} | 2 +- ...filatura_extractor_xml.buildXmlOutput.html | 4 + ...rc_trafilatura_extractor_xml.checkTei.html | 4 + ...latura_extractor_xml.controlXmlOutput.html | 4 + ...afilatura_extractor_xml.deleteElement.html | 4 + ...ilatura_extractor_xml.mergeWithParent.html | 4 + ...ura_extractor_xml.removeEmptyElements.html | 4 + ...ilatura_extractor_xml.stripDoubleTags.html | 4 + docs/index.html | 48 +- ...te_human_names_recognize.AuthorObject.html | 8 +- ...tent_html_to_content.ExtractedContent.html | 18 + ...url_to_content_url_to_content.Article.html | 24 +- ...o_content_youtube_embed.YouTubePlayer.html | 74 +- ...aph_embeddings_to_graph.PlotDataPoint.html | 10 + .../src_tokenize_tokenize_topics.Token.html | 8 +- docs/modules/index.html | 14 +- .../src_autocomplete_autocomplete.html | 2 +- .../src_chrome_extension_allow_cors.html | 2 +- ...rc_dataset_import_compile_topic_model.html | 2 +- ...src_dataset_import_compress_json_jz64.html | 2 +- .../src_dataset_import_dictionary_import.html | 2 +- ...src_dataset_import_human_names_import.html | 2 +- .../src_dataset_import_metadata_stats.html | 2 +- ...ataset_import_misspelled_typos_import.html | 2 +- .../src_dataset_import_quora_import.html | 2 +- ..._dataset_import_term_frequency_import.html | 2 +- ...dataset_import_wikipage_titles_import.html | 2 +- .../modules/src_extract_content_baseline.html | 9 - docs/modules/src_extract_content_core.html | 8 - .../src_extract_content_courlan-1.html | 31 - docs/modules/src_extract_content_courlan.html | 31 - .../src_extract_content_courlan_clean.html | 11 - .../src_extract_content_courlan_filters.html | 14 - .../src_extract_content_courlan_settings.html | 8 - .../src_extract_content_courlan_urlutils.html | 13 - .../src_extract_content_deduplication.html | 10 - .../src_extract_content_htmldate-1.html | 5 - .../src_extract_content_htmldate.export_.html | 4 - .../src_extract_content_htmldate_core.html | 15 - ...c_extract_content_htmldate_extractors.html | 25 - .../src_extract_content_htmldate_meta.html | 5 - ...src_extract_content_htmldate_settings.html | 9 - .../src_extract_content_htmldate_utils.html | 16 - ...c_extract_content_htmldate_validators.html | 14 - .../src_extract_content_json_metadata.html | 8 - .../src_extract_content_readability_lxml.html | 5 - .../modules/src_extract_content_settings.html | 23 - .../modules/src_extract_content_sitemaps.html | 8 - docs/modules/src_extract_content_spider.html | 7 - docs/modules/src_extract_content_utils.html | 18 - docs/modules/src_extract_content_xml.html | 13 - docs/modules/src_extract_content_xpaths.html | 17 - ...extractor_html_to_cite_extract_author.html | 2 +- ...c_extractor_html_to_cite_extract_cite.html | 2 +- ...c_extractor_html_to_cite_extract_date.html | 2 +- ...extractor_html_to_cite_extract_source.html | 2 +- ..._extractor_html_to_cite_extract_title.html | 2 +- ...or_html_to_cite_human_names_recognize.html | 2 +- ...tractor_html_to_cite_metadata_to_cite.html | 2 +- ..._extractor_html_to_cite_url_to_domain.html | 2 +- ...extractor_html_to_cite_url_to_favicon.html | 2 +- ...actor_html_to_content_extract_content.html | 5 - ...t_extract_content_extractor1_content.html} | 3 +- ...nt_extract_content_extractor2_content.html | 5 + ...act_content_extractor2_content_utils.html} | 2 +- ...or_html_to_content_html_special_chars.html | 2 +- ...or_html_to_content_html_to_basic_html.html | 2 +- ...actor_html_to_content_html_to_content.html | 3 +- ...tractor_url_to_content_pdf_to_content.html | 2 +- ...c_extractor_url_to_content_scrape_url.html | 2 +- ...tractor_url_to_content_url_to_content.html | 2 +- ...xtractor_url_to_content_youtube_embed.html | 2 +- ...ractor_url_to_content_youtube_to_text.html | 2 +- .../src_graph_embeddings_to_graph.html | 12 + ...ent_meta.html => src_graph_save_hnsw.html} | 2 +- docs/modules/src_match_compare_letters.html | 2 +- docs/modules/src_match_match_quasar.html | 2 +- docs/modules/src_match_softmax.html | 2 +- .../src_match_weigh_relevance_frequency.html | 2 +- .../modules/src_search_web_search_stream.html | 2 +- docs/modules/src_search_web_search_web.html | 2 +- .../src_search_web_search_wikipedia.html | 2 +- .../src_similarity_similarity_concept.html | 3 +- docs/modules/src_tokenize_sentences.html | 2 +- docs/modules/src_tokenize_stopwords.html | 2 +- docs/modules/src_tokenize_text_to_chunks.html | 2 +- .../modules/src_tokenize_tokenize_topics.html | 2 +- .../src_tokenize_word_to_root_stem.html | 2 +- ...topic_distribution_topic_distribution.html | 2 +- docs/modules/src_topics_ngrams.html | 2 +- .../src_topics_rank_sentences_keyphrases.html | 2 +- .../src_topics_seektopic_keyphrases.html | 2 +- .../src_trafilatura_extractor_baseline.html | 9 + .../src_trafilatura_extractor_core.html | 8 + .../src_trafilatura_extractor_courlan-1.html | 31 + .../src_trafilatura_extractor_courlan.html | 31 + ...c_trafilatura_extractor_courlan_clean.html | 11 + ...rc_trafilatura_extractor_courlan_core.html | 7 + ...trafilatura_extractor_courlan_filters.html | 14 + ...rc_trafilatura_extractor_courlan_meta.html | 6 + ...rafilatura_extractor_courlan_network.html} | 2 +- ...afilatura_extractor_courlan_sampling.html} | 3 +- ...rafilatura_extractor_courlan_settings.html | 8 + ...rafilatura_extractor_courlan_urlstore.html | 5 + ...rafilatura_extractor_courlan_urlutils.html | 13 + ...c_trafilatura_extractor_deduplication.html | 10 + ... src_trafilatura_extractor_downloads.html} | 7 +- ...> src_trafilatura_extractor_external.html} | 21 +- ...l => src_trafilatura_extractor_feeds.html} | 5 +- ...ilatura_extractor_html_special_chars.html} | 2 +- .../src_trafilatura_extractor_htmldate-1.html | 5 + ...rafilatura_extractor_htmldate.export_.html | 4 + .../src_trafilatura_extractor_htmldate.html | 6 + ...c_trafilatura_extractor_htmldate_core.html | 15 + ...ilatura_extractor_htmldate_extractors.html | 25 + ...c_trafilatura_extractor_htmldate_meta.html | 5 + ...afilatura_extractor_htmldate_settings.html | 9 + ..._trafilatura_extractor_htmldate_utils.html | 16 + ...ilatura_extractor_htmldate_validators.html | 14 + ..._trafilatura_extractor_htmlprocessing.html | 22 + ...c_trafilatura_extractor_json_metadata.html | 8 + ..._trafilatura_extractor_main_extractor.html | 6 + ...ml => src_trafilatura_extractor_meta.html} | 2 +- ...> src_trafilatura_extractor_metadata.html} | 3 +- ...rc_trafilatura_extractor_readability.html} | 2 +- ...afilatura_extractor_readability_lxml.html} | 2 +- .../src_trafilatura_extractor_settings.html | 23 + .../src_trafilatura_extractor_sitemaps.html | 8 + ... => src_trafilatura_extractor_spider.html} | 6 +- ...afilatura_extractor_test_extract_test.html | 4 + .../src_trafilatura_extractor_utils.html | 20 + ...tml => src_trafilatura_extractor_xml.html} | 15 +- ... => src_trafilatura_extractor_xpaths.html} | 15 +- ...ct_content_courlan_settings.BLACKLIST.html | 4 - ..._content_courlan_settings.LANG_PARAMS.html | 4 - ...rc_extract_content_htmldate.export_-1.html | 4 - ...tract_content_settings.DEFAULT_CONFIG.html | 4 - ...act_content_settings.MANUALLY_CLEANED.html | 4 - ...tent_settings.MAX_FILES_PER_DIRECTORY.html | 4 - ...rc_extract_content_settings.MAX_LINKS.html | 4 - ...ct_content_settings.MAX_SITEMAPS_SEEN.html | 4 - ...ct_content_settings.SUPPORTED_FORMATS.html | 4 - ..._extract_content_settings.TAG_CATALOG.html | 4 - .../src_extract_content_spider.URL_STORE.html | 4 - ..._content_xpaths.AUTHOR_DISCARD_XPATHS.html | 4 - ..._extract_content_xpaths.AUTHOR_XPATHS.html | 4 - ...ract_content_xpaths.CATEGORIES_XPATHS.html | 4 - ..._content_xpaths.OVERALL_DISCARD_XPATH.html | 4 - ...rc_extract_content_xpaths.TAGS_XPATHS.html | 4 - ...t_content_xpaths.TEASER_DISCARD_XPATH.html | 4 - ...c_extract_content_xpaths.TITLE_XPATHS.html | 4 - ...actor_courlan_settings.ALLOWED_PARAMS.html | 4 + ..._extractor_courlan_settings.BLACKLIST.html | 4 + ...xtractor_courlan_settings.LANG_PARAMS.html | 4 + ...tractor_courlan_settings.TARGET_LANGS.html | 4 + ...filatura_extractor_htmldate.export_-1.html | 4 + ...extractor_settings.BASIC_CLEAN_XPATH.html} | 2 +- ...a_extractor_settings.CUT_EMPTY_ELEMS.html} | 2 +- ...ura_extractor_settings.DEFAULT_CONFIG.html | 4 + ...tura_extractor_settings.FILENAME_LEN.html} | 2 +- ...extractor_settings.JUSTEXT_LANGUAGES.html} | 2 +- ...filatura_extractor_settings.LRU_SIZE.html} | 2 +- ..._extractor_settings.MANUALLY_CLEANED.html} | 2 +- ..._extractor_settings.MANUALLY_STRIPPED.html | 4 + ...tor_settings.MAX_FILES_PER_DIRECTORY.html} | 2 +- ...ilatura_extractor_settings.MAX_LINKS.html} | 2 +- ...extractor_settings.MAX_SITEMAPS_SEEN.html} | 2 +- ...ra_extractor_settings.PARALLEL_CORES.html} | 2 +- ...extractor_settings.SUPPORTED_FMT_CLI.html} | 2 +- ..._extractor_settings.SUPPORTED_FORMATS.html | 4 + ...latura_extractor_settings.TAG_CATALOG.html | 4 + ...afilatura_extractor_spider.URL_STORE.html} | 2 +- ...xtractor_xpaths.AUTHOR_DISCARD_XPATHS.html | 4 + ...latura_extractor_xpaths.AUTHOR_XPATHS.html | 4 + ...filatura_extractor_xpaths.BODY_XPATH.html} | 2 +- ...ra_extractor_xpaths.CATEGORIES_XPATHS.html | 4 + ...tractor_xpaths.COMMENTS_DISCARD_XPATH.html | 4 + ...atura_extractor_xpaths.COMMENTS_XPATH.html | 4 + ...tractor_xpaths.DISCARD_IMAGE_ELEMENTS.html | 4 + ...xtractor_xpaths.OVERALL_DISCARD_XPATH.html | 4 + ...ractor_xpaths.PRECISION_DISCARD_XPATH.html | 4 + ...xtractor_xpaths.REMOVE_COMMENTS_XPATH.html | 4 + ...ilatura_extractor_xpaths.TAGS_XPATHS.html} | 2 +- ...extractor_xpaths.TEASER_DISCARD_XPATH.html | 4 + ...ilatura_extractor_xpaths.TITLE_XPATHS.html | 4 + index.js | 40 +- package.json | 31 +- pnpm-lock.yaml | 2821 +++++++++++++++++ readme.md | 47 +- src/autocomplete/autocomplete.js | 2 +- src/dataset-import/compile-topic-model.js | 2 +- src/dataset-import/compress-json-jz64.js | 4 +- src/extract-content/python/baseline.py | 121 - src/extract-content/python/clean.py | 207 -- src/extract-content/python/core.py | 364 --- src/extract-content/python/courlan-core.py | 256 -- src/extract-content/python/deduplication.py | 253 -- src/extract-content/python/downloads.py | 474 --- src/extract-content/python/external.py | 189 -- src/extract-content/python/extractors.py | 515 --- src/extract-content/python/feeds.py | 312 -- src/extract-content/python/filters.py | 286 -- src/extract-content/python/htmldate-core.py | 982 ------ src/extract-content/python/htmlprocessing.py | 412 --- src/extract-content/python/json_metadata.py | 268 -- src/extract-content/python/main_extractor.py | 666 ---- src/extract-content/python/meta (1).py | 14 - src/extract-content/python/meta (2).py | 40 - src/extract-content/python/meta.py | 32 - src/extract-content/python/metadata.py | 496 --- src/extract-content/python/network.py | 71 - .../python/readability_lxml.py | 520 --- src/extract-content/python/sampling.py | 73 - src/extract-content/python/settings (1).py | 109 - src/extract-content/python/settings (2).py | 41 - src/extract-content/python/settings.py | 282 -- src/extract-content/python/sitemaps.py | 301 -- src/extract-content/python/spider.py | 351 -- src/extract-content/python/urlstore.py | 575 ---- src/extract-content/python/urlutils.py | 173 - src/extract-content/python/utils (1).py | 260 -- src/extract-content/python/utils.py | 445 --- src/extract-content/python/validators.py | 200 -- src/extract-content/python/xml.py | 611 ---- src/extract-content/python/xpaths.py | 267 -- src/extractor/html-to-cite/url-to-favicon.js | 2 +- .../html-to-content/extract-content.js | 222 -- .../extract-selectors-per-domain.json} | 0 .../extract-content/extractor1-content.js | 391 +++ .../extractor2-content-utils.js | 743 +++++ .../extract-content/extractor2-content.js | 820 +++++ .../html-to-content/html-special-chars.js | 7 +- .../html-to-content/html-to-basic-html.js | 13 +- .../html-to-content/html-to-content.js | 73 +- src/extractor/html-to-content/readability2.js | 303 -- .../url-to-content/pdf-to-content.js | 10 +- src/extractor/url-to-content/scrape-url.js | 26 +- .../url-to-content/url-to-content.js | 18 +- .../url-to-content/youtube-to-text.js | 23 +- src/graph/embeddings-to-graph.js | 172 + src/graph/save-hnsw.js | 32 + src/match/match-quasar.js | 12 +- src/match/softmax.js | 6 +- src/match/weigh-relevance-frequency.js | 2 +- src/search-web/search-stream.js | 15 +- src/search-web/search-web.js | 4 +- src/search-web/search-wikipedia.js | 2 +- src/similarity/similarity-concept.js | 64 +- src/tokenize/sentences.js | 4 +- src/tokenize/tokenize-topics.js | 4 +- src/topic-distribution/topic-distribution.js | 2 +- src/topics/seektopic-keyphrases.js | 2 +- src/trafilatura-extractor/.gitignore | 175 + .../baseline.js | 0 .../core.js | 0 .../courlan/clean.js | 0 .../courlan/core.js | 0 .../courlan/filters.js | 0 .../courlan/index.js | 0 .../courlan/meta.js | 0 .../courlan/network.js | 0 .../courlan/sampling.js | 0 .../courlan/settings.js | 0 .../courlan/urlstore.js | 0 .../courlan/urlutils.js | 0 .../deduplication.js | 0 .../downloads.js | 0 .../external.js | 0 .../feeds.js | 0 .../html-special-chars.js | 74 + .../htmldate/core.js | 13 - .../htmldate/extractors.js | 0 .../htmldate/index.js | 0 .../htmldate/meta.js | 0 .../htmldate/settings.js | 0 .../htmldate/utils.js | 0 .../htmldate/validators.js | 0 .../htmlprocessing.js | 0 src/trafilatura-extractor/jsconfig.json | 27 + .../json_metadata.js | 0 .../main_extractor.js | 0 .../meta.js | 0 .../metadata.js | 0 src/trafilatura-extractor/package.json | 19 + .../readability.js | 31 +- .../readability_lxml.js | 0 .../settings.js | 2 +- .../sitemaps.js | 0 .../spider.js | 0 .../test/extract.test.js | 28 + .../utils.js | 0 .../xml.js | 0 .../xpaths.js | 0 test/data/extract-article.test.js | 2 +- test/embedding.test.js | 89 + test/extractor-test/add-score.test.js | 23 + test/extractor-test/add-to-parent.test.js | 17 + test/extractor-test/extractor.test.js | 26 + .../extractor-test/find-top-candidate.test.js | 70 + test/extractor-test/get-or-init-score.test.js | 55 + test/extractor-test/get-score.test.js | 20 + test/extractor-test/get-weight.test.js | 92 + test/extractor-test/score-commas.test.js | 18 + test/extractor-test/score-content.test.js | 90 + test/extractor-test/score-length.test.js | 21 + test/extractor-test/score-node.test.js | 101 + test/extractor-test/score-parag.test.js | 39 + test/extractor-test/set-score.test.js | 19 + test/foo.dat | Bin 1856 -> 0 bytes test/graph.test.js | 32 + test/hnsw.js | 45 - test/readability.test.js | 37 +- test/relevace-concept-vector.test.js | 6 +- test/topic-distribution.test.js | 2 +- test/vectors.dat | Bin 8496 -> 0 bytes tsconfig.json | 2 +- web-app/package.json | 7 +- web-app/src/components/Home/Graph.svelte | 86 + web-app/src/components/Home/Home.svelte | 144 +- web-app/src/components/Home/ReadView.svelte | 2 +- 596 files changed, 8373 insertions(+), 12754 deletions(-) delete mode 100644 docs/classes/src_extract_content_core.Document.html delete mode 100644 docs/classes/src_extract_content_core.Extractor.html delete mode 100644 docs/classes/src_extract_content_courlan_urlstore.UrlStore.html delete mode 100644 docs/classes/src_extract_content_deduplication.LRUCache.html delete mode 100644 docs/classes/src_extract_content_deduplication.Simhash.html delete mode 100644 docs/classes/src_extract_content_downloads.Response.html delete mode 100644 docs/classes/src_extract_content_settings.Document.html delete mode 100644 docs/classes/src_extract_content_settings.Extractor.html delete mode 100644 docs/classes/src_extract_content_spider.CrawlParameters.html create mode 100644 docs/classes/src_trafilatura_extractor_core.Document.html create mode 100644 docs/classes/src_trafilatura_extractor_core.Extractor.html create mode 100644 docs/classes/src_trafilatura_extractor_courlan_urlstore.UrlStore.html create mode 100644 docs/classes/src_trafilatura_extractor_deduplication.LRUCache.html create mode 100644 docs/classes/src_trafilatura_extractor_deduplication.Simhash.html create mode 100644 docs/classes/src_trafilatura_extractor_downloads.Response.html rename docs/classes/{src_extractor_html_to_content_readability.Readability.html => src_trafilatura_extractor_readability.Readability.html} (55%) create mode 100644 docs/classes/src_trafilatura_extractor_settings.Document.html create mode 100644 docs/classes/src_trafilatura_extractor_settings.Extractor.html create mode 100644 docs/classes/src_trafilatura_extractor_spider.CrawlParameters.html delete mode 100644 docs/functions/src_extract_content_baseline.baseline.html delete mode 100644 docs/functions/src_extract_content_baseline.basicCleaning.html delete mode 100644 docs/functions/src_extract_content_baseline.loadHtml.html delete mode 100644 docs/functions/src_extract_content_baseline.trim.html delete mode 100644 docs/functions/src_extract_content_core.bareExtraction.html delete mode 100644 docs/functions/src_extract_content_core.extract.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.cleanQuery.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.cleanUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.decodePunycode.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizeFragment.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizePart.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.normalizeUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_clean.scrubUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.basicFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.domainFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.extensionFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isNavigationPage.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isNotCrawlable.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.isValidUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.langFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.pathFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.typeFilter.html delete mode 100644 docs/functions/src_extract_content_courlan_filters.validateUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_meta.addCache.html delete mode 100644 docs/functions/src_extract_content_courlan_meta.clearCaches.html delete mode 100644 docs/functions/src_extract_content_courlan_network.redirectionTest.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.extractDomain.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.filterUrls.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getBaseUrl.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getHostAndPath.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getHostinfo.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.getTldinfo.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.isExternal.html delete mode 100644 docs/functions/src_extract_content_courlan_urlutils.isKnownLink.html delete mode 100644 docs/functions/src_extract_content_deduplication.contentFingerprint.html delete mode 100644 docs/functions/src_extract_content_deduplication.isSimilarDomain.html delete mode 100644 docs/functions/src_extract_content_downloads.addToCompressedDict.html delete mode 100644 docs/functions/src_extract_content_downloads.bufferedDownloads.html delete mode 100644 docs/functions/src_extract_content_downloads.fetchResponse.html delete mode 100644 docs/functions/src_extract_content_downloads.fetchUrl.html delete mode 100644 docs/functions/src_extract_content_downloads.loadDownloadBuffer.html delete mode 100644 docs/functions/src_extract_content_external.compareExtraction.html delete mode 100644 docs/functions/src_extract_content_external.justextRescue.html delete mode 100644 docs/functions/src_extract_content_external.sanitizeTree.html delete mode 100644 docs/functions/src_extract_content_feeds.findFeedUrls.html delete mode 100644 docs/functions/src_extract_content_feeds.tryHomepage.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.buildHtmlOutput.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.collectLinkInfo.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertDeletions.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertDetails.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertLists.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertQuotes.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.convertToHtml.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.deleteByLinkDensity.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.linkDensityTest.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.linkDensityTestTables.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.pruneHtml.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.pruneUnwantedNodes.html delete mode 100644 docs/functions/src_extract_content_htmlprocessing.treeCleaning.html delete mode 100644 docs/functions/src_extract_content_json_metadata.extractJson.html delete mode 100644 docs/functions/src_extract_content_json_metadata.extractJsonParseError.html delete mode 100644 docs/functions/src_extract_content_json_metadata.normalizeAuthors.html delete mode 100644 docs/functions/src_extract_content_main_extractor.extractComments.html delete mode 100644 docs/functions/src_extract_content_main_extractor.extractContent.html delete mode 100644 docs/functions/src_extract_content_meta.resetCaches.html delete mode 100644 docs/functions/src_extract_content_metadata.extractMetadata.html delete mode 100644 docs/functions/src_extract_content_settings.argsToExtractor.html delete mode 100644 docs/functions/src_extract_content_sitemaps.extractRobotsSitemaps.html delete mode 100644 docs/functions/src_extract_content_sitemaps.findRobotsSitemaps.html delete mode 100644 docs/functions/src_extract_content_sitemaps.isPlausibleSitemap.html delete mode 100644 docs/functions/src_extract_content_sitemaps.sitemapSearch.html delete mode 100644 docs/functions/src_extract_content_spider.focusedCrawler.html delete mode 100644 docs/functions/src_extract_content_utils.controlXmlOutput.html delete mode 100644 docs/functions/src_extract_content_utils.deleteElement.html delete mode 100644 docs/functions/src_extract_content_utils.mergeWithParent.html delete mode 100644 docs/functions/src_extract_content_utils.removeEmptyElements.html delete mode 100644 docs/functions/src_extract_content_utils.stripDoubleTags.html delete mode 100644 docs/functions/src_extract_content_utils.textfilter.html delete mode 100644 docs/functions/src_extract_content_utils.trim.html delete mode 100644 docs/functions/src_extract_content_utils.xmlToTxt.html delete mode 100644 docs/functions/src_extract_content_xml.buildJsonOutput.html delete mode 100644 docs/functions/src_extract_content_xml.buildTeiOutput.html delete mode 100644 docs/functions/src_extract_content_xml.checkTei.html delete mode 100644 docs/functions/src_extract_content_xml.deleteElement.html delete mode 100644 docs/functions/src_extract_content_xml.removeEmptyElements.html delete mode 100644 docs/functions/src_extractor_html_to_content_extract_content.extractMainContent.html create mode 100644 docs/functions/src_extractor_html_to_content_extract_content_extractor1_content.extractContentHTML.html create mode 100644 docs/functions/src_extractor_html_to_content_extract_content_extractor2_content.extractContentHTML2.html delete mode 100644 docs/functions/src_extractor_html_to_content_html_to_content.extractContent.html create mode 100644 docs/functions/src_extractor_html_to_content_html_to_content.extractContentAndCite.html delete mode 100644 docs/functions/src_extractor_html_to_content_readability2.extractContentHTML.html rename docs/functions/{src_extractor_url_to_content_pdf_to_content.extractPDF.html => src_extractor_url_to_content_pdf_to_content.convertPDFToHTML.html} (55%) create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToHNSW.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToUMAP.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.convertTextToEmbeddingVector.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.cosineSimilarity.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.getAllEmbeddings.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.getEmbeddingPipeline.html create mode 100644 docs/functions/src_graph_embeddings_to_graph.searchWithQuery.html create mode 100644 docs/functions/src_graph_save_hnsw.convertEmbeddingsIndexToBase64.html delete mode 100644 docs/functions/src_similarity_similarity_concept.vectorizeTextAsConcept.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.baseline.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.basicCleaning.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.html2txt.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.loadHtml.html create mode 100644 docs/functions/src_trafilatura_extractor_baseline.trim.html rename docs/{variables/src_extract_content_xpaths.COMMENTS_XPATH.html => functions/src_trafilatura_extractor_core.bareExtraction.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_core.extract.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.cleanQuery.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.cleanUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.decodePunycode.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizeFragment.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizePart.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.normalizeUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_clean.scrubUrl.html rename docs/functions/{src_extract_content_courlan_core.checkUrl.html => src_trafilatura_extractor_courlan_core.checkUrl.html} (58%) rename docs/functions/{src_extract_content_courlan_core.extractLinks.html => src_trafilatura_extractor_courlan_core.extractLinks.html} (67%) rename docs/functions/{src_extract_content_courlan_core.filterLinks.html => src_trafilatura_extractor_courlan_core.filterLinks.html} (60%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.basicFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.domainFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.extensionFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.isNavigationPage.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.isNotCrawlable.html rename docs/{variables/src_extract_content_xpaths.BODY_XPATH.html => functions/src_trafilatura_extractor_courlan_filters.isValidUrl.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.langFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.pathFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.typeFilter.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_filters.validateUrl.html rename docs/functions/{src_extract_content_baseline.html2txt.html => src_trafilatura_extractor_courlan_meta.addCache.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_meta.clearCaches.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_network.redirectionTest.html rename docs/functions/{src_extract_content_courlan_sampling.sampleUrls.html => src_trafilatura_extractor_courlan_sampling.sampleUrls.html} (57%) rename docs/functions/{src_extract_content_deduplication.duplicateTest.html => src_trafilatura_extractor_courlan_urlutils.extractDomain.html} (66%) create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.filterUrls.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.fixRelativeUrls.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getBaseUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getHostAndPath.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getHostinfo.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.getTldinfo.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.isExternal.html create mode 100644 docs/functions/src_trafilatura_extractor_courlan_urlutils.isKnownLink.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.contentFingerprint.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.duplicateTest.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.generateBowHash.html create mode 100644 docs/functions/src_trafilatura_extractor_deduplication.isSimilarDomain.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.addToCompressedDict.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.bufferedDownloads.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.fetchResponse.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.fetchUrl.html create mode 100644 docs/functions/src_trafilatura_extractor_downloads.loadDownloadBuffer.html create mode 100644 docs/functions/src_trafilatura_extractor_external.compareExtraction.html create mode 100644 docs/functions/src_trafilatura_extractor_external.justextRescue.html create mode 100644 docs/functions/src_trafilatura_extractor_external.sanitizeTree.html rename docs/{variables/src_extract_content_courlan_settings.TARGET_LANGS.html => functions/src_trafilatura_extractor_feeds.findFeedUrls.html} (71%) rename docs/functions/{src_extract_content_htmlprocessing.processNode.html => src_trafilatura_extractor_feeds.tryHomepage.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_html_special_chars.convertHTMLSpecialChars.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.buildHtmlOutput.html rename docs/functions/{src_extract_content_htmlprocessing.convertHeadings.html => src_trafilatura_extractor_htmlprocessing.collectLinkInfo.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertDeletions.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertDetails.html rename docs/functions/{src_extract_content_courlan_urlutils.fixRelativeUrls.html => src_trafilatura_extractor_htmlprocessing.convertHeadings.html} (73%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertLineBreaks.html rename docs/{variables/src_extract_content_settings.FILENAME_LEN.html => functions/src_trafilatura_extractor_htmlprocessing.convertLists.html} (79%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertQuotes.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertTags.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.convertToHtml.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.deleteByLinkDensity.html rename docs/{variables/src_extract_content_settings.PARALLEL_CORES.html => functions/src_trafilatura_extractor_htmlprocessing.handleTextnode.html} (71%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.linkDensityTest.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.linkDensityTestTables.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.processNode.html create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.pruneHtml.html rename docs/functions/{src_extract_content_readability_lxml.extractContentHTML.html => src_trafilatura_extractor_htmlprocessing.pruneUnwantedNodes.html} (74%) create mode 100644 docs/functions/src_trafilatura_extractor_htmlprocessing.treeCleaning.html rename docs/functions/{src_extract_content_utils.buildTeiOutput.html => src_trafilatura_extractor_json_metadata.extractJson.html} (50%) create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.extractJsonParseError.html create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.normalizeAuthors.html create mode 100644 docs/functions/src_trafilatura_extractor_json_metadata.normalizeJson.html rename docs/functions/{src_extract_content_deduplication.generateBowHash.html => src_trafilatura_extractor_main_extractor.extractComments.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_main_extractor.extractContent.html create mode 100644 docs/functions/src_trafilatura_extractor_meta.resetCaches.html create mode 100644 docs/functions/src_trafilatura_extractor_metadata.extractMetadata.html create mode 100644 docs/functions/src_trafilatura_extractor_readability_lxml.extractContentHTML.html rename docs/{variables/src_extract_content_settings.CUT_EMPTY_ELEMS.html => functions/src_trafilatura_extractor_settings.argsToExtractor.html} (75%) create mode 100644 docs/functions/src_trafilatura_extractor_settings.setDateParams.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.extractRobotsSitemaps.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.findRobotsSitemaps.html create mode 100644 docs/functions/src_trafilatura_extractor_sitemaps.isPlausibleSitemap.html rename docs/functions/{src_extract_content_settings.setDateParams.html => src_trafilatura_extractor_sitemaps.sitemapSearch.html} (61%) create mode 100644 docs/functions/src_trafilatura_extractor_spider.focusedCrawler.html rename docs/functions/{src_extract_content_utils.writeFullheader.html => src_trafilatura_extractor_utils.buildJsonOutput.html} (77%) rename docs/functions/{src_extract_content_htmlprocessing.handleTextnode.html => src_trafilatura_extractor_utils.buildTeiOutput.html} (72%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.buildXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.checkTei.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.controlXmlOutput.html rename docs/functions/{src_extract_content_json_metadata.normalizeJson.html => src_trafilatura_extractor_utils.deleteElement.html} (77%) rename docs/functions/{src_extract_content_xml.stripDoubleTags.html => src_trafilatura_extractor_utils.mergeWithParent.html} (76%) rename docs/functions/{src_extract_content_xml.controlXmlOutput.html => src_trafilatura_extractor_utils.normalizeUnicode.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.removeEmptyElements.html rename docs/functions/{src_extract_content_xml.mergeWithParent.html => src_trafilatura_extractor_utils.stripDoubleTags.html} (77%) create mode 100644 docs/functions/src_trafilatura_extractor_utils.textCharsTest.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.textfilter.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.trim.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.writeFullheader.html create mode 100644 docs/functions/src_trafilatura_extractor_utils.writeTeitree.html rename docs/functions/{src_extract_content_utils.checkTei.html => src_trafilatura_extractor_utils.xmlToTxt.html} (78%) create mode 100644 docs/functions/src_trafilatura_extractor_xml.buildJsonOutput.html rename docs/functions/{src_extract_content_utils.buildXmlOutput.html => src_trafilatura_extractor_xml.buildTeiOutput.html} (80%) create mode 100644 docs/functions/src_trafilatura_extractor_xml.buildXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.checkTei.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.controlXmlOutput.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.deleteElement.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.mergeWithParent.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.removeEmptyElements.html create mode 100644 docs/functions/src_trafilatura_extractor_xml.stripDoubleTags.html create mode 100644 docs/interfaces/src_extractor_html_to_content_html_to_content.ExtractedContent.html create mode 100644 docs/interfaces/src_graph_embeddings_to_graph.PlotDataPoint.html delete mode 100644 docs/modules/src_extract_content_baseline.html delete mode 100644 docs/modules/src_extract_content_core.html delete mode 100644 docs/modules/src_extract_content_courlan-1.html delete mode 100644 docs/modules/src_extract_content_courlan.html delete mode 100644 docs/modules/src_extract_content_courlan_clean.html delete mode 100644 docs/modules/src_extract_content_courlan_filters.html delete mode 100644 docs/modules/src_extract_content_courlan_settings.html delete mode 100644 docs/modules/src_extract_content_courlan_urlutils.html delete mode 100644 docs/modules/src_extract_content_deduplication.html delete mode 100644 docs/modules/src_extract_content_htmldate-1.html delete mode 100644 docs/modules/src_extract_content_htmldate.export_.html delete mode 100644 docs/modules/src_extract_content_htmldate_core.html delete mode 100644 docs/modules/src_extract_content_htmldate_extractors.html delete mode 100644 docs/modules/src_extract_content_htmldate_meta.html delete mode 100644 docs/modules/src_extract_content_htmldate_settings.html delete mode 100644 docs/modules/src_extract_content_htmldate_utils.html delete mode 100644 docs/modules/src_extract_content_htmldate_validators.html delete mode 100644 docs/modules/src_extract_content_json_metadata.html delete mode 100644 docs/modules/src_extract_content_readability_lxml.html delete mode 100644 docs/modules/src_extract_content_settings.html delete mode 100644 docs/modules/src_extract_content_sitemaps.html delete mode 100644 docs/modules/src_extract_content_spider.html delete mode 100644 docs/modules/src_extract_content_utils.html delete mode 100644 docs/modules/src_extract_content_xml.html delete mode 100644 docs/modules/src_extract_content_xpaths.html delete mode 100644 docs/modules/src_extractor_html_to_content_extract_content.html rename docs/modules/{src_extract_content_feeds.html => src_extractor_html_to_content_extract_content_extractor1_content.html} (51%) create mode 100644 docs/modules/src_extractor_html_to_content_extract_content_extractor2_content.html rename docs/{variables/src_extract_content_settings.LRU_SIZE.html => modules/src_extractor_html_to_content_extract_content_extractor2_content_utils.html} (56%) create mode 100644 docs/modules/src_graph_embeddings_to_graph.html rename docs/modules/{src_extract_content_meta.html => src_graph_save_hnsw.html} (53%) create mode 100644 docs/modules/src_trafilatura_extractor_baseline.html create mode 100644 docs/modules/src_trafilatura_extractor_core.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan-1.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_clean.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_core.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_filters.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_meta.html rename docs/modules/{src_extractor_html_to_content_readability.html => src_trafilatura_extractor_courlan_network.html} (89%) rename docs/modules/{src_extract_content_htmldate.html => src_trafilatura_extractor_courlan_sampling.html} (53%) create mode 100644 docs/modules/src_trafilatura_extractor_courlan_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_urlstore.html create mode 100644 docs/modules/src_trafilatura_extractor_courlan_urlutils.html create mode 100644 docs/modules/src_trafilatura_extractor_deduplication.html rename docs/modules/{src_extract_content_courlan_network.html => src_trafilatura_extractor_downloads.html} (73%) rename docs/modules/{src_extract_content_htmlprocessing.html => src_trafilatura_extractor_external.html} (55%) rename docs/modules/{src_extract_content_external.html => src_trafilatura_extractor_feeds.html} (51%) rename docs/modules/{src_extract_content_courlan_sampling.html => src_trafilatura_extractor_html_special_chars.html} (52%) create mode 100644 docs/modules/src_trafilatura_extractor_htmldate-1.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate.export_.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_core.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_extractors.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_meta.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_utils.html create mode 100644 docs/modules/src_trafilatura_extractor_htmldate_validators.html create mode 100644 docs/modules/src_trafilatura_extractor_htmlprocessing.html create mode 100644 docs/modules/src_trafilatura_extractor_json_metadata.html create mode 100644 docs/modules/src_trafilatura_extractor_main_extractor.html rename docs/modules/{src_extract_content_metadata.html => src_trafilatura_extractor_meta.html} (53%) rename docs/modules/{src_extract_content_main_extractor.html => src_trafilatura_extractor_metadata.html} (88%) rename docs/modules/{src_extract_content_courlan_urlstore.html => src_trafilatura_extractor_readability.html} (52%) rename docs/modules/{src_extractor_html_to_content_readability2.html => src_trafilatura_extractor_readability_lxml.html} (92%) create mode 100644 docs/modules/src_trafilatura_extractor_settings.html create mode 100644 docs/modules/src_trafilatura_extractor_sitemaps.html rename docs/modules/{src_extract_content_courlan_core.html => src_trafilatura_extractor_spider.html} (80%) create mode 100644 docs/modules/src_trafilatura_extractor_test_extract_test.html create mode 100644 docs/modules/src_trafilatura_extractor_utils.html rename docs/modules/{src_extract_content_downloads.html => src_trafilatura_extractor_xml.html} (72%) rename docs/modules/{src_extract_content_courlan_meta.html => src_trafilatura_extractor_xpaths.html} (61%) delete mode 100644 docs/variables/src_extract_content_courlan_settings.BLACKLIST.html delete mode 100644 docs/variables/src_extract_content_courlan_settings.LANG_PARAMS.html delete mode 100644 docs/variables/src_extract_content_htmldate.export_-1.html delete mode 100644 docs/variables/src_extract_content_settings.DEFAULT_CONFIG.html delete mode 100644 docs/variables/src_extract_content_settings.MANUALLY_CLEANED.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_FILES_PER_DIRECTORY.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_LINKS.html delete mode 100644 docs/variables/src_extract_content_settings.MAX_SITEMAPS_SEEN.html delete mode 100644 docs/variables/src_extract_content_settings.SUPPORTED_FORMATS.html delete mode 100644 docs/variables/src_extract_content_settings.TAG_CATALOG.html delete mode 100644 docs/variables/src_extract_content_spider.URL_STORE.html delete mode 100644 docs/variables/src_extract_content_xpaths.AUTHOR_DISCARD_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.AUTHOR_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.CATEGORIES_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.OVERALL_DISCARD_XPATH.html delete mode 100644 docs/variables/src_extract_content_xpaths.TAGS_XPATHS.html delete mode 100644 docs/variables/src_extract_content_xpaths.TEASER_DISCARD_XPATH.html delete mode 100644 docs/variables/src_extract_content_xpaths.TITLE_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.ALLOWED_PARAMS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.BLACKLIST.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.LANG_PARAMS.html create mode 100644 docs/variables/src_trafilatura_extractor_courlan_settings.TARGET_LANGS.html create mode 100644 docs/variables/src_trafilatura_extractor_htmldate.export_-1.html rename docs/{functions/src_extract_content_htmlprocessing.convertLineBreaks.html => variables/src_trafilatura_extractor_settings.BASIC_CLEAN_XPATH.html} (79%) rename docs/{functions/src_extract_content_utils.buildJsonOutput.html => variables/src_trafilatura_extractor_settings.CUT_EMPTY_ELEMS.html} (76%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.DEFAULT_CONFIG.html rename docs/{functions/src_extract_content_utils.writeTeitree.html => variables/src_trafilatura_extractor_settings.FILENAME_LEN.html} (79%) rename docs/variables/{src_extract_content_settings.JUSTEXT_LANGUAGES.html => src_trafilatura_extractor_settings.JUSTEXT_LANGUAGES.html} (51%) rename docs/variables/{src_extract_content_settings.MANUALLY_STRIPPED.html => src_trafilatura_extractor_settings.LRU_SIZE.html} (53%) rename docs/variables/{src_extract_content_xpaths.DISCARD_IMAGE_ELEMENTS.html => src_trafilatura_extractor_settings.MANUALLY_CLEANED.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.MANUALLY_STRIPPED.html rename docs/variables/{src_extract_content_xpaths.COMMENTS_DISCARD_XPATH.html => src_trafilatura_extractor_settings.MAX_FILES_PER_DIRECTORY.html} (53%) rename docs/variables/{src_extract_content_settings.SUPPORTED_FMT_CLI.html => src_trafilatura_extractor_settings.MAX_LINKS.html} (53%) rename docs/variables/{src_extract_content_settings.BASIC_CLEAN_XPATH.html => src_trafilatura_extractor_settings.MAX_SITEMAPS_SEEN.html} (88%) rename docs/{functions/src_extract_content_xml.buildXmlOutput.html => variables/src_trafilatura_extractor_settings.PARALLEL_CORES.html} (80%) rename docs/variables/{src_extract_content_xpaths.PRECISION_DISCARD_XPATH.html => src_trafilatura_extractor_settings.SUPPORTED_FMT_CLI.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_settings.SUPPORTED_FORMATS.html create mode 100644 docs/variables/src_trafilatura_extractor_settings.TAG_CATALOG.html rename docs/variables/{src_extract_content_xpaths.REMOVE_COMMENTS_XPATH.html => src_trafilatura_extractor_spider.URL_STORE.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.AUTHOR_DISCARD_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.AUTHOR_XPATHS.html rename docs/variables/{src_extract_content_courlan_settings.ALLOWED_PARAMS.html => src_trafilatura_extractor_xpaths.BODY_XPATH.html} (53%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.CATEGORIES_XPATHS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.COMMENTS_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.COMMENTS_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.DISCARD_IMAGE_ELEMENTS.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.OVERALL_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.PRECISION_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.REMOVE_COMMENTS_XPATH.html rename docs/{functions/src_extract_content_htmlprocessing.convertTags.html => variables/src_trafilatura_extractor_xpaths.TAGS_XPATHS.html} (74%) create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.TEASER_DISCARD_XPATH.html create mode 100644 docs/variables/src_trafilatura_extractor_xpaths.TITLE_XPATHS.html create mode 100644 pnpm-lock.yaml delete mode 100644 src/extract-content/python/baseline.py delete mode 100644 src/extract-content/python/clean.py delete mode 100644 src/extract-content/python/core.py delete mode 100644 src/extract-content/python/courlan-core.py delete mode 100644 src/extract-content/python/deduplication.py delete mode 100644 src/extract-content/python/downloads.py delete mode 100644 src/extract-content/python/external.py delete mode 100644 src/extract-content/python/extractors.py delete mode 100644 src/extract-content/python/feeds.py delete mode 100644 src/extract-content/python/filters.py delete mode 100644 src/extract-content/python/htmldate-core.py delete mode 100644 src/extract-content/python/htmlprocessing.py delete mode 100644 src/extract-content/python/json_metadata.py delete mode 100644 src/extract-content/python/main_extractor.py delete mode 100644 src/extract-content/python/meta (1).py delete mode 100644 src/extract-content/python/meta (2).py delete mode 100644 src/extract-content/python/meta.py delete mode 100644 src/extract-content/python/metadata.py delete mode 100644 src/extract-content/python/network.py delete mode 100644 src/extract-content/python/readability_lxml.py delete mode 100644 src/extract-content/python/sampling.py delete mode 100644 src/extract-content/python/settings (1).py delete mode 100644 src/extract-content/python/settings (2).py delete mode 100644 src/extract-content/python/settings.py delete mode 100644 src/extract-content/python/sitemaps.py delete mode 100644 src/extract-content/python/spider.py delete mode 100644 src/extract-content/python/urlstore.py delete mode 100644 src/extract-content/python/urlutils.py delete mode 100644 src/extract-content/python/utils (1).py delete mode 100644 src/extract-content/python/utils.py delete mode 100644 src/extract-content/python/validators.py delete mode 100644 src/extract-content/python/xml.py delete mode 100644 src/extract-content/python/xpaths.py delete mode 100644 src/extractor/html-to-content/extract-content.js rename src/extractor/html-to-content/{specific-domain-extractors.json => extract-content/extract-selectors-per-domain.json} (100%) create mode 100644 src/extractor/html-to-content/extract-content/extractor1-content.js create mode 100644 src/extractor/html-to-content/extract-content/extractor2-content-utils.js create mode 100644 src/extractor/html-to-content/extract-content/extractor2-content.js delete mode 100644 src/extractor/html-to-content/readability2.js create mode 100644 src/graph/embeddings-to-graph.js create mode 100644 src/graph/save-hnsw.js create mode 100644 src/trafilatura-extractor/.gitignore rename src/{extract-content => trafilatura-extractor}/baseline.js (100%) rename src/{extract-content => trafilatura-extractor}/core.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/clean.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/core.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/filters.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/index.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/network.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/sampling.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/settings.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/urlstore.js (100%) rename src/{extract-content => trafilatura-extractor}/courlan/urlutils.js (100%) rename src/{extract-content => trafilatura-extractor}/deduplication.js (100%) rename src/{extract-content => trafilatura-extractor}/downloads.js (100%) rename src/{extract-content => trafilatura-extractor}/external.js (100%) rename src/{extract-content => trafilatura-extractor}/feeds.js (100%) create mode 100644 src/trafilatura-extractor/html-special-chars.js rename src/{extract-content => trafilatura-extractor}/htmldate/core.js (98%) rename src/{extract-content => trafilatura-extractor}/htmldate/extractors.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/index.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/settings.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/utils.js (100%) rename src/{extract-content => trafilatura-extractor}/htmldate/validators.js (100%) rename src/{extract-content => trafilatura-extractor}/htmlprocessing.js (100%) create mode 100644 src/trafilatura-extractor/jsconfig.json rename src/{extract-content => trafilatura-extractor}/json_metadata.js (100%) rename src/{extract-content => trafilatura-extractor}/main_extractor.js (100%) rename src/{extract-content => trafilatura-extractor}/meta.js (100%) rename src/{extract-content => trafilatura-extractor}/metadata.js (100%) create mode 100644 src/trafilatura-extractor/package.json rename src/{extractor/html-to-content => trafilatura-extractor}/readability.js (98%) rename src/{extract-content => trafilatura-extractor}/readability_lxml.js (100%) rename src/{extract-content => trafilatura-extractor}/settings.js (99%) rename src/{extract-content => trafilatura-extractor}/sitemaps.js (100%) rename src/{extract-content => trafilatura-extractor}/spider.js (100%) create mode 100644 src/trafilatura-extractor/test/extract.test.js rename src/{extract-content => trafilatura-extractor}/utils.js (100%) rename src/{extract-content => trafilatura-extractor}/xml.js (100%) rename src/{extract-content => trafilatura-extractor}/xpaths.js (100%) create mode 100644 test/embedding.test.js create mode 100644 test/extractor-test/add-score.test.js create mode 100644 test/extractor-test/add-to-parent.test.js create mode 100644 test/extractor-test/extractor.test.js create mode 100644 test/extractor-test/find-top-candidate.test.js create mode 100644 test/extractor-test/get-or-init-score.test.js create mode 100644 test/extractor-test/get-score.test.js create mode 100644 test/extractor-test/get-weight.test.js create mode 100644 test/extractor-test/score-commas.test.js create mode 100644 test/extractor-test/score-content.test.js create mode 100644 test/extractor-test/score-length.test.js create mode 100644 test/extractor-test/score-node.test.js create mode 100644 test/extractor-test/score-parag.test.js create mode 100644 test/extractor-test/set-score.test.js delete mode 100644 test/foo.dat create mode 100644 test/graph.test.js delete mode 100644 test/hnsw.js delete mode 100644 test/vectors.dat create mode 100644 web-app/src/components/Home/Graph.svelte diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index eb709a4..284a97a 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -38,6 +38,18 @@ jobs: with: # Upload entire repository path: './docs/' - - name: Deploy to GitHub Pages + - name: Deploy Docs to GitHub Pages id: deployment uses: actions/deploy-pages@v4 + + test: + name: Run Unit Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 # checkout the repo + - name: Download dependencies # execute tests in tests/ + run: | + npm install + - name: Execute tests # execute tests in tests/ + run: | + npm run test \ No newline at end of file diff --git a/docs/assets/navigation.js b/docs/assets/navigation.js index 576cf71..6eb2050 100644 --- a/docs/assets/navigation.js +++ b/docs/assets/navigation.js @@ -1 +1 @@ -window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAAA72dXXPjNpaG/4v7NtpsMtnUbKr2gpbotqZlW5Ho7vRMTbEgEraQJgkFBG0rW/vftwB+iACBQ4CycpPYbrzvwwOCIIADkv/63yuO3/jVL1ekSPHb1XdXB8T3V79c5TStMlx+L//8H3ueZ1ffXX0jRXr1y4/fXSV7kqUMF1e//KszSFCWVBnieEtykiFG+PH6ON8jhhKOGeT8YVRag3/64b9/+tt//vR/352YtHjBjN9Gd6vtAScEZUJVwjCrxkbBb1wcCeh6KjPisl7cuBjVxUa8tmH4KXpYL+cujv3CI75facWrHY7EHx2c1eI27xzxZP/rY7ANNqCpWs7mViYMHfDjZgV69UtZnTBiyX4bbcLgDjZTC8J+X/DOwawuNeJEvpEDTgly8euVtboeMsK3uOC4SDB8qQyKWj2r52dc8nv8xr9Qls5pfsgwJ7QY8YdkNhan33BB/sQRPZAE9h8UtXm+4IRTJkriNx6Uc1ok+AC3favExnjF5Hm/wRl+QUWCm/KfpQsIgnUTaMF6ORVYS92YEWb5DcN/VLhIjh5AXQfS5JldEEaSfYb5gpSckV0lmtA4EdSq1H/32zpLrsx3QFRxmtRtGH/f/2V4LCVL4n4J5Ren263LJfdUFYn80wjP7qUeys9qTSR7RnM8w28cFyWhxfcoy+jrLKGmO7A4gloQd4JYCmIhcIo5pyl5Os6lS7BeBkI9f9hsbRFDPJsXGHGKOCoxn5H8QBm3NANRrSTDMy7a1yynKc7M1dG4xbVb3OhiqYulzm3gVetkc75TaWp1OPAGXoPq0C5ALm45otHIsdQTSQg/nsG3esINkeYHhsty9ntJi9nvf/78k3OFC1ksZLGQOde3kF2jEv/80z/JQR0njcasIY12cCskEoDY8dQQx6M9qZq/OAVbF110Wsc4hzDdCIxwX+WomBUox6VPiFIWS5l/jLdCfC+0jjEaaLoTGGSOORKWs5IjbukxNWQriaVEC65vTcrygLMMpzN+PFCvSjxpY6n1r8m7ziESBo7VaeMaPcGK/aOiDPmELAX+cc5pntPi1woz4txoFJTBBgyMY5bPntqxkU+EQhl3Sv9QLYM5MFQz0+AHxvwqpjToGc844Zlfd9BK41rqH7W4D63RM46k3jFsC9XkaAj838PZ+CyhYgZmG27sUIkzUlgGmY1H3HjEbWmnWhhaq5Fbzc0UZfSwQyVJ5hlGBSmepwBOaoAi/uVHbr89g9UjhIB3RlF6W/+rn3crBLw5I7m3rxCNjJOYYzMRJZ2ayIImVV43zsY2yVBZQratRPX/4ce/9440rIX9qfGob6eBjHeI4aagMkmEq1j6q1Lg5A3WCx3Mmz+OnL6KZcgwszW7ysJOJ/GRZVvu0TZOzh96UttEXV6rNyQzrgKPAlS1dSF4j5Nvj8wy0QIBPanVXXQ0E91PUpt7SnNEiskVpMmBpd16EjwZNHQYWUZeyCObRlL0I5wVKb5Zxs8umFZuozzJcKdCVDXMeGTZdEQjthPeNjhDnLzgyRjdwcZ6xlzMZqddMYoYINzSkgdFuhb2kyiqwQiJFE90OqZRA4woS6cjTmIbgZThG8esQFPOiCK2Ez4V9LUQDX0Soq+2M+7RC3lG4l4qRtGTQAMLgEb5nKHXDO2yiSzVwE76jDKSTrteFLGNkKHieXLHr4hthIKyHGXkz4lXvSa3UYTt5DgUsTWVhcTi98QuUhEDqctqN62WelJreux4wJNrSBFbk2WiuSE+8USranDE5eUeS4nbAq4oKdZbrCsZMOOkByYBwzGjN+KRQVPEFCc0xeuqOIr/T8OoHgCsuzxvGHpW53s+vIGNC3KN2Lk4YeGCmny++g4AaHjp+0Ba9bvN8Rtz17n+cJLlePQC0IrH583aSNuD0TcAOMYBvQempwdPRV3O7z4SNyLX9bnhzNotkBbTs4D6GuMU1Y/U94AbgXme6kfTbACgfVDpR9R9YKRxZOkN7LmAuOHw0hfVOkDroYYxph/m5ABgTENAP8zJAVqBNYyj/DAnBwBjHEz5cXoWo/lGv75IKJw6IpSmc5TsvZuyBLTikcEUk4W8e2yJ6OnBKiowf6XMbwIbNyKnimI4JQzLw45w6T2saVGaDRiTnJMoORaXoFqV214kw5zJLaAOc7KAg8Gck+LZ857aqpyCCVarhy/hIl4Hm+Cut7XoBTEiOlpHlGqjkv/Wz9Vfr4L5p9VyG01GdQ4AZRXcfzw3pJ4HQIqCzccwikXh6ai+yYDVbxAVy0rv1EXcqiZmR+BUlIZo5cOslBZHxYnnokPcqpzisKzKu12oHUlxGR1kT+kTOtTJAuRY1rW9YYoPQDStbnvCThYwx7jG7c/q2Yzz1GXoaTDhAZMG693+oMYCHPgOV709OScLkGNY+/YGdR4j20KmpIFnP1wuEdx4XzIV3CIumAxuEZdKB3f+F0wIt4y/ICXcov6KpHCPddm0sAa6VGK4xVw0NaxCLpgcPoEunB5uQZdOEPc4l00Ra6DLJIl7kIuliVvGRRPFJ8jFU8Un1F+RLO7RLpouPnEulzBuGZdNGbeUyyaNW8ol08Yt43KJ445wsdRxS7ho8riFQOlj5RkZnFaHjCTI/CygiaRInIbLq82jtuwJrQyo/q0W2qy6Jfkelfsp9o0Ucm+EN6R4xuzAiHtSV0UNfaBMUiPEPsufKk+xAGeZBWaI42v6eqvUog9MMwEngc3bC/wWWFScZgKuhab0tRBbyR17nK64U9Pe4PJAi9K1aXferQ5qeChNIzpvHnvDqXgmzLm2OpDBBHraoHp6wgyni2GduQIHFtCiFObJfliFrihFPobxWIfSCCPpPIq6WK9l7P6YoQfYorHX4BMbF2aAhzYnbf3vKAMHoO5+r0rx0waXSeV8/juSooZ2k6CCcPEGAob9IX0xvIsBY9cuRhZ1Oh1PpEhvME59lmxr974SSt+y4y3N8cEjoV/794RgtYh/EwMRt5ppSzsu1ItHxf7H07lWxSpBXRbRbOGcjG6sLyEqSR/3XUetrce2o/rCixkW/W+RTAZ9MDnZF5FQTgoco92OxTjDYrea42VgINvcxujS5t3outsYfY9R6joZALCdzRhP/nourTEZZZH8HWtWd7MvtBVp7N9t9Ih9h9GJdSxfpjSZNfSxTk3l64fig/NKi4GmeoySuLiRnQ9rbey8DAsrVKTkrBNnMLJPXxs/65tUrLyT0KlvvV3GNw+buyCKlvcfz0N90L1sNXoX/BZHwfUqjL8sF9HtmdShm5UbRkEcRNFmef0YhdtzuQM3G/c+3Ioaib/cLqNQ3egxiWzys7O/rJb3YRyuwrtzI9a9bMztOpwvg9X7NSyjoY0ehct4sfwcb5fXq6WyBWUS22AHkTfh3cPnMI6CpeHFdN5g1Q3ifg5Wy4Vsju8RsOo2zo2Cd6lnxcxG/bKMbt/vKh66WZO+VclpHh8QK33vATpUs7KmgUmZIJbGVfGKCo7TM6kGOygxLCaJ9YBRHqfvKFCnWyxHUrhie8SU4ZKBrttZkyspoeWxSBgqCS7jepxwJt7maT2G/PmdwH0jG02+iupdcKoTkI4RbeF9kAMzG5XhZ/z2Lpev6mRNprBjM996O5x78ehe9nGj+x7vDua8yZvhEnN9E7Yf4oPqYY/Db59vB/La6DsP5rdhvF3+MzyH8kGxsbWG+SoM7sWgbcL4T8NpTtAo+2a5eocAdSeIuH7YipFSGM+D+8VyEfjfoYdss6f1KJb3sShzHvZkYm+jHvt1O4z7bl3DG3A8CR/6FuBetnivvMbIl6N4WEcz8lFT8dCKb//bI6kmIyg2yL9MxPWM7EguJve4SGjq/KyFGakbWZeSRApHjGimw/oW9s0mcVrtCK3KM1vJ0AhAvjJaPMfp4OVSE6ADKzu24k9/P4fV6K17aihKz6zEvoV9pHNAhMVPqMr48Uye0cs+5CH5lDXcHq9vYe95m00g/qtjJ6Fb5kHsBa5HXqQ4VL5xabQPBjtwC3PrjNPJuQ/zIZh9rcfSZE1eUFZ5D/0GB6Cb2anyMw1TZn9DpmIFb7SNj3k6ebVXJ1ssgS2ecY7e3iNmzQokkuLdiCcroHOXyvdA6l6jzCeR0Dj3Mh66WWe6GapKsstwfMSINQ/snkm3eeq95fm5asdnRiZlq0877WVuGeztRcEDowkuS6+h1UnjFMeuIpl8LedDxZW+Ht41oJE0F2CHQkIzkRAS+5iXHk9CaTzNBeTJXnCBRz4TMApUbVyIHBH3bR42njQZp91ilKorB1Nwrcs4b0UKfM0wcn+XiJl48nFhlvxsXKm/QNxE+rWi3P2ZezOq9hhnRejcsyYcHDjU5629FhIdeYFvKq4PfH0UF+ZCPCFl/yzCCNHgBHD3qEgz+U2awuOtUHp3qZhAewRPx+Szn1fDaS7uvEjuXnofau0FsBvp/fRq7TmAnKrAZ7TQTj/GeGwSQeJ4plbi0Ajck4ex7xu/NV7fAtyeJ1MR7ecS3MYMisRx6CMd/lG6bydVIT0DoN56pdYi6xAyRp0341qBJysA3e0ACiq+V+bbPlTdxQU4vVIVC7CRiC32p2SLWytRNT7NRHznQd1sBkelkTSX8fYyr33Oxcl/havROel0Xq5p5PiFd083esjuPcOUTuFuQBg//P4lemdkamEwjFK0Ixnhxzh7c13o01V+jVh6iK9nukY2wA2twCD9coFeKUC/rxd01hf4gkHn7fQVg+tgu5zHMhUY/7YO+pvo4E3dp3cG6Q7A5u75YxSHd+voq76NzJGl6QHSIrwJHldRPH+4v1l+9AapcoAj0pn3wV0Yr8J7b0pfDDD+8biNwt/qdxg9Bh9D/3obOAC01eZRy/Q6Qloh4H0X3D8Gq9XXurmEC2+GbuDC2kab5Xp9Dqx1AGl1ansbr8NNvFhuwnn0sPk6gWn0GSGvlvef/FtFpxxx3y6j8C5Yb+NtOKGRDxwAmngl2GoVruL5w2ZCM1flAGf7uF4/bKJwEd/cRfF8tfRGDRzcaHLfqX9gAweAFgUf43kQBasH/26vpwUIiD2XETXcluC7d0fR9NBzaZgvEBcvXUa58/Ct4yhqeGBAOM7RwXVg0JT2Ge1s6I7ycjvgjITSkowuQMWJZzvORA4tAB4p1202oSntzRtaQA2jLrHVNv45ohQ13DAOxPm5pbqs24Yx8W4O2S6x+l5ncGBX+2taaHD3uFnF2+hh434vrxGdDugEnmhSlTiVR+P+QG8DUMXgGfDYG+W+JUrmVsSk3i9DUwM0MdBKZckIk8mUTjsG+c032dSD/OaSYhLZ/ggTP/tWBa+kc0azicevq0eX0cP6wTo/iiIFEDlmz/gL4fs1Yt4QTQxgGM7pCw7zAz+Gg6cOXVAGA6ir5YwcFrTaZdgnr1KjNDG0pIvfuJ7WdiGcdOB6sftHEBtb0xcQT4avjHB8U2WZ/mCri7cmHsNEmHCP5/J7jEYJAN7yLKKR+1cta/NWBXbczotIrutGEzttYX/pLrtjXK7D7hCX6K6F+QU7a2l/4a5aMC7cUQvEX9JNC9Bf0kkL0FgXrVzTwtVxNFaXdbqyg8fo9kGsdmznwWZRryI6T5IbjtEDGL825c9hjTKuHxZf/VZVG8BJCC2mBlH48WGzDLfTohjoIdbD3V14H23V+vUFGk1cqGfRxijt0Szvgo/1M8lC5kkzmwDUh8+hWKw6q0KNHtDy2iacL7fLh/uzqBYXgNs8fXzW6TR6wMtgE6+LnhLyD4NtuDmrIk0WEHEZrcKJIfWkA4LxU/FyWc/UVwv1jNNZQuQuUGDJa4Zkjh68W1Am92XEnMbCsDv+WuqzwBZoMOONbwSnWIE3wjbEphb8AxS/eGVLFZBXcBLVs3EKbHSTr43m8W4qKVigyYE175TqbJwCK2nFRh5+sPFqqU9wWw3mFV6DU6ycQuSEjzwaZ0NKpU+AkYryiq+G9Y3A6PZVjopZgXJczhhO6HNB/vSLUjrE0iHuHNwGqLJHeNj9jvsvmCQFx+wJJVOofUftEP7r5+E5vUc5TsOCj2/+dMMPbZ02ufQ6fvdqb8XtH3y7vRtGc8c9MKNoiy28As3kDS8d/9yAjhcvieA0Tg1vYLW+oU5uCn7crCLq9PZXGDm0c4n0Cb2QZORdxxZuI/U5xTc6zSfOlqd6GYLUHwORLbnb1gaNYk6l3GqjGXlpIzGvDV+IFG5b7hy4Q0+4lxV1Ux5wQlA2S/Zo5MFIwwHI3xuHWDr4tHyxh2tbi+cq3bUCDHyL93hFcDqTH4qZ7ZVd1B4VwWksHeRvThWB0nTxcHfTRjuxAlSu5unw7Et0t4rotXBw2KDndhBma6eTMPEa1H6fsDtxeuDa9edy7fW2N/pG2pM6RbkxoQzJ5xHUxobVP0rXE/14Rmw/XmaD6Rh0aDhyh2lupPAN5pA+ebTt5obXHt8hfZrYsteLm/HqgGknJ3BLyCPL3gHW+sD7NBKGDng29g6LIawWir861WFd/HHj0KLsoM7EZTA2rXmov7rNcRgnSX8yZ53ewKzGx2FS412HGqkpB9bikVa82uEZzncj770b4hptLLVOdfiVVlG1w+sMHftJYdeaVIGKGVidovjXWqujHWtWJQ8NneqY05n8y7Ra5lS+rsOnK2sOMcKjCexx7NDS2MerD/3UrzIGXjU+yzBXd3j160QaxO2LLJqibqNllCVVhjhuPvJB+PH6KAa14mUc1tNv5sFeI4898WQ/+6NCJbKs89bI+r91Oaf4pODXx2AbbOBgFOeeCr5b0CeeozfoiJsifidjzeiuGTRsdYTp4FsIYADG8YrJ837GcIZfUJHg2RPDf1S4SCzDxxoqRXEnijuRZ6x7hkosp1NPJAHWpcaodkdoY4yw27RuEWb5zTB2v+MALEdGe/UbI2eveGfpDJoCJWcY5eZz07xF/BXv2h/r0m6jEqnYRpswuLOFbwX0xSNPT/XCdArhFe88jv9L33js4IV1J3M6bPKNHHBKLA/qGQitwCeEIWQ0kA6jWYw1uq6z/v70oxgtJvhguQWfyvV/bCROUb5gcSsV33rBbzwo5zpNCxbkmb2cr/mm/GcMPYIAHwFgOO0wgvXy3Y8kWC/B5s3pN9xkQsztUgx3Etvbtlp13JVza+2HjPDt0FqN2WCuCuHrltPDK2W2LwadzNtyTkdOyi+UpeLRZ1osnwvK+rMC2+F3BIMaPjn4jctJ3L6yftq3w4jCcoQqC7ufB3H9bHGOCk4ScM3URjK7OLW6GacHkoxGdvpBlHYKLRIa6xTKai1l0GSpFUTagduqSvNX5fDojLJUnHxGKZ+VHFvu/B1BlBdnRZSPRXmfdXPRLiO6oZRvFZIlrCHL6DNyD5J1MkuJ2EK4qwTm++GfbFEfSBL3yxn+5FQBsu+Up2NBGEn2GeYLI16vCpcDGDEfuUaaFmY66uJZfaRtUDdlXBfxmQ/f06oI02f8EXpcTnU3KeFFalR8m3Ud+uwbPh7kuB2ORahOt4H4pHIKT6i7m8YcF5yhLKKfWpORSO3sEd+RYSX+Vjd2xyroBL7RtxtAwvBT9LBezkfCNXJ0D8vaxr//H6myZCnGtQAA" \ No newline at end of file +window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAAA72db3PjNpKHv4vnbbRzM5tL7aVqX9ASPdZGthWJnsnsVooFk7DEDEUoIDi2c3Xf/Qr8DxBoACTtSlViO+jfgwZBAEQ3yP/87wXDz+zi54ski/HzxQ8XZ8SOFz9fnEhcpDh/X/75b0d2Si9+uPiWZPHFzx9/uIiOSRpTnF38/J9WIEJpVKSI4X1ySlJEE/Zy+bI8Iooihimk/M5oWoF//PA/P/79v378vx86Jsm+Y8r80wOO4yQ75GsuGpBLlOOffoSZJlNrZkCub/df3GCtjQPl/sbbulJqGwPlOrjZ7M84SlDKWz23oShsDJTt6iog3MxGvl/YoBvgZxaQ1uvPOGLE0N9gQx0PPzPeI0HproxBZe/7vwR32/XSRq5f2KD7lRSseMDcNxtlsbhO+4CZl6ZdxwKVFYUB3bbcNjnjNMmwSVtloNM/IRYdf7339t4OlBXL6dTyiKIzvt/BHbhfSquEEY2O+2DnezewmFgQ1vuCHyzEqlIGpeRbcsZxgmz0emVNquz4a4Hpi5VqV1arek4TtscZw1mE4T45KKrVLA4HnLNb/My+EBovyemcYpaQzKAPmelYjHzDWfIXDsg5iWD9QVGd5hNODscdTvF3lEV4SbIIn5nFkAjbjaB52/VYYGVqxwwwPV1R/GeBswjuV7AdSCtbfZXQJDqmmK2SnNHkoeCX10wEbUXq7/1+SKML9ToLFYxEVf/C7/u/DOuS0yjslxB+sVrU2dwOj0UWlX8y8PRaYlV+ElsiOlJywgv8zHCWJyR7j9KUPC0iolqn8BpUBmFrEJYGITew8vlE4uTxZVmqeNu1x62Xd7u9zmOIp9MCPY4RQzlmi+R0JpRpugFv1iTFC8b71+JEYpyqm6NWCyu1sLYLS7uwtLNb3ld2ZXe+EWlic1jwBlqD5pBuQMYnGd5pyhXnYxIl7GUCX6sJd0RyOlOc54s/cpIt/vhL9YChqQA3C7lZyM2s25ubVc8j/07O4orO6LOEVMrBvTApAYi+dB3R7G1nVf/Fytmq6Kq1tfRzCJOFQA+PxQlliwydcO7iYmkWlmbuPl5z41tua+mjgiYrgU6eMENccpEzxDQjpoRsTMLSRHKuL53k+RmnKY4X7OVMnBqxsw1LW/eWvGkVAi5g2Zw6rlITbNg/C0KRi8ulgbufS3I6kYyvhRPrTiOgFDKgYwzT0+KxWRu5eMgtw9bS3VXNYg50Vc1U6IE+P/GHGHTAC5aw1G04aEzDytTdaz4PbdEBB6W9pdsaqkpR4fjvw32Dcp2vqiS3XjCyiJJyiakqUkssUMGOqucMXv8WE3LBkJGQCzZ/DitTq1arTTwJJjaXHU6QAjtI42LdCu4O8l9c3FsKICfnSlRPxsqxGI10jBu6OLZCox0rUT0ZK8dyUtBonGuVqYtzewnm5F6NE6SsXCzHgFEelpYuDgYiysm/CtYXsl6eURyRA98AcfKyv3ZqFay8rUaEu4c/cH+/N8kYpo8oGkPtK0pV+O+fhteUL+xiP2PAg44Tfihrt2rsBn77Zm/Xj/UfXIe9K0pON7WGo/MDtEYWdL6g5YQXkxNKFDs8AL6g5c+VpeVjXhmKuN9tArKSgFYei8ihnI2nj+h7Eqk2s8zc2tTlEl/JNBc/G56oZVjitEsYwjeAdTsq7RRvU4rQD72Cds1WFe+mafF3Qj80f3K6YSoTMa5mbFH7qgwxNlMSoR9nb56PE5vn44zt8xFon4+ODbQoWJJqnsynVC0sdeXH98F9kVdB3EWkjvzCtSh/rxXCUsFl1NPHnW0vkIKv0YaXGvUA8YDyJFrUJd0bgpGwVCh/s2oIFMeru5urxtuRDSByJU1gb7XXVGXiQxKNG0YUlVBLW12EkeOG9LtV8/uVGo6XMtO81NNgZUmLtV5d0stix4c+TRWUsoYpsl4JwHPfOX50uED1jN3U8hw/ul4efeKIrmlgpqwH3BpJfk/T7epqKrLRAXt+lTKwKKhx2JFglSH/q100b5jAYOlVD9SK2Kwsx3UV8Ve7BzbKkqj/ZKq9gWFWrWNx1zq3oUSqy4Gt+FJl5ywwz3lxbMTaNixtrdrwKymC4gFvU/TST9izbUkRKIiBzcmL12lIMtqyZUXyUNCqjRlZMGXSlBWakZANImrw0lSZqeXocYMdSirH+77fB4rOR91DTpu8xZulKalol/J/hV1xXqHyb1YNsU0J38dDW5IAc6+eIdhDfcyYsyk2u56oETKvsfSJnBPQXMiMNuRHOvKVamAl8iTrJfaOAYsKAEyfp2gLkxVgGJC86AAcqABQbSadLU8SgNcl6DteHLP8Cbr5eaGQF3JZ0hkztlXudCRYxbDULZM9gawaRPEixYxh3eNwKRDWRcO66Kw58qL3ah6sBe/ucsHFnwXKkSZaVyGrf1fl7FKnVPm2KmcE5Z4V3B3JIzshxSGFTrcu4nYxtpQ8oIckTdjLXkaoKt9AAAE42MzTjxa0SUXsYu2QZ6VR2Bp1IW9HX48U5dgijcpE1SuakrlMuZtu9QAkDeNANRIunsp8aOXjUlUgZxSjk/raVEXCJ/zQ/FiVtnscU2aBi+5rAX1j+K7pu2nlwhN+cKi/kHVuqjyXbs2sqq3PRNcQGgMXF4YQoyMtRpIwdbp2sH7f/cgfk3n6s8bHtlz/x9rEykurpHDJYxAKCFrf/fps8Zlq4m3XcJ5TnVOvvfu1Bwt4tRrrsC1n1+E0ZxZEnxXioiF86zByfiI0Nta8KWeXrJTXWdsnkq0PGaH9HQld9VuCwtqQhPbMyg2kY5F9MznCC5dPx2Vh++vAn2X2+IQylkRgAEJHUqtY9boqcdvoWfcDL23lWsBttE/TWunSDHqI1h1X0TWVpC+awwskQmN+8SkhbJEzrJl8WwIvz68KLx/y8i6PIrxfBmRHCNsLJI1bQ5ZSxzANVFn7ce80yPvhn3Re80T2fjnFn+ynBZuzLXJT2FTAIG64R+oepqp1dqDopL1vuGFYFXHZi7slRebHB/xJ1FY43qqrLEGvKMq+LdoBffENv5zLpTPsC7fqpoGws7Jyj1u3k8YSZ4yiNCC/NCIGT/Vsg65hZYe/VZ3dsglaA1fv9Udtle4qObKG6cam6DFJESsoWpgyax9QrjnxWlaqU+rtAzc2Vv4PAZLfMELNEpZ0ZcR3mWKUJdlhPKbTAFj8/3xk+s1yiwbj5gAhJSi+FoL/ToTGHCAwmuinGFidmxrOJ1GnrsTLW3WjFYmKkxDFi1KU52bxxlCkfPj4j16t/d5N4qbeWkLyD4jiuiA0pekpogBwaQ1hQQBhEwmMSEFTpFsSaLRLE6tLfE/TPXPuP53+u56AeKhVGiuuklT5NhBLjKihI0VHHH2710XTLTA9AS2DD1iTGJ2AjlEldU5sMklEx2qPr07EDXUAIheQs17deYKKgbZJ9I+T1rBGRMd6LF2fhhI1YNI91eUPWoNqCT3neYdTxJLveCJM1tERD5jxaM6UO0yQADjXJOdZUlsOmcASZQy8JHskU2G1BkAK0ngqqJPQcZLcf2aYZmj8lRIk9JxfMvKU8VtiAqivoSfdou/JAfHZm5+jm4AbCAFMwpYUPaXoQXeYx44oyuh5n1GaxFPuL0FCx0lRdpg4pQgSOk5G6AmlyV+TRgxJRMfi4hN9EiR0nBzxV2RMGnAFCS0nosXDlHbrCegY7OWMJ7aZIKHjfOddErFJ3UDUAFeAIxhhaWi3RclLgokeNqROBcrTGaxnR4Lu5WRUARPjiMR4W2Qv/L9TYKISgGxv6iuKDuJzrDt1IGYD3iI6D5QL2QAnXse+DpSLNBg23FGNxsw7GzXCdodj+MDo5AnHNBLmHQLpucCZ1JcBaMqHEGdYTwW8RFW5MfNUWJva7mgO9xBcnGpgPSFotFI+gI/h9ZXgLqJ+Ch/DlMTA0w66Be8YrqwGg5Wr3pHYnhYIHS59xwEbHWhPWbH+HQPrdACYamE6BtbpQHvZihXdGFinA8CUy7oxtJ4QnCOJmSbnCOZwO6uhDMXxEkXHkZ2+xDQShmUdLQuNnAVKUE8FbLQMsydCxzyeh7WpXVQTxwnFpQsBzkcurRqgJGZIiD6dUyHSZe9gY2uXnqJ48nNxroV1QoaYLGPqF/tasGpbu3Nam83dF38Vbr2dd9N7peJ3RBM+bDsBRTGR//f+m8ouN97yl816H0wEtjoAa+PdfprHvZ4SwAu83Sc/CHnhqcC+1IAone7LR4aLwsZ2ZFzKJjgogRqRYZxQ8gk4iG9EqQ7bw+9BAt/2YckTtIwPA+PHkxbYCYE0TbRgJFJQg0/kDGIGo5CdEExTRg7GEntiZqq4rT8FyZVg3iCKMBZXC4FL82EsYRStEwJpiojCSFyrZEgQGh/IX3x47VB+TXj9YH4DevVwfgN63YB+S3n1kH5DerOgfgN8u7B+j/gWgX0J97qh/Qb2BsF9EfXq4f0O9yYB/gb3NiH+Hu0tgvwS7jXD/D3UKwf6G9IbhPo71BsF+zvg24X7e8w3CPh3tNcO+Tektwj6N6y3CPs3rNcP/Dek1w79t5xXDv43nDcI/zcoKAFA+EYCjotzmkQIOK6j5AmGVsv6ze5e2i4274SIlEYBSpfeJ6cjyo/jIbUAxKjfYXOVZAdMzzRxDciLwKEaFM+rDbH7trFIFYTAp+cMU8TwJXm6FtrVHSlJgY+19SshxmwxiVBJCtxDjslTxo9AOI1ZrZHVTbDD+ZlkudtN0BIaa6hzojgOyLL+WAqO+ZdEHNuvxSmkoJM0xeMjpjheDVvRDTsQgjbqMIuOw0Z1AwoiJpjz3pzEMQRbCWr9vizbYSxsqAT2fTxiYYyVG1TA54AmHGVpWQMdoDX/KHL+0w7nUeHYO1qeoAHlEqEsYfx0MMVjUX0JOFcFY7dBqjSwukyPSRZfYRy7b3tXjL49FHqnL9fkhM/OqRoVpWcONtTR8v3AatbxTd4NbM0e+15g/Ycx9GiHb2Lw77X8c5R+ZRuKHHFnTBK3CQrK8vKusxB7dE3Ma8QdMvPqt15RzCe1TPchD1vcO5WefmcRnZIMh+jhgYY4xTzd0/kOkPg6TVMdSpmZ6yBrmupwxCh2e4ID4K2YiVr+Og+zljISk9PsbS1r6ndiszgcO+T0uH0d4+5J2LwSbwpxqKbdc6heoXR23GxTMEUlI4+/OM/pMR1CNmJ6aor5+/lRFiczXFCFnH5folUa1307c6sx+nodXt3tbrwgWN9+mgP4TlbUtfGN91sYeJcbP/yyXgXXs7CHmlq6H3ihFwS79eV94O/noQ80dfRbf89bJ/xyvQ58MbdpAl+lqq/Bl8361g/9jX8zj/eyoo683/rLtbeZu9spZXV1CPx1uFp/Dvfry81ayL2aUAOFKMTf+Td3n/0w8NaKb9KPxIuaEP2zt1mvys46n/OippkeeDO2vCCpY39ZB9dz3/VDTW2WQpEzcgrPiObjZhQZLQlq8xaSPEI0DovsCWVM9zJ6R7ZCFMpk4E/51aK1rO24lahcB42wIdugfB/76FldUQdZVBvFixOSv2QRRXmC8/ptlLNUQqesrcnpMCu+L6djlt+znhEq6gFxP95H5gQPJHVsig/4ecbbXdTTxuvoS/10+Hye5zaTFfVrV9eDFy3S+uQFxTlm8pmIMaB3opLepzHp9S3OKb9+6S2v/XC//rc/nfVOENP1leXG9275knH0GlSCSnrQ2v9qvZnNWVkP4m7v9nyF5odL73a1Xnlj1wHDGqiVtXVZ34a8zBzwTkrfj50T5FuYfXq84oVgozjv+kJg0md4FN75No4mKGlXUeWZdX4Wbdxo3uOJUgYgHYT4JkF7cnow49sVOItI7HhgSg2W5bQbaDxKGGo/pWWP7Avpc6rCuHhISJHP0oeGcgD4iZLsEMaDd/ONRg8E9fCCPf5jOrFW0aaRERTP0qx9If0664wSGj6iImUvs1CVivoFV3Iav9Pdo/aF9ON4nd80dn+wM7eL4fDk+2r1l2TnYpyPEvOdQhQ8P9Bo4nhiLEldEbW6tkZ1FOo7SouRi9BBNWRJPbuMg45/eh2SBUE4mz18OcUT98RlvkYYyJcOT+h5Pv8lQZCbZDNzO0Fgqigt5wPLikbyIw8NzXPbDzW1T+0pKvLkIcXhC0a0Pv8/Sx10yvJ4O1cmgeXBrwm5BN0hmDLmD84dvOCZkgjn+YiFXWdp5dNDkaTlO5XvCibMHLZJID2epAWk10Qk5cE2fnBg7XzwUaJKWiC1HEdXOMXgx6AtsaKYDZehxDVrSUctpczMa4zA7/S5QBstM3WTZPiSYuT6YiQ1t1OzIedsJmjOLHi/FoS5vvhDDayUzMQAzXM1uY4Fjbi/rl3DI4Y3t8f8TsKXL/xGXvGjkPpPp1lxFXoA/YiyOC2/65o5vzZPHnYFKSiZtquZe7q8BJW07KlBma82J7tSBGpQm95ObeieDkgrMjy5F7cqJtJ9HWrjtZrWrEM5MGEV43GfiZCofSEwdbMM7fCd+hi5xRMEQ8uFV2n7r9w1K1tE9WSAluyV2vIojk+p/itujthOEKhAm+flFexIXBNzRbasZYOd2syCENiF+ImXTsalD4mWLp2IfyFNTDu08VDiSVrm3rSsDlHNA60+bw82rGOIb1pkz8oXTuhZG6vvOqqMGVBuBhxbV/o39o2SLH+bCqO4/oKti1c9M8vzWwqM+QhXH7PTIeX3OfWMwvTZbT9XtnW7g8sbgJ8zcLtwA+hQELyGY4LMTrHlMd8BagGv9i2glmD1PaBLb79ehmWMOfxt6/VzRm1OR3RvgpN1gFMSy/sg9G+2wVc5U9KJKKkAvJV/5d1vgnB5d3u1/jQSJ4oANB4zv/Vu/HDj345k9SUA0r/u94H/W/V+unvvkz+2JQc6AHOzu5cSDJxQjTlAuPFu773N5mvVmfzVSJIsY0PcB7v1djsd2eiAzCq7Yh9u/V24Wu/8ZXC3+zqarFQz8Dfr21/G9pnW3sDYrwP/xtvuw70/+nYY6ABM/nrIzcbfhMu73egbQhQBaPv77fZuF/ir8OomCJeb9UjgQMeOWaZlj3VyoAMwA+9TuPQCb3M3dvjsKQAcRA95QBTTns16oWVJKsByP8dshRj/jAD0mVOYJmjAC5KE4RM6uy1IahuXFdeOPBCW7wc0K7canlILaEp+wGoW8FAIoCb5tgk+1aVHUodCULepSuyl/FcnoKABd5tz4njMsLKwy4/kb0oq+y4Wv1dgsdCsKJICtNi8323CfXC3c109VKDWGhhAHklU5Dgu6+T6soMaI0qAV4bhnL2vBf7GhD1h80Xi5Ztfy1+k6zUlzdA+u7AMAvKdnzEBxQojSQD3TVkywMlEVqtgQv02Lk7aQ/1mEx3lCS8BTsZAGls4tMMoSSf5ImsY4zp+dTJ3DEsQAEAnTA/4S8KOW0RHoiQJm93S+yxx/9BSRZM1ABzFJ/Id+6cze/EH56btiQoZaFZiNDmvSPGQYveoYwWUJKAABn5m5fsi3ENhFUoQMIDkXBU3yqPxwx3OH6GuxVVfoO5kn2jC8FWRpvJ7AuwJkoQJFuCEOb8vpkeq7QHM8ykNSOD6zfEK0diCU6rjxqTtXuSkaY5D3maSa0mvPcW1oNeb4Dji1ae3EvImkxsnvcnUxkH2E9vkmYbj3nCe4TjTLCOMB1zbae1bWViNCt59cH3H98v2S2+3qvatHbdTappSCXhOqctPJxpJl3err2N29mtMZw5t6HuB/+lut/b3UzwaqEDEu5sb/zbYiy0+DquUsmHPwDSxmjqtb7xP1WsguNkoploKYN999vk26AxNrFSCNnF3/nK9X9/dzsDWaAH0+oUPM1xmpRK8zTrpDurZQxTf2/u7GZpWJQRx18HGn+ReT2DA+b365/8BE/dPkMHHAAA=" \ No newline at end of file diff --git a/docs/assets/search.js b/docs/assets/search.js index 3d7c0be..5707bc5 100644 --- a/docs/assets/search.js +++ b/docs/assets/search.js @@ -1 +1 @@ -window.searchData = "data:application/octet-stream;base64,"; \ No newline at end of file +window.searchData = "data:application/octet-stream;base64,"; \ No newline at end of file diff --git a/docs/classes/src_extract_content_core.Document.html b/docs/classes/src_extract_content_core.Document.html deleted file mode 100644 index b9452e6..0000000 --- a/docs/classes/src_extract_content_core.Document.html +++ /dev/null @@ -1,23 +0,0 @@ -Document | ai-research-agent

Constructors

  • Returns Document

Properties

author: string
body: any
categories: any[]
comments: string
commentsbody: any
date: string
description: string
fingerprint: string
hostname: string
id: string
license: string
raw_text: string
sitename: string
tags: any[]
text: string
title: string
url: string

Methods

diff --git a/docs/classes/src_extract_content_core.Extractor.html b/docs/classes/src_extract_content_core.Extractor.html deleted file mode 100644 index 26f7ad4..0000000 --- a/docs/classes/src_extract_content_core.Extractor.html +++ /dev/null @@ -1,25 +0,0 @@ -Extractor | ai-research-agent

Constructors

  • Parameters

    • options: {} = {}

      Returns Extractor

    Properties

    authorBlacklist: any
    comments: any
    config: any
    dateParams: any
    dedup: any
    fast: any
    formatting: any
    images: any
    lang: any
    links: any
    maxTreeSize: any
    onlyWithMetadata: any
    outputFormat: any
    precision: any
    recall: any
    tables: any
    teiValidation: any
    url: any
    urlBlacklist: any
    withMetadata: any
    diff --git a/docs/classes/src_extract_content_courlan_urlstore.UrlStore.html b/docs/classes/src_extract_content_courlan_urlstore.UrlStore.html deleted file mode 100644 index ff99fd3..0000000 --- a/docs/classes/src_extract_content_courlan_urlstore.UrlStore.html +++ /dev/null @@ -1,42 +0,0 @@ -UrlStore | ai-research-agent

    Constructors

    • Parameters

      • __namedParameters: {
            compressed: boolean;
            language: any;
            strict: boolean;
            trailing: boolean;
            verbose: boolean;
        } = {}
        • compressed: boolean
        • language: any
        • strict: boolean
        • trailing: boolean
        • verbose: boolean

      Returns UrlStore

    Properties

    compressed: boolean
    done: boolean
    language: any
    strict: boolean
    trailingSlash: boolean
    urldict: Map<any, any>

    Methods

    • Parameters

      • data: any
      • visited: boolean = false

      Returns Map<any, any>

    • Parameters

      • domain: any

      Returns any

    • Parameters

      • urls: any
      • switch_: any = null

      Returns any[]

    • Returns void

    • Parameters

      • domain: any
      • toRight: any = null
      • timestamp: any = null
      • toLeft: any = null

      Returns void

    • Parameters

      • htmlstring: any
      • url: any
      • external: boolean = false
      • lang: any = null
      • withNav: boolean = true

      Returns void

    • Parameters

      • urls: any = null
      • appendleft: any = null
      • visited: boolean = false

      Returns void

    • Parameters

      • domains: any

      Returns void

    • Parameters

      • threshold: any

      Returns boolean

    • Returns any[]

    • Parameters

      • maxUrls: number = 100
      • timeLimit: number = 10

      Returns any[][]

    • Parameters

      • urls: any

      Returns any[]

    • Parameters

      • urls: any

      Returns any[]

    • Parameters

      • domain: any

      Returns any

    • Parameters

      • domain: any

      Returns any

    • Returns any[]

    • Parameters

      • website: any
      • defaultDelay: number = 5

      Returns any

    • Parameters

      • timeLimit: number = 10
      • maxUrls: number = 10000

      Returns any[]

    • Returns any[]

    • Parameters

      • website: any

      Returns any

    • Returns any[]

    • Parameters

      • domain: any
      • asVisited: boolean = true

      Returns any

    • Parameters

      • url: any

      Returns boolean

    • Parameters

      • domain: any

      Returns boolean

    • Parameters

      • url: any

      Returns any

    • Returns void

    • Returns void

    • Returns void

    • Parameters

      • website: any
      • rules: any

      Returns void

    • Returns any

    • Returns number

    diff --git a/docs/classes/src_extract_content_deduplication.LRUCache.html b/docs/classes/src_extract_content_deduplication.LRUCache.html deleted file mode 100644 index 1958545..0000000 --- a/docs/classes/src_extract_content_deduplication.LRUCache.html +++ /dev/null @@ -1,10 +0,0 @@ -LRUCache | ai-research-agent

    Constructors

    Properties

    Methods

    Constructors

    • Parameters

      • maxsize: number = 128

      Returns LRUCache

    Properties

    cache: Map<any, any>
    maxsize: number

    Methods

    • Returns void

    • Parameters

      • key: any

      Returns any

    • Parameters

      • key: any
      • value: any

      Returns void

    diff --git a/docs/classes/src_extract_content_deduplication.Simhash.html b/docs/classes/src_extract_content_deduplication.Simhash.html deleted file mode 100644 index a599edc..0000000 --- a/docs/classes/src_extract_content_deduplication.Simhash.html +++ /dev/null @@ -1,14 +0,0 @@ -Simhash | ai-research-agent

    Constructors

    • Parameters

      • inputString: string = ''
      • length: number = 64
      • existingHash: any = null

      Returns Simhash

    Properties

    hash: bigint
    length: number

    Methods

    • Parameters

      • inputString: any

      Returns bigint

    • Parameters

      • token: any

      Returns (-1 | 1)[]

    • Parameters

      • inputString: any

      Returns bigint

    • Parameters

      • otherHash: any

      Returns number

    • Parameters

      • otherHash: any

      Returns number

    • Returns string

    • Parameters

      • inputHash: any

      Returns bigint

    diff --git a/docs/classes/src_extract_content_downloads.Response.html b/docs/classes/src_extract_content_downloads.Response.html deleted file mode 100644 index 4fef032..0000000 --- a/docs/classes/src_extract_content_downloads.Response.html +++ /dev/null @@ -1,13 +0,0 @@ -Response | ai-research-agent

    Constructors

    Properties

    Methods

    Constructors

    • Parameters

      • data: any
      • status: any
      • url: any

      Returns Response

    Properties

    data: any
    headers: {
        [k: string]: T;
    }
    html: any
    status: any
    url: any

    Methods

    • Returns {
          data: any;
          headers: {
              [k: string]: T;
          };
          html: any;
          status: any;
          url: any;
      }

      • data: any
      • headers: {
            [k: string]: T;
        }
        • [k: string]: T
      • html: any
      • status: any
      • url: any
    • Parameters

      • decode: any

      Returns void

    • Parameters

      • headerDict: any

      Returns void

    diff --git a/docs/classes/src_extract_content_settings.Document.html b/docs/classes/src_extract_content_settings.Document.html deleted file mode 100644 index 53fc6e8..0000000 --- a/docs/classes/src_extract_content_settings.Document.html +++ /dev/null @@ -1,30 +0,0 @@ -Document | ai-research-agent

    Constructors

    • Returns Document

    Properties

    author: any
    body: any
    categories: any
    comments: any
    commentsbody: any
    date: any
    description: any
    filedate: any
    fingerprint: any
    hostname: any
    id: any
    image: any
    language: any
    license: any
    pagetype: any
    rawText: any
    sitename: any
    tags: any
    text: any
    title: any
    url: any

    Methods

    • Returns Document

    • Returns void

    • Parameters

      • attributes: any

      Returns void

    • Parameters

      • data: any

      Returns Document

    diff --git a/docs/classes/src_extract_content_settings.Extractor.html b/docs/classes/src_extract_content_settings.Extractor.html deleted file mode 100644 index ff452d2..0000000 --- a/docs/classes/src_extract_content_settings.Extractor.html +++ /dev/null @@ -1,27 +0,0 @@ -Extractor | ai-research-agent

    Constructors

    • Parameters

      • __namedParameters: {
            authorBlacklist: any;
            comments: boolean;
            config: {
                COOKIE: string;
                DOWNLOAD_TIMEOUT: string;
                EXTENSIVE_DATE_SEARCH: string;
                EXTERNAL_URLS: string;
                EXTRACTION_TIMEOUT: string;
                MAX_FILE_SIZE: string;
                MAX_REDIRECTS: string;
                MAX_REPETITIONS: string;
                MIN_DUPLCHECK_SIZE: string;
                MIN_EXTRACTED_COMM_SIZE: string;
                MIN_EXTRACTED_SIZE: string;
                MIN_FILE_SIZE: string;
                MIN_OUTPUT_COMM_SIZE: string;
                MIN_OUTPUT_SIZE: string;
                SLEEP_TIME: string;
                USER_AGENTS: string;
            };
            dateParams: any;
            dedup: boolean;
            fast: boolean;
            formatting: boolean;
            images: boolean;
            lang: any;
            links: boolean;
            maxTreeSize: any;
            onlyWithMetadata: boolean;
            outputFormat: string;
            precision: boolean;
            recall: boolean;
            source: any;
            tables: boolean;
            teiValidation: boolean;
            url: any;
            urlBlacklist: any;
            withMetadata: boolean;
        } = {}
        • authorBlacklist: any
        • comments: boolean
        • config: {
              COOKIE: string;
              DOWNLOAD_TIMEOUT: string;
              EXTENSIVE_DATE_SEARCH: string;
              EXTERNAL_URLS: string;
              EXTRACTION_TIMEOUT: string;
              MAX_FILE_SIZE: string;
              MAX_REDIRECTS: string;
              MAX_REPETITIONS: string;
              MIN_DUPLCHECK_SIZE: string;
              MIN_EXTRACTED_COMM_SIZE: string;
              MIN_EXTRACTED_SIZE: string;
              MIN_FILE_SIZE: string;
              MIN_OUTPUT_COMM_SIZE: string;
              MIN_OUTPUT_SIZE: string;
              SLEEP_TIME: string;
              USER_AGENTS: string;
          }
          • COOKIE: string
          • DOWNLOAD_TIMEOUT: string
          • EXTENSIVE_DATE_SEARCH: string
          • EXTERNAL_URLS: string
          • EXTRACTION_TIMEOUT: string
          • MAX_FILE_SIZE: string
          • MAX_REDIRECTS: string
          • MAX_REPETITIONS: string
          • MIN_DUPLCHECK_SIZE: string
          • MIN_EXTRACTED_COMM_SIZE: string
          • MIN_EXTRACTED_SIZE: string
          • MIN_FILE_SIZE: string
          • MIN_OUTPUT_COMM_SIZE: string
          • MIN_OUTPUT_SIZE: string
          • SLEEP_TIME: string
          • USER_AGENTS: string
        • dateParams: any
        • dedup: boolean
        • fast: boolean
        • formatting: boolean
        • images: boolean
        • lang: any
        • links: boolean
        • maxTreeSize: any
        • onlyWithMetadata: boolean
        • outputFormat: string
        • precision: boolean
        • recall: boolean
        • source: any
        • tables: boolean
        • teiValidation: boolean
        • url: any
        • urlBlacklist: any
        • withMetadata: boolean

      Returns Extractor

    Properties

    authorBlacklist: any
    comments: boolean
    config: any
    dateParams: any
    dedup: boolean
    fast: boolean
    focus: string
    format: any
    formatting: boolean
    images: boolean
    lang: any
    links: boolean
    maxTreeSize: any
    onlyWithMetadata: boolean
    source: any
    tables: boolean
    teiValidation: boolean
    url: any
    urlBlacklist: any
    withMetadata: any

    Methods

    • Parameters

      • config: any

      Returns void

    • Parameters

      • chosenFormat: any

      Returns void

    diff --git a/docs/classes/src_extract_content_spider.CrawlParameters.html b/docs/classes/src_extract_content_spider.CrawlParameters.html deleted file mode 100644 index 371f45e..0000000 --- a/docs/classes/src_extract_content_spider.CrawlParameters.html +++ /dev/null @@ -1,19 +0,0 @@ -CrawlParameters | ai-research-agent

    Constructors

    • Parameters

      • start: any
      • lang: any = null
      • rules: any = null
      • pruneXpath: any = null

      Returns CrawlParameters

    Properties

    base: string
    i: number
    isOn: boolean
    knownNum: number
    lang: any
    pruneXpath: any
    ref: any
    rules: any
    start: any

    Methods

    • Parameters

      • start: any

      Returns string

    • Parameters

      • start: any

      Returns any

    • Parameters

      • todo: any

      Returns any

    • Parameters

      • link: any

      Returns boolean

    • Parameters

      • urlStore: any

      Returns void

    diff --git a/docs/classes/src_trafilatura_extractor_core.Document.html b/docs/classes/src_trafilatura_extractor_core.Document.html new file mode 100644 index 0000000..6c7eb73 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_core.Document.html @@ -0,0 +1,23 @@ +Document | ai-research-agent

    Constructors

    • Returns Document

    Properties

    author: string
    body: any
    categories: any[]
    comments: string
    commentsbody: any
    date: string
    description: string
    fingerprint: string
    hostname: string
    id: string
    license: string
    raw_text: string
    sitename: string
    tags: any[]
    text: string
    title: string
    url: string

    Methods

    • Returns Document

    diff --git a/docs/classes/src_trafilatura_extractor_core.Extractor.html b/docs/classes/src_trafilatura_extractor_core.Extractor.html new file mode 100644 index 0000000..2d2315a --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_core.Extractor.html @@ -0,0 +1,25 @@ +Extractor | ai-research-agent

    Constructors

    • Parameters

      • options: {} = {}

        Returns Extractor

      Properties

      authorBlacklist: any
      comments: any
      config: any
      dateParams: any
      dedup: any
      fast: any
      formatting: any
      images: any
      lang: any
      links: any
      maxTreeSize: any
      onlyWithMetadata: any
      outputFormat: any
      precision: any
      recall: any
      tables: any
      teiValidation: any
      url: any
      urlBlacklist: any
      withMetadata: any
      diff --git a/docs/classes/src_trafilatura_extractor_courlan_urlstore.UrlStore.html b/docs/classes/src_trafilatura_extractor_courlan_urlstore.UrlStore.html new file mode 100644 index 0000000..baf01d5 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_courlan_urlstore.UrlStore.html @@ -0,0 +1,42 @@ +UrlStore | ai-research-agent

      Constructors

      • Parameters

        • __namedParameters: {
              compressed: boolean;
              language: any;
              strict: boolean;
              trailing: boolean;
              verbose: boolean;
          } = {}
          • compressed: boolean
          • language: any
          • strict: boolean
          • trailing: boolean
          • verbose: boolean

        Returns UrlStore

      Properties

      compressed: boolean
      done: boolean
      language: any
      strict: boolean
      trailingSlash: boolean
      urldict: Map<any, any>

      Methods

      • Parameters

        • data: any
        • visited: boolean = false

        Returns Map<any, any>

      • Parameters

        • domain: any

        Returns any

      • Parameters

        • urls: any
        • switch_: any = null

        Returns any[]

      • Returns void

      • Parameters

        • domain: any
        • toRight: any = null
        • timestamp: any = null
        • toLeft: any = null

        Returns void

      • Parameters

        • htmlstring: any
        • url: any
        • external: boolean = false
        • lang: any = null
        • withNav: boolean = true

        Returns void

      • Parameters

        • urls: any = null
        • appendleft: any = null
        • visited: boolean = false

        Returns void

      • Parameters

        • domains: any

        Returns void

      • Parameters

        • threshold: any

        Returns boolean

      • Returns any[]

      • Parameters

        • maxUrls: number = 100
        • timeLimit: number = 10

        Returns any[][]

      • Parameters

        • urls: any

        Returns any[]

      • Parameters

        • urls: any

        Returns any[]

      • Parameters

        • domain: any

        Returns any

      • Parameters

        • domain: any

        Returns any

      • Returns any[]

      • Parameters

        • website: any
        • defaultDelay: number = 5

        Returns any

      • Parameters

        • timeLimit: number = 10
        • maxUrls: number = 10000

        Returns any[]

      • Returns any[]

      • Parameters

        • website: any

        Returns any

      • Returns any[]

      • Parameters

        • domain: any
        • asVisited: boolean = true

        Returns any

      • Parameters

        • url: any

        Returns boolean

      • Parameters

        • domain: any

        Returns boolean

      • Parameters

        • url: any

        Returns any

      • Returns void

      • Returns void

      • Returns void

      • Parameters

        • website: any
        • rules: any

        Returns void

      • Returns any

      • Returns number

      diff --git a/docs/classes/src_trafilatura_extractor_deduplication.LRUCache.html b/docs/classes/src_trafilatura_extractor_deduplication.LRUCache.html new file mode 100644 index 0000000..6094868 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_deduplication.LRUCache.html @@ -0,0 +1,10 @@ +LRUCache | ai-research-agent

      Constructors

      Properties

      Methods

      Constructors

      • Parameters

        • maxsize: number = 128

        Returns LRUCache

      Properties

      cache: Map<any, any>
      maxsize: number

      Methods

      • Returns void

      • Parameters

        • key: any

        Returns any

      • Parameters

        • key: any
        • value: any

        Returns void

      diff --git a/docs/classes/src_trafilatura_extractor_deduplication.Simhash.html b/docs/classes/src_trafilatura_extractor_deduplication.Simhash.html new file mode 100644 index 0000000..884841c --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_deduplication.Simhash.html @@ -0,0 +1,14 @@ +Simhash | ai-research-agent

      Constructors

      • Parameters

        • inputString: string = ''
        • length: number = 64
        • existingHash: any = null

        Returns Simhash

      Properties

      hash: bigint
      length: number

      Methods

      • Parameters

        • inputString: any

        Returns bigint

      • Parameters

        • token: any

        Returns (-1 | 1)[]

      • Parameters

        • inputString: any

        Returns bigint

      • Parameters

        • otherHash: any

        Returns number

      • Parameters

        • otherHash: any

        Returns number

      • Returns string

      • Parameters

        • inputHash: any

        Returns bigint

      diff --git a/docs/classes/src_trafilatura_extractor_downloads.Response.html b/docs/classes/src_trafilatura_extractor_downloads.Response.html new file mode 100644 index 0000000..6cd45e0 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_downloads.Response.html @@ -0,0 +1,13 @@ +Response | ai-research-agent

      Constructors

      Properties

      Methods

      Constructors

      • Parameters

        • data: any
        • status: any
        • url: any

        Returns Response

      Properties

      data: any
      headers: {
          [k: string]: T;
      }
      html: any
      status: any
      url: any

      Methods

      • Returns {
            data: any;
            headers: {
                [k: string]: T;
            };
            html: any;
            status: any;
            url: any;
        }

        • data: any
        • headers: {
              [k: string]: T;
          }
          • [k: string]: T
        • html: any
        • status: any
        • url: any
      • Parameters

        • decode: any

        Returns void

      • Parameters

        • headerDict: any

        Returns void

      diff --git a/docs/classes/src_extractor_html_to_content_readability.Readability.html b/docs/classes/src_trafilatura_extractor_readability.Readability.html similarity index 55% rename from docs/classes/src_extractor_html_to_content_readability.Readability.html rename to docs/classes/src_trafilatura_extractor_readability.Readability.html index 96026a6..d613161 100644 --- a/docs/classes/src_extractor_html_to_content_readability.Readability.html +++ b/docs/classes/src_trafilatura_extractor_readability.Readability.html @@ -1,221 +1,258 @@ -Readability | ai-research-agent

      Constructors

      Properties

      ALTER_TO_DIV_EXCEPTIONS -CLASSES_TO_PRESERVE -DEFAULT_CHAR_THRESHOLD -DEFAULT_MAX_ELEMS_TO_PARSE -DEFAULT_N_TOP_CANDIDATES -DEFAULT_TAGS_TO_SCORE -DEPRECATED_SIZE_ATTRIBUTE_ELEMS -DIV_TO_P_ELEMS -ELEMENT_NODE -FLAG_CLEAN_CONDITIONALLY -FLAG_STRIP_UNLIKELYS -FLAG_WEIGHT_CLASSES -HTML_ESCAPE_MAP -PHRASING_ELEMS -PRESENTATIONAL_ATTRIBUTES -REGEXPS -TEXT_NODE -UNLIKELY_ROLES -_articleByline -_articleDir -_articleSiteName -_articleTitle -_attempts -_charThreshold -_checkByline -_classesToPreserve -_clean -_cleanClasses -_cleanConditionally -_cleanHeaders -_cleanMatchedNodes -_cleanStyles -_concatNodeLists -_debug -_disableJSONLD -_doc -_docJSDOMParser -_everyNode -_findNode -_fixLazyImages -_fixRelativeUris -_flagIsActive -_flags -_forEachNode -_getAllNodesWithTag -_getArticleMetadata -_getArticleTitle -_getCharCount -_getClassWeight -_getInnerText -_getJSONLD -_getLinkDensity -_getNextNode -_getNodeAncestors -_getRowAndColumnCount -_getTextDensity -_grabArticle -_hasAncestorTag -_hasChildBlockElement -_hasSingleTagInsideElement -_headerDuplicatesTitle -_initializeNode -_isElementWithoutContent -_isPhrasingContent -_isProbablyVisible -_isSingleImage -_isValidByline -_isWhitespace -_keepClasses -_markDataTables -_maxElemsToParse -_nbTopCandidates -_nextNode -_postProcessContent -_prepArticle -_prepDocument -_removeAndGetNext -_removeFlag -_removeNodes -_removeScripts -_replaceBrs -_replaceNodeTags -_serializer -_setNodeTag -_simplifyNestedElements -_someNode -_textSimilarity -_unescapeHtmlEntities -_unwrapNoscriptImages -log -parse -prototype -

      Constructors

      • Parameters

        • doc: HTMLDocument
        • options: any
        • Rest...args: any

        Returns Readability

      Properties

      ALTER_TO_DIV_EXCEPTIONS: string[] = ...
      CLASSES_TO_PRESERVE: string[] = ...
      DEFAULT_CHAR_THRESHOLD: number = 500
      DEFAULT_MAX_ELEMS_TO_PARSE: number = 0
      DEFAULT_N_TOP_CANDIDATES: number = 5
      DEFAULT_TAGS_TO_SCORE: string[] = ...
      DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[] = ...
      DIV_TO_P_ELEMS: Set<string> = ...
      ELEMENT_NODE: number = 1
      FLAG_CLEAN_CONDITIONALLY: number = 0x4
      FLAG_STRIP_UNLIKELYS: number = 0x1
      FLAG_WEIGHT_CLASSES: number = 0x2
      HTML_ESCAPE_MAP: {
          amp: string;
          apos: string;
          gt: string;
          lt: string;
          quot: string;
      } = ...
      PHRASING_ELEMS: string[] = ...
      PRESENTATIONAL_ATTRIBUTES: string[] = ...
      REGEXPS: {
          b64DataUrl: RegExp;
          byline: RegExp;
          extraneous: RegExp;
          hasContent: RegExp;
          hashUrl: RegExp;
          jsonLdArticleTypes: RegExp;
          negative: RegExp;
          nextLink: RegExp;
          normalize: RegExp;
          okMaybeItsACandidate: RegExp;
          positive: RegExp;
          prevLink: RegExp;
          replaceFonts: RegExp;
          shareElements: RegExp;
          srcsetUrl: RegExp;
          tokenize: RegExp;
          unlikelyCandidates: RegExp;
          videos: RegExp;
          whitespace: RegExp;
      } = ...
      TEXT_NODE: number = 3
      UNLIKELY_ROLES: string[] = ...
      _articleByline: any
      _articleDir: any
      _articleSiteName: any
      _articleTitle: any
      _attempts: any[]
      _charThreshold: any
      _checkByline: typeof __function = ...
      _classesToPreserve: string[]
      _clean: ((e: any, tag: any) => void) = ...

      Clean a node of all elements of type "tag". +Readability | ai-research-agent

      Mozilla Readability (2015)

      +
        +
      1. Define regex patterns for content identification
      2. +
      3. Clean HTML by removing unlikely content
      4. +
      5. Score nodes based on content quality indicators
      6. +
      7. Select the best candidate for main content
      8. +
      9. Extract top candidate
      10. +
      11. Clean the selected content
      12. +
      +

      The HTML string to extract content from

      +

      Minimum length of content to be considered valid

      +

      Minimum score for content to be considered valid

      +

      Minimum length of text to be considered valid

      +

      Length to retry content extraction if initial attempt fails

      +

      Extracted HTML element of main content such as article body

      +

      Based on

      +
      var url = "https://en.wikipedia.org/wiki/David_Hilbert";
      var html = await (await fetch(url)).text();
      var content = extractContentHTML(html); +
      + +

      Constructors

      Properties

      ALTER_TO_DIV_EXCEPTIONS +CLASSES_TO_PRESERVE +DEFAULT_CHAR_THRESHOLD +DEFAULT_MAX_ELEMS_TO_PARSE +DEFAULT_N_TOP_CANDIDATES +DEFAULT_TAGS_TO_SCORE +DEPRECATED_SIZE_ATTRIBUTE_ELEMS +DIV_TO_P_ELEMS +ELEMENT_NODE +FLAG_CLEAN_CONDITIONALLY +FLAG_STRIP_UNLIKELYS +FLAG_WEIGHT_CLASSES +HTML_ESCAPE_MAP +PHRASING_ELEMS +PRESENTATIONAL_ATTRIBUTES +REGEXPS +TEXT_NODE +UNLIKELY_ROLES +_articleByline +_articleDir +_articleSiteName +_articleTitle +_attempts +_charThreshold +_checkByline +_classesToPreserve +_clean +_cleanClasses +_cleanConditionally +_cleanHeaders +_cleanMatchedNodes +_cleanStyles +_concatNodeLists +_debug +_disableJSONLD +_doc +_docJSDOMParser +_everyNode +_findNode +_fixLazyImages +_fixRelativeUris +_flagIsActive +_flags +_forEachNode +_getAllNodesWithTag +_getArticleMetadata +_getArticleTitle +_getCharCount +_getClassWeight +_getInnerText +_getJSONLD +_getLinkDensity +_getNextNode +_getNodeAncestors +_getRowAndColumnCount +_getTextDensity +_grabArticle +_hasAncestorTag +_hasChildBlockElement +_hasSingleTagInsideElement +_headerDuplicatesTitle +_initializeNode +_isElementWithoutContent +_isPhrasingContent +_isProbablyVisible +_isSingleImage +_isValidByline +_isWhitespace +_keepClasses +_markDataTables +_maxElemsToParse +_nbTopCandidates +_nextNode +_postProcessContent +_prepArticle +_prepDocument +_removeAndGetNext +_removeFlag +_removeNodes +_removeScripts +_replaceBrs +_replaceNodeTags +_serializer +_setNodeTag +_simplifyNestedElements +_someNode +_textSimilarity +_unescapeHtmlEntities +_unwrapNoscriptImages +log +parse +prototype +

      Constructors

      • Mozilla Readability (2015)

        +
          +
        1. Define regex patterns for content identification
        2. +
        3. Clean HTML by removing unlikely content
        4. +
        5. Score nodes based on content quality indicators
        6. +
        7. Select the best candidate for main content
        8. +
        9. Extract top candidate
        10. +
        11. Clean the selected content
        12. +
        +

        Parameters

        • doc: any
        • Optionaloptions: {
              minContentLength: number;
              minScore: number;
              minTextLength: number;
              retryLength: number;
          }
          • minContentLength: number

            Minimum length of content to be considered valid

            +
          • minScore: number

            Minimum score for content to be considered valid

            +
          • minTextLength: number

            Minimum length of text to be considered valid

            +
          • retryLength: number

            Length to retry content extraction if initial attempt fails

            +
        • Rest...args: any

        Returns Readability

        Extracted HTML element of main content such as article body

        +

        Based on

        +
        var url = "https://en.wikipedia.org/wiki/David_Hilbert";
        var html = await (await fetch(url)).text();
        var content = extractContentHTML(html); +
        + +

      Properties

      ALTER_TO_DIV_EXCEPTIONS: string[] = ...
      CLASSES_TO_PRESERVE: string[] = ...
      DEFAULT_CHAR_THRESHOLD: number = 500
      DEFAULT_MAX_ELEMS_TO_PARSE: number = 0
      DEFAULT_N_TOP_CANDIDATES: number = 5
      DEFAULT_TAGS_TO_SCORE: string[] = ...
      DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[] = ...
      DIV_TO_P_ELEMS: Set<string> = ...
      ELEMENT_NODE: number = 1
      FLAG_CLEAN_CONDITIONALLY: number = 0x4
      FLAG_STRIP_UNLIKELYS: number = 0x1
      FLAG_WEIGHT_CLASSES: number = 0x2
      HTML_ESCAPE_MAP: {
          amp: string;
          apos: string;
          gt: string;
          lt: string;
          quot: string;
      } = ...
      PHRASING_ELEMS: string[] = ...
      PRESENTATIONAL_ATTRIBUTES: string[] = ...
      REGEXPS: {
          b64DataUrl: RegExp;
          byline: RegExp;
          extraneous: RegExp;
          hasContent: RegExp;
          hashUrl: RegExp;
          jsonLdArticleTypes: RegExp;
          negative: RegExp;
          nextLink: RegExp;
          normalize: RegExp;
          okMaybeItsACandidate: RegExp;
          positive: RegExp;
          prevLink: RegExp;
          replaceFonts: RegExp;
          shareElements: RegExp;
          srcsetUrl: RegExp;
          tokenize: RegExp;
          unlikelyCandidates: RegExp;
          videos: RegExp;
          whitespace: RegExp;
      } = ...
      TEXT_NODE: number = 3
      UNLIKELY_ROLES: string[] = ...
      _articleByline: any
      _articleDir: any
      _articleSiteName: any
      _articleTitle: any
      _attempts: any[]
      _charThreshold: any
      _checkByline: typeof __function = ...
      _classesToPreserve: string[]
      _clean: ((e: any, tag: any) => void) = ...

      Clean a node of all elements of type "tag". (Unless it's a youtube/vimeo video. People love movies.)

      Type declaration

        • (e, tag): void
        • Parameters

          • e: any
          • tag: any

          Returns void

          void

          -
      _cleanClasses: ((node: any) => void) = ...

      Removes the class="" attribute from every element in the given +

      _cleanClasses: ((node: any) => void) = ...

      Removes the class="" attribute from every element in the given subtree, except those that match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.

      Type declaration

        • (node): void
        • Parameters

          • node: any

          Returns void

          void

          -
      _cleanConditionally: ((e: any, tag: any) => void) = ...

      Clean an element of all tags of type "tag" if they look fishy. +

      _cleanConditionally: ((e: any, tag: any) => void) = ...

      Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.

      Type declaration

        • (e, tag): void
        • Parameters

          • e: any
          • tag: any

          Returns void

          void

          -
      _cleanHeaders: ((e: any) => void) = ...

      Clean out spurious headers from an Element.

      +
      _cleanHeaders: ((e: any) => void) = ...

      Clean out spurious headers from an Element.

      Type declaration

        • (e): void
        • Parameters

          • e: any

          Returns void

          void

          -
      _cleanMatchedNodes: ((e: any, filter: any) => void) = ...

      Clean out elements that match the specified conditions

      +
      _cleanMatchedNodes: ((e: any, filter: any) => void) = ...

      Clean out elements that match the specified conditions

      Type declaration

        • (e, filter): void
        • Parameters

          • e: any
          • filter: any

          Returns void

          void

          -
      _cleanStyles: ((e: any) => void) = ...

      Remove the style attribute on every e and under. +

      _cleanStyles: ((e: any) => void) = ...

      Remove the style attribute on every e and under. TODO: Test if getElementsByTagName(*) is faster.

      Type declaration

        • (e): void
        • Parameters

          • e: any

          Returns void

          void

          -
      _concatNodeLists: ((...args: any[]) => any) = ...

      Concat all nodelists passed as arguments.

      -

      Type declaration

        • (...args): any
        • Parameters

          • Rest...args: any[]

          Returns any

          ...NodeList

          -
      _debug: boolean
      _disableJSONLD: boolean
      _doc: HTMLDocument
      _docJSDOMParser: any
      _everyNode: ((nodeList: any, fn: any) => any) = ...

      Iterate over a NodeList, return true if all of the provided iterate +

      _concatNodeLists: ((...args: any) => any) = ...

      Concat all nodelists passed as arguments.

      +

      Type declaration

        • (...args): any
        • Parameters

          • Rest...args: any

          Returns any

          ...NodeList

          +
      _debug: boolean
      _disableJSONLD: boolean
      _doc: any
      _docJSDOMParser: any
      _everyNode: ((nodeList: any, fn: any) => any) = ...

      Iterate over a NodeList, return true if all of the provided iterate function calls return true, false otherwise.

      For convenience, the current object context is applied to the provided iterate function.

      Type declaration

        • (nodeList, fn): any
        • Parameters

          • nodeList: any
          • fn: any

          Returns any

          Boolean

          -
      _findNode: ((nodeList: any, fn: any) => any) = ...

      Iterate over a NodeList, and return the first node that passes +

      _findNode: ((nodeList: any, fn: any) => any) = ...

      Iterate over a NodeList, and return the first node that passes the supplied test function

      For convenience, the current object context is applied to the provided test function.

      Type declaration

        • (nodeList, fn): any
        • Parameters

          • nodeList: any
          • fn: any

          Returns any

          void

          -
      _fixLazyImages: ((root: any) => void) = ...
      _fixRelativeUris: ((articleContent: any) => void) = ...
      _fixLazyImages: ((root: any) => void) = ...
      _fixRelativeUris: ((articleContent: any) => void) = ...
      _flagIsActive: ((flag: any) => boolean) = ...
      _flags: number
      _forEachNode: ((nodeList: any, fn: any) => void) = ...

      Iterate over a NodeList, which doesn't natively fully implement the Array +

      _flagIsActive: ((flag: any) => boolean) = ...
      _flags: number
      _forEachNode: ((nodeList: any, fn: any) => void) = ...

      Iterate over a NodeList, which doesn't natively fully implement the Array interface.

      For convenience, the current object context is applied to the provided iterate function.

      Type declaration

        • (nodeList, fn): void
        • Parameters

          • nodeList: any
          • fn: any

          Returns void

          void

          -
      _getAllNodesWithTag: ((node: any, tagNames: any) => any) = ...
      _getArticleMetadata: ((jsonld: any) => {
          byline: any;
          excerpt: any;
          siteName: any;
          title: any;
      }) = ...

      Attempts to get excerpt and byline metadata for the article.

      +
      _getAllNodesWithTag: ((node: any, tagNames: any) => any) = ...
      _getArticleMetadata: ((jsonld: any) => {
          byline: any;
          excerpt: any;
          siteName: any;
          title: any;
      }) = ...

      Attempts to get excerpt and byline metadata for the article.

      Type declaration

        • (jsonld): {
              byline: any;
              excerpt: any;
              siteName: any;
              title: any;
          }
        • Parameters

          • jsonld: any

            — object containing any metadata that could be extracted from JSON-LD object.

          Returns {
              byline: any;
              excerpt: any;
              siteName: any;
              title: any;
          }

          Object with optional "excerpt" and "byline" properties

          -
          • byline: any
          • excerpt: any
          • siteName: any
          • title: any
      _getArticleTitle: (() => string) = ...

      Get the article title as an H1.

      +
      • byline: any
      • excerpt: any
      • siteName: any
      • title: any
      _getArticleTitle: (() => string) = ...

      Get the article title as an H1.

      Type declaration

        • (): string
        • Returns string

          string

          -
      _getCharCount: ((e: any, s: any) => number) = ...

      Get the number of times a string s appears in the node e.

      +
      _getCharCount: ((e: any, s: any) => number) = ...

      Get the number of times a string s appears in the node e.

      Type declaration

        • (e, s): number
        • Parameters

          • e: any
          • s: any

          Returns number

          number (integer)

          -
      _getClassWeight: ((e: any) => number) = ...

      Get an elements class/id weight. Uses regular expressions to tell if this +

      _getClassWeight: ((e: any) => number) = ...

      Get an elements class/id weight. Uses regular expressions to tell if this element looks good or bad.

      Type declaration

        • (e): number
        • Parameters

          • e: any

          Returns number

          number (Integer)

          -
      _getInnerText: ((e: any, normalizeSpaces: any) => any) = ...

      Get the inner text of a node - cross browser compatibly. +

      _getInnerText: ((e: any, normalizeSpaces: any) => any) = ...

      Get the inner text of a node - cross browser compatibly. This also strips out any excess whitespace to be found.

      Type declaration

        • (e, normalizeSpaces): any
        • Parameters

          • e: any
          • normalizeSpaces: any

          Returns any

          string

          -
      _getJSONLD: ((doc: any) => {}) = ...

      Try to extract metadata from JSON-LD object. +

      _getJSONLD: ((doc: any) => {}) = ...

      Try to extract metadata from JSON-LD object. For now, only Schema.org objects of type Article or its subtypes are supported.

      Type declaration

        • (doc): {}
        • Parameters

          • doc: any

          Returns {}

          Object with any metadata that could be extracted (possibly none)

          -
        _getLinkDensity: ((element: any) => number) = ...

        Get the density of links as a percentage of the content +

          _getLinkDensity: ((element: any) => number) = ...

          Get the density of links as a percentage of the content This is the amount of text that is inside a link divided by the total text in the node.

          Type declaration

            • (element): number
            • Parameters

              • element: any

              Returns number

              number (float)

              -
          _getNextNode: ((node: any, ignoreSelfAndKids: any) => any) = ...

          Traverse the DOM from node to node, starting at the node passed in. +

          _getNextNode: ((node: any, ignoreSelfAndKids: any) => any) = ...

          Traverse the DOM from node to node, starting at the node passed in. Pass true for the second parameter to indicate this node itself (and its kids) are going away, and we want the next node over.

          Calling this in a loop will traverse the DOM depth-first.

          -
          _getNodeAncestors: ((node: any, maxDepth: any) => any[]) = ...
          _getRowAndColumnCount: ((table: any) => {
              columns: number;
              rows: number;
          }) = ...

          Return an object indicating how many rows and columns this table has.

          -
          _getTextDensity: ((e: any, tags: any) => number) = ...
          _grabArticle: typeof __function = ...

          grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is +

          _getNodeAncestors: ((node: any, maxDepth: any) => any[]) = ...
          _getRowAndColumnCount: ((table: any) => {
              columns: number;
              rows: number;
          }) = ...

          Return an object indicating how many rows and columns this table has.

          +
          _getTextDensity: ((e: any, tags: any) => number) = ...
          _grabArticle: typeof __function = ...

          grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read. Then return it wrapped up in a div.

          a document to run upon. Needs to be a full document, complete with body.

          -

          Element

          -
          _hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean) = ...

          Check if a given node has one of its ancestor tag name matching the +

          Element

          +
          _hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean) = ...

          Check if a given node has one of its ancestor tag name matching the provided one.

          Type declaration

            • (node, tagName, maxDepth, filterFn): boolean
            • Parameters

              • node: any
              • tagName: any
              • maxDepth: any
              • filterFn: any

              Returns boolean

              Boolean

              -
          _hasChildBlockElement: ((element: any) => any) = ...

          Determine whether element has any children block level elements.

          -
          _hasSingleTagInsideElement: ((element: any, tag: any) => boolean) = ...

          Check if this node has only whitespace and a single element with given tag +

          _hasChildBlockElement: ((element: any) => any) = ...

          Determine whether element has any children block level elements.

          +
          _hasSingleTagInsideElement: ((element: any, tag: any) => boolean) = ...

          Check if this node has only whitespace and a single element with given tag Returns false if the DIV node contains non-empty text nodes or if it contains no element with given tag or more than 1 element.

          -
          _headerDuplicatesTitle: ((node: any) => boolean) = ...

          Check if this node is an H1 or H2 element whose content is mostly +

          _headerDuplicatesTitle: ((node: any) => boolean) = ...

          Check if this node is an H1 or H2 element whose content is mostly the same as the article title.

          Type declaration

            • (node): boolean
            • Parameters

              • node: any

              Returns boolean

              boolean indicating whether this is a title-like header.

              -
          _initializeNode: ((node: any) => void) = ...

          Initialize a node with the readability object. Also checks the +

          _initializeNode: ((node: any) => void) = ...

          Initialize a node with the readability object. Also checks the className/id for special names to add to its score.

          Type declaration

            • (node): void
            • Parameters

              • node: any

              Returns void

              void

              -
          _isElementWithoutContent: ((node: any) => boolean) = ...
          _isPhrasingContent: ((node: any) => any) = ...

          Determine if a node qualifies as phrasing content. +

          _isElementWithoutContent: ((node: any) => boolean) = ...
          _isPhrasingContent: ((node: any) => any) = ...
          _isProbablyVisible: ((node: any) => boolean) = ...
          _isSingleImage: ((node: any) => any) = ...

          Check if node is image, or if node contains exactly only one image +

          _isProbablyVisible: ((node: any) => boolean) = ...
          _isSingleImage: ((node: any) => any) = ...

          Check if node is image, or if node contains exactly only one image whether as a direct child or as its descendants.

          -
          _isValidByline: ((byline: any) => boolean) = ...

          Check whether the input string could be a byline. +

          _isValidByline: ((byline: any) => boolean) = ...

          Check whether the input string could be a byline. This verifies that the input is a string, and that the length is less than 100 chars.

          Type declaration

            • (byline): boolean
            • Parameters

              • byline: any

              Returns boolean

              Boolean - whether the input string is a byline.

              -
          _isWhitespace: ((node: any) => boolean) = ...
          _keepClasses: boolean
          _markDataTables: ((root: any) => void) = ...

          Look for 'data' (as opposed to 'layout') tables, for which we use +

          _isWhitespace: ((node: any) => boolean) = ...
          _keepClasses: boolean
          _markDataTables: ((root: any) => void) = ...

          Look for 'data' (as opposed to 'layout') tables, for which we use similar checks as https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19

          -
          _maxElemsToParse: any
          _nbTopCandidates: any
          _nextNode: ((node: any) => any) = ...

          Finds the next node, starting from the given node, and ignoring +

          _maxElemsToParse: any
          _nbTopCandidates: any
          _nextNode: ((node: any) => any) = ...

          Finds the next node, starting from the given node, and ignoring whitespace in between. If the given node is an element, the same node is returned.

          -
          _postProcessContent: ((articleContent: any) => void) = ...

          Run any post-process modifications to article content as necessary.

          +
          _postProcessContent: ((articleContent: any) => void) = ...

          Run any post-process modifications to article content as necessary.

          Type declaration

            • (articleContent): void
            • Parameters

              • articleContent: any

              Returns void

              void

              -
          _prepArticle: ((articleContent: any) => void) = ...

          Prepare the article node for display. Clean out any inline styles, +

          _prepArticle: ((articleContent: any) => void) = ...

          Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous

          tags, etc.

          Type declaration

            • (articleContent): void
            • Parameters

              • articleContent: any

              Returns void

              void

              -
          _prepDocument: (() => void) = ...

          Prepare the HTML document for readability to scrape it. +

          _prepDocument: (() => void) = ...

          Prepare the HTML document for readability to scrape it. This includes things like stripping javascript, CSS, and handling terrible markup.

          Type declaration

            • (): void
            • Returns void

              void

              -
          _removeAndGetNext: ((node: any) => any) = ...
          _removeFlag: typeof __function = ...
          _removeNodes: ((nodeList: any, filterFn: any) => void) = ...

          Iterates over a NodeList, calls filterFn for each node and removes node +

          _removeAndGetNext: ((node: any) => any) = ...
          _removeFlag: typeof __function = ...
          _removeNodes: ((nodeList: any, filterFn: any) => void) = ...

          Iterates over a NodeList, calls filterFn for each node and removes node if function returned true.

          If function is not passed, removes all the nodes in node list.

          Type declaration

            • (nodeList, filterFn): void
            • Parameters

              • nodeList: any
              • filterFn: any

              Returns void

              void

              -
          _removeScripts: ((doc: any) => void) = ...

          Removes script tags from the document.

          -
          _replaceBrs: ((elem: any) => void) = ...

          Replaces 2 or more successive
          elements with a single

          . +

          _removeScripts: ((doc: any) => void) = ...

          Removes script tags from the document.

          +
          _replaceBrs: ((elem: any) => void) = ...

          Replaces 2 or more successive
          elements with a single

          . Whitespace between
          elements are ignored. For example:

          foo
          bar


          abc
          will become: -
          foo
          bar

          abc

          _replaceNodeTags: ((nodeList: any, newTagName: any) => void) = ...

          Iterates over a NodeList, and calls _setNodeTag for each node.

          +
          foo
          bar

          abc

          _replaceNodeTags: ((nodeList: any, newTagName: any) => void) = ...

          Iterates over a NodeList, and calls _setNodeTag for each node.

          Type declaration

            • (nodeList, newTagName): void
            • Parameters

              • nodeList: any
              • newTagName: any

              Returns void

              void

              -
          _serializer: any
          _setNodeTag: ((node: any, tag: any) => any) = ...
          _simplifyNestedElements: ((articleContent: any) => void) = ...
          _someNode: ((nodeList: any, fn: any) => any) = ...

          Iterate over a NodeList, return true if any of the provided iterate +

          _serializer: any
          _setNodeTag: ((node: any, tag: any) => any) = ...
          _simplifyNestedElements: ((articleContent: any) => void) = ...
          _someNode: ((nodeList: any, fn: any) => any) = ...

          Iterate over a NodeList, return true if any of the provided iterate function calls returns true, false otherwise.

          For convenience, the current object context is applied to the provided iterate function.

          Type declaration

            • (nodeList, fn): any
            • Parameters

              • nodeList: any
              • fn: any

              Returns any

              Boolean

              -
          _textSimilarity: ((textA: any, textB: any) => number) = ...
          _unescapeHtmlEntities: ((str: string) => string) = ...

          Converts some of the common HTML entities in string to their corresponding characters.

          +
          _textSimilarity: ((textA: any, textB: any) => number) = ...
          _unescapeHtmlEntities: ((str: string) => string) = ...

          Converts some of the common HTML entities in string to their corresponding characters.

          Type declaration

            • (str): string
            • Parameters

              • str: string

                {string} - a string to unescape.

              Returns string

              string without HTML entity.

              -
          _unwrapNoscriptImages: ((doc: any) => void) = ...

          Find all

          _unwrapNoscriptImages: ((doc: any) => void) = ...

          Find all

          -
          log: ((...args: any[]) => void)
          parse: typeof __function = ...

          Runs readability.

          +
          log: ((...args: any) => void)
          parse: typeof __function = ...

          Runs readability.

          Workflow:

          1. Prep the document by removing script tags, css, etc.
          2. @@ -224,8 +261,8 @@
          -

          void

          -
          prototype: {
              ALTER_TO_DIV_EXCEPTIONS: string[];
              CLASSES_TO_PRESERVE: string[];
              DEFAULT_CHAR_THRESHOLD: number;
              DEFAULT_MAX_ELEMS_TO_PARSE: number;
              DEFAULT_N_TOP_CANDIDATES: number;
              DEFAULT_TAGS_TO_SCORE: string[];
              DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[];
              DIV_TO_P_ELEMS: Set<string>;
              ELEMENT_NODE: number;
              FLAG_CLEAN_CONDITIONALLY: number;
              FLAG_STRIP_UNLIKELYS: number;
              FLAG_WEIGHT_CLASSES: number;
              HTML_ESCAPE_MAP: {
                  amp: string;
                  apos: string;
                  gt: string;
                  lt: string;
                  quot: string;
              };
              PHRASING_ELEMS: string[];
              PRESENTATIONAL_ATTRIBUTES: string[];
              REGEXPS: {
                  b64DataUrl: RegExp;
                  byline: RegExp;
                  extraneous: RegExp;
                  hasContent: RegExp;
                  hashUrl: RegExp;
                  jsonLdArticleTypes: RegExp;
                  negative: RegExp;
                  nextLink: RegExp;
                  normalize: RegExp;
                  okMaybeItsACandidate: RegExp;
                  positive: RegExp;
                  prevLink: RegExp;
                  replaceFonts: RegExp;
                  shareElements: RegExp;
                  srcsetUrl: RegExp;
                  tokenize: RegExp;
                  unlikelyCandidates: RegExp;
                  videos: RegExp;
                  whitespace: RegExp;
              };
              TEXT_NODE: number;
              UNLIKELY_ROLES: string[];
              _checkByline: typeof __function;
              _clean: ((e: any, tag: any) => void);
              _cleanClasses: ((node: any) => void);
              _cleanConditionally: ((e: any, tag: any) => void);
              _cleanHeaders: ((e: any) => void);
              _cleanMatchedNodes: ((e: any, filter: any) => void);
              _cleanStyles: ((e: any) => void);
              _concatNodeLists: ((...args: any[]) => any);
              _everyNode: ((nodeList: any, fn: any) => any);
              _findNode: ((nodeList: any, fn: any) => any);
              _fixLazyImages: ((root: any) => void);
              _fixRelativeUris: ((articleContent: any) => void);
              _flagIsActive: ((flag: any) => boolean);
              _forEachNode: ((nodeList: any, fn: any) => void);
              _getAllNodesWithTag: ((node: any, tagNames: any) => any);
              _getArticleMetadata: ((jsonld: any) => {
                  byline: any;
                  excerpt: any;
                  siteName: any;
                  title: any;
              });
              _getArticleTitle: (() => string);
              _getCharCount: ((e: any, s: any) => number);
              _getClassWeight: ((e: any) => number);
              _getInnerText: ((e: any, normalizeSpaces: any) => any);
              _getJSONLD: ((doc: any) => {});
              _getLinkDensity: ((element: any) => number);
              _getNextNode: ((node: any, ignoreSelfAndKids: any) => any);
              _getNodeAncestors: ((node: any, maxDepth: any) => any[]);
              _getRowAndColumnCount: ((table: any) => {
                  columns: number;
                  rows: number;
              });
              _getTextDensity: ((e: any, tags: any) => number);
              _grabArticle: typeof __function;
              _hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean);
              _hasChildBlockElement: ((element: any) => any);
              _hasSingleTagInsideElement: ((element: any, tag: any) => boolean);
              _headerDuplicatesTitle: ((node: any) => boolean);
              _initializeNode: ((node: any) => void);
              _isElementWithoutContent: ((node: any) => boolean);
              _isPhrasingContent: ((node: any) => any);
              _isProbablyVisible: ((node: any) => boolean);
              _isSingleImage: ((node: any) => any);
              _isValidByline: ((byline: any) => boolean);
              _isWhitespace: ((node: any) => boolean);
              _markDataTables: ((root: any) => void);
              _nextNode: ((node: any) => any);
              _postProcessContent: ((articleContent: any) => void);
              _prepArticle: ((articleContent: any) => void);
              _prepDocument: (() => void);
              _removeAndGetNext: ((node: any) => any);
              _removeFlag: typeof __function;
              _removeNodes: ((nodeList: any, filterFn: any) => void);
              _removeScripts: ((doc: any) => void);
              _replaceBrs: ((elem: any) => void);
              _replaceNodeTags: ((nodeList: any, newTagName: any) => void);
              _setNodeTag: ((node: any, tag: any) => any);
              _simplifyNestedElements: ((articleContent: any) => void);
              _someNode: ((nodeList: any, fn: any) => any);
              _textSimilarity: ((textA: any, textB: any) => number);
              _unescapeHtmlEntities: ((str: string) => string);
              _unwrapNoscriptImages: ((doc: any) => void);
              parse: typeof __function;
          }

          Type declaration

          • ALTER_TO_DIV_EXCEPTIONS: string[]
          • CLASSES_TO_PRESERVE: string[]
          • DEFAULT_CHAR_THRESHOLD: number
          • DEFAULT_MAX_ELEMS_TO_PARSE: number
          • DEFAULT_N_TOP_CANDIDATES: number
          • DEFAULT_TAGS_TO_SCORE: string[]
          • DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[]
          • DIV_TO_P_ELEMS: Set<string>
          • ELEMENT_NODE: number
          • FLAG_CLEAN_CONDITIONALLY: number
          • FLAG_STRIP_UNLIKELYS: number
          • FLAG_WEIGHT_CLASSES: number
          • HTML_ESCAPE_MAP: {
                amp: string;
                apos: string;
                gt: string;
                lt: string;
                quot: string;
            }
            • amp: string
            • apos: string
            • gt: string
            • lt: string
            • quot: string
          • PHRASING_ELEMS: string[]
          • PRESENTATIONAL_ATTRIBUTES: string[]
          • REGEXPS: {
                b64DataUrl: RegExp;
                byline: RegExp;
                extraneous: RegExp;
                hasContent: RegExp;
                hashUrl: RegExp;
                jsonLdArticleTypes: RegExp;
                negative: RegExp;
                nextLink: RegExp;
                normalize: RegExp;
                okMaybeItsACandidate: RegExp;
                positive: RegExp;
                prevLink: RegExp;
                replaceFonts: RegExp;
                shareElements: RegExp;
                srcsetUrl: RegExp;
                tokenize: RegExp;
                unlikelyCandidates: RegExp;
                videos: RegExp;
                whitespace: RegExp;
            }
            • b64DataUrl: RegExp
            • byline: RegExp
            • extraneous: RegExp
            • hasContent: RegExp
            • hashUrl: RegExp
            • jsonLdArticleTypes: RegExp
            • negative: RegExp
            • nextLink: RegExp
            • normalize: RegExp
            • okMaybeItsACandidate: RegExp
            • positive: RegExp
            • prevLink: RegExp
            • replaceFonts: RegExp
            • shareElements: RegExp
            • srcsetUrl: RegExp
            • tokenize: RegExp
            • unlikelyCandidates: RegExp
            • videos: RegExp
            • whitespace: RegExp
          • TEXT_NODE: number
          • UNLIKELY_ROLES: string[]
          • _checkByline: typeof __function
          • _clean: ((e: any, tag: any) => void)

            Clean a node of all elements of type "tag". +

            void

            +
          prototype: {
              ALTER_TO_DIV_EXCEPTIONS: string[];
              CLASSES_TO_PRESERVE: string[];
              DEFAULT_CHAR_THRESHOLD: number;
              DEFAULT_MAX_ELEMS_TO_PARSE: number;
              DEFAULT_N_TOP_CANDIDATES: number;
              DEFAULT_TAGS_TO_SCORE: string[];
              DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[];
              DIV_TO_P_ELEMS: Set<string>;
              ELEMENT_NODE: number;
              FLAG_CLEAN_CONDITIONALLY: number;
              FLAG_STRIP_UNLIKELYS: number;
              FLAG_WEIGHT_CLASSES: number;
              HTML_ESCAPE_MAP: {
                  amp: string;
                  apos: string;
                  gt: string;
                  lt: string;
                  quot: string;
              };
              PHRASING_ELEMS: string[];
              PRESENTATIONAL_ATTRIBUTES: string[];
              REGEXPS: {
                  b64DataUrl: RegExp;
                  byline: RegExp;
                  extraneous: RegExp;
                  hasContent: RegExp;
                  hashUrl: RegExp;
                  jsonLdArticleTypes: RegExp;
                  negative: RegExp;
                  nextLink: RegExp;
                  normalize: RegExp;
                  okMaybeItsACandidate: RegExp;
                  positive: RegExp;
                  prevLink: RegExp;
                  replaceFonts: RegExp;
                  shareElements: RegExp;
                  srcsetUrl: RegExp;
                  tokenize: RegExp;
                  unlikelyCandidates: RegExp;
                  videos: RegExp;
                  whitespace: RegExp;
              };
              TEXT_NODE: number;
              UNLIKELY_ROLES: string[];
              _checkByline: typeof __function;
              _clean: ((e: any, tag: any) => void);
              _cleanClasses: ((node: any) => void);
              _cleanConditionally: ((e: any, tag: any) => void);
              _cleanHeaders: ((e: any) => void);
              _cleanMatchedNodes: ((e: any, filter: any) => void);
              _cleanStyles: ((e: any) => void);
              _concatNodeLists: ((...args: any) => any);
              _everyNode: ((nodeList: any, fn: any) => any);
              _findNode: ((nodeList: any, fn: any) => any);
              _fixLazyImages: ((root: any) => void);
              _fixRelativeUris: ((articleContent: any) => void);
              _flagIsActive: ((flag: any) => boolean);
              _forEachNode: ((nodeList: any, fn: any) => void);
              _getAllNodesWithTag: ((node: any, tagNames: any) => any);
              _getArticleMetadata: ((jsonld: any) => {
                  byline: any;
                  excerpt: any;
                  siteName: any;
                  title: any;
              });
              _getArticleTitle: (() => string);
              _getCharCount: ((e: any, s: any) => number);
              _getClassWeight: ((e: any) => number);
              _getInnerText: ((e: any, normalizeSpaces: any) => any);
              _getJSONLD: ((doc: any) => {});
              _getLinkDensity: ((element: any) => number);
              _getNextNode: ((node: any, ignoreSelfAndKids: any) => any);
              _getNodeAncestors: ((node: any, maxDepth: any) => any[]);
              _getRowAndColumnCount: ((table: any) => {
                  columns: number;
                  rows: number;
              });
              _getTextDensity: ((e: any, tags: any) => number);
              _grabArticle: typeof __function;
              _hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean);
              _hasChildBlockElement: ((element: any) => any);
              _hasSingleTagInsideElement: ((element: any, tag: any) => boolean);
              _headerDuplicatesTitle: ((node: any) => boolean);
              _initializeNode: ((node: any) => void);
              _isElementWithoutContent: ((node: any) => boolean);
              _isPhrasingContent: ((node: any) => any);
              _isProbablyVisible: ((node: any) => boolean);
              _isSingleImage: ((node: any) => any);
              _isValidByline: ((byline: any) => boolean);
              _isWhitespace: ((node: any) => boolean);
              _markDataTables: ((root: any) => void);
              _nextNode: ((node: any) => any);
              _postProcessContent: ((articleContent: any) => void);
              _prepArticle: ((articleContent: any) => void);
              _prepDocument: (() => void);
              _removeAndGetNext: ((node: any) => any);
              _removeFlag: typeof __function;
              _removeNodes: ((nodeList: any, filterFn: any) => void);
              _removeScripts: ((doc: any) => void);
              _replaceBrs: ((elem: any) => void);
              _replaceNodeTags: ((nodeList: any, newTagName: any) => void);
              _setNodeTag: ((node: any, tag: any) => any);
              _simplifyNestedElements: ((articleContent: any) => void);
              _someNode: ((nodeList: any, fn: any) => any);
              _textSimilarity: ((textA: any, textB: any) => number);
              _unescapeHtmlEntities: ((str: string) => string);
              _unwrapNoscriptImages: ((doc: any) => void);
              parse: typeof __function;
          }

          Type declaration

          • ALTER_TO_DIV_EXCEPTIONS: string[]
          • CLASSES_TO_PRESERVE: string[]
          • DEFAULT_CHAR_THRESHOLD: number
          • DEFAULT_MAX_ELEMS_TO_PARSE: number
          • DEFAULT_N_TOP_CANDIDATES: number
          • DEFAULT_TAGS_TO_SCORE: string[]
          • DEPRECATED_SIZE_ATTRIBUTE_ELEMS: string[]
          • DIV_TO_P_ELEMS: Set<string>
          • ELEMENT_NODE: number
          • FLAG_CLEAN_CONDITIONALLY: number
          • FLAG_STRIP_UNLIKELYS: number
          • FLAG_WEIGHT_CLASSES: number
          • HTML_ESCAPE_MAP: {
                amp: string;
                apos: string;
                gt: string;
                lt: string;
                quot: string;
            }
            • amp: string
            • apos: string
            • gt: string
            • lt: string
            • quot: string
          • PHRASING_ELEMS: string[]
          • PRESENTATIONAL_ATTRIBUTES: string[]
          • REGEXPS: {
                b64DataUrl: RegExp;
                byline: RegExp;
                extraneous: RegExp;
                hasContent: RegExp;
                hashUrl: RegExp;
                jsonLdArticleTypes: RegExp;
                negative: RegExp;
                nextLink: RegExp;
                normalize: RegExp;
                okMaybeItsACandidate: RegExp;
                positive: RegExp;
                prevLink: RegExp;
                replaceFonts: RegExp;
                shareElements: RegExp;
                srcsetUrl: RegExp;
                tokenize: RegExp;
                unlikelyCandidates: RegExp;
                videos: RegExp;
                whitespace: RegExp;
            }
            • b64DataUrl: RegExp
            • byline: RegExp
            • extraneous: RegExp
            • hasContent: RegExp
            • hashUrl: RegExp
            • jsonLdArticleTypes: RegExp
            • negative: RegExp
            • nextLink: RegExp
            • normalize: RegExp
            • okMaybeItsACandidate: RegExp
            • positive: RegExp
            • prevLink: RegExp
            • replaceFonts: RegExp
            • shareElements: RegExp
            • srcsetUrl: RegExp
            • tokenize: RegExp
            • unlikelyCandidates: RegExp
            • videos: RegExp
            • whitespace: RegExp
          • TEXT_NODE: number
          • UNLIKELY_ROLES: string[]
          • _checkByline: typeof __function
          • _clean: ((e: any, tag: any) => void)

            Clean a node of all elements of type "tag". (Unless it's a youtube/vimeo video. People love movies.)

              • (e, tag): void
              • Parameters

                • e: any
                • tag: any

                Returns void

                void

          • _cleanClasses: ((node: any) => void)

            Removes the class="" attribute from every element in the given @@ -242,8 +279,8 @@

        • _cleanStyles: ((e: any) => void)
        • _concatNodeLists: ((...args: any[]) => any)
        • _concatNodeLists: ((...args: any) => any)
        • _everyNode: ((nodeList: any, fn: any) => any)
        • _getTextDensity: ((e: any, tags: any) => number)
        • _grabArticle: typeof __function

          a document to run upon. Needs to be a full document, complete with body.

          -

          Element

          +

          Element

        • _hasAncestorTag: ((node: any, tagName: any, maxDepth: any, filterFn: any) => boolean)

          Check if a given node has one of its ancestor tag name matching the provided one.

        • void

          -
          diff --git a/docs/classes/src_trafilatura_extractor_settings.Document.html b/docs/classes/src_trafilatura_extractor_settings.Document.html new file mode 100644 index 0000000..fce199c --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_settings.Document.html @@ -0,0 +1,30 @@ +Document | ai-research-agent

          Constructors

          • Returns Document

          Properties

          author: any
          body: any
          categories: any
          comments: any
          commentsbody: any
          date: any
          description: any
          filedate: any
          fingerprint: any
          hostname: any
          id: any
          image: any
          language: any
          license: any
          pagetype: any
          rawText: any
          sitename: any
          tags: any
          text: any
          title: any
          url: any

          Methods

          • Returns Document

          • Returns void

          • Parameters

            • attributes: any

            Returns void

          • Parameters

            • data: any

            Returns Document

          diff --git a/docs/classes/src_trafilatura_extractor_settings.Extractor.html b/docs/classes/src_trafilatura_extractor_settings.Extractor.html new file mode 100644 index 0000000..8815d32 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_settings.Extractor.html @@ -0,0 +1,27 @@ +Extractor | ai-research-agent

          Constructors

          • Parameters

            • __namedParameters: {
                  authorBlacklist: any;
                  comments: boolean;
                  config: {
                      COOKIE: string;
                      DOWNLOAD_TIMEOUT: string;
                      EXTENSIVE_DATE_SEARCH: string;
                      EXTERNAL_URLS: string;
                      EXTRACTION_TIMEOUT: string;
                      MAX_FILE_SIZE: string;
                      MAX_REDIRECTS: string;
                      MAX_REPETITIONS: string;
                      MIN_DUPLCHECK_SIZE: string;
                      MIN_EXTRACTED_COMM_SIZE: string;
                      MIN_EXTRACTED_SIZE: string;
                      MIN_FILE_SIZE: string;
                      MIN_OUTPUT_COMM_SIZE: string;
                      MIN_OUTPUT_SIZE: string;
                      SLEEP_TIME: string;
                      USER_AGENTS: string;
                  };
                  dateParams: any;
                  dedup: boolean;
                  fast: boolean;
                  formatting: boolean;
                  images: boolean;
                  lang: any;
                  links: boolean;
                  maxTreeSize: any;
                  onlyWithMetadata: boolean;
                  outputFormat: string;
                  precision: boolean;
                  recall: boolean;
                  source: any;
                  tables: boolean;
                  teiValidation: boolean;
                  url: any;
                  urlBlacklist: any;
                  withMetadata: boolean;
              } = {}
              • authorBlacklist: any
              • comments: boolean
              • config: {
                    COOKIE: string;
                    DOWNLOAD_TIMEOUT: string;
                    EXTENSIVE_DATE_SEARCH: string;
                    EXTERNAL_URLS: string;
                    EXTRACTION_TIMEOUT: string;
                    MAX_FILE_SIZE: string;
                    MAX_REDIRECTS: string;
                    MAX_REPETITIONS: string;
                    MIN_DUPLCHECK_SIZE: string;
                    MIN_EXTRACTED_COMM_SIZE: string;
                    MIN_EXTRACTED_SIZE: string;
                    MIN_FILE_SIZE: string;
                    MIN_OUTPUT_COMM_SIZE: string;
                    MIN_OUTPUT_SIZE: string;
                    SLEEP_TIME: string;
                    USER_AGENTS: string;
                }
                • COOKIE: string
                • DOWNLOAD_TIMEOUT: string
                • EXTENSIVE_DATE_SEARCH: string
                • EXTERNAL_URLS: string
                • EXTRACTION_TIMEOUT: string
                • MAX_FILE_SIZE: string
                • MAX_REDIRECTS: string
                • MAX_REPETITIONS: string
                • MIN_DUPLCHECK_SIZE: string
                • MIN_EXTRACTED_COMM_SIZE: string
                • MIN_EXTRACTED_SIZE: string
                • MIN_FILE_SIZE: string
                • MIN_OUTPUT_COMM_SIZE: string
                • MIN_OUTPUT_SIZE: string
                • SLEEP_TIME: string
                • USER_AGENTS: string
              • dateParams: any
              • dedup: boolean
              • fast: boolean
              • formatting: boolean
              • images: boolean
              • lang: any
              • links: boolean
              • maxTreeSize: any
              • onlyWithMetadata: boolean
              • outputFormat: string
              • precision: boolean
              • recall: boolean
              • source: any
              • tables: boolean
              • teiValidation: boolean
              • url: any
              • urlBlacklist: any
              • withMetadata: boolean

            Returns Extractor

          Properties

          authorBlacklist: any
          comments: boolean
          config: any
          dateParams: any
          dedup: boolean
          fast: boolean
          focus: string
          format: any
          formatting: boolean
          images: boolean
          lang: any
          links: boolean
          maxTreeSize: any
          onlyWithMetadata: boolean
          source: any
          tables: boolean
          teiValidation: boolean
          url: any
          urlBlacklist: any
          withMetadata: any

          Methods

          • Parameters

            • config: any

            Returns void

          • Parameters

            • chosenFormat: any

            Returns void

          diff --git a/docs/classes/src_trafilatura_extractor_spider.CrawlParameters.html b/docs/classes/src_trafilatura_extractor_spider.CrawlParameters.html new file mode 100644 index 0000000..637c760 --- /dev/null +++ b/docs/classes/src_trafilatura_extractor_spider.CrawlParameters.html @@ -0,0 +1,19 @@ +CrawlParameters | ai-research-agent

          Constructors

          • Parameters

            • start: any
            • lang: any = null
            • rules: any = null
            • pruneXpath: any = null

            Returns CrawlParameters

          Properties

          base: string
          i: number
          isOn: boolean
          knownNum: number
          lang: any
          pruneXpath: any
          ref: any
          rules: any
          start: any

          Methods

          • Parameters

            • start: any

            Returns string

          • Parameters

            • start: any

            Returns any

          • Parameters

            • todo: any

            Returns any

          • Parameters

            • link: any

            Returns boolean

          • Parameters

            • urlStore: any

            Returns void

          diff --git a/docs/functions/src_autocomplete_autocomplete.suggestNextWordCompletions.html b/docs/functions/src_autocomplete_autocomplete.suggestNextWordCompletions.html index 0fb66f1..c4ecb1f 100644 --- a/docs/functions/src_autocomplete_autocomplete.suggestNextWordCompletions.html +++ b/docs/functions/src_autocomplete_autocomplete.suggestNextWordCompletions.html @@ -1,7 +1,7 @@ suggestNextWordCompletions | ai-research-agent
          • Completes the query with the most likely next words for phrases. If typing 2+ letters of a word, returns all possible words matching those few letters.

            Parameters

            • query: string

              The input query which can be pertial words or phrases.

              -
            • options: {
                  limitMaxResults: number;
                  numberOfLastWordsToCheck: number;
                  phrasesModel: any;
              } = {}
              • limitMaxResults: number

                default=10 - The maximum number of autocomplete suggestions to return.

                +
              • Optionaloptions: {
                    limitMaxResults: number;
                    numberOfLastWordsToCheck: number;
                    phrasesModel: any;
                } = {}
                • limitMaxResults: number

                  default=10 - The maximum number of autocomplete suggestions to return.

                • numberOfLastWordsToCheck: number

                  default=5 - The number of last words in the query to check for phrase completions.

                • phrasesModel: any

                  A custom phrases model to use for autocomplete suggestions.

            Returns Promise<any[]>

            An array of autocomplete suggestions, each containing either a 'phrase' or 'word' property.

            @@ -11,7 +11,7 @@
            // Using options
            const customModel = await import("./custom-phrases-model.json");
            const suggestions = await suggestNextWordCompletions("artificial int", {
            phrasesModel: customModel,
            limitMaxResults: 5,
            numberOfLastWordsToCheck: 3
            });
            // Possible output: [{ phrase: "artificial intelligence" }, { phrase: "artificial interpretation" }]
            -
          diff --git a/docs/functions/src_chrome_extension_allow_cors.modifyChromeAPIAllowCORS.html b/docs/functions/src_chrome_extension_allow_cors.modifyChromeAPIAllowCORS.html index 3d186a2..5b31780 100644 --- a/docs/functions/src_chrome_extension_allow_cors.modifyChromeAPIAllowCORS.html +++ b/docs/functions/src_chrome_extension_allow_cors.modifyChromeAPIAllowCORS.html @@ -10,12 +10,12 @@ -

          Parameters

          diff --git a/docs/functions/src_dataset_import_compile_topic_model.compileTopicModel.html b/docs/functions/src_dataset_import_compile_topic_model.compileTopicModel.html index ae7e7dd..c9b832d 100644 --- a/docs/functions/src_dataset_import_compile_topic_model.compileTopicModel.html +++ b/docs/functions/src_dataset_import_compile_topic_model.compileTopicModel.html @@ -14,13 +14,13 @@ JSON Prefix Trie - arranged by sorting words and phrases for lookup by first word to tokenize by word, then find if it starts a phrase based on entries, for Phrase Extraction from a text.
          There is "unanimous consensus" that Prefix Trie O(1) lookups (instead of having to loop through the index for each lookup) makes it the best data type for this task. -

          Parameters

          diff --git a/docs/functions/src_dataset_import_compile_topic_model.weightWikiWordSpecificity.html b/docs/functions/src_dataset_import_compile_topic_model.weightWikiWordSpecificity.html index 7079c10..d8b563a 100644 --- a/docs/functions/src_dataset_import_compile_topic_model.weightWikiWordSpecificity.html +++ b/docs/functions/src_dataset_import_compile_topic_model.weightWikiWordSpecificity.html @@ -11,7 +11,7 @@ Zenodo. https://doi.org/10.5281/zenodo.3631674 https://github.com/SmartDataAnalytics/Wikipedia_TF_IDF_Dataset

          Parameters

          Returns number

          score for term specificity 0-12~

          -
          diff --git a/docs/functions/src_dataset_import_compress_json_jz64.compressBase64ZipText.html b/docs/functions/src_dataset_import_compress_json_jz64.compressBase64ZipText.html index 6246215..5d84537 100644 --- a/docs/functions/src_dataset_import_compress_json_jz64.compressBase64ZipText.html +++ b/docs/functions/src_dataset_import_compress_json_jz64.compressBase64ZipText.html @@ -1,10 +1,13 @@ compressBase64ZipText | ai-research-agent
          • Compress/decompress any data (such as JSON or text) with JSZip then convert zip binary to a Base64Zip text string which is easier to store in db or files.

            Parameters

            • dataOrZip: string

              data to compress, or Base64Zip to decompress

              -
            • options: {
                  compressionLevel: number;
                  decompress: boolean;
              } = {}
              • compressionLevel: number

                default=9 0-9, 9 has smallest size at ~40% but takes longer

                +
              • Optionaloptions: {
                    compressionLevel: number;
                    decompress: boolean;
                } = {}
                  +
                • +
                +
                • compressionLevel: number

                  default=9 0-9, 9 has smallest size at ~40% but takes longer

                • decompress: boolean

                  default=false false to compress, true to decompress

            Returns Promise<string>

            base64-encoded string of the zipped data

            -
          diff --git a/docs/functions/src_dataset_import_dictionary_import.importDictionary.html b/docs/functions/src_dataset_import_dictionary_import.importDictionary.html index 2d2a1f8..d93a0a3 100644 --- a/docs/functions/src_dataset_import_dictionary_import.importDictionary.html +++ b/docs/functions/src_dataset_import_dictionary_import.importDictionary.html @@ -10,7 +10,7 @@

          Returns any

          McCrae, J.P., Rademaker, A., Bond, F., Rudnicka, E., and Fellbaum, C. (2019). English WordNet 2019 – An Open-Source WordNet for English. In Proceedings of the 10th Global WordNet Conference – GWC 2019, Wrocław. https://aclanthology.org/2019.gwc-1.31/

          -
          diff --git a/docs/functions/src_dataset_import_human_names_import.importHumanNames.html b/docs/functions/src_dataset_import_human_names_import.importHumanNames.html index 31488fe..b9f12ce 100644 --- a/docs/functions/src_dataset_import_human_names_import.importHumanNames.html +++ b/docs/functions/src_dataset_import_human_names_import.importHumanNames.html @@ -1,7 +1,7 @@ importHumanNames | ai-research-agent
          • Import 92k first and last human names sorted by popularity

            Returns any

            American Registry for Internet Numbers (ARIN), a nonprofit which administers IP addresses, Github @arineng/arincli

            -
          diff --git a/docs/functions/src_dataset_import_misspelled_typos_import.importMisspelledTypos.html b/docs/functions/src_dataset_import_misspelled_typos_import.importMisspelledTypos.html index 6c0d625..69e3c21 100644 --- a/docs/functions/src_dataset_import_misspelled_typos_import.importMisspelledTypos.html +++ b/docs/functions/src_dataset_import_misspelled_typos_import.importMisspelledTypos.html @@ -3,7 +3,7 @@ Total unique typos collected: 7969

          Returns any

          Crowd-sourced often-updated Wikipedia list of common misspellings https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings

          -
          diff --git a/docs/functions/src_dataset_import_quora_import.importCommonQueries.html b/docs/functions/src_dataset_import_quora_import.importCommonQueries.html index cc9e661..2896478 100644 --- a/docs/functions/src_dataset_import_quora_import.importCommonQueries.html +++ b/docs/functions/src_dataset_import_quora_import.importCommonQueries.html @@ -4,7 +4,7 @@ Quora question pairs dataset, great for training a semantic similarity model or Query-Response model

          Returns any

          Quora https://huggingface.co/datasets/BeIR/quora

          -
          diff --git a/docs/functions/src_dataset_import_term_frequency_import.importTermFrequency.html b/docs/functions/src_dataset_import_term_frequency_import.importTermFrequency.html index 5b055d4..4ca4704 100644 --- a/docs/functions/src_dataset_import_term_frequency_import.importTermFrequency.html +++ b/docs/functions/src_dataset_import_term_frequency_import.importTermFrequency.html @@ -10,7 +10,7 @@ Total Articles (Wiki-en-2020): 5,989,879

          Returns any

          Galkin, M., Malykh, V. (2020). Wikipedia TF-IDF Dataset Release (v1.0). Zenodo. https://doi.org/10.5281/zenodo.3631674 https://github.com/SmartDataAnalytics/Wikipedia_TF_IDF_Dataset

          -
          diff --git a/docs/functions/src_dataset_import_wikipage_titles_import.importWikiPageTitles.html b/docs/functions/src_dataset_import_wikipage_titles_import.importWikiPageTitles.html index b6355de..5ca8483 100644 --- a/docs/functions/src_dataset_import_wikipage_titles_import.importWikiPageTitles.html +++ b/docs/functions/src_dataset_import_wikipage_titles_import.importWikiPageTitles.html @@ -1,6 +1,6 @@ importWikiPageTitles | ai-research-agent
          diff --git a/docs/functions/src_extract_content_baseline.baseline.html b/docs/functions/src_extract_content_baseline.baseline.html deleted file mode 100644 index 7f48393..0000000 --- a/docs/functions/src_extract_content_baseline.baseline.html +++ /dev/null @@ -1,4 +0,0 @@ -baseline | ai-research-agent
          • Parameters

            • filecontent: any

            Returns any[]

          diff --git a/docs/functions/src_extract_content_baseline.basicCleaning.html b/docs/functions/src_extract_content_baseline.basicCleaning.html deleted file mode 100644 index e7fafbb..0000000 --- a/docs/functions/src_extract_content_baseline.basicCleaning.html +++ /dev/null @@ -1,4 +0,0 @@ -basicCleaning | ai-research-agent
          • Parameters

            • tree: any

            Returns any

          diff --git a/docs/functions/src_extract_content_baseline.loadHtml.html b/docs/functions/src_extract_content_baseline.loadHtml.html deleted file mode 100644 index a12e056..0000000 --- a/docs/functions/src_extract_content_baseline.loadHtml.html +++ /dev/null @@ -1,4 +0,0 @@ -loadHtml | ai-research-agent
          • Parameters

            • content: any

            Returns any

          diff --git a/docs/functions/src_extract_content_baseline.trim.html b/docs/functions/src_extract_content_baseline.trim.html deleted file mode 100644 index 6051cae..0000000 --- a/docs/functions/src_extract_content_baseline.trim.html +++ /dev/null @@ -1,4 +0,0 @@ -trim | ai-research-agent
          • Parameters

            • text: any

            Returns any

          diff --git a/docs/functions/src_extract_content_core.bareExtraction.html b/docs/functions/src_extract_content_core.bareExtraction.html deleted file mode 100644 index 039943d..0000000 --- a/docs/functions/src_extract_content_core.bareExtraction.html +++ /dev/null @@ -1,4 +0,0 @@ -bareExtraction | ai-research-agent
          • Parameters

            • filecontent: any
            • options: any

            Returns Document

          diff --git a/docs/functions/src_extract_content_core.extract.html b/docs/functions/src_extract_content_core.extract.html deleted file mode 100644 index 47f633f..0000000 --- a/docs/functions/src_extract_content_core.extract.html +++ /dev/null @@ -1,4 +0,0 @@ -extract | ai-research-agent
          • Parameters

            • htmlOrDom: any
            • options: {} = {}

              Returns any

            diff --git a/docs/functions/src_extract_content_courlan_clean.cleanQuery.html b/docs/functions/src_extract_content_courlan_clean.cleanQuery.html deleted file mode 100644 index b67f3db..0000000 --- a/docs/functions/src_extract_content_courlan_clean.cleanQuery.html +++ /dev/null @@ -1,11 +0,0 @@ -cleanQuery | ai-research-agent
            • Strip unwanted query elements

              -

              Parameters

              • querystring: string

                The query string to clean

                -
              • Optionalstrict: boolean = false

                Whether to use strict cleaning

                -
              • Optionallanguage: string = null

                The language to use for cleaning

                -

              Returns string

                -
              • The cleaned query string
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.cleanUrl.html b/docs/functions/src_extract_content_courlan_clean.cleanUrl.html deleted file mode 100644 index 3791360..0000000 --- a/docs/functions/src_extract_content_courlan_clean.cleanUrl.html +++ /dev/null @@ -1,10 +0,0 @@ -cleanUrl | ai-research-agent
            • Helper function: chained scrubbing and normalization

              -

              Parameters

              • url: string

                The URL to clean

                -
              • Optionallanguage: string = null

                The language to use for cleaning

                -

              Returns string

                -
              • The cleaned URL or null if invalid
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.decodePunycode.html b/docs/functions/src_extract_content_courlan_clean.decodePunycode.html deleted file mode 100644 index 51d7028..0000000 --- a/docs/functions/src_extract_content_courlan_clean.decodePunycode.html +++ /dev/null @@ -1,9 +0,0 @@ -decodePunycode | ai-research-agent
            • Probe for punycode in lower-cased hostname and try to decode it

              -

              Parameters

              • string: string

                The string to decode

                -

              Returns string

                -
              • The decoded string
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.normalizeFragment.html b/docs/functions/src_extract_content_courlan_clean.normalizeFragment.html deleted file mode 100644 index 952515b..0000000 --- a/docs/functions/src_extract_content_courlan_clean.normalizeFragment.html +++ /dev/null @@ -1,10 +0,0 @@ -normalizeFragment | ai-research-agent
            • Look for trackers in URL fragments using query analysis, normalize the output

              -

              Parameters

              • fragment: string

                The fragment to normalize

                -
              • Optionallanguage: string = null

                The language to use for normalization

                -

              Returns string

                -
              • The normalized fragment
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.normalizePart.html b/docs/functions/src_extract_content_courlan_clean.normalizePart.html deleted file mode 100644 index 1a4e9ef..0000000 --- a/docs/functions/src_extract_content_courlan_clean.normalizePart.html +++ /dev/null @@ -1,9 +0,0 @@ -normalizePart | ai-research-agent
            • Normalize URLs parts (specifically path and fragment) while accounting for certain characters

              -

              Parameters

              • urlPart: string

                The URL part to normalize

                -

              Returns string

                -
              • The normalized URL part
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.normalizeUrl.html b/docs/functions/src_extract_content_courlan_clean.normalizeUrl.html deleted file mode 100644 index 79f137e..0000000 --- a/docs/functions/src_extract_content_courlan_clean.normalizeUrl.html +++ /dev/null @@ -1,12 +0,0 @@ -normalizeUrl | ai-research-agent
            • Takes a URL string or a parsed URL and returns a normalized URL string

              -

              Parameters

              • parsedUrl: string | URL

                The URL to normalize

                -
              • Optionalstrict: boolean = false

                Whether to use strict normalization

                -
              • Optionallanguage: string = null

                The language to use for normalization

                -
              • OptionaltrailingSlash: boolean = true

                Whether to keep trailing slashes

                -

              Returns string

                -
              • The normalized URL
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_clean.scrubUrl.html b/docs/functions/src_extract_content_courlan_clean.scrubUrl.html deleted file mode 100644 index 8c27a7a..0000000 --- a/docs/functions/src_extract_content_courlan_clean.scrubUrl.html +++ /dev/null @@ -1,9 +0,0 @@ -scrubUrl | ai-research-agent
            • Strip unnecessary parts and make sure only one URL is considered

              -

              Parameters

              • url: string

                The URL to scrub

                -

              Returns string

                -
              • The scrubbed URL
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.basicFilter.html b/docs/functions/src_extract_content_courlan_filters.basicFilter.html deleted file mode 100644 index 57ccae6..0000000 --- a/docs/functions/src_extract_content_courlan_filters.basicFilter.html +++ /dev/null @@ -1,9 +0,0 @@ -basicFilter | ai-research-agent
            • Filter URLs based on basic formal characteristics.

              -

              Parameters

              • url: string

                The URL to filter

                -

              Returns boolean

                -
              • True if the URL passes the basic filter, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.domainFilter.html b/docs/functions/src_extract_content_courlan_filters.domainFilter.html deleted file mode 100644 index 915cddf..0000000 --- a/docs/functions/src_extract_content_courlan_filters.domainFilter.html +++ /dev/null @@ -1,9 +0,0 @@ -domainFilter | ai-research-agent
            • Find invalid domain/host names.

              -

              Parameters

              • domain: string

                The domain to filter

                -

              Returns boolean

                -
              • True if the domain is valid, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.extensionFilter.html b/docs/functions/src_extract_content_courlan_filters.extensionFilter.html deleted file mode 100644 index ada9960..0000000 --- a/docs/functions/src_extract_content_courlan_filters.extensionFilter.html +++ /dev/null @@ -1,9 +0,0 @@ -extensionFilter | ai-research-agent
            • Filter based on file extension.

              -

              Parameters

              • urlpath: string

                The URL path to filter

                -

              Returns boolean

                -
              • True if the URL path passes the extension filter, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.isNavigationPage.html b/docs/functions/src_extract_content_courlan_filters.isNavigationPage.html deleted file mode 100644 index ca25951..0000000 --- a/docs/functions/src_extract_content_courlan_filters.isNavigationPage.html +++ /dev/null @@ -1,9 +0,0 @@ -isNavigationPage | ai-research-agent
            • Determine if the URL is related to navigation and overview pages rather than content pages.

              -

              Parameters

              • url: string

                The URL to check

                -

              Returns boolean

                -
              • True if the URL is a navigation page, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.isNotCrawlable.html b/docs/functions/src_extract_content_courlan_filters.isNotCrawlable.html deleted file mode 100644 index 365164e..0000000 --- a/docs/functions/src_extract_content_courlan_filters.isNotCrawlable.html +++ /dev/null @@ -1,9 +0,0 @@ -isNotCrawlable | ai-research-agent
            • Run tests to check if the URL may lead to deep web or pages generally not usable in a crawling context.

              -

              Parameters

              • url: string

                The URL to check

                -

              Returns boolean

                -
              • True if the URL is not crawlable, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.isValidUrl.html b/docs/functions/src_extract_content_courlan_filters.isValidUrl.html deleted file mode 100644 index 76375f5..0000000 --- a/docs/functions/src_extract_content_courlan_filters.isValidUrl.html +++ /dev/null @@ -1,9 +0,0 @@ -isValidUrl | ai-research-agent
            • Determine if a given string is a valid URL.

              -

              Parameters

              • url: string

                The URL to validate

                -

              Returns boolean

                -
              • True if the URL is valid, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.langFilter.html b/docs/functions/src_extract_content_courlan_filters.langFilter.html deleted file mode 100644 index 67381d3..0000000 --- a/docs/functions/src_extract_content_courlan_filters.langFilter.html +++ /dev/null @@ -1,12 +0,0 @@ -langFilter | ai-research-agent
            • Heuristics targeting internationalization and linguistic elements based on a score.

              -

              Parameters

              • url: string

                The URL to filter

                -
              • language: string = null

                The target language

                -
              • strict: boolean = false

                Whether to use strict filtering

                -
              • trailingSlash: boolean = true

                Whether to consider trailing slashes

                -

              Returns boolean

                -
              • True if the URL passes the language filter, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.pathFilter.html b/docs/functions/src_extract_content_courlan_filters.pathFilter.html deleted file mode 100644 index 2514d7f..0000000 --- a/docs/functions/src_extract_content_courlan_filters.pathFilter.html +++ /dev/null @@ -1,10 +0,0 @@ -pathFilter | ai-research-agent
            • Filters based on URL path: index page, imprint, etc.

              -

              Parameters

              • urlpath: string

                The URL path to filter

                -
              • query: string

                The URL query string

                -

              Returns boolean

                -
              • True if the URL path passes the filter, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.typeFilter.html b/docs/functions/src_extract_content_courlan_filters.typeFilter.html deleted file mode 100644 index 6257353..0000000 --- a/docs/functions/src_extract_content_courlan_filters.typeFilter.html +++ /dev/null @@ -1,11 +0,0 @@ -typeFilter | ai-research-agent
            • Make sure the target URL is from a suitable type (HTML page with primarily text).

              -

              Parameters

              • url: string

                The URL to filter

                -
              • strict: boolean = false

                Whether to use strict filtering

                -
              • withNav: boolean = false

                Whether to include navigation pages

                -

              Returns boolean

                -
              • True if the URL passes the type filter, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_filters.validateUrl.html b/docs/functions/src_extract_content_courlan_filters.validateUrl.html deleted file mode 100644 index dddb9fb..0000000 --- a/docs/functions/src_extract_content_courlan_filters.validateUrl.html +++ /dev/null @@ -1,9 +0,0 @@ -validateUrl | ai-research-agent
            • Parse and validate the input.

              -

              Parameters

              • url: string

                The URL to validate

                -

              Returns [boolean, URL]

                -
              • A tuple of validation result and parsed URL
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_meta.addCache.html b/docs/functions/src_extract_content_courlan_meta.addCache.html deleted file mode 100644 index 8ffdbf0..0000000 --- a/docs/functions/src_extract_content_courlan_meta.addCache.html +++ /dev/null @@ -1,6 +0,0 @@ -addCache | ai-research-agent
            • Add a cache to the set of caches that can be cleared

              -

              Parameters

              • cache: any

                The cache object to add

                -

              Returns void

            diff --git a/docs/functions/src_extract_content_courlan_meta.clearCaches.html b/docs/functions/src_extract_content_courlan_meta.clearCaches.html deleted file mode 100644 index 597135e..0000000 --- a/docs/functions/src_extract_content_courlan_meta.clearCaches.html +++ /dev/null @@ -1,6 +0,0 @@ -clearCaches | ai-research-agent
            • Reset all known caches used to speed up processing. -This may release some memory.

              -

              Returns void

            diff --git a/docs/functions/src_extract_content_courlan_network.redirectionTest.html b/docs/functions/src_extract_content_courlan_network.redirectionTest.html deleted file mode 100644 index 12149ba..0000000 --- a/docs/functions/src_extract_content_courlan_network.redirectionTest.html +++ /dev/null @@ -1,12 +0,0 @@ -redirectionTest | ai-research-agent
            • Test final URL to handle redirects

              -

              Parameters

              • url: string

                URL to check

                -

              Returns Promise<string>

                -
              • The final URL seen
              • -
              -
                -
              • If the URL cannot be reached
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.extractDomain.html b/docs/functions/src_extract_content_courlan_urlutils.extractDomain.html deleted file mode 100644 index 5bf9ad7..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.extractDomain.html +++ /dev/null @@ -1,11 +0,0 @@ -extractDomain | ai-research-agent
            • Extract domain name information using top-level domain info

              -

              Parameters

              • url: string

                The URL to extract domain from

                -
              • Optionalblacklist: Set<string> = ...

                A set of blacklisted domains

                -
              • Optionalfast: boolean = false

                Whether to use a faster, regex-based method

                -

              Returns string

                -
              • The extracted domain or null if blacklisted or invalid
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.filterUrls.html b/docs/functions/src_extract_content_courlan_urlutils.filterUrls.html deleted file mode 100644 index 85474d1..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.filterUrls.html +++ /dev/null @@ -1,10 +0,0 @@ -filterUrls | ai-research-agent
            • Return a list of links corresponding to the given substring pattern

              -

              Parameters

              • linkList: string[]

                The list of links to filter

                -
              • urlfilter: string

                The substring pattern to filter by

                -

              Returns string[]

                -
              • The filtered list of links
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.getBaseUrl.html b/docs/functions/src_extract_content_courlan_urlutils.getBaseUrl.html deleted file mode 100644 index 1d1d4c2..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.getBaseUrl.html +++ /dev/null @@ -1,9 +0,0 @@ -getBaseUrl | ai-research-agent
            • Strip URL of some of its parts to get base URL

              -

              Parameters

              • url: string | URL

                The URL to get the base from

                -

              Returns string

                -
              • The base URL
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.getHostAndPath.html b/docs/functions/src_extract_content_courlan_urlutils.getHostAndPath.html deleted file mode 100644 index 69e5033..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.getHostAndPath.html +++ /dev/null @@ -1,9 +0,0 @@ -getHostAndPath | ai-research-agent
            • Decompose URL in two parts: protocol + host/domain and path

              -

              Parameters

              • url: string | URL

                The URL to decompose

                -

              Returns [string, string]

                -
              • A tuple of hostname and path
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.getHostinfo.html b/docs/functions/src_extract_content_courlan_urlutils.getHostinfo.html deleted file mode 100644 index a51877b..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.getHostinfo.html +++ /dev/null @@ -1,9 +0,0 @@ -getHostinfo | ai-research-agent
            • Convenience function returning domain and host info (protocol + host/domain) from a URL

              -

              Parameters

              • url: string

                The URL to get info from

                -

              Returns [string, string]

                -
              • A tuple of domain name and base URL
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.getTldinfo.html b/docs/functions/src_extract_content_courlan_urlutils.getTldinfo.html deleted file mode 100644 index c40d693..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.getTldinfo.html +++ /dev/null @@ -1,10 +0,0 @@ -getTldinfo | ai-research-agent
            • Cached function to extract top-level domain info

              -

              Parameters

              • url: string

                The URL to extract domain info from

                -
              • fast: boolean = false

                Whether to use a faster, regex-based method

                -

              Returns [string, string]

                -
              • A tuple of domain and full domain
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.isExternal.html b/docs/functions/src_extract_content_courlan_urlutils.isExternal.html deleted file mode 100644 index 6080f00..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.isExternal.html +++ /dev/null @@ -1,11 +0,0 @@ -isExternal | ai-research-agent
            • Determine if a link leads to another host

              -

              Parameters

              • url: string

                The URL to check

                -
              • reference: string

                The reference URL

                -
              • OptionalignoreSuffix: boolean = true

                Whether to ignore the suffix in comparison

                -

              Returns boolean

                -
              • True if the URL is external, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_courlan_urlutils.isKnownLink.html b/docs/functions/src_extract_content_courlan_urlutils.isKnownLink.html deleted file mode 100644 index e3e41a8..0000000 --- a/docs/functions/src_extract_content_courlan_urlutils.isKnownLink.html +++ /dev/null @@ -1,10 +0,0 @@ -isKnownLink | ai-research-agent
            • Compare the link and its possible variants to the existing URL base

              -

              Parameters

              • link: string

                The link to check

                -
              • knownLinks: Set<string>

                The set of known links

                -

              Returns boolean

                -
              • True if the link is known, false otherwise
              • -
              -
            diff --git a/docs/functions/src_extract_content_deduplication.contentFingerprint.html b/docs/functions/src_extract_content_deduplication.contentFingerprint.html deleted file mode 100644 index a203b5b..0000000 --- a/docs/functions/src_extract_content_deduplication.contentFingerprint.html +++ /dev/null @@ -1,4 +0,0 @@ -contentFingerprint | ai-research-agent
            • Parameters

              • content: any

              Returns string

            diff --git a/docs/functions/src_extract_content_deduplication.isSimilarDomain.html b/docs/functions/src_extract_content_deduplication.isSimilarDomain.html deleted file mode 100644 index 19cd93c..0000000 --- a/docs/functions/src_extract_content_deduplication.isSimilarDomain.html +++ /dev/null @@ -1,4 +0,0 @@ -isSimilarDomain | ai-research-agent
            • Parameters

              • reference: any
              • newString: any
              • threshold: number = 0.5

              Returns boolean

            diff --git a/docs/functions/src_extract_content_downloads.addToCompressedDict.html b/docs/functions/src_extract_content_downloads.addToCompressedDict.html deleted file mode 100644 index e037e66..0000000 --- a/docs/functions/src_extract_content_downloads.addToCompressedDict.html +++ /dev/null @@ -1,4 +0,0 @@ -addToCompressedDict | ai-research-agent
            • Parameters

              • inputList: any
              • blacklist: any = null
              • urlFilter: any = null
              • urlStore: any = null
              • compression: boolean = false
              • verbose: boolean = false

              Returns any

            diff --git a/docs/functions/src_extract_content_downloads.bufferedDownloads.html b/docs/functions/src_extract_content_downloads.bufferedDownloads.html deleted file mode 100644 index ff7a94a..0000000 --- a/docs/functions/src_extract_content_downloads.bufferedDownloads.html +++ /dev/null @@ -1,4 +0,0 @@ -bufferedDownloads | ai-research-agent
            • Parameters

              • bufferList: any
              • downloadThreads: any
              • decode: boolean = true
              • options: any = null

              Returns AsyncGenerator<any[], void, unknown>

            diff --git a/docs/functions/src_extract_content_downloads.fetchResponse.html b/docs/functions/src_extract_content_downloads.fetchResponse.html deleted file mode 100644 index 8dd7ae3..0000000 --- a/docs/functions/src_extract_content_downloads.fetchResponse.html +++ /dev/null @@ -1,4 +0,0 @@ -fetchResponse | ai-research-agent
            • Parameters

              • url: any
              • __namedParameters: {
                    config: {
                        COOKIE: any;
                        DOWNLOAD_TIMEOUT: number;
                        MAX_REDIRECTS: number;
                        USER_AGENTS: string[];
                    };
                    decode: boolean;
                    noSsl: boolean;
                    withHeaders: boolean;
                } = {}
                • config: {
                      COOKIE: any;
                      DOWNLOAD_TIMEOUT: number;
                      MAX_REDIRECTS: number;
                      USER_AGENTS: string[];
                  }
                  • COOKIE: any
                  • DOWNLOAD_TIMEOUT: number
                  • MAX_REDIRECTS: number
                  • USER_AGENTS: string[]
                • decode: boolean
                • noSsl: boolean
                • withHeaders: boolean

              Returns Promise<Response>

            diff --git a/docs/functions/src_extract_content_downloads.fetchUrl.html b/docs/functions/src_extract_content_downloads.fetchUrl.html deleted file mode 100644 index 0b957e8..0000000 --- a/docs/functions/src_extract_content_downloads.fetchUrl.html +++ /dev/null @@ -1,4 +0,0 @@ -fetchUrl | ai-research-agent
            • Parameters

              • url: any
              • decode: boolean = true
              • noSsl: boolean = false
              • config: {
                    COOKIE: any;
                    DOWNLOAD_TIMEOUT: number;
                    MAX_REDIRECTS: number;
                    USER_AGENTS: string[];
                } = DEFAULT_CONFIG
                • COOKIE: any
                • DOWNLOAD_TIMEOUT: number
                • MAX_REDIRECTS: number
                • USER_AGENTS: string[]
              • options: any = null

              Returns Promise<any>

            diff --git a/docs/functions/src_extract_content_downloads.loadDownloadBuffer.html b/docs/functions/src_extract_content_downloads.loadDownloadBuffer.html deleted file mode 100644 index 5cd8340..0000000 --- a/docs/functions/src_extract_content_downloads.loadDownloadBuffer.html +++ /dev/null @@ -1,4 +0,0 @@ -loadDownloadBuffer | ai-research-agent
            • Parameters

              • urlStore: any
              • sleepTime: number = 5000

              Returns Promise<any[]>

            diff --git a/docs/functions/src_extract_content_external.compareExtraction.html b/docs/functions/src_extract_content_external.compareExtraction.html deleted file mode 100644 index 43ab002..0000000 --- a/docs/functions/src_extract_content_external.compareExtraction.html +++ /dev/null @@ -1,4 +0,0 @@ -compareExtraction | ai-research-agent
            • Parameters

              • tree: any
              • backupTree: any
              • body: any
              • text: any
              • lenText: any
              • options: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_external.justextRescue.html b/docs/functions/src_extract_content_external.justextRescue.html deleted file mode 100644 index 8417ae8..0000000 --- a/docs/functions/src_extract_content_external.justextRescue.html +++ /dev/null @@ -1,4 +0,0 @@ -justextRescue | ai-research-agent
            • Parameters

              • tree: any
              • options: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_external.sanitizeTree.html b/docs/functions/src_extract_content_external.sanitizeTree.html deleted file mode 100644 index f8639aa..0000000 --- a/docs/functions/src_extract_content_external.sanitizeTree.html +++ /dev/null @@ -1,4 +0,0 @@ -sanitizeTree | ai-research-agent
            • Parameters

              • tree: any
              • options: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_feeds.findFeedUrls.html b/docs/functions/src_extract_content_feeds.findFeedUrls.html deleted file mode 100644 index 2ad138d..0000000 --- a/docs/functions/src_extract_content_feeds.findFeedUrls.html +++ /dev/null @@ -1,4 +0,0 @@ -findFeedUrls | ai-research-agent
            • Parameters

              • url: any
              • targetLang: any = null
              • external: boolean = false
              • sleepTime: number = 2000

              Returns any

            diff --git a/docs/functions/src_extract_content_feeds.tryHomepage.html b/docs/functions/src_extract_content_feeds.tryHomepage.html deleted file mode 100644 index 670fd09..0000000 --- a/docs/functions/src_extract_content_feeds.tryHomepage.html +++ /dev/null @@ -1,4 +0,0 @@ -tryHomepage | ai-research-agent
            • Parameters

              • baseurl: any
              • targetLang: any

              Returns any

            diff --git a/docs/functions/src_extract_content_htmlprocessing.buildHtmlOutput.html b/docs/functions/src_extract_content_htmlprocessing.buildHtmlOutput.html deleted file mode 100644 index fdb11ea..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.buildHtmlOutput.html +++ /dev/null @@ -1,4 +0,0 @@ -buildHtmlOutput | ai-research-agent
            • Parameters

              • document: any
              • withMetadata: boolean = false

              Returns string

            diff --git a/docs/functions/src_extract_content_htmlprocessing.collectLinkInfo.html b/docs/functions/src_extract_content_htmlprocessing.collectLinkInfo.html deleted file mode 100644 index 245abde..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.collectLinkInfo.html +++ /dev/null @@ -1,4 +0,0 @@ -collectLinkInfo | ai-research-agent
            • Parameters

              • linksXpath: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_htmlprocessing.convertDeletions.html b/docs/functions/src_extract_content_htmlprocessing.convertDeletions.html deleted file mode 100644 index 68e73bf..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.convertDeletions.html +++ /dev/null @@ -1,4 +0,0 @@ -convertDeletions | ai-research-agent
            • Parameters

              • elem: any

              Returns void

            diff --git a/docs/functions/src_extract_content_htmlprocessing.convertDetails.html b/docs/functions/src_extract_content_htmlprocessing.convertDetails.html deleted file mode 100644 index d566642..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.convertDetails.html +++ /dev/null @@ -1,4 +0,0 @@ -convertDetails | ai-research-agent
            • Parameters

              • elem: any

              Returns void

            diff --git a/docs/functions/src_extract_content_htmlprocessing.convertLists.html b/docs/functions/src_extract_content_htmlprocessing.convertLists.html deleted file mode 100644 index 8a36a66..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.convertLists.html +++ /dev/null @@ -1,4 +0,0 @@ -convertLists | ai-research-agent
            • Parameters

              • elem: any

              Returns void

            diff --git a/docs/functions/src_extract_content_htmlprocessing.convertQuotes.html b/docs/functions/src_extract_content_htmlprocessing.convertQuotes.html deleted file mode 100644 index 5596177..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.convertQuotes.html +++ /dev/null @@ -1,4 +0,0 @@ -convertQuotes | ai-research-agent
            • Parameters

              • elem: any

              Returns void

            diff --git a/docs/functions/src_extract_content_htmlprocessing.convertToHtml.html b/docs/functions/src_extract_content_htmlprocessing.convertToHtml.html deleted file mode 100644 index 81580f7..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.convertToHtml.html +++ /dev/null @@ -1,4 +0,0 @@ -convertToHtml | ai-research-agent
            • Parameters

              • tree: any

              Returns HTMLHtmlElement

            diff --git a/docs/functions/src_extract_content_htmlprocessing.deleteByLinkDensity.html b/docs/functions/src_extract_content_htmlprocessing.deleteByLinkDensity.html deleted file mode 100644 index fd762f3..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.deleteByLinkDensity.html +++ /dev/null @@ -1,4 +0,0 @@ -deleteByLinkDensity | ai-research-agent
            • Parameters

              • subtree: any
              • tagname: any
              • backtracking: boolean = false
              • favorPrecision: boolean = false

              Returns any

            diff --git a/docs/functions/src_extract_content_htmlprocessing.linkDensityTest.html b/docs/functions/src_extract_content_htmlprocessing.linkDensityTest.html deleted file mode 100644 index ddc5fad..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.linkDensityTest.html +++ /dev/null @@ -1,4 +0,0 @@ -linkDensityTest | ai-research-agent
            • Parameters

              • element: any
              • text: any
              • favorPrecision: boolean = false

              Returns any[]

            diff --git a/docs/functions/src_extract_content_htmlprocessing.linkDensityTestTables.html b/docs/functions/src_extract_content_htmlprocessing.linkDensityTestTables.html deleted file mode 100644 index 09ab2a1..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.linkDensityTestTables.html +++ /dev/null @@ -1,4 +0,0 @@ -linkDensityTestTables | ai-research-agent
            • Parameters

              • element: any

              Returns boolean

            diff --git a/docs/functions/src_extract_content_htmlprocessing.pruneHtml.html b/docs/functions/src_extract_content_htmlprocessing.pruneHtml.html deleted file mode 100644 index 79a474e..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.pruneHtml.html +++ /dev/null @@ -1,4 +0,0 @@ -pruneHtml | ai-research-agent
            • Parameters

              • tree: any
              • focus: string = "balanced"

              Returns any

            diff --git a/docs/functions/src_extract_content_htmlprocessing.pruneUnwantedNodes.html b/docs/functions/src_extract_content_htmlprocessing.pruneUnwantedNodes.html deleted file mode 100644 index 41d6106..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.pruneUnwantedNodes.html +++ /dev/null @@ -1,4 +0,0 @@ -pruneUnwantedNodes | ai-research-agent
            • Parameters

              • tree: any
              • nodelist: any
              • withBackup: boolean = false

              Returns any

            diff --git a/docs/functions/src_extract_content_htmlprocessing.treeCleaning.html b/docs/functions/src_extract_content_htmlprocessing.treeCleaning.html deleted file mode 100644 index c20f3ed..0000000 --- a/docs/functions/src_extract_content_htmlprocessing.treeCleaning.html +++ /dev/null @@ -1,4 +0,0 @@ -treeCleaning | ai-research-agent
            • Parameters

              • tree: any
              • options: any

              Returns any

            diff --git a/docs/functions/src_extract_content_json_metadata.extractJson.html b/docs/functions/src_extract_content_json_metadata.extractJson.html deleted file mode 100644 index 13ea521..0000000 --- a/docs/functions/src_extract_content_json_metadata.extractJson.html +++ /dev/null @@ -1,4 +0,0 @@ -extractJson | ai-research-agent
            • Parameters

              • schema: any
              • metadata: any

              Returns any

            diff --git a/docs/functions/src_extract_content_json_metadata.extractJsonParseError.html b/docs/functions/src_extract_content_json_metadata.extractJsonParseError.html deleted file mode 100644 index 1193d0d..0000000 --- a/docs/functions/src_extract_content_json_metadata.extractJsonParseError.html +++ /dev/null @@ -1,4 +0,0 @@ -extractJsonParseError | ai-research-agent
            • Parameters

              • elem: any
              • metadata: any

              Returns any

            diff --git a/docs/functions/src_extract_content_json_metadata.normalizeAuthors.html b/docs/functions/src_extract_content_json_metadata.normalizeAuthors.html deleted file mode 100644 index aa563fa..0000000 --- a/docs/functions/src_extract_content_json_metadata.normalizeAuthors.html +++ /dev/null @@ -1,4 +0,0 @@ -normalizeAuthors | ai-research-agent
            • Parameters

              • currentAuthors: any
              • authorString: any

              Returns any

            diff --git a/docs/functions/src_extract_content_main_extractor.extractComments.html b/docs/functions/src_extract_content_main_extractor.extractComments.html deleted file mode 100644 index dc739b7..0000000 --- a/docs/functions/src_extract_content_main_extractor.extractComments.html +++ /dev/null @@ -1,4 +0,0 @@ -extractComments | ai-research-agent
            • Parameters

              • tree: any
              • options: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_main_extractor.extractContent.html b/docs/functions/src_extract_content_main_extractor.extractContent.html deleted file mode 100644 index eef1e94..0000000 --- a/docs/functions/src_extract_content_main_extractor.extractContent.html +++ /dev/null @@ -1,4 +0,0 @@ -extractContent | ai-research-agent
            • Parameters

              • dom: any
              • options: any

              Returns any[]

            diff --git a/docs/functions/src_extract_content_meta.resetCaches.html b/docs/functions/src_extract_content_meta.resetCaches.html deleted file mode 100644 index 7242068..0000000 --- a/docs/functions/src_extract_content_meta.resetCaches.html +++ /dev/null @@ -1,4 +0,0 @@ -resetCaches | ai-research-agent
            • Returns void

            diff --git a/docs/functions/src_extract_content_metadata.extractMetadata.html b/docs/functions/src_extract_content_metadata.extractMetadata.html deleted file mode 100644 index 8b3ba75..0000000 --- a/docs/functions/src_extract_content_metadata.extractMetadata.html +++ /dev/null @@ -1,4 +0,0 @@ -extractMetadata | ai-research-agent
            • Parameters

              • filecontent: any
              • defaultUrl: any = null
              • dateConfig: any = null
              • extensive: boolean = true
              • authorBlacklist: any = null

              Returns any

            diff --git a/docs/functions/src_extract_content_settings.argsToExtractor.html b/docs/functions/src_extract_content_settings.argsToExtractor.html deleted file mode 100644 index d812d6a..0000000 --- a/docs/functions/src_extract_content_settings.argsToExtractor.html +++ /dev/null @@ -1,4 +0,0 @@ -argsToExtractor | ai-research-agent
            • Parameters

              • args: any
              • url: any = null

              Returns Extractor

            diff --git a/docs/functions/src_extract_content_sitemaps.extractRobotsSitemaps.html b/docs/functions/src_extract_content_sitemaps.extractRobotsSitemaps.html deleted file mode 100644 index 3b0aa72..0000000 --- a/docs/functions/src_extract_content_sitemaps.extractRobotsSitemaps.html +++ /dev/null @@ -1,4 +0,0 @@ -extractRobotsSitemaps | ai-research-agent
            • Parameters

              • robotstxt: any
              • baseurl: any

              Returns string[]

            diff --git a/docs/functions/src_extract_content_sitemaps.findRobotsSitemaps.html b/docs/functions/src_extract_content_sitemaps.findRobotsSitemaps.html deleted file mode 100644 index 5364282..0000000 --- a/docs/functions/src_extract_content_sitemaps.findRobotsSitemaps.html +++ /dev/null @@ -1,4 +0,0 @@ -findRobotsSitemaps | ai-research-agent
            • Parameters

              • baseurl: any

              Returns Promise<string[]>

            diff --git a/docs/functions/src_extract_content_sitemaps.isPlausibleSitemap.html b/docs/functions/src_extract_content_sitemaps.isPlausibleSitemap.html deleted file mode 100644 index 50e793a..0000000 --- a/docs/functions/src_extract_content_sitemaps.isPlausibleSitemap.html +++ /dev/null @@ -1,4 +0,0 @@ -isPlausibleSitemap | ai-research-agent
            • Parameters

              • url: any
              • contents: any

              Returns boolean

            diff --git a/docs/functions/src_extract_content_sitemaps.sitemapSearch.html b/docs/functions/src_extract_content_sitemaps.sitemapSearch.html deleted file mode 100644 index 7616b65..0000000 --- a/docs/functions/src_extract_content_sitemaps.sitemapSearch.html +++ /dev/null @@ -1,4 +0,0 @@ -sitemapSearch | ai-research-agent
            • Parameters

              • url: any
              • targetLang: any = null
              • external: boolean = false
              • sleepTime: number = 2000

              Returns Promise<any[]>

            diff --git a/docs/functions/src_extract_content_spider.focusedCrawler.html b/docs/functions/src_extract_content_spider.focusedCrawler.html deleted file mode 100644 index 524b0bc..0000000 --- a/docs/functions/src_extract_content_spider.focusedCrawler.html +++ /dev/null @@ -1,4 +0,0 @@ -focusedCrawler | ai-research-agent
            • Parameters

              • homepage: any
              • maxSeenUrls: number = MAX_SEEN_URLS
              • maxKnownUrls: number = MAX_KNOWN_URLS
              • todo: any = null
              • knownLinks: any = null
              • lang: any = null
              • config: {
                    COOKIE: string;
                    DOWNLOAD_TIMEOUT: string;
                    EXTENSIVE_DATE_SEARCH: string;
                    EXTERNAL_URLS: string;
                    EXTRACTION_TIMEOUT: string;
                    MAX_FILE_SIZE: string;
                    MAX_REDIRECTS: string;
                    MAX_REPETITIONS: string;
                    MIN_DUPLCHECK_SIZE: string;
                    MIN_EXTRACTED_COMM_SIZE: string;
                    MIN_EXTRACTED_SIZE: string;
                    MIN_FILE_SIZE: string;
                    MIN_OUTPUT_COMM_SIZE: string;
                    MIN_OUTPUT_SIZE: string;
                    SLEEP_TIME: string;
                    USER_AGENTS: string;
                } = DEFAULT_CONFIG
                • COOKIE: string
                • DOWNLOAD_TIMEOUT: string
                • EXTENSIVE_DATE_SEARCH: string
                • EXTERNAL_URLS: string
                • EXTRACTION_TIMEOUT: string
                • MAX_FILE_SIZE: string
                • MAX_REDIRECTS: string
                • MAX_REPETITIONS: string
                • MIN_DUPLCHECK_SIZE: string
                • MIN_EXTRACTED_COMM_SIZE: string
                • MIN_EXTRACTED_SIZE: string
                • MIN_FILE_SIZE: string
                • MIN_OUTPUT_COMM_SIZE: string
                • MIN_OUTPUT_SIZE: string
                • SLEEP_TIME: string
                • USER_AGENTS: string
              • rules: any = null
              • pruneXpath: any = null

              Returns any[]

            diff --git a/docs/functions/src_extract_content_utils.controlXmlOutput.html b/docs/functions/src_extract_content_utils.controlXmlOutput.html deleted file mode 100644 index 60e4468..0000000 --- a/docs/functions/src_extract_content_utils.controlXmlOutput.html +++ /dev/null @@ -1,4 +0,0 @@ -controlXmlOutput | ai-research-agent
            • Parameters

              • document: any
              • options: any

              Returns string

            diff --git a/docs/functions/src_extract_content_utils.deleteElement.html b/docs/functions/src_extract_content_utils.deleteElement.html deleted file mode 100644 index 8921429..0000000 --- a/docs/functions/src_extract_content_utils.deleteElement.html +++ /dev/null @@ -1,4 +0,0 @@ -deleteElement | ai-research-agent
            • Parameters

              • element: any
              • keepTail: boolean = true

              Returns void

            diff --git a/docs/functions/src_extract_content_utils.mergeWithParent.html b/docs/functions/src_extract_content_utils.mergeWithParent.html deleted file mode 100644 index 3ca1152..0000000 --- a/docs/functions/src_extract_content_utils.mergeWithParent.html +++ /dev/null @@ -1,4 +0,0 @@ -mergeWithParent | ai-research-agent
            • Parameters

              • element: any
              • includeFormatting: boolean = false

              Returns void

            diff --git a/docs/functions/src_extract_content_utils.removeEmptyElements.html b/docs/functions/src_extract_content_utils.removeEmptyElements.html deleted file mode 100644 index 7631fc2..0000000 --- a/docs/functions/src_extract_content_utils.removeEmptyElements.html +++ /dev/null @@ -1,4 +0,0 @@ -removeEmptyElements | ai-research-agent
            • Parameters

              • tree: any

              Returns any

            diff --git a/docs/functions/src_extract_content_utils.stripDoubleTags.html b/docs/functions/src_extract_content_utils.stripDoubleTags.html deleted file mode 100644 index 2c20186..0000000 --- a/docs/functions/src_extract_content_utils.stripDoubleTags.html +++ /dev/null @@ -1,4 +0,0 @@ -stripDoubleTags | ai-research-agent
            • Parameters

              • tree: any

              Returns any

            diff --git a/docs/functions/src_extract_content_utils.textfilter.html b/docs/functions/src_extract_content_utils.textfilter.html deleted file mode 100644 index 67057b6..0000000 --- a/docs/functions/src_extract_content_utils.textfilter.html +++ /dev/null @@ -1,4 +0,0 @@ -textfilter | ai-research-agent
            • Parameters

              • element: any

              Returns any

            diff --git a/docs/functions/src_extract_content_utils.trim.html b/docs/functions/src_extract_content_utils.trim.html deleted file mode 100644 index a9dd719..0000000 --- a/docs/functions/src_extract_content_utils.trim.html +++ /dev/null @@ -1,4 +0,0 @@ -trim | ai-research-agent
            • Parameters

              • str: any

              Returns any

            diff --git a/docs/functions/src_extract_content_utils.xmlToTxt.html b/docs/functions/src_extract_content_utils.xmlToTxt.html deleted file mode 100644 index b8314a1..0000000 --- a/docs/functions/src_extract_content_utils.xmlToTxt.html +++ /dev/null @@ -1,4 +0,0 @@ -xmlToTxt | ai-research-agent
            • Parameters

              • xmlOutput: any
              • includeFormatting: any

              Returns any

            diff --git a/docs/functions/src_extract_content_xml.buildJsonOutput.html b/docs/functions/src_extract_content_xml.buildJsonOutput.html deleted file mode 100644 index b9a43de..0000000 --- a/docs/functions/src_extract_content_xml.buildJsonOutput.html +++ /dev/null @@ -1,4 +0,0 @@ -buildJsonOutput | ai-research-agent
            • Parameters

              • docmeta: any
              • withMetadata: boolean = true

              Returns string

            diff --git a/docs/functions/src_extract_content_xml.buildTeiOutput.html b/docs/functions/src_extract_content_xml.buildTeiOutput.html deleted file mode 100644 index 9269d98..0000000 --- a/docs/functions/src_extract_content_xml.buildTeiOutput.html +++ /dev/null @@ -1,4 +0,0 @@ -buildTeiOutput | ai-research-agent
            • Parameters

              • docmeta: any

              Returns any

            diff --git a/docs/functions/src_extract_content_xml.checkTei.html b/docs/functions/src_extract_content_xml.checkTei.html deleted file mode 100644 index 38aadaf..0000000 --- a/docs/functions/src_extract_content_xml.checkTei.html +++ /dev/null @@ -1,4 +0,0 @@ -checkTei | ai-research-agent
            • Parameters

              • xmldoc: any
              • url: any

              Returns any

            diff --git a/docs/functions/src_extract_content_xml.deleteElement.html b/docs/functions/src_extract_content_xml.deleteElement.html deleted file mode 100644 index 98b9896..0000000 --- a/docs/functions/src_extract_content_xml.deleteElement.html +++ /dev/null @@ -1,4 +0,0 @@ -deleteElement | ai-research-agent
            • Parameters

              • element: any
              • keepTail: boolean = true

              Returns void

            diff --git a/docs/functions/src_extract_content_xml.removeEmptyElements.html b/docs/functions/src_extract_content_xml.removeEmptyElements.html deleted file mode 100644 index 0718c28..0000000 --- a/docs/functions/src_extract_content_xml.removeEmptyElements.html +++ /dev/null @@ -1,4 +0,0 @@ -removeEmptyElements | ai-research-agent
            • Parameters

              • tree: any

              Returns any

            diff --git a/docs/functions/src_extractor_html_to_cite_extract_author.extractAuthor.html b/docs/functions/src_extractor_html_to_cite_extract_author.extractAuthor.html index 84c1abc..877748b 100644 --- a/docs/functions/src_extractor_html_to_cite_extract_author.extractAuthor.html +++ b/docs/functions/src_extractor_html_to_cite_extract_author.extractAuthor.html @@ -1,6 +1,6 @@ extractAuthor | ai-research-agent
            • Extracts the author from the document and validates it as a human name

              Parameters

              • document: Document

              Returns any

              {author_cite, author_short, author_type} or null if no valid author found

              -
            diff --git a/docs/functions/src_extractor_html_to_cite_extract_cite.extractCite.html b/docs/functions/src_extractor_html_to_cite_extract_cite.extractCite.html index c9fb342..6eeb6d8 100644 --- a/docs/functions/src_extractor_html_to_cite_extract_cite.extractCite.html +++ b/docs/functions/src_extractor_html_to_cite_extract_cite.extractCite.html @@ -7,7 +7,7 @@

            Article-extraction-benchmark

            Parameters

            Returns any

            {author, date, title, source}

            -
            diff --git a/docs/functions/src_extractor_html_to_cite_extract_date.extractDate.html b/docs/functions/src_extractor_html_to_cite_extract_date.extractDate.html index 3b75e67..cb3ae8e 100644 --- a/docs/functions/src_extractor_html_to_cite_extract_date.extractDate.html +++ b/docs/functions/src_extractor_html_to_cite_extract_date.extractDate.html @@ -2,7 +2,7 @@

            Parameters

            Returns string

            Extracted date or null if not found

            -
            diff --git a/docs/functions/src_extractor_html_to_cite_extract_source.extractSource.html b/docs/functions/src_extractor_html_to_cite_extract_source.extractSource.html index 6998d04..e2f4b48 100644 --- a/docs/functions/src_extractor_html_to_cite_extract_source.extractSource.html +++ b/docs/functions/src_extractor_html_to_cite_extract_source.extractSource.html @@ -1,7 +1,7 @@ extractSource | ai-research-agent
            • Extract source from document using common class names

              Parameters

              • document: Document

                document or dom object with article content

              Returns any

              source

              -
            diff --git a/docs/functions/src_extractor_html_to_cite_extract_title.extractTitle.html b/docs/functions/src_extractor_html_to_cite_extract_title.extractTitle.html index 85028ea..efee496 100644 --- a/docs/functions/src_extractor_html_to_cite_extract_title.extractTitle.html +++ b/docs/functions/src_extractor_html_to_cite_extract_title.extractTitle.html @@ -1,7 +1,7 @@ extractTitle | ai-research-agent
            • Extract and clean title from document

              Parameters

              • document: Document

                DOM object with article content

              Returns string

              Extracted and cleaned title

              -
            diff --git a/docs/functions/src_extractor_html_to_cite_human_names_recognize.extractNamedEntity.html b/docs/functions/src_extractor_html_to_cite_human_names_recognize.extractNamedEntity.html index be312c2..5732e71 100644 --- a/docs/functions/src_extractor_html_to_cite_human_names_recognize.extractNamedEntity.html +++ b/docs/functions/src_extractor_html_to_cite_human_names_recognize.extractNamedEntity.html @@ -6,7 +6,7 @@ Author type is ["single", "two-author", "more-than-two", "organization"] where organization is a non-human name that is not reversed.

            Parameters

            Returns any

            {author_cite, author_short, author_type}

            -
            diff --git a/docs/functions/src_extractor_html_to_cite_metadata_to_cite.extractCiteFromMetadata.html b/docs/functions/src_extractor_html_to_cite_metadata_to_cite.extractCiteFromMetadata.html index 9ac1b93..ab4a2ca 100644 --- a/docs/functions/src_extractor_html_to_cite_metadata_to_cite.extractCiteFromMetadata.html +++ b/docs/functions/src_extractor_html_to_cite_metadata_to_cite.extractCiteFromMetadata.html @@ -1,7 +1,7 @@ extractCiteFromMetadata | ai-research-agent
            • Extract cite info from common property names in webpage's metadata

              Parameters

              • doc: Document

                dom object of document

              Returns any

              {author, date, title, source}

              -
            diff --git a/docs/functions/src_extractor_html_to_cite_url_to_domain.convertURLToDomain.html b/docs/functions/src_extractor_html_to_cite_url_to_domain.convertURLToDomain.html index cdc8934..07c52fc 100644 --- a/docs/functions/src_extractor_html_to_cite_url_to_domain.convertURLToDomain.html +++ b/docs/functions/src_extractor_html_to_cite_url_to_domain.convertURLToDomain.html @@ -5,7 +5,7 @@ https://wiki.mozilla.org/TLD_List https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains

            Parameters

            Returns string

            rootDomain

            -
            diff --git a/docs/functions/src_extractor_html_to_cite_url_to_favicon.extractFavicon.html b/docs/functions/src_extractor_html_to_cite_url_to_favicon.extractFavicon.html index a002371..748b2da 100644 --- a/docs/functions/src_extractor_html_to_cite_url_to_favicon.extractFavicon.html +++ b/docs/functions/src_extractor_html_to_cite_url_to_favicon.extractFavicon.html @@ -1,10 +1,10 @@ extractFavicon | ai-research-agent
            • Gets favicon for any URL by parsing the HTML and looking for <link rel="icon"> tags and validates domain.com/favicon.ico by checking if for valid response.

              -

              Parameters

              • url: string
              • options: any = {}

              Returns Promise<string>

              Favicon URL or null if not found

              +

            Parameters

            Returns Promise<string>

            Favicon URL or null if not found

            const favicons = await extractFavicon('https://github.com/')
             
            -
            diff --git a/docs/functions/src_extractor_html_to_content_extract_content.extractMainContent.html b/docs/functions/src_extractor_html_to_content_extract_content.extractMainContent.html deleted file mode 100644 index 5d58e8c..0000000 --- a/docs/functions/src_extractor_html_to_content_extract_content.extractMainContent.html +++ /dev/null @@ -1,77 +0,0 @@ -extractMainContent | ai-research-agent
            • The function extracts main content with regex patterns, cleaning HTML, scoring nodes -based on content indicators like paragraphs and id/class names, selecting -the top candidate, extracting it, and cleaning up content around it.

              -

              Article-extraction-benchmark

              -

              Parameters

              • htmlString: string

                The HTML string to extract content from.

                -
              • options: {
                    removeHTML: boolean;
                } = {}

                Additional options for extraction

                -
                • removeHTML: boolean

                  [default=false] - remove HTML tags and return text with linebreaks

                  -

              Returns string

              The extracted main article content.

              -

              This function works through the following steps to extract the main content:

              -
                -
              1. -

                Define regular expressions:

                -
                  -
                • Various regex patterns are defined to identify content and non-content areas.
                • -
                -
              2. -
              3. -

                Define helper functions:

                -
                  -
                • normalizeSpaces: Normalizes whitespace in a string.
                • -
                • stripTags: Removes all HTML tags from a string.
                • -
                • getTextLength: Calculates the length of text after stripping tags.
                • -
                • calculateLinkDensity: Calculates the ratio of link text to total text.
                • -
                -
              4. -
              5. -

                Clean HTML:

                -
                  -
                • Remove unlikely candidates (e.g., ads, sidebars) from the HTML.
                • -
                -
              6. -
              7. -

                Define scoring function:

                -
                  -
                • scoreNode: Assigns a score to an HTML node based on content and attributes.
                • -
                • Increases score for positive indicators (e.g., article, body, content tags).
                • -
                • Decreases score for negative indicators (e.g., hidden, footer, sidebar tags).
                • -
                • Adds to score based on paragraph tags and text length.
                • -
                -
              8. -
              9. -

                Find and score candidate nodes:

                -
                  -
                • Identify potential content nodes in the cleaned HTML.
                • -
                • Score each node using the scoreNode function.
                • -
                -
              10. -
              11. -

                Select top candidate:

                -
                  -
                • Sort candidates by score and select the highest-scoring node.
                • -
                -
              12. -
              13. -

                Extract content:

                -
                  -
                • Use regex to extract content around the top candidate node.
                • -
                -
              14. -
              15. -

                Clean up extracted content:

                -
                  -
                • Remove script and style tags and their contents.
                • -
                • Process anchor tags based on content density.
                • -
                • Keep only specific HTML tags (a, p, img, h1-h6, ul, ol, li).
                • -
                • Remove excess whitespace from the final content.
                • -
                -
              16. -
              -

              Based on Postlight and Readability

              -
              var url = "https://www.nytimes.com/2024/08/28/business/telegram-ceo-pavel-durov-charged.html"
              const html = await (await fetch(url)).text();
              var articleContent = extractMainArticle(html, {removeHTML: true}); -
              - -
            diff --git a/docs/functions/src_extractor_html_to_content_extract_content_extractor1_content.extractContentHTML.html b/docs/functions/src_extractor_html_to_content_extract_content_extractor1_content.extractContentHTML.html new file mode 100644 index 0000000..50ccea0 --- /dev/null +++ b/docs/functions/src_extractor_html_to_content_extract_content_extractor1_content.extractContentHTML.html @@ -0,0 +1,77 @@ +extractContentHTML | ai-research-agent
            • The function extracts main content with regex patterns, cleaning HTML, scoring nodes +based on content indicators like paragraphs and id/class names, selecting +the top candidate, extracting it, and cleaning up content around it.

              +
                +
              1. +

                Define regular expressions:

                +
                  +
                • Various regex patterns are defined to identify content and non-content areas.
                • +
                +
              2. +
              3. +

                Define helper functions:

                +
                  +
                • normalizeSpaces: Normalizes whitespace in a string.
                • +
                • stripTags: Removes all HTML tags from a string.
                • +
                • getTextLength: Calculates the length of text after stripping tags.
                • +
                • calculateLinkDensity: Calculates the ratio of link text to total text.
                • +
                +
              4. +
              5. +

                Clean HTML:

                +
                  +
                • Remove unlikely candidates (e.g., ads, sidebars) from the HTML.
                • +
                +
              6. +
              7. +

                Define scoring function:

                +
                  +
                • scoreNode: Assigns a score to an HTML node based on content and attributes.
                • +
                • Increases score for positive indicators (e.g., article, body, content tags).
                • +
                • Decreases score for negative indicators (e.g., hidden, footer, sidebar tags).
                • +
                • Adds to score based on paragraph tags and text length.
                • +
                +
              8. +
              9. +

                Find and score candidate nodes:

                +
                  +
                • Identify potential content nodes in the cleaned HTML.
                • +
                • Score each node using the scoreNode function.
                • +
                +
              10. +
              11. +

                Select top candidate:

                +
                  +
                • Sort candidates by score and select the highest-scoring node.
                • +
                +
              12. +
              13. +

                Extract content:

                +
                  +
                • Use regex to extract content around the top candidate node.
                • +
                +
              14. +
              15. +

                Clean up extracted content:

                +
                  +
                • Remove script and style tags and their contents.
                • +
                • Process anchor tags based on content density.
                • +
                • Keep only specific HTML tags (a, p, img, h1-h6, ul, ol, li).
                • +
                • Remove excess whitespace from the final content.
                • +
                +
              16. +
              +

              Article Extraction Benchmark

              +

              Parameters

              • html: any
              • Optionaloptions: {
                    minContentLength: number;
                    minScore: number;
                    minTextLength: number;
                    retryLength: number;
                } = {}
                • minContentLength: number

                  default=140 - Minimum length of content to be considered valid

                  +
                • minScore: number

                  default=20 - Minimum score for content to be considered valid

                  +
                • minTextLength: number

                  default=25 - Minimum length of text to be considered valid

                  +
                • retryLength: number

                  default=250 - Length to retry content extraction if initial attempt fails

                  +

              Returns Element

              Extracted HTML element of main content such as article body

              +
              var url = "https://www.nytimes.com/2024/08/28/business/telegram-ceo-pavel-durov-charged.html"
              const html = await (await fetch(url)).text();
              var articleContent = extractContentHTML(html); +
              + +

              Based on Mozilla Readability (2015), Arc90 (2010)

              +
            diff --git a/docs/functions/src_extractor_html_to_content_extract_content_extractor2_content.extractContentHTML2.html b/docs/functions/src_extractor_html_to_content_extract_content_extractor2_content.extractContentHTML2.html new file mode 100644 index 0000000..1660440 --- /dev/null +++ b/docs/functions/src_extractor_html_to_content_extract_content_extractor2_content.extractContentHTML2.html @@ -0,0 +1,41 @@ +extractContentHTML2 | ai-research-agent
              1. +
              2. The algorithm starts by loading the HTML content using linkedom, a lightweight DOM parser for Node.js.
              3. +
              4. It then applies a series of cleaning and scoring techniques to identify the main content of +the page, starting with stripping unlikely candidates (e.g., elements with class names like "comment" +or "sidebar").
              5. +
              6. The HTML is converted into a series of paragraph elements, which are then scored based on various +factors such as text length, number of commas, and the presence of certain class names or IDs.
              7. +
              8. The algorithm assigns scores to parent and grandparent elements based on the scores of their +children, with parents receiving the full score and grandparents receiving half.
              9. +
              10. After scoring, the algorithm finds the top candidate element by selecting the node with the +highest score.
              11. +
              12. The top candidate's siblings are then examined to see if they should be included in the main +content, based on their scores and other factors like link density.
              13. +
              14. The algorithm then cleans the selected content by removing unnecessary tags, attributes, and empty +elements.
              15. +
              16. It also handles special cases like cleaning up header tags, images, and other potentially irrelevant +content.
              17. +
              18. Throughout the process, the algorithm uses various regular expressions and scoring heuristics to +identify positive and negative indicators of content relevance.
              19. +
              20. Finally, the cleaned and extracted content is returned as an HTML string, representing the main +body of the article or webpage.
              21. +
              +

              Article Extraction Benchmark

              +

              Parameters

              • html: string

                The HTML content to extract from.

                +
              • Optionalopts: {
                    cleanConditionally: boolean;
                    stripUnlikelyCandidates: boolean;
                    weightNodes: boolean;
                }

                The options for content extraction.

                +
                • cleanConditionally: boolean

                  default=true - Clean the node to remove superfluous content +like forms, ads, etc. Initially, pass in the most restrictive options which will return the highest +quality content. On each failure, retry with slightly more lax options.

                  +
                • stripUnlikelyCandidates: boolean

                  default=true - Remove elements that match non-article- +like criteria first (e.g., elements with a classname of "comment").

                  +
                • weightNodes: boolean

                  default=true - Modify an element's score based on certain classNames or +IDs (e.g., subtract if a node has a className of 'comment', add if a node has an ID of 'entry-content').

                  +

              Returns string

              The extracted content as an HTML string, or null if extraction fails.

              +

              Based on Postlight Mercury Parser (2017-)

              +
              var url =  "https://en.wikipedia.org/wiki/David_Hilbert"
              var html = await (await fetch(url)).text();
              var content = extractContentHTML(html);
              console.log(content); // HTML content of main article body +
              + +
            diff --git a/docs/functions/src_extractor_html_to_content_html_special_chars.convertHTMLSpecialChars.html b/docs/functions/src_extractor_html_to_content_html_special_chars.convertHTMLSpecialChars.html index bc1bf86..78ce309 100644 --- a/docs/functions/src_extractor_html_to_content_html_special_chars.convertHTMLSpecialChars.html +++ b/docs/functions/src_extractor_html_to_content_html_special_chars.convertHTMLSpecialChars.html @@ -1,13 +1,13 @@ convertHTMLSpecialChars | ai-research-agent
            • Converts HTML special characters like &<>"'`’ to & escaped codes or vice versa. -It handles named entities, decimal numeric character references, and hexadecimal numeric character references.

              +It handles named entities and hexadecimal numeric character references.

              Parameters

              • str: string

                The string to process.

              • unescape: boolean = true

                default=true - If true, converts & codes to characters. If false, converts characters to codes.

              Returns string

              The processed string.

              -
              convertHTMLSpecialChars('&lt;p&gt;This &amp; that &copy; 2023 &quot;Quotes&quot; &#39;Apostrophes&#39; &euro;100 &#x263A;&lt;/p&gt;', true)
              // Returns: "<p>This & that © 2023 "Quotes" 'Apostrophes' €100 ☺</p>" +
              var normalHTML = convertHTMLSpecialChars('&lt;p&gt;This &amp; that &copy; 2023 '+
              '&quot;Quotes&quot;&#39;Apostrophes&#39; &euro;100 &#x263A;&lt;/p&gt;', true)
              console.log(normalHTML) // Returns: "<p>This & that © 2023 "Quotes" 'Apostrophes' €100 ☺</p>"
              -
            diff --git a/docs/functions/src_extractor_html_to_content_html_to_basic_html.addDOMFunctions.html b/docs/functions/src_extractor_html_to_content_html_to_basic_html.addDOMFunctions.html index 4135b20..9a9123b 100644 --- a/docs/functions/src_extractor_html_to_content_html_to_basic_html.addDOMFunctions.html +++ b/docs/functions/src_extractor_html_to_content_html_to_basic_html.addDOMFunctions.html @@ -1,4 +1,4 @@ -addDOMFunctions | ai-research-agent
            diff --git a/docs/functions/src_extractor_html_to_content_html_to_basic_html.convertHTMLToBasicHTML.html b/docs/functions/src_extractor_html_to_content_html_to_basic_html.convertHTMLToBasicHTML.html index 58c1e52..6c70ac7 100644 --- a/docs/functions/src_extractor_html_to_content_html_to_basic_html.convertHTMLToBasicHTML.html +++ b/docs/functions/src_extractor_html_to_content_html_to_basic_html.convertHTMLToBasicHTML.html @@ -4,11 +4,13 @@ to HTML anyways to display it, and it is better to edit basic HTML in a rich text editor.

            Mozilla DOM Reference
            Source Code of Browser HTML DOM
            -How Blink Works
            RegExp JS V8 Code

            Parameters

            Returns string

            basic text formatting html

            -
            diff --git a/docs/functions/src_extractor_html_to_content_html_to_content.extractContent.html b/docs/functions/src_extractor_html_to_content_html_to_content.extractContent.html deleted file mode 100644 index e47c833..0000000 --- a/docs/functions/src_extractor_html_to_content_html_to_content.extractContent.html +++ /dev/null @@ -1,6 +0,0 @@ -extractContent | ai-research-agent
            • Extracts the main content and cite from a document or HTML string

              -

              Parameters

              • documentOrHTML: any
              • options: any = {}

              Returns any

              {title, author_cite, author_short, author, date, source, html}

              -
            diff --git a/docs/functions/src_extractor_html_to_content_html_to_content.extractContentAndCite.html b/docs/functions/src_extractor_html_to_content_html_to_content.extractContentAndCite.html new file mode 100644 index 0000000..4288749 --- /dev/null +++ b/docs/functions/src_extractor_html_to_content_html_to_content.extractContentAndCite.html @@ -0,0 +1,14 @@ +extractContentAndCite | ai-research-agent
            • Extracts the main content and citation information from a document or HTML string

              +

              Parameters

              • documentOrHTML: any

                The document or HTML string to extract content from

                +
              • options: {
                    formatting: boolean;
                    images: boolean;
                    links: boolean;
                    url: string;
                    useExtractor2: boolean;
                } = {}

                Optional configuration options

                +
                • formatting: boolean

                  default=true - Whether to preserve formatting in the extracted content

                  +
                • images: boolean

                  default=true - Whether to include images in the extracted content

                  +
                • links: boolean

                  default=true - Whether to include links in the extracted content

                  +
                • url: string

                  The URL of the original document, if available, for absolutify-ing URLs

                  +
                • useExtractor2: boolean

                  default=false - false uses Mozilla Readability, true uses Postlight Mercury

                  +

              Returns ExtractedContent

              An object containing extracted information

              +

              If there's an error parsing the HTML

              +
            diff --git a/docs/functions/src_extractor_html_to_content_readability2.extractContentHTML.html b/docs/functions/src_extractor_html_to_content_readability2.extractContentHTML.html deleted file mode 100644 index 5779874..0000000 --- a/docs/functions/src_extractor_html_to_content_readability2.extractContentHTML.html +++ /dev/null @@ -1,24 +0,0 @@ -extractContentHTML | ai-research-agent
            • Extracts main content from HTML documents using algorithm based on Readability. -Employs regex patterns, HTML cleaning, node scoring, and content selection.

              -
                -
              1. Define regex patterns for content identification
              2. -
              3. Clean HTML by removing unlikely content
              4. -
              5. Score nodes based on content quality indicators
              6. -
              7. Select the best candidate for main content
              8. -
              9. Extract top candate
              10. -
              11. Clean the selected content
              12. -
              -

              Parameters

              • html: string

                The HTML string to extract content from

                -
              • options: {
                    minContentLength: number;
                    minScore: number;
                    minTextLength: number;
                    retryLength: number;
                } = {}
                • minContentLength: number

                  default=140 - Minimum length of content to be considered valid

                  -
                • minScore: number

                  default=20 - Minimum score for content to be considered valid

                  -
                • minTextLength: number

                  default=25 - Minimum length of text to be considered valid

                  -
                • retryLength: number

                  default=250 - Length to retry content extraction if initial attempt fails

                  -

              Returns string

              Extracted HTML of main content such as article body

              -

              Mozilla (2015), Arc90 (2010)

              -
              var url = "https://en.wikipedia.org/wiki/David_Hilbert";
              var html = await (await fetch(url)).text();
              var content = extractContentHTML(html); -
              - -
            diff --git a/docs/functions/src_extractor_url_to_content_pdf_to_content.extractPDF.html b/docs/functions/src_extractor_url_to_content_pdf_to_content.convertPDFToHTML.html similarity index 55% rename from docs/functions/src_extractor_url_to_content_pdf_to_content.extractPDF.html rename to docs/functions/src_extractor_url_to_content_pdf_to_content.convertPDFToHTML.html index 7ce5fc1..64f023a 100644 --- a/docs/functions/src_extractor_url_to_content_pdf_to_content.extractPDF.html +++ b/docs/functions/src_extractor_url_to_content_pdf_to_content.convertPDFToHTML.html @@ -1,11 +1,14 @@ -extractPDF | ai-research-agent
            • Extracts formatted text from PDF with parsing of linebreaks , +convertPDFToHTML | ai-research-agent

              • Extracts formatted text from PDF with parsing of linebreaks , page headers, footnotes, and infering section headings based on standard deviation of range from average text height
                https://en.wikipedia.org/wiki/History_of_PDF
                https://github.com/mozilla/pdf.js/releases
                https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html

                Parameters

                • pdfURLOrBuffer: string

                  URL to a PDF file or buffer from fs.readFile

                  -
                • options: {
                      addHeadingsTags: boolean;
                      addPageNumbers: boolean;
                      addSentenceLineBreaks: boolean;
                      moveFootnotes: boolean;
                      removeHyphens: boolean;
                      removePageHeaders: boolean;
                      timeout: boolean;
                  } = {}
                  • addHeadingsTags: boolean

                    default=true - Adds H1 tags to heading titles in document

                    +
                  • Optionaloptions: {
                        addHeadingsTags: boolean;
                        addPageNumbers: boolean;
                        addSentenceLineBreaks: boolean;
                        moveFootnotes: boolean;
                        removeHyphens: boolean;
                        removePageHeaders: boolean;
                        timeout: boolean;
                    } = {}
                      +
                    • +
                    +
                    • addHeadingsTags: boolean

                      default=true - Adds H1 tags to heading titles in document

                    • addPageNumbers: boolean

                      default=true - Adds # to end of each page

                    • addSentenceLineBreaks: boolean

                      default=true - Inserts line breaks at the end of sentence ranges

                    • moveFootnotes: boolean

                      default=true - Moves footnotes to end of document

                      @@ -13,7 +16,7 @@
                    • removePageHeaders: boolean

                      default=true - Removes repeated headers found on each page

                    • timeout: boolean

                      default=5 - http request timeout

                Returns any

                HTML formatted text or {error} if error in parsing

                -
              diff --git a/docs/functions/src_extractor_url_to_content_pdf_to_content.isUrlPDF.html b/docs/functions/src_extractor_url_to_content_pdf_to_content.isUrlPDF.html index 8e64de1..1758cb8 100644 --- a/docs/functions/src_extractor_url_to_content_pdf_to_content.isUrlPDF.html +++ b/docs/functions/src_extractor_url_to_content_pdf_to_content.isUrlPDF.html @@ -3,7 +3,7 @@ Useful for hidden pdf url that does not end with pdf

              Parameters

              • url: string

                The URL to check.

              Returns Promise<boolean>

              True if the URL points to a PDF, false otherwise.

              -
            diff --git a/docs/functions/src_extractor_url_to_content_scrape_url.scrapeURL.html b/docs/functions/src_extractor_url_to_content_scrape_url.scrapeURL.html index 431e735..2b9399e 100644 --- a/docs/functions/src_extractor_url_to_content_scrape_url.scrapeURL.html +++ b/docs/functions/src_extractor_url_to_content_scrape_url.scrapeURL.html @@ -3,11 +3,15 @@ Scraping internet pages is a free speech right globally.

            Parameters

            Returns Promise<any>

            diff --git a/docs/functions/src_extractor_url_to_content_url_to_content.extract.html b/docs/functions/src_extractor_url_to_content_url_to_content.extract.html index a5133da..5063ae3 100644 --- a/docs/functions/src_extractor_url_to_content_url_to_content.extract.html +++ b/docs/functions/src_extractor_url_to_content_url_to_content.extract.html @@ -7,7 +7,10 @@ footnotes, and adding linebreaks based on standard deviation of range text height.


            Parameters

            Returns Article

            -
            diff --git a/docs/functions/src_extractor_url_to_content_youtube_embed.embedYoutubePlayer.html b/docs/functions/src_extractor_url_to_content_youtube_embed.embedYoutubePlayer.html index 37a0ed3..f37e0df 100644 --- a/docs/functions/src_extractor_url_to_content_youtube_embed.embedYoutubePlayer.html +++ b/docs/functions/src_extractor_url_to_content_youtube_embed.embedYoutubePlayer.html @@ -5,7 +5,7 @@
            // <div id="player"></div>
            const YT = embedYoutubePlayer();
            new YT.Player('player', {
            height: '360',
            width: '640',
            videoId: 'dQw4w9WgXcQ',
            events: {
            'onReady': onPlayerReady,
            'onStateChange': null,
            'onTimeChange': onTimeChange,
            }
            });
            function onPlayerReady(event) {
            event.target.playVideo();
            }
            function onTimeChange(time) {
            console.log(time)
            }
            -
            diff --git a/docs/functions/src_extractor_url_to_content_youtube_to_text.extractYoutubeText.html b/docs/functions/src_extractor_url_to_content_youtube_to_text.extractYoutubeText.html index 6ade13a..e7b9f6c 100644 --- a/docs/functions/src_extractor_url_to_content_youtube_to_text.extractYoutubeText.html +++ b/docs/functions/src_extractor_url_to_content_youtube_to_text.extractYoutubeText.html @@ -1,10 +1,13 @@ extractYoutubeText | ai-research-agent
            • Fetch youtube.com video's webpage HTML for embedded transcript if blocked, use scraper of alternative sites

              -

              Parameters

              • videoUrl: string
              • options: {
                    addTimestamps: boolean;
                    timeout: boolean;
                } = {}
                • addTimestamps: boolean

                  default=true - true to return timestamps, default true

                  +

                  Parameters

                  • videoUrl: string
                  • Optionaloptions: {
                        addTimestamps: boolean;
                        timeout: boolean;
                    } = {}
                      +
                    • +
                    +
                    • addTimestamps: boolean

                      default=true - true to return timestamps, default true

                    • timeout: boolean

                      default=5 - http request timeout

                  Returns any

                  {content, timestamps} where content is the full text of the transcript, and timestamps is an array of [characterIndex, timeSeconds]

                  -
            diff --git a/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToHNSW.html b/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToHNSW.html new file mode 100644 index 0000000..c949ed8 --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToHNSW.html @@ -0,0 +1,15 @@ +convertEmbeddingsToHNSW | ai-research-agent
            • Generates vectors for a set of documents and creates an HNSW index using +hnswlib-node WASM JS for efficient similarity search.

              +

              ANN Benchmarks

              +Pinecone - HNSW
              +Wikipedia - HNSW

              +

              Parameters

              • documents: string[]

                An array of document texts to be vectorized.

                +
              • Optionaloptions: {
                    maxElements: number;
                    numDimensions: number;
                } = {}

                Optional parameters for vector generation and indexing.

                +
                • maxElements: number

                  The maximum number of data points.

                  +
                • numDimensions: number

                  The length of data point vector that will be indexed.

                  +

              Returns Promise<HierarchicalNSW>

              The created HNSW index.

              +
            diff --git a/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToUMAP.html b/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToUMAP.html new file mode 100644 index 0000000..e3d21ee --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.convertEmbeddingsToUMAP.html @@ -0,0 +1,13 @@ +convertEmbeddingsToUMAP | ai-research-agent
            • Converts embeddings to UMAP coordinates.

              +

              Understanding UMAP
              +UMAP Algorithm Overview

              +

              Parameters

              • embeddingsDict: {}

                The dictionary of embeddings.

                +
                • Optionaloptions: {
                      numberDimensions: number;
                      numberDistance: number;
                      numberNeighbors: number;
                  } = {}
                  • numberDimensions: number

                    [default=2] - The number of dimensions for UMAP output.

                    +
                  • numberDistance: number

                    [default=0.1] - The minimum distance parameter for UMAP.

                    +
                  • numberNeighbors: number

                    [default=15] - The number of nearest neighbors for UMAP.

                    +

                Returns Promise<PlotDataPoint[]>

                An array of plot data points.

                +
              diff --git a/docs/functions/src_graph_embeddings_to_graph.convertTextToEmbeddingVector.html b/docs/functions/src_graph_embeddings_to_graph.convertTextToEmbeddingVector.html new file mode 100644 index 0000000..8c2a1ab --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.convertTextToEmbeddingVector.html @@ -0,0 +1,17 @@ +convertTextToEmbeddingVector | ai-research-agent
              • Text embeddings convert words or phrases into numerical vectors in a high-dimensional +space, where each dimension represents a semantic feature extracted by a model like +MiniLM-L6-v2. In this concept space, words with similar meanings have vectors that +are close together, allowing for quantitative comparisons of semantic similarity. +These vector representations enable powerful applications in natural language processing, +including semantic search, text classification, and clustering, by leveraging the +geometric properties of the embedding space to capture and analyze the relationships +between words and concepts. +Text Embeddings, Classification, and Semantic Search (Youtube)

                +


                +

                Parameters

                • text: string

                  The text to embed.

                  +
                • Optionaloptions: {
                      modelName: string;
                      precision: number;
                  } = {}
                  • modelName: string

                    default="Xenova/all-MiniLM-L6-v2" - The name of the model to use

                    +
                  • precision: number

                    default=3 - The number of decimal places to round to.

                    +

                Returns Promise<{
                    embedding: number[];
                    embeddingsDict: {};
                }>

              diff --git a/docs/functions/src_graph_embeddings_to_graph.cosineSimilarity.html b/docs/functions/src_graph_embeddings_to_graph.cosineSimilarity.html new file mode 100644 index 0000000..8d29584 --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.cosineSimilarity.html @@ -0,0 +1,4 @@ +cosineSimilarity | ai-research-agent
              • Parameters

                • vecA: any
                • vecB: any

                Returns number

              diff --git a/docs/functions/src_graph_embeddings_to_graph.getAllEmbeddings.html b/docs/functions/src_graph_embeddings_to_graph.getAllEmbeddings.html new file mode 100644 index 0000000..6442f8d --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.getAllEmbeddings.html @@ -0,0 +1,8 @@ +getAllEmbeddings | ai-research-agent
              • Retrieves all embeddings from the HNSW index.

                +

                Parameters

                • index: HierarchicalNSW

                  The HNSW index containing the embeddings.

                  +
                • precision: number = 3

                Returns Promise<number[][]>

                A promise that resolves to an array of embedding vectors.

                +

                getAllEmbeddings

                +
              diff --git a/docs/functions/src_graph_embeddings_to_graph.getEmbeddingPipeline.html b/docs/functions/src_graph_embeddings_to_graph.getEmbeddingPipeline.html new file mode 100644 index 0000000..91fe24d --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.getEmbeddingPipeline.html @@ -0,0 +1,6 @@ +getEmbeddingPipeline | ai-research-agent
              • Initialize HuggingFace Transformers pipeline for embedding text.

                +

                Returns Promise<AutoTokenizer>

                The pipeline.

                +
              diff --git a/docs/functions/src_graph_embeddings_to_graph.searchWithQuery.html b/docs/functions/src_graph_embeddings_to_graph.searchWithQuery.html new file mode 100644 index 0000000..59adff2 --- /dev/null +++ b/docs/functions/src_graph_embeddings_to_graph.searchWithQuery.html @@ -0,0 +1,4 @@ +searchWithQuery | ai-research-agent
              • Parameters

                • index: any
                • query: any
                • options: {} = {}

                  Returns Promise<any>

                diff --git a/docs/functions/src_graph_save_hnsw.convertEmbeddingsIndexToBase64.html b/docs/functions/src_graph_save_hnsw.convertEmbeddingsIndexToBase64.html new file mode 100644 index 0000000..f7b8a6c --- /dev/null +++ b/docs/functions/src_graph_save_hnsw.convertEmbeddingsIndexToBase64.html @@ -0,0 +1,10 @@ +convertEmbeddingsIndexToBase64 | ai-research-agent
                • Writes an HNSW index to a base64 encoded string.

                  +

                  Parameters

                  • index: any

                    The HNSW index object.

                    +
                  • numDimensions: number

                    The number of dimensions in the index.

                    +
                  • maxElements: number

                    The maximum number of elements in the index.

                    +

                  Returns string

                  A base64 encoded string representation of the index.

                  +

                  If there's an error during the index serialization process.

                  +
                diff --git a/docs/functions/src_match_compare_letters.calculateSimilarityByCharacter.html b/docs/functions/src_match_compare_letters.calculateSimilarityByCharacter.html index 30dfea9..b0a3982 100644 --- a/docs/functions/src_match_compare_letters.calculateSimilarityByCharacter.html +++ b/docs/functions/src_match_compare_letters.calculateSimilarityByCharacter.html @@ -9,7 +9,7 @@
              • s2: string

                Second string

              • Returns number

                Jaro-Winkler similarity score

                Matthew Jaro, William Winkler (1990)

                -
                diff --git a/docs/functions/src_match_match_quasar.matchQUASAR.html b/docs/functions/src_match_match_quasar.matchQUASAR.html index 1c16cb1..a87f8eb 100644 --- a/docs/functions/src_match_match_quasar.matchQUASAR.html +++ b/docs/functions/src_match_match_quasar.matchQUASAR.html @@ -1,13 +1,13 @@ -matchQUASAR | ai-research-agent
                • QUASAR: Quotes-Unifying Alphanumeric Search-All RegExp

                  +matchQUASAR | ai-research-agent
                  • QUASAR: Quotes-Unifying Alphanumeric Search-All RegExp

                    Search document for all words of query ignoring casing but "words in quotes" as necessarily together like users expect in web search engines. Single line function that can be used anywhere, such as UI inputs to filter a data list.

                    -

                    Parameters

                    • document: string
                    • query: string

                    Returns boolean

                    true if doc has all words and "words in phrases"

                    -
                    var isFound = matchQUASAR(`Ask not what your country can do for you, ask what you can do for your country. 
                    There is nothing to fear but fear itself.`, ` "Ask not" "but fear itself" nothing`) +

                    Parameters

                    • document: string
                    • query: string

                    Returns boolean

                    true if doc has all words and "phrases in quotes"

                    +
                    var isFound = matchQUASAR(`Ask not what your country can do for you, 
                    ask what you can do for your country. is nothing to fear but fear itself.`,
                    ` "Ask not" "but fear itself" nothing`) // returns true
                    -
                  diff --git a/docs/functions/src_match_softmax.calculateProbabilitySoftmax.html b/docs/functions/src_match_softmax.calculateProbabilitySoftmax.html index 521e273..fdb6f6c 100644 --- a/docs/functions/src_match_softmax.calculateProbabilitySoftmax.html +++ b/docs/functions/src_match_softmax.calculateProbabilitySoftmax.html @@ -1,5 +1,4 @@ -calculateProbabilitySoftmax | ai-research-agent
                  • Calculates the Softmax of an array of numbers.

                    -

                    The Softmax +calculateProbabilitySoftmax | ai-research-agent

                    diff --git a/docs/functions/src_match_weigh_relevance_frequency.calculatePhraseSpecificity.html b/docs/functions/src_match_weigh_relevance_frequency.calculatePhraseSpecificity.html index b5a2989..dab7fff 100644 --- a/docs/functions/src_match_weigh_relevance_frequency.calculatePhraseSpecificity.html +++ b/docs/functions/src_match_weigh_relevance_frequency.calculatePhraseSpecificity.html @@ -2,7 +2,7 @@ Words are tokenized into phrases and their specificity is calculated based on how many Wiki pages they appear in.

                    Parameters

                    • phrase: string
                    • options: any

                    Returns number

                    domain specificity 0-12~

                    -
                  diff --git a/docs/functions/src_match_weigh_relevance_frequency.weighRelevanceTermFrequency.html b/docs/functions/src_match_weigh_relevance_frequency.weighRelevanceTermFrequency.html index b911a26..82e3ff6 100644 --- a/docs/functions/src_match_weigh_relevance_frequency.weighRelevanceTermFrequency.html +++ b/docs/functions/src_match_weigh_relevance_frequency.weighRelevanceTermFrequency.html @@ -4,7 +4,7 @@ Data Science". https://www.youtube.com/watch?v=ruBm9WywevM

                  Parameters

                  • document: string

                    a single document to calculate the score for

                  • query: string

                    phrase to search tf and idf for each word

                    -
                  • options: {
                        avgDocWordCount: number;
                        normalizeLength: number;
                        saturationWeight: number;
                        totalWikiPages: number;
                    } = {}
                    • avgDocWordCount: number

                      Estimated average word count of all documents

                      +
                    • Optionaloptions: {
                          avgDocWordCount: number;
                          normalizeLength: number;
                          saturationWeight: number;
                          totalWikiPages: number;
                      } = {}
                      • avgDocWordCount: number

                        Estimated average word count of all documents

                      • normalizeLength: number

                        normalizeLengthcontrols the document length normalization. It ranges from 0 to 1, with 0.75 being a common default value. When normalizeLength=1: Full length normalization is applied. @@ -14,7 +14,7 @@ As saturationWeight increases: The impact of term frequency increases (i.e., multiple occurrences of a term in a document become more significant).

                      • totalWikiPages: number

                        Total number of Wikipedia pages used to calculate IDF

                  Returns number

                  score for term specificity

                  -
                diff --git a/docs/functions/src_search_web_search_stream.searchSTREAM.html b/docs/functions/src_search_web_search_stream.searchSTREAM.html index 577de6a..a9e39bd 100644 --- a/docs/functions/src_search_web_search_stream.searchSTREAM.html +++ b/docs/functions/src_search_web_search_stream.searchSTREAM.html @@ -7,26 +7,16 @@ 5. Uses a Research Agent prompt with key sentences from relevant sources to generate an answer via Groq Llama, OpenAI, or Anthropic API, and suggests follow-up queries.

                Parameters

                • query: string

                  The search query string.

                  -
                • options: {
                      categoryIndex: number;
                      maxRetries: number;
                      maxTopResultsToExtract: number;
                      recencyIndex: number;
                      selectedDomain: string;
                  } = {}

                  default { -categoryIndex = 0, -recencyIndex = 0, -maxRetries = 8, -maxTopResultsToExtract = 3, -} - -categoryIndex=0 - Index of the search category. -recencyIndex=0 - Index representing the recency of results. -maxRetries=5 - Maximum number of retry attempts for the search. -maxTopResultsToExtract=6 - Maximum number of top results to extract and analyze.

                  -
                  • categoryIndex: number

                    default=0 - Index of the search category.

                    +
                  • Optionaloptions: {
                        categoryIndex: number;
                        maxRetries: number;
                        maxTopResultsToExtract: number;
                        recencyIndex: number;
                        selectedDomain: string;
                    } = {}
                    • categoryIndex: number

                      default=0 - Index of the search category.

                    • maxRetries: number

                      default=5 - Maximum number of retry attempts for the search.

                    • maxTopResultsToExtract: number

                      default=6 - Maximum number of top results to extract and analyze.

                    • recencyIndex: number

                      default=0 - Index representing the recency of results.

                    • selectedDomain: string

                      default=null - Use your custom domain SearXNG

                Returns Promise<any[]>

                A promise that resolves to an array containing the search results, extracted information, and generated answer.

                -
                const advancedResults = await searchSTREAM('Latest developments in quantum computing', {
                categoryIndex: 2,
                recencyIndex: 1,
                maxRetries: 5,
                maxTopResultsToExtract: 10
                }); +
                const advancedResults = await searchSTREAM('Latest developments in quantum computing', {
                categoryIndex: 2,
                recencyIndex: 1,
                maxRetries: 5,
                maxTopResultsToExtract: 10
                });
                -
                diff --git a/docs/functions/src_search_web_search_web.searchWeb.html b/docs/functions/src_search_web_search_web.searchWeb.html index 9104549..d55e17e 100644 --- a/docs/functions/src_search_web_search_web.searchWeb.html +++ b/docs/functions/src_search_web_search_web.searchWeb.html @@ -3,7 +3,10 @@ times to retry other domains if first time fails. SearXNG is a free internet metasearch engine which aggregates results from more than 70 search services. Users are neither tracked nor profiled.

                Parameters

                • query: string

                  The search query string.

                  -
                • options: {
                      category: number;
                      maxRetries: number;
                      recency: number;
                      selectedDomain: string;
                  } = {}
                  • category: number

                    default=0 - ["general", "news", "videos", "images", +

                  • Optionaloptions: {
                        category: number;
                        maxRetries: number;
                        recency: number;
                        selectedDomain: string;
                    } = {}
                      +
                    • +
                    +
                    • category: number

                      default=0 - ["general", "news", "videos", "images", "science", "map", "music", "it", "files", "social+media"]

                    • maxRetries: number

                      default=3 - Maximum number of retry attempts if the initial search fails.

                    • recency: number

                      default=0 - ["", "day", "week", "month", "year"]

                      @@ -14,7 +17,7 @@
                      const advancedResults = await searchWeb('Node.js', {
                      category: 2,
                      recency: 1,
                      maxRetries: 5
                      });
                      -
                diff --git a/docs/functions/src_search_web_search_wikipedia.searchWikipedia.html b/docs/functions/src_search_web_search_wikipedia.searchWikipedia.html index e63ba46..bed615f 100644 --- a/docs/functions/src_search_web_search_wikipedia.searchWikipedia.html +++ b/docs/functions/src_search_web_search_wikipedia.searchWikipedia.html @@ -3,7 +3,7 @@ documentation and is dificult to parse and clean up results.

                Parameters

                • query: string

                  search phrase

                  -
                • options: {
                      filterDisambiguation: boolean;
                      imageSize: number;
                      images: boolean;
                      limitSearchResults: number;
                      plainText: boolean;
                      searchInTitleOnly: boolean;
                      summarySentenceLimit: number;
                  } = {}
                  • filterDisambiguation: boolean

                    default=true Filter disambiguation pages

                    +
                  • Optionaloptions: {
                        filterDisambiguation: boolean;
                        imageSize: number;
                        images: boolean;
                        limitSearchResults: number;
                        plainText: boolean;
                        searchInTitleOnly: boolean;
                        summarySentenceLimit: number;
                    } = {}
                    • filterDisambiguation: boolean

                      default=true Filter disambiguation pages

                    • imageSize: number

                      default=200 Image size in pixels

                    • images: boolean

                      default=true Include image in results

                    • limitSearchResults: number

                      default=1 Limit number of search results

                      @@ -14,7 +14,7 @@
                      await searchWikipedia("JavaScript", { plainText: true })
                       
                      -
                diff --git a/docs/functions/src_similarity_similarity_concept.vectorizeTextAsConcept.html b/docs/functions/src_similarity_similarity_concept.vectorizeTextAsConcept.html deleted file mode 100644 index 62059d1..0000000 --- a/docs/functions/src_similarity_similarity_concept.vectorizeTextAsConcept.html +++ /dev/null @@ -1,16 +0,0 @@ -vectorizeTextAsConcept | ai-research-agent
                • Text embeddings convert words or phrases into numerical vectors in a high-dimensional -space, where each dimension represents a semantic feature extracted by a model like -MiniLM-L6-v2. In this concept space, words with similar meanings have vectors that -are close together, allowing for quantitative comparisons of semantic similarity. -These vector representations enable powerful applications in natural language processing, -including semantic search, text classification, and clustering, by leveraging the -geometric properties of the embedding space to capture and analyze the relationships -between words and concepts.

                  - -

                  Text Embeddings, Classification, and Semantic Search (Youtube)

                  -

                  Parameters

                  • input: string | string[]
                  • options: {
                        batchSize: number;
                        modelName: string;
                    } = {}
                    • batchSize: number

                      default=512 - chunk size for each batch

                      -
                    • modelName: string

                      default="Xenova/all-MiniLM-L6-v2" - The name of the model to use

                      -

                  Returns Promise<number[][]>

                diff --git a/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVector.html b/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVector.html index 84e6c89..8189732 100644 --- a/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVector.html +++ b/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVector.html @@ -1,8 +1,11 @@ weighRelevanceConceptVector | ai-research-agent
                • Rerank documents's chunks based on relevance to query, based on cosine similarity of their concept vectors generated by a 20MB MiniLM transformer model downloaded locally. -"A Complete Overview of Word Embeddings" https://www.youtube.com/watch?v=5MaWmXwxFNQ&t=323s

                  -

                  Parameters

                  • documents: string[]
                  • query: string
                  • options: any = {}

                  Returns Promise<{
                      content: string;
                      similarity: number;
                  }[]>

                diff --git a/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVectorAPI.html b/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVectorAPI.html index d7256b2..6102e37 100644 --- a/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVectorAPI.html +++ b/docs/functions/src_similarity_similarity_concept.weighRelevanceConceptVectorAPI.html @@ -6,10 +6,13 @@ or longer passage, depending on the model being used.

              • sentences: string[]

                A list of strings which will be compared against the source_sentence.

                -
              • options: {
                    HF_API_KEY: string;
                    model: string;
                } = {}

                Returns Promise<any>

                array of 0-1 similarity scores for each sentence

                -
              • diff --git a/docs/functions/src_tokenize_sentences.splitSentences.html b/docs/functions/src_tokenize_sentences.splitSentences.html index 222ca91..e9e57c0 100644 --- a/docs/functions/src_tokenize_sentences.splitSentences.html +++ b/docs/functions/src_tokenize_sentences.splitSentences.html @@ -1,11 +1,14 @@ splitSentences | ai-research-agent
                • Splits text into sentences, handling 220+ common abbreviations, and infering acronyms, numbers, URLs, times, names, etc.

                  Parameters

                  • inputText: string

                    The text to be split into sentences.

                    -
                  • options: {
                        maxSize: number;
                        minSize: number;
                        splitOnHtmlTags: boolean;
                    } = {}
                    • maxSize: number

                      default=600 - Maximum size for a sentence.

                      +
                    • Optionaloptions: {
                          maxSize: number;
                          minSize: number;
                          splitOnHtmlTags: boolean;
                      } = {}
                        +
                      • +
                      +
                      • maxSize: number

                        default=600 - Maximum size for a sentence.

                      • minSize: number

                        default=20 - Minimum size for a sentence.

                      • splitOnHtmlTags: boolean

                        default=true - Split on HTML tags like P, DIV, UL, OL.

                  Returns string[]

                  An array of sentences.

                  -
                diff --git a/docs/functions/src_tokenize_stopwords.isWordCommonIgnored.html b/docs/functions/src_tokenize_stopwords.isWordCommonIgnored.html index dbaebda..a48a862 100644 --- a/docs/functions/src_tokenize_stopwords.isWordCommonIgnored.html +++ b/docs/functions/src_tokenize_stopwords.isWordCommonIgnored.html @@ -1,7 +1,7 @@ isWordCommonIgnored | ai-research-agent
                diff --git a/docs/functions/src_tokenize_text_to_chunks.splitTextSemanticChars.html b/docs/functions/src_tokenize_text_to_chunks.splitTextSemanticChars.html index bc2a0c4..2aa8885 100644 --- a/docs/functions/src_tokenize_text_to_chunks.splitTextSemanticChars.html +++ b/docs/functions/src_tokenize_text_to_chunks.splitTextSemanticChars.html @@ -28,7 +28,7 @@
                const text = "# Heading\n\nThis is a paragraph.\n\n- List item 1\n- List item 2\n\n";
                const chunks = splitTextSemanticChars(text);
                console.log(chunks);
                // Output: ['# Heading', 'This is a paragraph.', '- List item 1', '- List item 2']
                -
                diff --git a/docs/functions/src_tokenize_tokenize_topics.tokenizeTopics.html b/docs/functions/src_tokenize_tokenize_topics.tokenizeTopics.html index 0b8cc82..7ec2629 100644 --- a/docs/functions/src_tokenize_tokenize_topics.tokenizeTopics.html +++ b/docs/functions/src_tokenize_tokenize_topics.tokenizeTopics.html @@ -7,13 +7,16 @@ not just split into words and find frequency. Examples are "white house" or "state of the art" which should be searched as a phrase but would return different context if split into words. As Led Zeppelin famously put it: ♫ "'Cause you know sometimes words have two meanings."

                -

                Parameters

                • phrase: string
                • options: {
                      checkRootWords: number;
                      checkTypos: number;
                      ignoreStopWords: number;
                      phrasesModel: any;
                      typosModel: any;
                  } = {}
                  • checkRootWords: number

                    check for word's root stem

                    +

                    Parameters

                    • phrase: string
                    • Optionaloptions: {
                          checkRootWords: number;
                          checkTypos: number;
                          ignoreStopWords: number;
                          phrasesModel: any;
                          typosModel: any;
                      } = {}
                        +
                      • +
                      +
                      • checkRootWords: number

                        check for word's root stem

                      • checkTypos: number

                        check for typos

                      • ignoreStopWords: number

                        ignore 300+ overused words

                      • phrasesModel: any

                        remote model

                      • typosModel: any

                        remote model

                    Returns Token[]

                    ex. [[50, 0, "Albert Einstein"],...]

                    -
                diff --git a/docs/functions/src_tokenize_word_to_root_stem.convertWordToRootStem.html b/docs/functions/src_tokenize_word_to_root_stem.convertWordToRootStem.html index 673af18..85289f6 100644 --- a/docs/functions/src_tokenize_word_to_root_stem.convertWordToRootStem.html +++ b/docs/functions/src_tokenize_word_to_root_stem.convertWordToRootStem.html @@ -9,7 +9,7 @@
                var rootWord = convertWordToRootStem("running"); // returns "run"
                 
                -
                diff --git a/docs/functions/src_topic_distribution_topic_distribution.weighTopicDirichletDistribution.html b/docs/functions/src_topic_distribution_topic_distribution.weighTopicDirichletDistribution.html index 73ff380..d0d9faa 100644 --- a/docs/functions/src_topic_distribution_topic_distribution.weighTopicDirichletDistribution.html +++ b/docs/functions/src_topic_distribution_topic_distribution.weighTopicDirichletDistribution.html @@ -10,7 +10,7 @@ https://www.geeksforgeeks.org/latent-dirichlet-allocation/
                "Topic Models" https://www.youtube.com/watch?v=yK7nN3FcgUs

                Parameters

                • sentences: string[]

                  Array of input sentences.

                  -
                • options: {
                      alpha: number;
                      beta: number;
                      numberOfIterations: number;
                      numberOfTermsPerTopic: number;
                      topicCount: number;
                      valueBurnIn: number;
                      valueSampleLag: number;
                  } = {}

                  Configuration options for LDA.

                  +
                • Optionaloptions: {
                      alpha: number;
                      beta: number;
                      numberOfIterations: number;
                      numberOfTermsPerTopic: number;
                      topicCount: number;
                      valueBurnIn: number;
                      valueSampleLag: number;
                  } = {}

                  Configuration options for LDA.

                  • alpha: number

                    default=0.1 - Dirichlet prior on document-topic distributions.

                  • beta: number

                    default=0.01 - Dirichlet prior on topic-word distributions.

                  • numberOfIterations: number

                    default=1000 - Number of iterations for the LDA algorithm.

                    @@ -21,7 +21,7 @@

                Returns any[]

                • Array of topics, each containing term-probability pairs.
                -
                diff --git a/docs/functions/src_topics_ngrams.extractNounEdgeGrams.html b/docs/functions/src_topics_ngrams.extractNounEdgeGrams.html index bb675f0..b348724 100644 --- a/docs/functions/src_topics_ngrams.extractNounEdgeGrams.html +++ b/docs/functions/src_topics_ngrams.extractNounEdgeGrams.html @@ -18,7 +18,7 @@
                  let terms = [["The", 1], ["quick", 2], ["brown", 2], ["fox", 3], ["jumps", 4]];
                  let nGrams = {};
                  extractNounEdgeGrams(3, terms, 0, nGrams, 3, 1);
                  // nGrams might now contain: {3: {"brown fox jumps": [1]}}
                  -
                  diff --git a/docs/functions/src_topics_rank_sentences_keyphrases.rankSentencesCentralToKeyphrase.html b/docs/functions/src_topics_rank_sentences_keyphrases.rankSentencesCentralToKeyphrase.html index 4424113..8c559d0 100644 --- a/docs/functions/src_topics_rank_sentences_keyphrases.rankSentencesCentralToKeyphrase.html +++ b/docs/functions/src_topics_rank_sentences_keyphrases.rankSentencesCentralToKeyphrase.html @@ -13,7 +13,7 @@ https://dl.acm.org/doi/10.1145/3321408.3326659 https://doi.org/10.1145/3321408.3326659

                  Parameters

                  • sentencesWithKeyphrases: any[]

                    Array of objects, each containing {text, keyphrases}

                  • options: {} = {}

                    Returns any[]

                    Updated array with added weights: [{text, keyphrases, weight}]

                    -
                    diff --git a/docs/functions/src_topics_seektopic_keyphrases.extractSEEKTOPIC.html b/docs/functions/src_topics_seektopic_keyphrases.extractSEEKTOPIC.html index 5261290..d72b24f 100644 --- a/docs/functions/src_topics_seektopic_keyphrases.extractSEEKTOPIC.html +++ b/docs/functions/src_topics_seektopic_keyphrases.extractSEEKTOPIC.html @@ -5,7 +5,7 @@ to find which sentences centralize and tie together keyphrase concepts referred to most by other sentences. Based on the & PageRank algorithms, it randomly surfs nodes to find probability of being at that node, thus ranking influence.

                    Parameters

                    • docText: string

                      input text to analyze

                      -
                    • options: {
                          heavyWeightQuery: string;
                          limitTopKeyphrases: number;
                          limitTopSentences: number;
                          maxWords: number;
                          minKeyPhraseLength: number;
                          minWordLength: number;
                          minWords: number;
                          phrasesModel: any;
                          topKeyphrasesPercent: number;
                      } = {}
                      • heavyWeightQuery: string

                        query to give heavy weight to

                        +
                      • Optionaloptions: {
                            heavyWeightQuery: string;
                            limitTopKeyphrases: number;
                            limitTopSentences: number;
                            maxWords: number;
                            minKeyPhraseLength: number;
                            minWordLength: number;
                            minWords: number;
                            phrasesModel: any;
                            topKeyphrasesPercent: number;
                        } = {}
                        • heavyWeightQuery: string

                          query to give heavy weight to

                        • limitTopKeyphrases: number

                          default=10 - maximum number of top keyphrases to return

                        • limitTopSentences: number

                          default=5 - maximum number of top sentences to return

                        • maxWords: number

                          default=5 - maximum words in a keyphrase

                          @@ -46,7 +46,7 @@
                          extractSEEKTOPIC(testDoc, { phrasesModel, heavyWeightQuery: "self attention", limitTopSentences: 10,
                           
                          -
                    diff --git a/docs/functions/src_trafilatura_extractor_baseline.baseline.html b/docs/functions/src_trafilatura_extractor_baseline.baseline.html new file mode 100644 index 0000000..910ed7d --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_baseline.baseline.html @@ -0,0 +1,4 @@ +baseline | ai-research-agent
                    • Parameters

                      • filecontent: any

                      Returns any[]

                    diff --git a/docs/functions/src_trafilatura_extractor_baseline.basicCleaning.html b/docs/functions/src_trafilatura_extractor_baseline.basicCleaning.html new file mode 100644 index 0000000..6555149 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_baseline.basicCleaning.html @@ -0,0 +1,4 @@ +basicCleaning | ai-research-agent
                    • Parameters

                      • tree: any

                      Returns any

                    diff --git a/docs/functions/src_trafilatura_extractor_baseline.html2txt.html b/docs/functions/src_trafilatura_extractor_baseline.html2txt.html new file mode 100644 index 0000000..fc88606 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_baseline.html2txt.html @@ -0,0 +1,4 @@ +html2txt | ai-research-agent
                    • Parameters

                      • content: any
                      • clean: boolean = true

                      Returns any

                    diff --git a/docs/functions/src_trafilatura_extractor_baseline.loadHtml.html b/docs/functions/src_trafilatura_extractor_baseline.loadHtml.html new file mode 100644 index 0000000..5c259a5 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_baseline.loadHtml.html @@ -0,0 +1,4 @@ +loadHtml | ai-research-agent
                    • Parameters

                      • content: any

                      Returns any

                    diff --git a/docs/functions/src_trafilatura_extractor_baseline.trim.html b/docs/functions/src_trafilatura_extractor_baseline.trim.html new file mode 100644 index 0000000..7949024 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_baseline.trim.html @@ -0,0 +1,4 @@ +trim | ai-research-agent
                    • Parameters

                      • text: any

                      Returns any

                    diff --git a/docs/variables/src_extract_content_xpaths.COMMENTS_XPATH.html b/docs/functions/src_trafilatura_extractor_core.bareExtraction.html similarity index 75% rename from docs/variables/src_extract_content_xpaths.COMMENTS_XPATH.html rename to docs/functions/src_trafilatura_extractor_core.bareExtraction.html index 4dadf10..fa6b1cc 100644 --- a/docs/variables/src_extract_content_xpaths.COMMENTS_XPATH.html +++ b/docs/functions/src_trafilatura_extractor_core.bareExtraction.html @@ -1,4 +1,4 @@ -COMMENTS_XPATH | ai-research-agent
                    COMMENTS_XPATH: string[] = ...
                    • Parameters

                      • filecontent: any
                      • options: any

                      Returns Document

                    diff --git a/docs/functions/src_trafilatura_extractor_core.extract.html b/docs/functions/src_trafilatura_extractor_core.extract.html new file mode 100644 index 0000000..455091a --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_core.extract.html @@ -0,0 +1,4 @@ +extract | ai-research-agent
                    • Parameters

                      • htmlOrDom: any
                      • options: {} = {}

                        Returns any

                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.cleanQuery.html b/docs/functions/src_trafilatura_extractor_courlan_clean.cleanQuery.html new file mode 100644 index 0000000..2d61be5 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.cleanQuery.html @@ -0,0 +1,11 @@ +cleanQuery | ai-research-agent
                      • Strip unwanted query elements

                        +

                        Parameters

                        • querystring: string

                          The query string to clean

                          +
                        • Optionalstrict: boolean = false

                          Whether to use strict cleaning

                          +
                        • Optionallanguage: string = null

                          The language to use for cleaning

                          +

                        Returns string

                          +
                        • The cleaned query string
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.cleanUrl.html b/docs/functions/src_trafilatura_extractor_courlan_clean.cleanUrl.html new file mode 100644 index 0000000..3935c18 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.cleanUrl.html @@ -0,0 +1,10 @@ +cleanUrl | ai-research-agent
                      • Helper function: chained scrubbing and normalization

                        +

                        Parameters

                        • url: string

                          The URL to clean

                          +
                        • Optionallanguage: string = null

                          The language to use for cleaning

                          +

                        Returns string

                          +
                        • The cleaned URL or null if invalid
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.decodePunycode.html b/docs/functions/src_trafilatura_extractor_courlan_clean.decodePunycode.html new file mode 100644 index 0000000..032cfaf --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.decodePunycode.html @@ -0,0 +1,9 @@ +decodePunycode | ai-research-agent
                      • Probe for punycode in lower-cased hostname and try to decode it

                        +

                        Parameters

                        • string: string

                          The string to decode

                          +

                        Returns string

                          +
                        • The decoded string
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeFragment.html b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeFragment.html new file mode 100644 index 0000000..6430bd0 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeFragment.html @@ -0,0 +1,10 @@ +normalizeFragment | ai-research-agent
                      • Look for trackers in URL fragments using query analysis, normalize the output

                        +

                        Parameters

                        • fragment: string

                          The fragment to normalize

                          +
                        • Optionallanguage: string = null

                          The language to use for normalization

                          +

                        Returns string

                          +
                        • The normalized fragment
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.normalizePart.html b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizePart.html new file mode 100644 index 0000000..7cae3dd --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizePart.html @@ -0,0 +1,9 @@ +normalizePart | ai-research-agent
                      • Normalize URLs parts (specifically path and fragment) while accounting for certain characters

                        +

                        Parameters

                        • urlPart: string

                          The URL part to normalize

                          +

                        Returns string

                          +
                        • The normalized URL part
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeUrl.html b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeUrl.html new file mode 100644 index 0000000..909f49c --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.normalizeUrl.html @@ -0,0 +1,12 @@ +normalizeUrl | ai-research-agent
                      • Takes a URL string or a parsed URL and returns a normalized URL string

                        +

                        Parameters

                        • parsedUrl: string | URL

                          The URL to normalize

                          +
                        • Optionalstrict: boolean = false

                          Whether to use strict normalization

                          +
                        • Optionallanguage: string = null

                          The language to use for normalization

                          +
                        • OptionaltrailingSlash: boolean = true

                          Whether to keep trailing slashes

                          +

                        Returns string

                          +
                        • The normalized URL
                        • +
                        +
                      diff --git a/docs/functions/src_trafilatura_extractor_courlan_clean.scrubUrl.html b/docs/functions/src_trafilatura_extractor_courlan_clean.scrubUrl.html new file mode 100644 index 0000000..a0526b6 --- /dev/null +++ b/docs/functions/src_trafilatura_extractor_courlan_clean.scrubUrl.html @@ -0,0 +1,9 @@ +scrubUrl | ai-research-agent
                      • Strip unnecessary parts and make sure only one URL is considered

                        +

                        Parameters

                        • url: string

                          The URL to scrub

                          +

                        Returns string

                          +
                        • The scrubbed URL
                        • +
                        +
                      diff --git a/docs/functions/src_extract_content_courlan_core.checkUrl.html b/docs/functions/src_trafilatura_extractor_courlan_core.checkUrl.html similarity index 58% rename from docs/functions/src_extract_content_courlan_core.checkUrl.html rename to docs/functions/src_trafilatura_extractor_courlan_core.checkUrl.html index 1d1b98b..031690c 100644 --- a/docs/functions/src_extract_content_courlan_core.checkUrl.html +++ b/docs/functions/src_trafilatura_extractor_courlan_core.checkUrl.html @@ -1,4 +1,4 @@ -checkUrl | ai-research-agent