From 4c66fefa217e53daf8d59f0f08c79b90e714646a Mon Sep 17 00:00:00 2001 From: vtempest Date: Fri, 13 Sep 2024 18:03:56 -0700 Subject: [PATCH] tardigrade crawler --- docs/assets/highlight.css | 47 +++++-------------- docs/assets/navigation.js | 2 +- docs/assets/search.js | 2 +- docs/functions/convertEmbeddingsToHNSW.html | 8 ++-- docs/functions/convertHTMLSpecialChars.html | 2 +- docs/functions/convertHTMLToBasicHTML.html | 3 +- docs/functions/convertMarkdownToHtml.html | 2 +- docs/functions/convertPDFToHTML.html | 2 +- docs/functions/embedYoutubePlayer.html | 2 +- docs/functions/exportEmbeddingsIndex.html | 2 +- docs/functions/extractContentHTML.html | 2 +- docs/functions/extractContentHTML2.html | 2 +- docs/functions/extractFavicon.html | 2 +- docs/functions/extractSEEKTOPIC.html | 2 +- ...on.html => extractTopicTermGroupsLDA.html} | 2 +- docs/functions/getAllEmbeddings.html | 2 +- .../importVectorIndexFromString.html | 2 +- docs/functions/matchQUASAR.html | 2 +- docs/functions/scrapeURL.html | 9 ++-- docs/functions/searchSTREAM.html | 2 +- docs/functions/searchVectorIndex.html | 2 +- docs/functions/searchWeb.html | 2 +- docs/functions/searchWikipedia.html | 2 +- docs/functions/splitTextSemanticChars.html | 2 +- docs/functions/stemWordToRoot.html | 2 +- .../functions/suggestNextWordCompletions.html | 4 +- docs/index.html | 14 +++--- docs/modules.html | 2 +- index.js | 4 +- package.json | 2 +- readme.md | 21 +++------ src/extractor/url-to-content/scrape-url.js | 5 +- src/similarity/similarity-vector.js | 7 ++- src/topics/topic-distribution.js | 2 +- test/topic-distribution.test.js | 4 +- 35 files changed, 74 insertions(+), 100 deletions(-) rename docs/functions/{weighTopicDirichletDistribution.html => extractTopicTermGroupsLDA.html} (74%) diff --git a/docs/assets/highlight.css b/docs/assets/highlight.css index 385c713..3d1f58d 100644 --- a/docs/assets/highlight.css +++ b/docs/assets/highlight.css @@ -3,28 +3,22 @@ --dark-hl-0: #569CD6; --light-hl-1: #000000; --dark-hl-1: #D4D4D4; - --light-hl-2: #000000; + --light-hl-2: #001080; --dark-hl-2: #9CDCFE; - --light-hl-3: #000000; + --light-hl-3: #795E26; --dark-hl-3: #DCDCAA; - --light-hl-4: #000000; + --light-hl-4: #A31515; --dark-hl-4: #CE9178; - --light-hl-5: #A31515; - --dark-hl-5: #CE9178; - --light-hl-6: #001080; - --dark-hl-6: #9CDCFE; - --light-hl-7: #795E26; - --dark-hl-7: #DCDCAA; - --light-hl-8: #008000; - --dark-hl-8: #6A9955; - --light-hl-9: #0070C1; - --dark-hl-9: #4FC1FF; - --light-hl-10: #EE0000; - --dark-hl-10: #D7BA7D; - --light-hl-11: #AF00DB; - --dark-hl-11: #C586C0; - --light-hl-12: #098658; - --dark-hl-12: #B5CEA8; + --light-hl-5: #008000; + --dark-hl-5: #6A9955; + --light-hl-6: #0070C1; + --dark-hl-6: #4FC1FF; + --light-hl-7: #EE0000; + --dark-hl-7: #D7BA7D; + --light-hl-8: #AF00DB; + --dark-hl-8: #C586C0; + --light-hl-9: #098658; + --dark-hl-9: #B5CEA8; --light-code-background: #FFFFFF; --dark-code-background: #1E1E1E; } @@ -40,9 +34,6 @@ --hl-7: var(--light-hl-7); --hl-8: var(--light-hl-8); --hl-9: var(--light-hl-9); - --hl-10: var(--light-hl-10); - --hl-11: var(--light-hl-11); - --hl-12: var(--light-hl-12); --code-background: var(--light-code-background); } } @@ -57,9 +48,6 @@ --hl-7: var(--dark-hl-7); --hl-8: var(--dark-hl-8); --hl-9: var(--dark-hl-9); - --hl-10: var(--dark-hl-10); - --hl-11: var(--dark-hl-11); - --hl-12: var(--dark-hl-12); --code-background: var(--dark-code-background); } } @@ -74,9 +62,6 @@ --hl-7: var(--light-hl-7); --hl-8: var(--light-hl-8); --hl-9: var(--light-hl-9); - --hl-10: var(--light-hl-10); - --hl-11: var(--light-hl-11); - --hl-12: var(--light-hl-12); --code-background: var(--light-code-background); } @@ -91,9 +76,6 @@ --hl-7: var(--dark-hl-7); --hl-8: var(--dark-hl-8); --hl-9: var(--dark-hl-9); - --hl-10: var(--dark-hl-10); - --hl-11: var(--dark-hl-11); - --hl-12: var(--dark-hl-12); --code-background: var(--dark-code-background); } @@ -107,7 +89,4 @@ .hl-7 { color: var(--hl-7); } .hl-8 { color: var(--hl-8); } .hl-9 { color: var(--hl-9); } -.hl-10 { color: var(--hl-10); } -.hl-11 { color: var(--hl-11); } -.hl-12 { color: var(--hl-12); } pre, code { background: var(--code-background); } diff --git a/docs/assets/navigation.js b/docs/assets/navigation.js index 1847640..867014f 100644 --- a/docs/assets/navigation.js +++ b/docs/assets/navigation.js @@ -1 +1 @@ -window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAACpXWTY/aMBCA4f+SM2qlVdUDN5YPgbpsaWKKqqoH40wTi8STOhMWVPW/V6GUJos9Tm9Ifv0MISbw9WdEcKJoHCk0R7A0L/eQptpktcDlc7KLRlElKY/G0ffGKNJo6ree9E1OZRGNooM2aTR+/+7XiMO368lmIN6mYXwp1k9JBUrLYppLWzP463QYLvBR1lq1LwN2pwzTa2kPKb4Ygcs/qU/uh2F4M1sIDLzbWxPmBJxI4O2+MOircigt8ACGu23dLIx+wYaaPQhstzFqr2NYaK/nGm8KeQbrQu8rjjxV2D3qK5PCyam6QhYmK5Xzqq9L4c1TTcAA7fIABA2BId8pvK/+i3wYZj6E0YU8aoWG8a5FmErm8w/i42Y1ZbBbw3AZGLCS4EmarJEZrDGFIoaqOLtgf82OoElR/DtWbrjf8Nytu8z3eP2IAXXZHvvPoAjt5cgvLJYJWc/jh8mZIaUklX/aTpJJ7EI7ywxSKysr2MbOU35b5ACQVuWJiOeTtdPorAeZzkfgtzpRENzB3g/tYB8G9EFXkGrJMH8TDqsKTQm032wFzuPaL0JU+8hPoJSGtPL+aXCXHE1Q7tCmAmNE50O4X3BUk2VQ0zOcqO2nWFYFXAwn662ZES+gszyGAo7SKJiiUVBdv0STzco1ht8xeJQAWy4s/GjAKOczjclDQxJd6kJaTefHc3u7pCL377W/Do0QWGk101arvACa6Zqs3jet6p3j33I37NtvmRky8ZoLAAA=" \ No newline at end of file +window.navigationData = "data:application/octet-stream;base64,H4sIAAAAAAAACpXWUY+aQBDA8e/Cs2mTS9MH3zhPq6leLaw1TdOHdZmDjbBDl8HTNP3uDdazcC6z3JvJ/vmNCKz8+B0QHCkYBwrNASxNix0kiTZpJXD+GG+DUVBKyoJx8FQbRRpN9b4nfZdRkQejYK9NEow/fvgz4vDNKlwPxJvUj8/FahmXoLTMJ5m0FYO/TofhAu9lpVXz0WO3Sj+9knaf4LMROP+X9snd0A+vH2YCPd/22vg5AUcSeL0uDPqqHEoL3IPhLls786PfsaZ6BwKbwxi10zEsNOdzide5PIF1obcVRx5LbN/qC5PA0am6QhYmK5XzrC9L/oMnmoABmuUBCBoCQ3134W31JvJumHnnR2fyoBUaxrsUfiqeTj+LL+vFhMGujZ8TWGolwBafLNZltXwIGfc2ZgakYMBKgqU0aS1TWGECeQRlfnJN6K/ZERTm+f/71g13G567duf5PV43YkBdNM/VN1CE9vxMzSwWMdme/Y3JmSGFJJV93YRxGLnQ1jKDVMrKEjaR8zG6LnIASKuyWETTcOU0WutepvUT9FutyAtuYdcPbWHnB/Rel5BoyTAvCYeVuaYYmq1DgfN27RY+qvlPiaGQhrTqfStxlxxNUGzRJgIjROcu3y04qk5TqOgRjtT0EyzKHM6Gk+2tmRHPoNMsghwO0iiYoFFQXh6icL1wjeGPGDyq2QhnFn7VYJRzT2Ny35BYFzqXVtPp/tRcLqnI/ULQX9+M+PkXmXeFWo4LAAA=" \ No newline at end of file diff --git a/docs/assets/search.js b/docs/assets/search.js index bdb7695..be14f23 100644 --- a/docs/assets/search.js +++ b/docs/assets/search.js @@ -1 +1 @@ -window.searchData = "data:application/octet-stream;base64,H4sIAAAAAAAACqWZbU/jOBCA/4s/RyyeQFv6jWVBu7plj6Pl0KlCpzQZ2ogkzjouL6r47ycnaTOuHeRw3xCZeeykz9jOZMukeKnYdLFlT2mRsOnoJGBFlCObshUWKCOFP6NitYlWeC0SzG6xzN5YwDYyY1P2uClilYqi+tIffLRWecYCFmdRVWHFpoy9B47hYlE8o1TXkXxKxEsxF9+bvMORnHG+g6R5KaT6G2Ml5I8iwdcrKfKZkmmxcgz1QfTAu/o+v/45KzFOo+xiHcmq/74OIwcOdPPtai40o3+EfYgvGl+VjGLlILZXBoKuouc0FkU/rw3wxuZLTP4RG7VZ4k0WvaF0oa0gX3ylML8XMpmLWyFcT8EMGPgwLlKF/U9CXx0KFIXCQvU4YAd9Hg9efPhEqczF16hK4489Pgj0vw9d1JdahyQtVlVd2M47ccQNvJcuey6+/5rd99/MYeT/GOju+vzGbyAdOXCgOb6qudhD+sc5CPzUMHPxhMUHayWNGjhAuxLMhWb0j2CEDayU2eXlH/M/b35c9JfJPsQXvUJ1nmXdb+jcic2QAeh9Ur17u9lmjC88j1S8/uvufHZ+68CSq96LciyjEu9uXevD/po3DCMZr2fz28vzaxePXB6GvMdlL+8elwNh6VNaYpJG/chdxDAwOeD0okmMN7zMUjVDvRHE6DLVDBiE1eU4wzwqVBr3Hajcgd7DbFYrrNQvfFV6b78QeZlhDXYN1RvsO9wLpqv1LM3TLJKpevv6picbxcp5lukPHjTcLWb4HBUxXogixrI96J7f/Ogbsjfhc8POUeZXEn9vsIhd7xQfRA8acC7KNP6WyjReZ6i+pZWS6XKjx+gbtD/DNfBDwNK6eKZb9oyy0uApg6Pw6IwF7DHFLNGvWM2cAhaLPMdCbzqJiDf1nw9tWPNAdXAT/eWYBYvjIORH4wl/eAgWu+T6Qv2PHaP7T53IWbDgrkRuJXIjEViwAFciWIlgJIYsWISuxNBKDI3EExYsTlyJJ1biiZF4yoLFqSvx1Eo8NRJHLFiMXIkjK3FkJI5ZsBi7EsdW4thInLBgMXElTqzEiZF4xoLFmSvxzEo8MwXQPnCnO9yWhx/YU+vj9schkGkQ115wp0PcloibFnHtBnd6xG2RuGkS135wp0vclombNnHtCHf6xG2huGkU155wp1PcloqbVnHtCnd6xW2xuGkW175wp1vcloubdnHtDHf6xW3BuGkYaGfAaRjYhoFpGGhnwGkY2IbBwRpVL1LuVcqxTJmGgXYGnIaBbRiYhoF2BpyGgW0YmIaBdgachoFtGJiGgXYGnIaBbRiYhoF2BpyGgW0YmIaBdgachoFtGJiGgXYGnIaBbRiYhoXamdBpWGgbFpqGhdqZ0GlYaBsWmoaF2pnQaVhoGxYe7IT1VujeCx2boWlYqJ0JnYaFtmHtv+oTiH5fxaQ5xusDRPsKi/s3QSXWRfXCgi37tz2v8P1Jacv4CZtu39+788l0+06OKPpaMxmLusmj0qCeEurpEKo+XFVN8zNuTvUdNeyg4VCmEkvdJmrObmSiQCYKQ6B5231WwmIS5BBimTzaMPLzDPp1FL4qJbDr0pD5jcgER8Ohqu3JEOKYEMdDiG9Nd0UJVTdhCHJCkBMvZH2zLbBsm8AdkEzRb4ZN96+zvOHQKRIfuZ+Q+z56RyGV4lcoLSOuu8Yd56zjnA3iNH1aS+JjcnPHnwSCQaR14VcYLfFx97WggxGF/QxuURXik9KvdsbMyLPjfg9v97Eraz925boVJpsvYx2YPEO/R7hCFWUZkuZeBwNCA2/cnpU3DT3CI78H+P0ezdew5/oVtaY8SpFX7bczQiZgL27d/fu9iarIKFmgID9S0/jTb/SUQwoV/Aq1aXlVSmKUGyiyHoPfgtygyEMzeGRZAr91qeG96L4i4ZBlBPzWkZbTtRQJjZQX+NVX3WqrulYfgZGVHPxW8hqmd4Sq7dtZRwEgJQt+Jau/0b0ImSgh6494HYxM0HN+TbOvwFeliTHtDJLjCinY0K9g69aT3PW74qbH1qgTlakBJ7UR+tWGCVco88eu9UbI9JjlVy01udp3I5dvcde6JGCy4IR+C04NrtfrZNeKS4zmHaGTygw9KvMhYGVaYpYWyKaLh/f3/wAYwqnGCSEAAA=="; \ No newline at end of file +window.searchData = "data:application/octet-stream;base64,H4sIAAAAAAAACqWZbU/cOBCA/4s/RxSPYd++bSlcqys9jk0PnVboFJJhiUjiNPHyIsR/r5yEjb2eIIf7htiZx07yeOxMXlglH2u2WL+w+7RI2GJyFLAiypEt2AYLrCKF36Nis402eC4TzC6xzJ5ZwLZVxhbsdlvEKpVF/Wk4+OBO5RkLWJxFdY01WzD2GhDDxbJ4wEqdR9V9Ih+LUH5t8/ZHIuN8B0nzUlbqH4yVrL4VCT6dVTJfqSotNsRQ70SPvKqv4fn3VYlxGmUnd1FVD1/XfuTIgS6+nIVSM4ZH2IX4ovFJVVGsCGL3y0jQWfSQxrIY5nUB3tj8BpN/5VZtb/Aii56xotBOkC++VphfySoJ5aWU1F2wA0bejJNU4fCd0L+OBcpCYaEGHHCDPo4HLz58YKmE8nNUp/H7Hu8F+l+HXtSnWockLTZ1s7DJKyHiRl5Lnx3Krz9WV8MXsx/5Pwb6eb688BtIR44cKMQnFcodZHicvcAPDRPKeyzeqZVm1MgBukoQSs0YHsEKG7lSVqenf4Z/XXw7GV4muxBf9AbVMsv6Z0juxHbICPQuqdm9abYd4wvPIxXf/f1zuVpeEljjV++iHFdRiT8vqfqw+80bhlEV363Cy9PlOcUzfh6HvMKbQd4V3oyEpfdpiUkaDSPfIsaBjQPOINqI8YaXWapWqDeCGClT7YBRWL0cV5hHhUrjoQMVHeg9zHazwVr9wCel9/YTmZcZNmBqqMFg3+EeMd3crdI8zaIqVc+fn/Vko1iRZ5nh4FHDXWKGD1ER44ksYiy7g+7y4tvQkIMJHxs2xCo/q/DXFouYeqd4J3pkKQ5lmcYa8Eclt2X9/ctyuCa7sdRg1wFLmwWzeGEPWNWpLNiCwYE4mLOA3aaYJfq1qp1HwGKZ51jojSaR8bb587oLa2+iDm6jPx2yYH0YCH4wnfHr62D9ltz80PzjjdH/p0nkLFhzKpE7idxKBBasgUoEJxGsRMGCtaAShZMorMQjFqyPqMQjJ/HISjxmwfqYSjx2Eo+txAkL1hMqceIkTqzEKQvWUypx6iROrcQZC9YzKnHmJM6sxDkL1nMqce4kzm0BtA+cdIe78vA9exp9aH8IgWyDuPaCkw5xVyJuW8S1G5z0iLsicdskrv3gpEvclYnbNnHtCCd94q5Q3DaKa0846RR3peK2VVy7wkmvuCsWt83i2hdOusVdubhtF9fOcNIv7grGbcNAOwOkYeAaBrZhoJ0B0jBwDYO9GtUUKbpKEWXKNgy0M0AaBq5hYBsG2hkgDQPXMLANA+0MkIaBaxjYhoF2BkjDwDUMbMNAOwOkYeAaBrZhoJ0B0jBwDQPbMNDOAGkYuIaBbZjQzgjSMOEaJmzDhHZGkIYJ1zBhGya0M4I0TLiGib2dsNkK6b2Q2Axtw4R2RpCGCdew7l/NCUS/o2LSHt31AaJ7bcXd25+Sd0X9yIIX9l93XuG709EL40ds8fL62p9PFi+vxhFF/9ZOxqFu86i0qMcG9XgMVR+u6rbhGbcn+Z4qeqgYy1TyRreG2rObMVEwJgpjoHnXcVbSYRrIMcQyuXVhxuMZ9XQUPiklse/MGPObGBOcjIeqrg9jEKcGcTqG+Nx2VJRUTePFQM4M5MwL2VxsByy7xm8PNKboN8O249db3nLMKRo+cj8hd73znmKsFL+F0jHiplPcc+Y9Zz6K0/ZmHYkPjYs7/CAQLKK5LvwWRke8fftC0MMMhf0M7lA14r3Sr3TWzIx7x0fdvAalsMo3zdthlkRWvTIWr/BbvW8fzrLuw1mu22pV+5WtBxvPxu/RbFBFWYZGo7CHgUEDb9yOlbfNQYNnPGfwe87tl7WH5tW3odxWMq+773AG2QB7cZtO4q9tVEdWKQAT5Edqm4i6R2ByjAIAfgWgbZ/VqsIot1CGKuCnSosybprFM8od+NW7lveoe5QGxyhP4FefOk7fnjRoxrIFv3XbtO3qvm1owIwdAvx2iAamd5q66wE6RwwwSgH4lQL9ve9RVomSVfNBsIcZE/ScX9s4LPBJaWJsdhmNsmIsWOG3YJveWfXWO4vbfl2rTlSmFtxYG8JvbdhwXQ1v+zaeQTaPb36rpSHXu87mzXPct0ENsFFwhEfBuQ5YmZaYpQWyxfr69fU3LvU8QvcgAAA="; \ No newline at end of file diff --git a/docs/functions/convertEmbeddingsToHNSW.html b/docs/functions/convertEmbeddingsToHNSW.html index a23c5a5..f77b3fd 100644 --- a/docs/functions/convertEmbeddingsToHNSW.html +++ b/docs/functions/convertEmbeddingsToHNSW.html @@ -1,6 +1,5 @@ convertEmbeddingsToHNSW | ai-research-agent

Function convertEmbeddingsToHNSW

  • Generates vectors for a set of documents and creates an HNSW index using -hnswlib-node WASM JS for efficient similarity search.

    -

    "If AI is Humanity's Last Invention, then Vector Space is the Final Frontier."

    +hnswlib in C++ compiled to WASM JS for efficient similarity search.

    ANN Benchmarks

    Pinecone - HNSW

    Wikipedia - HNSW

    @@ -9,8 +8,9 @@
    • maxElements: number

      The maximum number of data points.

    • numDimensions: number

      The length of data point vector that will be indexed.

Returns Promise<HierarchicalNSW>

The created HNSW index.

-
diff --git a/docs/functions/convertHTMLSpecialChars.html b/docs/functions/convertHTMLSpecialChars.html index 98de201..095a8c0 100644 --- a/docs/functions/convertHTMLSpecialChars.html +++ b/docs/functions/convertHTMLSpecialChars.html @@ -4,7 +4,7 @@
  • unescape: boolean = true

    default=true - If true, converts & codes to characters. If false, converts characters to codes.

  • Returns string

    The processed string.

    -
    var normalHTML = convertHTMLSpecialChars('&lt;p&gt;This &amp; that &copy; 2023 '+
    '&quot;Quotes&quot;&#39;Apostrophes&#39; &euro;100 &#x263A;&lt;/p&gt;', true)
    console.log(normalHTML) // Returns: "<p>This & that © 2023 "Quotes" 'Apostrophes' €100 ☺</p>" +
    var normalHTML = convertHTMLSpecialChars('&lt;p&gt;This &amp; that &copy; 2023 '+
    '&quot;Quotes&quot;&#39;Apostrophes&#39; &euro;100 &#x263A;&lt;/p&gt;', true)
    console.log(normalHTML) // Returns: "<p>This & that © 2023 "Quotes" 'Apostrophes' €100 ☺</p>"
    diff --git a/docs/functions/convertMarkdownToHtml.html b/docs/functions/convertMarkdownToHtml.html index cdd0542..ee35d1d 100644 --- a/docs/functions/convertMarkdownToHtml.html +++ b/docs/functions/convertMarkdownToHtml.html @@ -9,7 +9,7 @@

    Parameters

    • markdown: string

      The Markdown-formatted text to be converted.

    Returns string

    The resulting HTML string.

    -
    const markdown = "# Header\n\nThis is **bold** and *italic* text.\n\n* List item 1\n* List item 2";
    const html = convertMarkdownToHtml(markdown);
    console.log(html);
    // Output:
    // <h1>Header</h1>
    // <p>This is <strong>bold</strong> and <em>italic</em> text.</p>
    // <ul><li>List item 1</li><li>List item 2</li></ul> +
    const markdown = "# Header\n\nThis is **bold** and *italic* text.\n\n* List item 1\n* List item 2";
    const html = convertMarkdownToHtml(markdown);
    console.log(html);
    // Output:
    // <h1>Header</h1>
    // <p>This is <strong>bold</strong> and <em>italic</em> text.</p>
    // <ul><li>List item 1</li><li>List item 2</li></ul>
    diff --git a/docs/functions/extractContentHTML.html b/docs/functions/extractContentHTML.html index b2a9aa3..4d392bc 100644 --- a/docs/functions/extractContentHTML.html +++ b/docs/functions/extractContentHTML.html @@ -67,7 +67,7 @@
  • minTextLength: number

    default=25 - Minimum length of text to be considered valid

  • retryLength: number

    default=250 - Length to retry content extraction if initial attempt fails

  • Returns Element

    Extracted HTML element of main content such as article body

    -
    var url = "https://www.nytimes.com/2024/08/28/business/telegram-ceo-pavel-durov-charged.html"
    const html = await (await fetch(url)).text();
    var articleContent = extractContentHTML(html); +
    var url = "https://www.nytimes.com/2024/08/28/business/telegram-ceo-pavel-durov-charged.html"
    const html = await (await fetch(url)).text();
    var articleContent = extractContentHTML(html);

    Based on Mozilla Readability (2015), Arc90 (2010)

    diff --git a/docs/functions/extractContentHTML2.html b/docs/functions/extractContentHTML2.html index 2de6a04..bc3415a 100644 --- a/docs/functions/extractContentHTML2.html +++ b/docs/functions/extractContentHTML2.html @@ -32,7 +32,7 @@ IDs (e.g., subtract if a node has a className of 'comment', add if a node has an ID of 'entry-content').

    Returns string

    The extracted content as an HTML string, or null if extraction fails.

    Based on Postlight Mercury Parser (2017-)

    -
    var url =  "https://en.wikipedia.org/wiki/David_Hilbert"
    var html = await (await fetch(url)).text();
    var content = extractContentHTML(html);
    console.log(content); // HTML content of main article body +
    var url =  "https://en.wikipedia.org/wiki/David_Hilbert"
    var html = await (await fetch(url)).text();
    var content = extractContentHTML(html);
    console.log(content); // HTML content of main article body

    Function weighTopicDirichletDistribution

    • Latent Dirichlet (pronounced Dee-ruesh-ley) allocation is used +extractTopicTermGroupsLDA | ai-research-agent

      Function extractTopicTermGroupsLDA

      • Latent Dirichlet (pronounced Dee-ruesh-ley) allocation is used in natural language processing to discover abstract topics in a collection of documents. It is a generative probabilistic model that assumes documents are mixtures of topics, where a topic diff --git a/docs/functions/getAllEmbeddings.html b/docs/functions/getAllEmbeddings.html index 4d502b6..be20432 100644 --- a/docs/functions/getAllEmbeddings.html +++ b/docs/functions/getAllEmbeddings.html @@ -2,7 +2,7 @@

        Parameters

        • index: HierarchicalNSW

          The HNSW index containing the embeddings.

        • Optionalprecision: number = 3

          The number of decimal places to round to.

        Returns number[][]

        An array of embedding vectors.

        -
      diff --git a/docs/functions/importVectorIndexFromString.html b/docs/functions/importVectorIndexFromString.html index 8b76d16..db3546b 100644 --- a/docs/functions/importVectorIndexFromString.html +++ b/docs/functions/importVectorIndexFromString.html @@ -4,7 +4,7 @@
    • space: string = "cosine"

      The space type of the index (e.g., 'l2', 'ip', 'cosine').

    Returns Promise<any>

    A promise that resolves to the imported HNSW index object.

    If there's an error during the index deserialization process.

    -
    diff --git a/docs/functions/matchQUASAR.html b/docs/functions/matchQUASAR.html index 9c0735f..4b8f2c9 100644 --- a/docs/functions/matchQUASAR.html +++ b/docs/functions/matchQUASAR.html @@ -3,7 +3,7 @@ in web search engines. Single line function that can be used anywhere, such as UI inputs to filter a data list.

    Parameters

    • document: string
    • query: string

    Returns boolean

    true if doc has all words and "phrases in quotes"

    -
    var isFound = matchQUASAR(`Ask not what your country can do for you, 
    ask what you can do for your country. is nothing to fear but fear itself.`,
    ` "Ask not" "but fear itself" nothing`) // returns true +
    var isFound = matchQUASAR(`Ask not what your country can do for you, 
    ask what you can do for your country. is nothing to fear but fear itself.`,
    ` "Ask not" "but fear itself" nothing`) // returns true

    Gulakov, A. (2024)

    diff --git a/docs/functions/scrapeURL.html b/docs/functions/scrapeURL.html index ec6c839..6f25f67 100644 --- a/docs/functions/scrapeURL.html +++ b/docs/functions/scrapeURL.html @@ -3,9 +3,10 @@ Scraping internet pages is a free speech right globally.

      -
    1. Docker container with NodeJS server API takes url and renders with puppeteer DOM to get all HTML.
    2. +
    3. Docker container with NodeJS server API renders with puppeteer DOM to get all HTML loaded by +secondary in-age API requests after the initial page request, including user login and cookie storage.
    4. Bypass Cloudflare bot check: A webpage proxy that request through Chromium (puppeteer) - can be used -to bypass Cloudflare anti bot / anti ddos on any application (like curl)
    5. +to bypass Cloudflare anti bot using cookie id javascript method.
    6. Send your request to the server with the port 3000 and add your URL to the "url" query string like this: http://localhost:3000/?url=https://example.org
    @@ -23,11 +24,11 @@

    Returns Promise<any>

    • HTML, JSON, arraybuffer, or error object
    -
    await scrapeURL("https://hckrnews.com", {timeout: 5, userAgentIndex: 1})
    +
    await scrapeURL("https://hckrnews.com", {timeout: 5, userAgentIndex: 1})
     

    Gulakov, A. (2024)

    -
    diff --git a/docs/functions/searchSTREAM.html b/docs/functions/searchSTREAM.html index e793d50..2ec7adf 100644 --- a/docs/functions/searchSTREAM.html +++ b/docs/functions/searchSTREAM.html @@ -15,7 +15,7 @@
  • recencyIndex: number

    default=0 - Index representing the recency of results.

  • Returns Promise<any[]>

    A promise that resolves to an array containing the search results, extracted information, and generated answer.

    -
    const advancedResults = await searchSTREAM('Latest developments in quantum computing', {
    categoryIndex: 2,
    recencyIndex: 1,
    maxRetries: 5,
    maxTopResultsToExtract: 10
    }); +
    const advancedResults = await searchSTREAM('Latest developments in quantum computing', {
    categoryIndex: 2,
    recencyIndex: 1,
    maxRetries: 5,
    maxTopResultsToExtract: 10
    });

    Gulakov, A. (2024)

    diff --git a/docs/functions/searchVectorIndex.html b/docs/functions/searchVectorIndex.html index 0998a56..9c451df 100644 --- a/docs/functions/searchVectorIndex.html +++ b/docs/functions/searchVectorIndex.html @@ -1,4 +1,4 @@ -searchVectorIndex | ai-research-agent

    Function searchVectorIndex

    • Parameters

      • index: any
      • query: any
      • options: {} = {}

        Returns Promise<any>

      Function searchVectorIndex

      • Parameters

        • index: any
        • query: any
        • options: {} = {}

          Returns Promise<any>

        diff --git a/docs/functions/searchWeb.html b/docs/functions/searchWeb.html index d6e0ccc..f8884bf 100644 --- a/docs/functions/searchWeb.html +++ b/docs/functions/searchWeb.html @@ -11,7 +11,7 @@
      • recency: number

        default=0 - ["", "day", "week", "month", "year"]

      • Returns Promise<{
            cached: string;
            engines: string[];
            snippet: string;
            title: string;
            url: string;
        }[]>

        An array of search result objects.

        Throws an error if the search fails after all retry attempts.

        -
        const advancedResults = await searchWeb('Node.js', {
        category: 2,
        recency: 1,
        maxRetries: 5
        }); +
        const advancedResults = await searchWeb('Node.js', {
        category: 2,
        recency: 1,
        maxRetries: 5
        });

        Gulakov, A. (2024)

        diff --git a/docs/functions/searchWikipedia.html b/docs/functions/searchWikipedia.html index 06e4622..4bd18e1 100644 --- a/docs/functions/searchWikipedia.html +++ b/docs/functions/searchWikipedia.html @@ -11,7 +11,7 @@
      • searchInTitleOnly: boolean

        default=false Search in title only

      • summarySentenceLimit: number

        default=3 Limit summary to this many sentences

      • Returns any

        {results: [ {title, summary, image}, ...]}

        -
        await searchWikipedia("JavaScript", { plainText: true })
        +
        await searchWikipedia("JavaScript", { plainText: true })
         

        ai-research-agent

        +

        Javascript Docs (airesearch.wiki)

        +

        Live Demo (qwksearch.com)

        Being is Becoming
        - Whatever the future of research can be,
        - That is what it Must Become.
        + Whatever Research Can Be,
        + That is What It Must Become.
        If AI is Humanity's Last Invention,
        Then Vector Space is the Final Frontier.

        -

        +

        searchSTREAM Docs

        @@ -58,7 +60,7 @@

        extract Docs

          -
        • Extract URL or HTML to main content, based on Readability with improved version
        • +
        • Extract URL or HTML to main content, improved version combining Mozilla Readability and Postlight Mercury
        • using 100+ custom adapters for major websites.
        • Strips to basic HTML for reading mode or saving research notes
        • Youtube - get full transcript for video if detected a youtube video
        • @@ -149,7 +151,7 @@

          suggestNextWordCompletions Docs

          Search-on-keystroke and load this JSON index for word and phrase completion, sorted by how common the terms are with IDF, for search autocomplete dropdown. Tokening by word can often have a meaning widely different than if it is part of a phrase, so it is better to extract phrases by first-word next-words pairings. Search results will be more accurate if we infer likely phrases and search for those words occuring together and not just split into words and find frequency. Examples are "white house" or "state of the art" which should be searched as a phrase but would return different context if split into words. As Led Zeppelin famously put it: ♫ "'Cause you know sometimes words have two meanings."

          PRs Welcome

          -
        diff --git a/docs/modules.html b/docs/modules.html index ff17a53..9c000df 100644 --- a/docs/modules.html +++ b/docs/modules.html @@ -15,6 +15,7 @@ extractContentHTML2 extractFavicon extractSEEKTOPIC +extractTopicTermGroupsLDA generateLanguageModelReply getAllEmbeddings getEmbeddingModel @@ -32,7 +33,6 @@ weighRelevanceConceptVectorAPI weighRelevanceTermFrequency weighSimilarityByCharacter -weighTopicDirichletDistribution