Skip to content

Commit

Permalink
fix seektopic edge cases, tests working
Browse files Browse the repository at this point in the history
  • Loading branch information
vtempest committed Aug 7, 2024
1 parent 1f397f2 commit f265d56
Show file tree
Hide file tree
Showing 13 changed files with 771 additions and 111 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ai-research-agent",
"version": "0.9.1",
"version": "0.9.2",
"module": "index.js",
"type": "module",
"author": "vtempest",
Expand Down
2 changes: 1 addition & 1 deletion src/extractor/html-to-content/html-to-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export default async function extractContent(documentOrHTML, options = {}) {
formatting = true,
absoluteURLs = 1,
url = "",
usePostlightExtractor = 1,
usePostlightExtractor =0,
} = options;
if (typeof documentOrHTML === "string") var html = documentOrHTML;
else var html = documentOrHTML.documentElement.innerHTML;
Expand Down
14 changes: 7 additions & 7 deletions src/keyphrases/ngrams.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import stopWords from "../tokenize/stopwords";
import {isStopWord} from "../tokenize/stopwords";

/**
* Searches terms from index for ngram of given size
Expand Down Expand Up @@ -26,16 +26,16 @@ export default function extractNounEdgeGrams(

var nextWords = terms.slice(index, index + nGramSize);
// is noun or is a stop word like 'state of the art'

if (
isNoun(nextWords[0]) &&
isNoun(nextWords[nGramSize - 1]) &&
nextWords.every(
isNoun(nextWords[nGramSize - 1])
&& nextWords.every(
(word) =>
word[4]?.length >= minWordLength &&
(isNoun(word) || stopWords.includes(word[4]))
)
word[0]?.length >= minWordLength &&
(isNoun(word) || isStopWord(word[0])) )
) {
var nextWordsString = nextWords.map((v) => v[4]).join(" ");
var nextWordsString = nextWords.map(v => v[0]).join(" ");
if (!nGrams[nGramSize][nextWordsString])
nGrams[nGramSize][nextWordsString] = [];

Expand Down
6 changes: 3 additions & 3 deletions src/keyphrases/rank-sentences-keyphrases.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* @returns {Array<Object>} Updated array with added weights: [{text, keyphrases, weight}]
* @category Topics
*/
function rankSentencesCentralToKeyphrase(sentencesWithKeyphrases) {
export function rankSentencesCentralToKeyphrase(sentencesWithKeyphrases) {
// Define graph data structure
const graph = {
vertices: new Map(),
Expand Down Expand Up @@ -106,8 +106,8 @@ function rankSentencesCentralToKeyphrase(sentencesWithKeyphrases) {
let probabilityDistribution = [];

// Perform random walks to distribute weights
const ITERATIONS = 10000;
const RESET_INTERVAL = 1000; // Reset to a random vertex to avoid getting stuck
const ITERATIONS = 1000;
const RESET_INTERVAL = 100; // Reset to a random vertex to avoid getting stuck
for (let i = 0; i < ITERATIONS; i++) {
let totalWeight = 0;

Expand Down
56 changes: 28 additions & 28 deletions src/keyphrases/seektopic-keyphrases.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import extractNounEdgeGrams from "./ngrams.js";
* concepts referred to most by other sentences. Based on the
* TextRank & PageRank algorithms, it randomly surfs links to nodes
* to find probability of being at that node, thus ranking influence.
* @param {string} inputString - input text to analyze
* @param {string} docText - input text to analyze
* @param {Object} options
* @param {Object} options.phrasesModel - phrases model
* @param {Object} options.typosModel - typos model
Expand All @@ -25,35 +25,35 @@ import extractNounEdgeGrams from "./ngrams.js";
* @param {string} options.heavyWeightQuery - query to give heavy weight to
* @returns {Array<Object>} - [{text, keyphrases, weight}] array of sentences
* @example extractSEEKTOPIC(testDoc, { phrasesModel, heavyWeightQuery: "self attention", limitTopSentences: 10,
* @category Topics
*/
export function extractSEEKTOPIC(inputString, options = {}) {
* @category Topics
*/
export function extractSEEKTOPIC(docText, options = {}) {
var {
phrasesModel,
typosModel,
maxWords = 5,
minWords = 2,
minWords = 1,
minWordLength = 3,
topKeyphrasesPercent = 0.2,
topKeyphrasesPercent = 0.5,
limitTopSentences = 5,
limitTopKeyphrases = 10,
minKeyPhraseLength = 5,
heavyWeightQuery = "",
} = options;

//if not string throw error
if (typeof inputString != "string") {
throw new Error("inputString must be a string");
if (typeof docText != "string") {
throw new Error("docText must be a string");
}

//add space before < to ensure split sentences
inputString = inputString
docText = docText
.replace(/</g, " <")
.replace(/>/g, "> ")
.replace(/&.{2,5};/g, ""); //&quot; &amp; &lt; &gt; &nbsp;

//split into sentences
var sentencesArray = splitSentences(inputString);
var sentencesArray = splitSentences(docText);
var sentenceNumber = 0;

//extract ngrams
Expand All @@ -62,17 +62,17 @@ export function extractSEEKTOPIC(inputString, options = {}) {
// create sentenceKeysMap [{text,index,tokens,keyphrases:[{text,weight}] }]
var sentenceKeysMap = [];

for (var i = 0; i < sentencesArray.length; i++) {
var text = sentencesArray[i];
for (var index in sentencesArray) {
var text = sentencesArray[index];

var tokens = tokenizeTopics(text, {
phrasesModel,
typosModel,
// typosModel,
});

sentenceKeysMap.push({
text,
index: i,
index,
keyphrases: [],
tokens,
});
Expand All @@ -85,7 +85,7 @@ export function extractSEEKTOPIC(inputString, options = {}) {
i,
nGrams,
minWordLength,
sentenceNumber++
index
);
}

Expand Down Expand Up @@ -151,14 +151,15 @@ export function extractSEEKTOPIC(inputString, options = {}) {
if (keyphrase.keyphrase == heavyWeightQuery) keyphrase.weight += 5000;
});


var keysUniqueMap = {};
var keysUniqueMap = {};
//deduplicate and enforce unique values

keyphraseObjects = keyphrasesFolded
.sort((a, b) => b.weight - a.weight)
.filter(k=>!keysUniqueMap[k.keyphrase] && (keysUniqueMap[k.keyphrase] = 1))
.map(k => ({sentences: [...new Set(k.sentences)], ...k}));
.filter(
(k) => !keysUniqueMap[k.keyphrase] && (keysUniqueMap[k.keyphrase] = 1)
)
.map((k) => ({ sentences: [...new Set(k.sentences)], ...k }));

var limitKeyPhrases = Math.floor(
keyphrasesFolded.length * topKeyphrasesPercent
Expand All @@ -171,7 +172,7 @@ export function extractSEEKTOPIC(inputString, options = {}) {
.map((keyphraseObject) => {
var phraseTokenized = tokenizeTopics(keyphraseObject.keyphrase, {
phrasesModel,
typosModel,
// typosModel,
});
//if wiki entity, triple weight
if (phraseTokenized.filter((r) => r[0] == 50)[0]) {
Expand All @@ -198,24 +199,23 @@ export function extractSEEKTOPIC(inputString, options = {}) {
for (var keyphraseObject of keyphraseObjects) {
const { keyphrase, sentences, weight } = keyphraseObject;
for (var sentenceNumber of sentences)
sentenceKeysMap[sentenceNumber].keyphrases.push({ keyphrase, weight });
sentenceKeysMap[sentenceNumber].keyphrases?.push({ keyphrase, weight });
}

//run text rank
var topSentences = rankSentencesCentralToKeyphrase(sentenceKeysMap)
?.sort((a, b) => b.weight - a.weight)
.slice(0, limitTopSentences) //cut off top S limit
.map(s => ({
.slice(0, limitTopSentences) //cut off top S limit
.map((s) => ({
...s,
text: s.text, //remove to not show text
keyphrases: s.keyphrases.map(k => k.keyphrase),
...s
keyphrases: s.keyphrases.map((k) => k.keyphrase),
}));

// limit keyphrases to top K
var keyphrases = keyphraseObjects
.slice(0, limitTopKeyphrases).map(k => ({
sentences: k.sentences.join(","),
var keyphrases = keyphraseObjects.slice(0, limitTopKeyphrases).map((k) => ({
...k,
sentences: k.sentences.join(","),
}));

return { topSentences, keyphrases, sentences: sentencesArray };
Expand Down
23 changes: 16 additions & 7 deletions src/search-web/answer-engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ export async function searchSTREAM(query, options = {}) {
maxTopResultsToExtract = 6,
} = options;

var timerStart = Date.now();

//try archive.org if original url fails or bot-blocked

const optionUseCacheBackup = 1;
Expand All @@ -37,20 +39,27 @@ export async function searchSTREAM(query, options = {}) {
}

results = results.slice(0, maxTopResultsToExtract);
console.log("Search results:", results);

for (const r of results) {
let extraction = await extract(r.url);
for (var i in results) {

let {url, cached} = results[i];

let extraction = await extract(url);

if (optionUseCacheBackup && extraction.error)
extraction = await extract(r.cached);
extraction = await extract(cached);

if (extraction.error) console.error("error", extraction);

else{
results[i] = extraction;
}
// extraction.html = "";
// html.html = splitSentences.default(html.html);
console.log("Extracted content:", extraction);
// console.log("Extracted content:", extraction);
}

return results;

var timeElapsed = Date.now() - timerStart;

return {results, timeElapsed};
}
30 changes: 26 additions & 4 deletions src/search-web/search-web.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,32 @@ export async function searchWeb(query, options = {}) {
];
const RECENCY_LIST = ["", "day", "week", "month", "year"];

const SEARX_DOMAINS =
"baresearch.org,copp.gg,darmarit.org,etsi.me,fairsuch.net,nogoo.me,northboot.xyz,nyc1.sx.ggtyler.dev,ooglester.com,opnxng.com,paulgo.io,priv.au,s.mble.dk,s.trung.fun,search.blitzw.in,search.bus-hit.me,search.charliewhiskey.net,search.citw.lgbt,search.darkness.services,search.datura.network,search.dotone.nl,search.einfachzocken.eu,search.gcomm.ch,search.hbubli.cc,search.im-in.space,search.incogniweb.net,search.indst.eu,search.inetol.net,search.leptons.xyz,search.mdosch.de,search.nadeko.net,search.nerdvpn.de,search.ngn.tf,search.ononoki.org,search.privacyredirect.com,search.projectsegfau.lt,search.rhscz.eu,search.rowie.at,search.sapti.me,search.smnz.de,search.tommy-tran.com,searx.aleteoryx.me,searx.ankha.ac,searx.be,searx.catfluori.de,searx.colbster937.dev,searx.daetalytica.io,searx.dresden.network,searx.electroncash.de,searx.foss.family,searx.hu,searx.juancord.xyz,searx.lunar.icu,searx.mv-software.de,searx.mxchange.org,searx.namejeff.xyz,searx.nobulart.com,searx.numeriquement.fr,searx.oakleycord.dev,searx.ox2.fr,searx.perennialte.ch,searx.rhscz.eu,searx.ro,searx.sev.monster,searx.thefloatinglab.world,searx.tiekoetter.com,searx.tuxcloud.net,searx.work,searx.zhenyapav.com,searxng.brihx.fr,searxng.ch,searxng.hweeren.com,searxng.online,searxng.shreven.org,searxng.site,skyrimhater.com,sx.ca.zorby.top,sx.catgirl.cloud,sx.thatxtreme.dev,sx.zorby.top,www.gruble.de,www.jabber-germany.de,xo.wtf".split(
","
);
const SEARX_DOMAINS = ['baresearch.org', 'copp.gg', 'darmarit.org',
'etsi.me', 'fairsuch.net', 'nogoo.me', 'northboot.xyz',
'nyc1.sx.ggtyler.dev', 'ooglester.com', 'opnxng.com', 'paulgo.io',
'priv.au', 's.mble.dk', 's.trung.fun', 'search.blitzw.in',
'search.charliewhiskey.net', 'search.citw.lgbt',
'search.darkness.services', 'search.datura.network', 'search.dotone.nl',
'search.einfachzocken.eu', 'search.gcomm.ch', 'search.hbubli.cc',
'search.im-in.space', 'search.incogniweb.net', 'search.indst.eu',
'search.inetol.net', 'search.leptons.xyz', 'search.mdosch.de',
'search.nadeko.net', 'search.nerdvpn.de', 'search.ngn.tf',
'search.ononoki.org', 'search.privacyredirect.com', 'search.sapti.me',
'search.rhscz.eu', 'search.rowie.at', 'search.projectsegfau.lt',
'search.tommy-tran.com', 'searx.aleteoryx.me', 'searx.ankha.ac',
'searx.be', 'searx.catfluori.de', 'searx.colbster937.dev',
'searx.daetalytica.io', 'searx.dresden.network', 'searx.electroncash.de',
'searx.foss.family', 'searx.hu', 'searx.juancord.xyz', 'searx.lunar.icu',
'searx.mv-software.de', 'searx.mxchange.org', 'searx.namejeff.xyz',
'searx.nobulart.com', 'searx.numeriquement.fr', 'searx.oakleycord.dev',
'searx.ox2.fr', 'searx.perennialte.ch', 'searx.rhscz.eu', 'searx.ro',
'searx.sev.monster', 'searx.thefloatinglab.world', 'searx.tiekoetter.com',
'searx.tuxcloud.net', 'searx.work', 'searx.zhenyapav.com',
'searxng.brihx.fr', 'searxng.ch', 'searxng.hweeren.com', 'search.smnz.de',
'searxng.online', 'searxng.shreven.org', 'searxng.site', 'skyrimhater.com',
'sx.ca.zorby.top', 'sx.catgirl.cloud', 'sx.thatxtreme.dev',
'sx.zorby.top', 'www.gruble.de', 'www.jabber-germany.de', 'xo.wtf'];

try {
//select a random domain if none is provided
const searchDomain =
Expand Down
Loading

0 comments on commit f265d56

Please sign in to comment.