Skip to content

Commit

Permalink
Improvements to assess-query-targets script
Browse files Browse the repository at this point in the history
  • Loading branch information
nonword committed Oct 17, 2024
1 parent 1bb299d commit ea4cd80
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 25 deletions.
11 changes: 10 additions & 1 deletion lib/elasticsearch/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ const buildEsMatchClause = (matcher, field, value, boost) => {

/**
* Given a field name and a value, returns a plainobject representing a ES
* `prefix` clause that can be inserted into a ES query
* `prefix` clause that can be inserted into a ES query.
*
* For example:
* const query = { prefixMatch('idIsbn', '1234' }
*/
const prefixMatch = (field, value, boost = 1) => {
return buildEsMatchClause('prefix', field, value, boost)
Expand All @@ -85,6 +88,9 @@ const prefixMatch = (field, value, boost = 1) => {
/**
* Given a field name and a value, returns a plainobject representing a ES
* `term` clause that can be inserted into a ES query
*
* For example:
* const query = { termMatch('idBarcode', '1234567891011' }
*/
const termMatch = (field, value, boost = 1) => {
return buildEsMatchClause('term', field, value, boost)
Expand All @@ -93,6 +99,9 @@ const termMatch = (field, value, boost = 1) => {
/**
* Given a field name and a value, returns a plainobject representing a ES
* `match_phrase` clause that can be inserted into a ES query
*
* For example:
* const query = { phraseMatch('title', 'Importance of Being' }
*/
const phraseMatch = (field, value, boost = 1) => {
return {
Expand Down
105 changes: 81 additions & 24 deletions scripts/assess-query-targets.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
/**
* Assess query targets performance of the discovery-api
* Assess relevancy performance of discovery-api by running several sample
* searches and reporting on how many of the named bibs appear among the top
* results.
*
* This script loads "query targets" from a CSV, representing scoped keyword queries with 1 or more "good hits"
*
* Query targets CSV should be placed in /data/query-targets.csv, downloaded from
* https://docs.google.com/spreadsheets/d/1cLYqK8MqJ8XtR4cVCCdYRc3hMPZNwoxirko93q8CAts/edit#gid=0
*
* For each query target, the script runs the query against the app, reporting on the successes and failures.
* For each query target, the script runs the query against the app, reporting on
* the successes and failures. Success is determined by how many of the named
* bibs appear close to the top of teh results.
*
* Usage:
* ENV=qa node scripts/assess-query-targets.js [--rows 2,3] [--offset O] [--limit L]
Expand Down Expand Up @@ -38,6 +42,16 @@ setKmsCredentials(fromIni({ profile: argv.profile }))

const PORT = 3333

// Define ratio threshold for passing each query
const OVERALL_PASS_RATIO = {
// If 100% pass, mark test as "pass"
pass: 1,
// If 50+% pass, mark test as "mixed"
mixed: 0.5,
// Otherwise ( < 50% pass), mark test as "fail"
fail: 0
}

/**
* Start the app server
*/
Expand Down Expand Up @@ -93,29 +107,46 @@ const runQuery = async (query) => {
* returns two arrays of objects representing passes and fails
*/
const analyzeResults = (results, goodHits) => {
return Object.keys(goodHits)
.reduce((h, expectedIndex) => {
const bibid = goodHits[expectedIndex].bibid
if (!bibid) {
console.warn('No "good hits" defined?')
return h
}
const actualIndex = results.findIndex((result) => result.result.uri === bibid)
// Consider it a pass if within 5 of expected index (adjusting for number of good hits)
const pass = actualIndex >= 0 && Math.abs(actualIndex - expectedIndex) <= (5 + goodHits.length)

const document = actualIndex >= 0 ? results[actualIndex].result : null
const report = {
bibid,
expectedIndex,
actualIndex,
document
}
h[pass ? 'pass' : 'fail'].push(report)
return goodHits
.map((goodHit) => analyzeResultsForTargetBibId(results, goodHit, goodHits.length))
.reduce((h, analysis) => {
h[analysis.pass ? 'pass' : 'fail'].push(analysis)
return h
}, { pass: [], fail: [] })
}

/**
* Given an array of search results and a "good hit" (bibid and rank), returns
* an object representing the pass/fail for the expected bib
*/
const analyzeResultsForTargetBibId = (results, goodHit, numberOfTargetHits) => {
// Where does the target bib occur in the results?
const actualIndex = results.findIndex((result) => result.result.uri === goodHit.bibid)

// Assess whether and how well the expected hit appeared in the results
let pass = false
if (actualIndex >= 0) {
const distanceFromExpected = actualIndex >= 0 && Math.abs(actualIndex - goodHit.rank)

// Consider it a pass if found within 5 of expected index (adjusting for
// number of good hits)
pass = distanceFromExpected <= (5 + numberOfTargetHits)
}

// Store document in report, if found:
const document = actualIndex >= 0 ? results[actualIndex].result : null

const report = {
bibid: goodHit.bibid,
expectedIndex: goodHit.rank,
actualIndex,
document,
pass
}

return report
}

/**
* Given an Express instance, an array of queries, and a index, executes the
* next test query against the server, reports on the result, and calls itself
Expand All @@ -141,6 +172,9 @@ const testNextQuery = async (queries, index = 0) => {
console.info(tableFormat.table(table))
}

if (!query.orderedHits) {
console.warn('No "good hits" defined?')
}
// Analyze results:
const { pass, fail } = analyzeResults(results, query.orderedHits)
const ratioPass = pass.length / query.orderedHits.length
Expand All @@ -164,7 +198,7 @@ const testNextQuery = async (queries, index = 0) => {
}

// Summarize results:
reportOn(`${pass.length} out of ${query.orderedHits.length} PASS`, ratioPass === 1 ? 'success' : (ratioPass < 0.5 ? 'failure' : 'mixed'))
reportOn(`${pass.length} out of ${query.orderedHits.length} PASS`, overallPassFailLabel(ratioPass))

// More queries to run?
if (queries[index + 1]) {
Expand All @@ -176,6 +210,28 @@ const testNextQuery = async (queries, index = 0) => {
}
}

/**
* Given a ratio (between 0-1), returns "pass", "fail", or "mixed"
*
* E.g. overallPassFailLabel(0.3) => 'fail'
* overallPassFailLabel(0.7) => 'mixed'
* overallPassFailLabel(1.0) => 'pass'
*/
const overallPassFailLabel = (ratio) => {
return Object.entries(OVERALL_PASS_RATIO)
// Sort by thresholds ascending:
.sort((pair1, pair2) => {
return pair1[1] - pair2[1]
})
// Return highest pass/fail label where `ratio` exceeds configured thresshold
.reduce((result, [label, thresshold]) => {
if (ratio >= thresshold) {
result = label
}
return result
}, '')
}

// Await-able setTimeout:
const delay = async (time) => new Promise((resolve) => setTimeout(resolve, time))

Expand All @@ -185,10 +241,11 @@ const delay = async (time) => new Promise((resolve) => setTimeout(resolve, time)
const parseTargetQueryRow = (row, index) => {
row.orderedHits = row['good hits']
.split('\n')
.map((hit) => {
.map((hit, index) => {
return {
url: hit,
bibid: hit.split('/').pop()
bibid: hit.split('/').pop(),
rank: index
}
})
delete row['good hits']
Expand Down

0 comments on commit ea4cd80

Please sign in to comment.