Skip to content

Commit

Permalink
Updates from Elastic consulting
Browse files Browse the repository at this point in the history
 - named queries
 - min_should_match
 - more targetted standard_numbers searches
 - include some contrib matchers in 'all' search
  • Loading branch information
nonword committed Jul 2, 2024
1 parent 62c4296 commit 1a3a5dc
Show file tree
Hide file tree
Showing 105 changed files with 84,678 additions and 175,190 deletions.
7 changes: 4 additions & 3 deletions config/production.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Currently, this is our ES 5.3 for Prod:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAMMwgcAGCSqGSIb3DQEHBqCBsjCBrwIBADCBqQYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAyUXsOWkXt7ZqqD7uQCARCAfMIrLEGKd51lkX/hzWYlUms176JOee1HhdP/WIf5nd3xTwXJz1m4yOWwDsefoJEL4sez/2ZP43MVds+1lcM3PwkHMyN01sa2xpwVDrT68A/f22bDMtp6A9BmJqr7VBB5oq2j4FoXu2+0uaQWQRWJ0Ns+vQ6nbiR5V4Vv8zE=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDF+IpeiMrtx/UveMsgIBEIAvIolGVXJPBMseyyep3/fyf/qP+KdATwfxT6Gw4dnatbiL5609NBeyyLNIprEM90g=
# Greg built self-hosted production domain:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHgwdgYJKoZIhvcNAQcGoGkwZwIBADBiBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDIYpOz/BbRlJZUul7gIBEIA1idumQ6fdf/j5/pzF4t96MGGH/eV1gD4WCyLUnScgNYqtRNK0ajRO6XVroswsrJtgCwUerDM=
# resources-2024-06-13:
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDHp+Hkdg9AqFe18QggIBEIAvZqSBQgs8v/wpkOVl5q54mQtuvBZpnXYDGc+zxiUHLZLKsqiCs58dGFnZvekDWXs=

ENCRYPTED_SCSB_URL=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHwwegYJKoZIhvcNAQcGoG0wawIBADBmBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDKPFC8wFkVM5CyT6VQIBEIA5m4eLBkpChRA//ZNEWsRqIDGZmevb/thzI03a0NiAW6VfybSAYpFthh+bj/yAk1VEEBF6r1T4A2GP
ENCRYPTED_SCSB_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAIMwgYAGCSqGSIb3DQEHBqBzMHECAQAwbAYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAw8tglwVzGKBduDD9wCARCAP4biSz13FvZVHyQ8LKCb0+uLcKUKmzWqC5abVJI0kTmQJvjr9ViHsuP9/qj94Y8E7K96sb+fn0+HZk8So6CssA==
Expand Down
2 changes: 2 additions & 0 deletions config/qa.env
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ HIDE_NYPL_SOURCE=

BIB_HAS_VOLUMES_THRESHOLD=0.8
BIB_HAS_DATES_THRESHOLD=0.8

NAME_QUERIES=true
6 changes: 5 additions & 1 deletion lib/jsonld_serializers.js
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ class SearchResultsSerializer extends JsonLdListSerializer {
}
}
element.result = Object.assign(element.result, item.record)

// If using named queries, pass them through:
if (item.matched_queries) element.matchedQueries = item.matched_queries

if (item.title) element.result.prefLabel = item.title
return element
}
Expand Down Expand Up @@ -400,7 +404,7 @@ class ResourceResultsSerializer extends SearchResultsSerializer {
}

static serialize (resp, opts) {
const results = resp.hits.hits.map((h) => ({ score: h._score, record: ResourceSerializer.serialize(h._source) }))
const results = resp.hits.hits.map((h) => ({ score: h._score, record: ResourceSerializer.serialize(h._source), matched_queries: h.matched_queries }))
const totalResults = typeof resp.hits.total?.value === 'number' ? resp.hits.total.value : resp.hits.total
opts = Object.assign({ extraRootProperties: { totalResults } }, opts)
return (new ResourceResultsSerializer(results, opts)).format()
Expand Down
167 changes: 116 additions & 51 deletions lib/resources.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ const SEARCH_SCOPES = {
'formerTitle',
'addedAuthorTitle',
'placeOfPublication',
{ field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) }
{ field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) },
// Try to detect shelfmark searches (e.g. JFD 16-5143)
{ field: 'items.shelfMark', on: (q) => /^[A-Z]{1,3} \d{2,}/.test(q) }
]
},
title: {
Expand Down Expand Up @@ -122,7 +124,7 @@ const SEARCH_SCOPES = {
fields: ['shelfMark', 'items.shelfMark']
},
standard_number: {
fields: ['shelfMark', 'identifierV2.value', 'uri', 'identifier', 'items.shelfMark', { field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) }, 'idIsbn_clean']
fields: null // ['shelfMark', 'identifierV2.value', 'uri', 'identifier', 'items.shelfMark', { field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) }, 'idIsbn_clean']
}
}

Expand Down Expand Up @@ -451,15 +453,6 @@ module.exports = function (app, _private = null) {
}
}
},
{
nested: {
path: 'items',
query: {
exists: { field: 'items.electronicLocator' }
},
inner_hits: { name: 'electronicResources' }
}
},
// Add a catch-all to ensure we return the bib document even when
// numItems=0 or applied item filters exclude all items:
{ match_all: {} }
Expand Down Expand Up @@ -698,10 +691,6 @@ module.exports = function (app, _private = null) {
const itemsQueryContext = (options) => {
const excludeClauses = []

// Always remove electronicResources from items because (if they exist
// there, the "electronicResources" inner_hits query will extract them)
excludeClauses.push({ exists: { field: 'items.electronicLocator' } })

if (!options.merge_checkin_card_items) excludeClauses.push({ term: { 'items.type': 'nypl:CheckinCardItem' } })

return excludeClauses.length ? { must_not: excludeClauses } : { must: { match_all: {} } }
Expand Down Expand Up @@ -736,7 +725,14 @@ module.exports = function (app, _private = null) {
.then((resp) => {
// Build relevance report (for debugging):
const relevanceReport = resp.itemListElement
.map((r) => `${r.searchResultScore}: ${r.result.title[0]}`)
.map((r, ind) => {
const out = []
out.push(`${ind + 1}: ${r.searchResultScore} score > ${r.result.uri}:`)
if (params.search_scope === 'contributor') out.push(`(${r.result.creatorLiteral || r.result.contributorLiteral})`)
if (params.search_scope === 'standard_number') out.push(`(${r.result.items && r.result.items[0]?.shelfMark})`)
out.push(`${r.result.title[0]} (displayed as "${r.result.titleDisplay[0]}")`)
return out.join(' ')
})
app.logger.debug(`Relevances:\n ${relevanceReport.join('\n')}`)

resp.debug = {
Expand Down Expand Up @@ -999,6 +995,14 @@ const escapeQuery = function (str) {
return str
}

const namedQuery = (query, name) => {
const namedQueriesEnabled = process.env.NAME_QUERIES === 'true'

if (!namedQueriesEnabled) return query

return Object.assign(query, { _name: name })
}

/**
* For a set of parsed request params, returns a plainobject representing the
* keyword aspects of the ES query.
Expand All @@ -1018,6 +1022,8 @@ const buildElasticQueryForKeywords = function (params) {
// Fill these with our top-level clauses:
const should = []

if (!SEARCH_SCOPES[params.search_scope].fields) return null

// We have an array of fields to match.
// Seperate the root-level fields from nested fields by building an object like this:
// {
Expand All @@ -1043,13 +1049,25 @@ const buildElasticQueryForKeywords = function (params) {
}, { _root: [] })

should.push({
multi_match: namedQuery({
fields: fieldMap._root,
query: escapeQuery(params.q),
type: 'most_fields',
// For 4+ term queries, require 75% of terms:
minimum_should_match: '3<75%'
}, 'bib multi_match')
/*
query_string: {
fields: fieldMap._root,
query: escapeQuery(params.q),
default_operator: 'OR'
default_operator: 'OR',
// For 4+ term queries, require 75% of terms:
minimum_should_match: '3<75%'
}
*/
})

/*
if (params.search_scope === 'standard_number') {
should.push({
query_string: {
Expand All @@ -1059,6 +1077,7 @@ const buildElasticQueryForKeywords = function (params) {
}
})
}
*/

// Add nested queries (if any) to things that *should* match:
Object.keys(fieldMap)
Expand All @@ -1068,10 +1087,10 @@ const buildElasticQueryForKeywords = function (params) {
nested: {
path: nestedName,
query: {
query_string: {
multi_match: {
fields: fieldMap[nestedName],
query: escapeQuery(params.q),
default_operator: 'AND'
type: 'phrase'
}
}
}
Expand Down Expand Up @@ -1248,19 +1267,24 @@ const buildElasticQuery = function (params) {
})

if (params.q) {
query.bool = {}

// Merge keyword-specific ES query into the query we're building:
query.bool = { must: [buildElasticQueryForKeywords(params)] }
const queryStringMatcher = buildElasticQueryForKeywords(params)
if (queryStringMatcher) {
query.bool.must = [queryStringMatcher]
}

// Mike, Elastic consultant, additions:
query.bool.should = []

// Boost NYPL hits:
query.bool.should.push({
term: {
nyplSource: {
nyplSource: namedQuery({
value: 'sierra-nypl',
boost: 1
}
}, 'nypl-owned')
}
})

Expand All @@ -1270,11 +1294,11 @@ const buildElasticQuery = function (params) {
path: 'items',
query: {
regexp: {
'items.holdingLocation.id': {
'items.holdingLocation.id': namedQuery({
value: 'loc:(ma|pa|sc).*',
flags: 'ALL',
boost: 1
}
}, 'on-site')
}
}
}
Expand All @@ -1290,50 +1314,50 @@ const buildElasticQuery = function (params) {
query.bool.should = query.bool.should.concat([
{
match_phrase: {
'title.folded': {
'title.folded': namedQuery({
query: querySansQuotes,
boost: 50
}
}, 'match_phrase title.folded')
}
},
{
prefix: {
'title.keywordLowercasedStripped': {
'title.keywordLowercasedStripped': namedQuery({
value: querySansQuotes,
boost: 25
}
}, 'prefix title.keywordLowercasedStripped')
}
},
{
match_phrase: {
'title.shingle': {
'title.shingle': namedQuery({
query: querySansQuotes,
boost: 75
}
}, 'match_phrase title.shingle')
}
},
{
term: {
'title.keywordLowercasedStripped': {
'title.keywordLowercasedStripped': namedQuery({
value: querySansQuotes,
boost: 105
}
}, 'term title.keywordLowercasedStripped')
}
},
{
term: {
'title.keywordLowercased': {
'title.keywordLowercased': namedQuery({
value: querySansQuotes,
boost: 110
}
}, 'term title.keywordLowercased')
}
},
{
term: {
'title.keyword': {
'title.keyword': namedQuery({
value: querySansQuotes,
boost: 150
}
}, 'term title.keyword')
}
}
])
Expand All @@ -1343,26 +1367,31 @@ const buildElasticQuery = function (params) {
query.bool.should = query.bool.should.concat([
{
match_phrase: {
'contributorLiteral.folded': {
'contributorLiteral.folded': namedQuery({
query: querySansQuotes,
boost: 25
}
}, 'match_phrase contributorLiteral.folded')
}
},
{
match_phrase: {
'creatorLiteral.folded': {
'creatorLiteral.folded': namedQuery({
query: querySansQuotes,
boost: 100
}
}, 'match_phrase creatorLiteral.folded')
}
},
}
])
}

if (['all', 'contributor'].includes(params.search_scope)) {
query.bool.should = query.bool.should.concat([
{
match_phrase: {
creatorModifiedName: {
creatorModifiedName: namedQuery({
query: querySansQuotes,
boost: 1000
}
boost: 100
}, 'match_phrase creatorModifiedName')
}
}
])
Expand All @@ -1371,28 +1400,64 @@ const buildElasticQuery = function (params) {
if (['all', 'standard_number'].includes(params.search_scope)) {
query.bool.should.push({
term: {
'shelfMark.raw': {
'shelfMark.raw': namedQuery({
value: querySansQuotes,
boost: 50
}
}, 'term shelfMark.raw')
}
})
}

// ['shelfMark', 'identifierV2.value', 'uri', 'identifier', 'items.shelfMark', { field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) }, 'idIsbn_clean']
if (params.search_scope === 'standard_number') {
query.bool.should = query.bool.should.concat([
{ prefix: { idIsbn: { value: querySansQuotes } } },
{ prefix: { idLccn: { value: querySansQuotes } } },
{ prefix: { idOclc: { value: querySansQuotes } } },
{ prefix: { 'identifierV2.value': namedQuery({ value: querySansQuotes }, 'term identifierV2.value') } },
{ term: { uri: namedQuery({ value: querySansQuotes }, 'term uri') } },
{ prefix: { idIsbn: namedQuery({ value: querySansQuotes }, 'prefix idIsbn') } },
{ prefix: { idLccn: namedQuery({ value: querySansQuotes }, 'prefix idLccn') } },
{ prefix: { idOclc: namedQuery({ value: querySansQuotes }, 'prefix idOclc') } },
{
prefix: {
'shelfMark.raw': {
'shelfMark.raw': namedQuery({
value: querySansQuotes,
boost: 10
}
}, 'prefix shelfMark.raw')
}
}
])

query.bool.should.push({
nested: {
path: 'items',
query: {
bool: {
should: [
{
// I would like this to be a prefix, but doesn't seem to work:
term: {
'items.identifierV2.value': namedQuery({ value: querySansQuotes }, 'prefix identifierV2.value')
}
},
{
prefix: {
'items.shelfMark': namedQuery({ value: querySansQuotes }, 'prefix identifierV2.value')
}
}
]
}
}
}
})

if (/\d{6,}/.test(querySansQuotes)) {
query.bool.should.push({
prefix: {
idIsbn_clean: namedQuery({
value: querySansQuotes
}, 'prefix idIsbn_clean')
}
})
}
}
}

Expand Down
Loading

0 comments on commit 1a3a5dc

Please sign in to comment.