Skip to content

Commit

Permalink
Merge pull request #411 from NYPL/qa2-refactor
Browse files Browse the repository at this point in the history
QA2 - ES relevance improvements and query generation refactor
  • Loading branch information
nonword authored Oct 21, 2024
2 parents 0ffb3c3 + f29693c commit 15409b3
Show file tree
Hide file tree
Showing 172 changed files with 155,684 additions and 253,191 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ For local development, it's easiest to just use local node binaries:

```
nvm use; npm i
nvm use; ENV=qa npm start
nvm use; ENV=qa LOCAL=true npm start
```

Note that when developing locally, if connecting to a IP ACL protected index (a practice we're currently deprecating), you may need to [add your IP to the access control policy of the relevant ES domain](https://github.com/NYPL/aws/blob/b5c0af0ec8357af9a645d8b47a5dbb0090966071/common/elasticsearch.md#2-make-the-domain-public-restrict-by-ip). If your IP has not been authorized, you will see errors such as the following in the application logs:
Expand Down
5 changes: 4 additions & 1 deletion app.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
const express = require('express')

const esClient = require('./lib/es-client')
const esClient = require('./lib/elasticsearch/client')
const loadConfig = require('./lib/load-config')
const { preflightCheck } = require('./lib/preflight_check')

const swaggerDocs = require('./swagger.v1.1.x.json')

const pjson = require('./package.json')

const app = express()
Expand Down Expand Up @@ -52,6 +53,8 @@ app.init = async () => {
app.get('/api/v0.1/discovery/swagger', function (req, res) {
res.send(swaggerDocs)
})

return app
}

app.start = async () => {
Expand Down
7 changes: 4 additions & 3 deletions config/production.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Currently, this is our ES 5.3 for Prod:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAMMwgcAGCSqGSIb3DQEHBqCBsjCBrwIBADCBqQYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAyUXsOWkXt7ZqqD7uQCARCAfMIrLEGKd51lkX/hzWYlUms176JOee1HhdP/WIf5nd3xTwXJz1m4yOWwDsefoJEL4sez/2ZP43MVds+1lcM3PwkHMyN01sa2xpwVDrT68A/f22bDMtp6A9BmJqr7VBB5oq2j4FoXu2+0uaQWQRWJ0Ns+vQ6nbiR5V4Vv8zE=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDF+IpeiMrtx/UveMsgIBEIAvIolGVXJPBMseyyep3/fyf/qP+KdATwfxT6Gw4dnatbiL5609NBeyyLNIprEM90g=
# Greg built self-hosted production domain:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJYwgZMGCSqGSIb3DQEHBqCBhTCBggIBADB9BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDFWw8ECX9Pz81z0kvAIBEIBQGec9PCpwuvEgLH6imhqP6tx1fj8Vlf2ZipnUy06jzmpE262Qvk9LPAq7sIYPVkTCZctwilwcU9oC6yxasVoUlK87la77v03CeZsPIDwciFY=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDGBdRKmRzMe2BVCBQAIBEIAvOT7nIumnss0wJN0N4tSrhW+wh7lJ7yr7VwQhe7TVUJEmADJDEnnyvm18FMLBR4c=
ENCRYPTED_ELASTICSEARCH_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJ4wgZsGCSqGSIb3DQEHBqCBjTCBigIBADCBhAYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAyPOPaQCBbvKQhJoPQCARCAV2TlWlRh+xKnCegpprEQgfldZGcVW48RND0LVd/pQpVTJnRTtbCpP7damT7k8ziJVdWZ3jsfs5fw5YnKc/EIQ1M//DRUzOJL98ir5LTTxE7QhflKDtUY+Q==

ENCRYPTED_SCSB_URL=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHwwegYJKoZIhvcNAQcGoG0wawIBADBmBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDKPFC8wFkVM5CyT6VQIBEIA5m4eLBkpChRA//ZNEWsRqIDGZmevb/thzI03a0NiAW6VfybSAYpFthh+bj/yAk1VEEBF6r1T4A2GP
ENCRYPTED_SCSB_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAIMwgYAGCSqGSIb3DQEHBqBzMHECAQAwbAYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAw8tglwVzGKBduDD9wCARCAP4biSz13FvZVHyQ8LKCb0+uLcKUKmzWqC5abVJI0kTmQJvjr9ViHsuP9/qj94Y8E7K96sb+fn0+HZk8So6CssA==
Expand Down
24 changes: 0 additions & 24 deletions config/qa-new-domain.env

This file was deleted.

9 changes: 6 additions & 3 deletions config/qa.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Currently, this is our ES 5.3 for QA:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAALswgbgGCSqGSIb3DQEHBqCBqjCBpwIBADCBoQYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAyWLvUSzA/IAQCHl0MCARCAdNpF/Z1VJESwJ7hcwo/BqZz2mTDPA9NAPQ4zuPLsItz9A2lfHaP03bPuo9nq8VP5AKLOa4zPL0VoBmwEjj9qCCb+LSpQ3m+OoyM3BxG98/qYEcwXXOa8+0fH1x5asVrup/YICJdeD6jOewxttzzxCCGXEklL
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDI88/9macimvmLyWCAIBEIAvMUOAtF2Miq+8u7/A9fzBz57LavqkeLJmv8dd7WQzdA9lhqPkjUK0pzYtxsPe6Nk=
# Greg built self-hosted qa domain:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJYwgZMGCSqGSIb3DQEHBqCBhTCBggIBADB9BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDMIkDoQ9C/cCDCAq1wIBEIBQ+L3OgUGeOW9rs1CWkhpBjwM4LbbVRFIWedqew4UXIeSNMJ8cO9SNe4YGCUIoKwCDYt7W7ip3VtDRRRMVvz6QJw+Eg8ugTMVs2pbNFGNvaAQ=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDGjIeRRI3cN5LwfcmgIBEIAvkB3K4sxagpqspNa3b3BnJQgbw6Ic0jDhuXzk8gmrCiGNMR90YjTZmyLj9aVUN3M=
ENCRYPTED_ELASTICSEARCH_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJ4wgZsGCSqGSIb3DQEHBqCBjTCBigIBADCBhAYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAx+kryf2KUmGdBYD9sCARCAV3ygz3eXIdq8JX/wpG9JRWlTNMRcpNE1qT0zNlN4t+ZvXEoedLQa/3p1YjgHw06GIAdA9xtkMV4eH9a1K8uCvjP8XxxNKekcMj59TlResnu9QF3r7pGXuQ==

ENCRYPTED_SCSB_URL=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAH8wfQYJKoZIhvcNAQcGoHAwbgIBADBpBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBKllElmWYLxGOGopQIBEIA8JJyKde/8m8iCJGKR5D8HoTJhXHeyvw9eIDeuUNKiXLfJwoVz+PDAZSxkCQtM9O91zGhXbe3l6Bk1RlYJ
ENCRYPTED_SCSB_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAGMwYQYJKoZIhvcNAQcGoFQwUgIBADBNBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDNw8KXkyN8HvtjAX0gIBEIAgX+XG2fxTj6kSchrd/dfHB05KU5pkT0LtPxUTuNCXoLc=
Expand All @@ -22,3 +23,5 @@ HIDE_NYPL_SOURCE=

BIB_HAS_VOLUMES_THRESHOLD=0.8
BIB_HAS_DATES_THRESHOLD=0.8

NAME_QUERIES=true
75 changes: 75 additions & 0 deletions lib/api-request.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* This class wraps request params to ease interpretting the query
**/

// Build regex pattern for matching a phrase fully enclosed in quotes (or smart quotes):
const QUOTE_CHARS = '"\u201C\u201D\u201E\u201F\u2033\u2036'
const IN_QUOTES_PATTERN = new RegExp(`^[${QUOTE_CHARS}][^${QUOTE_CHARS}]+[${QUOTE_CHARS}]$`)

class ApiRequest {
static ADVANCED_SEARCH_PARAMS = ['title', 'subject', 'contributor']
static IDENTIFIER_NUMBER_PARAMS = ['isbn', 'issn', 'lccn', 'oclc']

constructor (params) {
this.params = params

// Make some substitutions for folks querying with "date:1997", etc
if (this.params.q) {
this.params.q = this.params.q
.replace(/date:/g, 'dateStartYear:')
.replace(/location:/g, 'locations:')
.replace(/subject:/g, 'subjectLiteral:')
}
}

/**
* Get array of params in the query that are also valid search-scopes:
*/
advancedSearchParamsThatAreAlsoScopes () {
// Return search params that are also valid search_scope values
return ApiRequest.ADVANCED_SEARCH_PARAMS
.filter((key) => this.params[key])
}

hasKeyword () {
return !!this.params.q
}

hasSearch () {
return this.hasKeyword() || this.advancedSearchParamsThatAreAlsoScopes().length > 0
}

/**
* Returns true if search_scope matches one of the named scopes
**/
hasScope (scopes) {
return scopes.includes(this.params.search_scope)
}

/**
* Returns true if query is fully wrapped in quotes (or smart quotes)
**/
queryIsFullyQuoted () {
return IN_QUOTES_PATTERN.test(this.params.q)
}

/**
* Get keyword query without surrounding quotes (if fully quoted)
**/
querySansQuotes () {
return this.queryIsFullyQuoted()
? this.params.q.substring(1, this.params.q.length - 1)
: this.params.q
}

hasIdentifierNumberParam () {
return Object.keys(this.params)
.some((userParam) => ApiRequest.IDENTIFIER_NUMBER_PARAMS.includes(userParam))
}

static fromParams (params) {
return new ApiRequest(params)
}
}

module.exports = ApiRequest
11 changes: 8 additions & 3 deletions lib/availability_resolver.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ const scsbClient = require('./scsb-client')
const logger = require('./logger')
const ResourceSerializer = require('./jsonld_serializers').ResourceSerializer
const NyplSourceMapper = require('discovery-store-models/lib/nypl-source-mapper')
const { nonRecapItemStatusAggregation } = require('./es-client')
const { nonRecapItemStatusAggregation } = require('./elasticsearch/client')
const { deepValue, isInRecap } = require('./util')

const ITEM_STATUSES = {
Expand Down Expand Up @@ -94,7 +94,9 @@ class AvailabilityResolver {
logger.debug('_fixItemStatusAggregation: Skipping because no SCSB statuses provided')
return Promise.resolve()
}
const bnum = resp.hits.hits[0]._id
const bnum = resp.hits.hits[0]?._id
if (!bnum) return Promise.resolve()

const { nyplSource } = NyplSourceMapper.instance().splitIdentifier(bnum)

// Get total number of items:
Expand Down Expand Up @@ -135,7 +137,10 @@ class AvailabilityResolver {
nonRecapStatusAggregation
)
logger.debug(`_fixItemStatusAggregation: Final aggregation: ${JSON.stringify(resp.aggregations.item_status._nested.buckets)}`)
}).catch((e) => console.error('Got error: ', e))
}).catch((e) => {
console.error('Got error: ', e)
return Promise.resolve()
})
}
}

Expand Down
48 changes: 35 additions & 13 deletions lib/es-client.js → lib/elasticsearch/client.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const elasticsearch = require('@elastic/elasticsearch')
const url = require('node:url')
const { deepValue } = require('./util')
const logger = require('./logger')
const { IndexConnectionError } = require('./errors')
const { deepValue } = require('../util')
const logger = require('../logger')
const { IndexConnectionError } = require('../errors')

const clientWrapper = {}

Expand All @@ -11,16 +11,38 @@ const clientWrapper = {}
*/
clientWrapper.esClient = function () {
if (!this._esClient) {
// Parse ES connection string:
const { protocol, auth, host, port } = url.parse(process.env.ELASTICSEARCH_URI)
const [username, password] = auth ? auth.split(':') : []
const options = {
node: `${protocol}//${host}`,
port,
auth: { username, password }
// Parse ES connection string, which is likely multiple http base URIs
// separated by a comma:
const elasticUris = process.env.ELASTICSEARCH_URI.split(',')
const urisParsed = elasticUris.map((uri) => {
// Extract parts of the URI:
const { protocol, auth, host } = url.parse(uri)
const [username, password] = auth ? auth.split(':') : []
return {
protocol,
host,
username,
password
}
})
// Build ES client connection config:
const config = {}
config.nodes = urisParsed.map((uri) => `${uri.protocol}//${uri.host}`)

// Configure auth:
if (process.env.ELASTICSEARCH_API_KEY) {
// Auth with `apiKey`:
config.auth = { apiKey: process.env.ELASTICSEARCH_API_KEY }
} else if (urisParsed[0].username) {
// Auth with username, password:
config.auth = { username: urisParsed[0].username, password: urisParsed[0].password }
}
logger.info(`Connecting to ES at ${host}:${port} ${username && password ? 'with creds' : 'w/out creds'}`)
this._esClient = new elasticsearch.Client(options)

// Log out some of the connection details for debugging purposes:
const authMethod = urisParsed[0].username ? 'with creds' : (process.env.ELASTICSEARCH_API_KEY ? 'with apiKey' : 'w/out creds')
logger.info(`Connecting to ES at ${urisParsed.map((u) => u.host).join(',')}/${process.env.RESOURCES_INDEX} ${authMethod}`)

this._esClient = new elasticsearch.Client(config)
}
return this._esClient
}.bind(clientWrapper)
Expand All @@ -30,7 +52,7 @@ clientWrapper.esClient = function () {
* the configured ES
*/
clientWrapper.search = function (body) {
logger.debug(`Performing ES search: ${JSON.stringify(body)}`)
logger.debug(`Performing ES search: ${JSON.stringify(body, null, 2)}`)
return this.esClient().search({
index: process.env.RESOURCES_INDEX,
body
Expand Down
103 changes: 103 additions & 0 deletions lib/elasticsearch/config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Configure search scopes:
const SEARCH_SCOPES = {
all: {
fields: [
'title^5',
'title.folded^2',
'description.folded',
'subjectLiteral^2',
'subjectLiteral.folded',
'creatorLiteral^2',
'creatorLiteral.folded',
'contributorLiteral.folded',
'note.label.folded',
'publisherLiteral.folded',
'seriesStatement.folded',
'titleAlt.folded',
'titleDisplay.folded',
'contentsTitle.folded',
'donor.folded',
'parallelTitle.folded^5',
'parallelTitleDisplay.folded',
'parallelTitleAlt.folded',
'parallelSeriesStatement.folded',
'parallelCreatorLiteral.folded',
'parallelPublisher',
'uniformTitle.folded',
'parallelUniformTitle',
'formerTitle',
'addedAuthorTitle',
'placeOfPublication',
{ field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) },
// Try to detect shelfmark searches (e.g. JFD 16-5143)
{ field: 'items.shelfMark', on: (q) => /^[A-Z]{1,3} \d{2,}/.test(q) }
]
},
title: {
fields: [
'title^5',
'title.folded^2',
'titleAlt.folded',
'uniformTitle.folded',
'titleDisplay.folded',
'seriesStatement.folded',
'contentsTitle.folded',
'donor.folded',
'parallelTitle.folded^5',
'parallelTitleDisplay.folded',
'parallelSeriesStatement.folded',
'parallelTitleAlt.folded',
'parallelCreatorLiteral.folded',
'parallelUniformTitle',
'formerTitle',
'addedAuthorTitle'
]
},
contributor: {
fields: ['creatorLiteral^4', 'creatorLiteral.folded^2', 'contributorLiteral.folded', 'parallelCreatorLiteral.folded', 'parallelContributorLiteral.folded']
},
subject: {
fields: ['subjectLiteral^2', 'subjectLiteral.folded', 'parallelSubjectLiteral.folded']
},
series: {
fields: ['seriesStatement.folded']
},
journal_title: {
fields: null
},
callnumber: {
// We do custom field matching for this search-scope
},
standard_number: {
// We do custom field matching for this search-scope
}
}

const FILTER_CONFIG = {
owner: { operator: 'match', field: 'items.owner_packed', repeatable: true, path: 'items' },
subjectLiteral: { operator: 'match', field: 'subjectLiteral_exploded', repeatable: true },
holdingLocation: { operator: 'match', field: 'items.holdingLocation_packed', repeatable: true, path: 'items' },
buildingLocation: { operator: 'match', field: 'buildingLocationIds', repeatable: true },
language: { operator: 'match', field: 'language_packed', repeatable: true },
materialType: { operator: 'match', field: 'materialType_packed', repeatable: true },
mediaType: { operator: 'match', field: 'mediaType_packed', repeatable: true },
carrierType: { operator: 'match', field: 'carrierType_packed', repeatable: true },
publisher: { operator: 'match', field: 'publisherLiteral.raw', repeatable: true },
contributorLiteral: { operator: 'match', field: 'contributorLiteral.raw', repeatable: true },
creatorLiteral: { operator: 'match', field: 'creatorLiteral.raw', repeatable: true },
issuance: { operator: 'match', field: 'issuance_packed', repeatable: true },
createdYear: { operator: 'match', field: 'createdYear', repeatable: true },
dateAfter: {
operator: 'custom',
type: 'int'
},
dateBefore: {
operator: 'custom',
type: 'int'
}
}

module.exports = {
SEARCH_SCOPES,
FILTER_CONFIG
}
Loading

0 comments on commit 15409b3

Please sign in to comment.