Skip to content

Commit

Permalink
Updates following searchathon testing
Browse files Browse the repository at this point in the history
Updates:
 - Add search_scope=journal_title (to ease testing and because it really ought
   to be implemented in the discovery-api, not the front-end
 - Match on shelfmark lowercased
 - Allow smart-quotes in fully-quoted searches
 - Remove boost=1 on quoted creator/contrib phrase matching
 - Match on new idIsbn.clean and idIssn.clean (added 2024-10-17)
 - Updates qa index to 2024-10-16 (added shelfMark.keywordLowercased)
  • Loading branch information
nonword committed Oct 17, 2024
1 parent ee44d51 commit ca37f6a
Show file tree
Hide file tree
Showing 41 changed files with 5,877 additions and 6,373 deletions.
2 changes: 1 addition & 1 deletion config/qa.env
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Greg built self-hosted qa domain:
ENCRYPTED_ELASTICSEARCH_URI=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJYwgZMGCSqGSIb3DQEHBqCBhTCBggIBADB9BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDMIkDoQ9C/cCDCAq1wIBEIBQ+L3OgUGeOW9rs1CWkhpBjwM4LbbVRFIWedqew4UXIeSNMJ8cO9SNe4YGCUIoKwCDYt7W7ip3VtDRRRMVvz6QJw+Eg8ugTMVs2pbNFGNvaAQ=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDNxOM6EK7KwwZE56SgIBEIAvjEuTWDQzzrtq8VX+gNIpc/LYi3Z1H6CDjlwAjmR0IvkPQT2DladPXJbaYiyD3/E=
ENCRYPTED_RESOURCES_INDEX=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAHIwcAYJKoZIhvcNAQcGoGMwYQIBADBcBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDKS+EEsLB1iiqsC/YwIBEIAvBJpaq8IQcbTPYdgywjBfeUcMaWENUOftxSiVsdG01Rh56dDv/ITrYTHAqqXK/DU=
ENCRYPTED_ELASTICSEARCH_API_KEY=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAJ4wgZsGCSqGSIb3DQEHBqCBjTCBigIBADCBhAYJKoZIhvcNAQcBMB4GCWCGSAFlAwQBLjARBAx+kryf2KUmGdBYD9sCARCAV3ygz3eXIdq8JX/wpG9JRWlTNMRcpNE1qT0zNlN4t+ZvXEoedLQa/3p1YjgHw06GIAdA9xtkMV4eH9a1K8uCvjP8XxxNKekcMj59TlResnu9QF3r7pGXuQ==

ENCRYPTED_SCSB_URL=AQECAHh7ea2tyZ6phZgT4B9BDKwguhlFtRC6hgt+7HbmeFsrsgAAAH8wfQYJKoZIhvcNAQcGoHAwbgIBADBpBgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDBKllElmWYLxGOGopQIBEIA8JJyKde/8m8iCJGKR5D8HoTJhXHeyvw9eIDeuUNKiXLfJwoVz+PDAZSxkCQtM9O91zGhXbe3l6Bk1RlYJ
Expand Down
8 changes: 6 additions & 2 deletions lib/api-request.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
* This class wraps request params to ease interpretting the query
**/

// Build regex pattern for matching a phrase fully enclosed in quotes (or smart quotes):
const QUOTE_CHARS = '"\u201C\u201D\u201E\u201F\u2033\u2036'
const IN_QUOTES_PATTERN = new RegExp(`^[${QUOTE_CHARS}][^${QUOTE_CHARS}]+[${QUOTE_CHARS}]$`)

class ApiRequest {
static ADVANCED_SEARCH_PARAMS = ['title', 'subject', 'contributor']
static IDENTIFIER_NUMBER_PARAMS = ['isbn', 'issn', 'lccn', 'oclc']
Expand Down Expand Up @@ -43,10 +47,10 @@ class ApiRequest {
}

/**
* Returns true if query is fully wrapped in quotes
* Returns true if query is fully wrapped in quotes (or smart quotes)
**/
queryIsFullyQuoted () {
return /^"[^"]+"$/.test(this.params.q)
return IN_QUOTES_PATTERN.test(this.params.q)
}

/**
Expand Down
3 changes: 3 additions & 0 deletions lib/elasticsearch/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ const SEARCH_SCOPES = {
series: {
fields: ['seriesStatement.folded']
},
journal_title: {
fields: null
},
callnumber: {
fields: null
},
Expand Down
72 changes: 46 additions & 26 deletions lib/elasticsearch/elastic-query-builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class ElasticQueryBuilder {
case 'title':
this.buildTitleQuery()
break
case 'journal_title':
this.buildTitleQuery()
this.requireIssuance('urn:biblevel:s')
break
case 'subject':
this.buildSubjectQuery()
break
Expand Down Expand Up @@ -187,6 +191,18 @@ class ElasticQueryBuilder {
this.boostOnItemShelfmarkPrefix()
}

/**
* Require a specific issuance (e.g. require 'urn:biblevel:s' for journal-
* title searches)
*/
requireIssuance (issuance) {
this.query.addFilter({
term: {
'issuance.id': issuance
}
})
}

/**
* Require exact phrase-match on named fields
*/
Expand All @@ -213,8 +229,9 @@ class ElasticQueryBuilder {
prefixMatch('identifierV2.value', q),
termMatch('uri', q),
termMatch('items.idBarcode', q),
termMatch('idIsbn_clean', q),
prefixMatch('items.shelfMark.raw', q)
termMatch('idIsbn.clean', q),
termMatch('idIssn.clean', q),
prefixMatch('items.shelfMark.keywordLowercased', q)
]
}
})
Expand All @@ -233,17 +250,9 @@ class ElasticQueryBuilder {
}
})
} else if (this.request.params.issn) {
this.query.addMust({ term: { idIssn: this.request.params.issn } })
this.query.addMust({ term: { 'idIssn.clean': this.request.params.issn } })
} else if (this.request.params.isbn) {
this.query.addMust({
bool: {
should: [
{ term: { idIsbn: this.request.params.isbn } },
{ term: { idIsbn_clean: this.request.params.isbn } }
],
minimum_should_match: 1
}
})
this.query.addMust({ term: { 'idIsbn.clean': this.request.params.isbn } })
} else if (this.request.params.oclc) {
this.query.addMust({ term: { idOclc: this.request.params.oclc } })
}
Expand All @@ -258,8 +267,8 @@ class ElasticQueryBuilder {
this.query.addMust({
bool: {
should: [
prefixMatch('shelfMark.raw', q),
prefixMatch('items.shelfMark.raw', q)
prefixMatch('shelfMark.keywordLowercased', q),
prefixMatch('items.shelfMark.keywordLowercased', q)
]
}
})
Expand Down Expand Up @@ -350,14 +359,18 @@ class ElasticQueryBuilder {
* Boost bibs with items having exact callnumber match
**/
boostOnItemShelfmarkExact (boost = 50) {
this.query.addShould(termMatch('items.shelfMark.raw', this.request.querySansQuotes(), 50))
this.query.addShould(termMatch('items.shelfMark.keywordLowercased', this.request.querySansQuotes(), boost))
// Specially boost case-sensitive match:
this.query.addShould(termMatch('items.shelfMark.raw', this.request.querySansQuotes(), boost * 2))
}

/**
* Boost bibs with items having prefix callnumber match
**/
boostOnItemShelfmarkPrefix (boost = 10) {
this.query.addShould(prefixMatch('items.shelfMark.raw', boost))
this.query.addShould(prefixMatch('items.shelfMark.keywordLowercased', this.request.querySansQuotes(), boost))
// Specially boost case-sensitive match:
this.query.addShould(prefixMatch('items.shelfMark.raw', this.request.querySansQuotes(), boost * 2))
}

/**
Expand All @@ -367,20 +380,27 @@ class ElasticQueryBuilder {
const q = this.request.querySansQuotes()

this.query.addShoulds([
prefixMatch('identifierV2.value', q, 10),
termMatch('uri', q, 10),
prefixMatch('idIsbn', q, 10),
prefixMatch('identifierV2.value', q, 5),
termMatch('uri', q, 5),

// Tiered matches on isbn:
prefixMatch('idIsbn.clean', q, 10),
termMatch('idIsbn.clean', q, 20),
termMatch('idIsbn', q, 20),

// Tiered matches on issn:
prefixMatch('idIssn.clean', q, 10),
termMatch('idIssn.clean', q, 20),
termMatch('idIssn', q, 20),

prefixMatch('idLccn', q, 10),
prefixMatch('idOclc', q, 10),
prefixMatch('shelfMark.raw', q, 10),
prefixMatch('shelfMark.keywordLowercased', q, 10),
prefixMatch('items.shelfMark.keywordLowercased', q, 10),

// I would like this to be a prefix, but doesn't seem to work:
termMatch('items.identifierV2.value', q),
prefixMatch('items.shelfMark.raw', q)
termMatch('items.identifierV2.value', q)
])

if (/\d{6,}/.test(this.request.querySansQuotes())) {
this.query.addShould(prefixMatch('idIsbn_clean', q))
}
}

/**
Expand Down
2 changes: 1 addition & 1 deletion lib/elasticsearch/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ const termMatch = (field, value, boost = 1) => {
* Given a field name and a value, returns a plainobject representing a ES
* `match_phrase` clause that can be inserted into a ES query
*/
const phraseMatch = (field, value, boost = 1) => {
const phraseMatch = (field, value, boost = 0) => {
return {
match_phrase: {
[field]: namedQuery({
Expand Down
2 changes: 2 additions & 0 deletions swagger.v1.1.x.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"enum": [
"all",
"title",
"journal_title",
"contributor",
"subject",
"series",
Expand Down Expand Up @@ -287,6 +288,7 @@
"enum": [
"all",
"title",
"journal_title",
"contributor",
"subject",
"series",
Expand Down
4 changes: 4 additions & 0 deletions test/api-request.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ describe('ApiRequest', function () {
request = ApiRequest.fromParams({ q: '"toast"' })
expect(request.queryIsFullyQuoted()).to.eq(true)

// Test smart-quotes, the great scorge
request = ApiRequest.fromParams({ q: '“toast“' })
expect(request.queryIsFullyQuoted()).to.eq(true)

request = ApiRequest.fromParams({ q: '"toast" and jam' })
expect(request.queryIsFullyQuoted()).to.eq(false)
})
Expand Down
8 changes: 4 additions & 4 deletions test/elastic-query-builder.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ describe('ElasticQueryBuilder', () => {
// Expect boosting on several identifier fields:
expect(inst.query.toJson().bool.should)
.to.be.a('array')
.have.lengthOf.at.least(9).at.most(11)
.have.lengthOf.at.least(15).at.most(16)
})
})

Expand All @@ -118,13 +118,13 @@ describe('ElasticQueryBuilder', () => {

// Expect multiple term/prefix matches on identifier fields:
expect(inst.query.toJson()).to.nested
.include({ 'bool.must[0].bool.should[0].prefix.shelfMark\\.raw.value': 'toast' })
.include({ 'bool.must[0].bool.should[1].nested.query.prefix.items\\.shelfMark\\.raw.value': 'toast' })
.include({ 'bool.must[0].bool.should[0].prefix.shelfMark\\.keywordLowercased.value': 'toast' })
.include({ 'bool.must[0].bool.should[1].nested.query.prefix.items\\.shelfMark\\.keywordLowercased.value': 'toast' })

// Expect boosting on several identifier fields:
expect(inst.query.toJson().bool.should)
.to.be.a('array')
.have.lengthOf.at.least(2).at.most(3)
.have.lengthOf.at.least(4).at.most(5)
})
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b22144813",
"_score": 11,
"_score": 16,
"_source": {
"extent": [
"1234, [1] pages, x leaves : illustrations ;",
Expand Down Expand Up @@ -737,7 +737,7 @@
]
},
"sort": [
11,
16,
"b22144813"
],
"inner_hits": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 7,
"took": 13,
"timed_out": false,
"_shards": {
"total": 2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b22144813",
"_score": 11,
"_score": 6,
"_source": {
"extent": [
"1234, [1] pages, x leaves : illustrations ;",
Expand Down Expand Up @@ -737,7 +737,7 @@
]
},
"sort": [
11,
6,
"b22144813"
],
"inner_hits": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b22144813",
"_score": 11,
"_score": 6,
"_source": {
"extent": [
"1234, [1] pages, x leaves : illustrations ;",
Expand Down Expand Up @@ -737,7 +737,7 @@
]
},
"sort": [
11,
6,
"b22144813"
],
"inner_hits": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 6,
"took": 36,
"timed_out": false,
"_shards": {
"total": 2,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 18,
"took": 117,
"timed_out": false,
"_shards": {
"total": 2,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 9,
"took": 11,
"timed_out": false,
"_shards": {
"total": 2,
Expand All @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b13627363",
"_score": 23,
"_score": 6,
"_source": {
"extent": [
"128 p. illus."
Expand Down Expand Up @@ -164,7 +164,7 @@
]
},
"sort": [
23,
6,
"b13627363"
],
"inner_hits": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 33,
"took": 4,
"timed_out": false,
"_shards": {
"total": 2,
Expand All @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b11826883",
"_score": 677.532,
"_score": 1299.1257,
"_source": {
"extent": [
"v. : ill. ;"
Expand Down Expand Up @@ -532,7 +532,7 @@
]
},
"sort": [
677.532,
1299.1257,
"b11826883"
],
"inner_hits": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 18,
"took": 148,
"timed_out": false,
"_shards": {
"total": 2,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"body": {
"took": 6,
"took": 4,
"timed_out": false,
"_shards": {
"total": 2,
Expand All @@ -18,7 +18,7 @@
{
"_index": "resources-2024-07-11",
"_id": "b22144813",
"_score": 21,
"_score": 6,
"_source": {
"extent": [
"1234, [1] pages, x leaves : illustrations ;",
Expand Down Expand Up @@ -737,7 +737,7 @@
]
},
"sort": [
21,
6,
"b22144813"
],
"inner_hits": {
Expand Down
Loading

0 comments on commit ca37f6a

Please sign in to comment.