From dc667b15472f3036080ac39bfa1e2ac0468f885d Mon Sep 17 00:00:00 2001 From: Paul Pilone Date: Thu, 18 Aug 2022 22:56:55 -0400 Subject: [PATCH 1/2] 236 - Fixes DOI searches --- api/lambdas/search-by-keywords.test.ts | 34 ++++++ api/lib/search-query-builder.test.ts | 159 +++++++++++++++++++++++++ api/lib/search-query-builder.ts | 14 ++- 3 files changed, 205 insertions(+), 2 deletions(-) diff --git a/api/lambdas/search-by-keywords.test.ts b/api/lambdas/search-by-keywords.test.ts index d0d0d6e0..07d87bf9 100644 --- a/api/lambdas/search-by-keywords.test.ts +++ b/api/lambdas/search-by-keywords.test.ts @@ -63,6 +63,8 @@ describe('search-by-keywords.handler', () => { const uuid2 = randomUUID(); const uuid3 = randomUUID(); + const doiUUID = randomUUID(); + beforeAll(async () => { const doc1 = { uuid: uuid1, @@ -79,6 +81,10 @@ describe('search-by-keywords.handler', () => { uuid: uuid3, dc_title: 'This is a document with author text.', dc_contributor_author: 'Ocean Sea', + dc_identifier_uri: [ + 'http://hdl.handle.net/11329/1029', + 'http://dx.doi.org/10.25607/OBP-561', + ], }; await osClient.addDocument(esUrl, documentsIndexName, doc1); @@ -88,6 +94,12 @@ describe('search-by-keywords.handler', () => { }); afterAll(async () => { + await osClient.deleteByQuery( + esUrl, + documentsIndexName, + { match: { uuid: doiUUID } } + ); + await osClient.deleteByQuery(esUrl, documentsIndexName, { match: { uuid: uuid1 } }); await osClient.deleteByQuery(esUrl, documentsIndexName, { match: { uuid: uuid2 } }); await osClient.deleteByQuery(esUrl, documentsIndexName, { match: { uuid: uuid3 } }); @@ -184,6 +196,28 @@ describe('search-by-keywords.handler', () => { ); }); + test('should find documents with the DOI metadata field', (done) => { + const proxyEvent = { + queryStringParameters: { + keywords: ':dc_identifier_uri:10.25607/OBP-561', + }, + }; + + searchHandler( + proxyEvent, + (results) => { + expect(results.hits.total.value).toEqual(1); + + const uuids = results.hits.hits.map( + (h) => h._source.uuid + ); + + expect(uuids).toEqual([uuid3]); + }, + done + ); + }); + describe('and using boolean operators', () => { test('should find matching documents using the OR boolean operator', (done) => { const proxyEvent = { diff --git a/api/lib/search-query-builder.test.ts b/api/lib/search-query-builder.test.ts index 0e6af4c6..0986f479 100644 --- a/api/lib/search-query-builder.test.ts +++ b/api/lib/search-query-builder.test.ts @@ -127,5 +127,164 @@ describe('search-document-builder', () => { test.todo('should boost the dc_title keyword field if searching all fields'); test.todo('shoudl boost the dc_description_abstract field if searching all fields'); + + describe('when escaping query string special characters', () => { + // + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ / + test('should escape + in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a + in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\+ in it)'); + }); + + test('should escape - in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a - in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\- in it)'); + }); + + test('should escape = in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a = in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\= in it)'); + }); + + test('should escape && in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a && in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\&& in it)'); + }); + + test('should escape || in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a || in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\|| in it)'); + }); + + test('should escape ! in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a ! in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\! in it)'); + }); + + test('should escape ( and ) in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a ( and ) in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\( and \\) in it)'); + }); + + test('should escape { and } in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a { and } in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\{ and \\} in it)'); + }); + + test('should escape [ and ] in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a [ and ] in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\[ and \\] in it)'); + }); + + test('should escape : in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a : in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\: in it)'); + }); + + test('should escape / in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a / in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\/ in it)'); + }); + + test('should escape \\ in the query string term component', () => { + const keywordComps = [ + { + operator: '', + field: '*', + term: 'term with a \\ in it', + }, + ]; + + const result = formatQueryString(keywordComps); + expect(result).toEqual('*:(term with a \\\\ in it)'); + }); + }); }); }); diff --git a/api/lib/search-query-builder.ts b/api/lib/search-query-builder.ts index 2038de61..11836812 100644 --- a/api/lib/search-query-builder.ts +++ b/api/lib/search-query-builder.ts @@ -29,6 +29,15 @@ export const nestedQuery = (termPhrase: unknown) => ({ }, }); +// Elasticsearch's query_string query has a list of special characters. We don't +// want to necessarily escape them all (e.g. the user can use a wildcard if they want) +// but there are a few obvious ones we need to escape. +// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html +// All special characters: + - = && || > < ! ( ) { } [ ] ^ " ~ * ? : \ / +// Characters we currently want to escape: + - = && || ! ( ) { } [ ] : \ / +const queryStringSpecialCharacters = /\+|-|=|&{2}|\|{2}|!|\(|\)|{|}|\[|]|:|\/|\\/g; +const encodeQueryStringTermComp = (term: string): string => term.replace(queryStringSpecialCharacters, '\\$&'); + const formatKeywordComp = (keywordComp: SearchKeywordComps) => { let openSearchOperator; @@ -43,7 +52,8 @@ const formatKeywordComp = (keywordComp: SearchKeywordComps) => { openSearchOperator = 'OR'; } - return `${openSearchOperator} ${keywordComp.field}:(${keywordComp.term})`; + const escapedKeywordCompTerm = encodeQueryStringTermComp(keywordComp.term); + return `${openSearchOperator} ${keywordComp.field}:(${escapedKeywordCompTerm})`; }; /** @@ -90,7 +100,7 @@ export const buildSort = (sortParams: string[] = []) => { * Helper function that builds the `query` field of the Elasticsearch search * document. * - * @param keywords An array of search keyword components. + * @param keywordComps An array of search keyword components. * @param terms An array of terms that will be used as filters in the * query. * @param termURIs A list of term URIs (ontology URIs) that can be From 8a7d2c7fbf7469f1513306e6be300300d9a0541d Mon Sep 17 00:00:00 2001 From: Paul Pilone Date: Mon, 22 Aug 2022 22:02:21 -0400 Subject: [PATCH 2/2] 247 - Removes old bulk-items stuff (#248) --- .eslint-ratchet | 2 +- .tsc-ratchet | 2 +- ingest/downloader/bulk-items-helper.rb | 33 --------- ingest/downloader/bulk-items-template.yaml | 23 ------ ingest/lambdas/bulk-items.js | 81 ---------------------- 5 files changed, 2 insertions(+), 139 deletions(-) delete mode 100755 ingest/downloader/bulk-items-helper.rb delete mode 100755 ingest/downloader/bulk-items-template.yaml delete mode 100755 ingest/lambdas/bulk-items.js diff --git a/.eslint-ratchet b/.eslint-ratchet index d4d5a4b7..08839f6b 100644 --- a/.eslint-ratchet +++ b/.eslint-ratchet @@ -1 +1 @@ -274 +200 diff --git a/.tsc-ratchet b/.tsc-ratchet index d136d6a7..9b252fd0 100644 --- a/.tsc-ratchet +++ b/.tsc-ratchet @@ -1 +1 @@ -125 +113 diff --git a/ingest/downloader/bulk-items-helper.rb b/ingest/downloader/bulk-items-helper.rb deleted file mode 100755 index 21281661..00000000 --- a/ingest/downloader/bulk-items-helper.rb +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env ruby - -require 'net/http' -require 'json' -require 'uri' - -BULK_INDEXER_HOST = "http://localhost:3000" -BULK_INDEXER_PATH = "/items" - -def bulk_index(limit, offset) - params = { limit: limit, offset: offset } - - uri = URI(BULK_INDEXER_HOST + BULK_INDEXER_PATH) - uri.query = URI.encode_www_form(params) - - req = Net::HTTP::Get.new(uri) - req['Accept'] = 'application/json' - - res = Net::HTTP.start(uri.hostname, uri.port) { |http| http.request(req) } - JSON.parse(res.body) -end - -limit = 50 -offset = 0 - -objects = bulk_index(limit, offset) -puts "Uploaded metadata for objects from #{offset} to #{offset + objects.size}..." - -while objects.size > 0 - offset += objects.size - objects = bulk_index(limit, offset) - puts "Uploaded metadata for objects from #{offset} to #{offset + objects.size}..." -end \ No newline at end of file diff --git a/ingest/downloader/bulk-items-template.yaml b/ingest/downloader/bulk-items-template.yaml deleted file mode 100755 index 21201bb3..00000000 --- a/ingest/downloader/bulk-items-template.yaml +++ /dev/null @@ -1,23 +0,0 @@ -AWSTemplateFormatVersion : '2010-09-09' -Transform: AWS::Serverless-2016-10-31 -Description: Describes the infrasturcture to fetch items (documents) directly from the Ocean Best Practices /items API. This API can be used to bulk index items as opposed to using the RSS feed methods. - -Resources: - - BulkItems: - Type: AWS::Serverless::Function - Properties: - Handler: bulk-items.handler - Runtime: nodejs12.x - Timeout: 500 - Events: - GetItems: - Type: Api - Properties: - Path: '/items' - Method: get - # Role: !GetAtt MetadataDownloaderExecutionRole.Arn - # Environment: - # Variables: - # DOCUMENT_METADATA_BUCKET: !Ref DocumentMetadataBucketName - # CodeUri: s3://oop-indexer-functions/items.zip \ No newline at end of file diff --git a/ingest/lambdas/bulk-items.js b/ingest/lambdas/bulk-items.js deleted file mode 100755 index 0d28f546..00000000 --- a/ingest/lambdas/bulk-items.js +++ /dev/null @@ -1,81 +0,0 @@ -'use strict'; - -const AWS = require('aws-sdk'); -const s3 = new AWS.S3(); - -const https = require('https'); - -const metadataBucket = process.env.DOCUMENT_METADATA_BUCKET || 'oop-doc-metadata'; - -const obp = { - host: "repository.oceanbestpractices.org", - itemsPath: "/rest/items", - metadataPath: "/rest/items/{uuid}/metadata", - findPath: '/rest/items/find-by-metadata-field', - accepts: 'application/json', - userAgent: "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", -}; - -exports.handler = (event, context, callback) => { - const queryParams = event.queryStringParameters; - const offset = queryParams.offset !== undefined ? queryParams.offset : 0; - const limit = queryParams.limit !== undefined ? queryParams.limit : 100; - - const opts = { - hostname: obp.host, - path: obp.itemsPath + '?offset=' + offset + '&limit=' + limit + '&expand=metadata,bitstreams', - headers: { - 'Accept': obp.accepts, - 'User-Agent': obp.userAgent - } - }; - - https.get(opts, function(res) { - var body = ''; - res.on('data', function(chunk) { - body += chunk; - }); - - res.on('end', function() { - var items = JSON.parse(body); - if (items.length > 0) { - Promise.all(items.map((i) => { - var metadata = i.metadata; - metadata.push({'key': 'handle', 'value': i.handle}); - - // Get the thumbnail retrieveLink from the document. - const thumbnailLink = findThumbnailLink(i.bitstreams); - if (thumbnailLink !== null) { - metadata.push({"key": "thumbnail", "value": thumbnailLink}); - } - - return putMetadata(i.uuid, JSON.stringify(metadata)); - })).then((values) => { - callback(null, {statusCode: 200, body: JSON.stringify(values)}); - }); - } else { - callback(null, { statusCode: 200, body: JSON.stringify([])}); - } - }); - }).on('error', function(err) { - callback(err, {statusCode: 500, body: JSON.stringify({'err': err})}); - }); -} - -function putMetadata(docUUID, metadata, callback) { - const params = { - Body: metadata, - Bucket: metadataBucket, - Key: docUUID + '.json', - }; - - return s3.putObject(params).promise(); -} - -function findThumbnailLink(bitstreams) { - var thumbnailBitstream = bitstreams.find(function(b) { - return b.bundleName === "THUMBNAIL"; - }); - - return thumbnailBitstream !== undefined ? thumbnailBitstream.retrieveLink : null; -} \ No newline at end of file