Skip to content

Commit

Permalink
Merge pull request 'release_1.1.9' (#13) from release_1.1.9 into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Dominick Leppich committed Nov 15, 2024
2 parents f1fea75 + fd4687b commit 8d81e1e
Show file tree
Hide file tree
Showing 24 changed files with 264 additions and 43 deletions.
2 changes: 2 additions & 0 deletions docs/de/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ Diese URLs sollten gültig und auflösbar sein.
Der Host-Teil dieser URLs wird aus der Anfrage generiert.

## Migration der Mets-Datei
**Achtung** Bitte erstellen Sie vorher ein Backup. In einigen Fällen sind die mets-Dateien inkonsistent, d. h. die gespeicherten Werte und Vokabularreferenzen stimmen nicht überein. Das Migrationsskript verwendet die Vokabularreferenzen, um die richtigen Werte zu finden. Wenn die Vokabularreferenzen falsch und die Werte richtig sind, wird die Migration die Datenintegrität korrumpieren!

Dieser Schritt kann nur durchgeführt werden, wenn die Migration der Vokabulardaten erfolgreich abgeschlossen wurde!

Wenn die Datei `migration.csv` vorhanden ist, führen Sie den folgenden Befehl in der aktivierten Python-Umgebung aus:
Expand Down
2 changes: 2 additions & 0 deletions docs/en/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ blau,123
This file maps all record values to the corresponding record IDs in the reference vocabulary.

## Mets file migration
**Caution** Please create a backup beforehand. In some cases the mets files are inconsistent, i. e. the stored values and vocabulary references do not match. The migration script uses the vocabulary references to find the correct values. If the vocabulary references were wrong and the values correct, the migration will corrupt the data!

This step can only be done after the vocabulary data migration has been successfully completed!

With the `migration.csv` file present, run the following command in the activated Python environment:
Expand Down
24 changes: 18 additions & 6 deletions migration/lib/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import requests
import json
import sys

SCHEMA_INSERTION_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas'
SCHEMA_LOOKUP_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas/{{SCHEMA_ID}}'
Expand Down Expand Up @@ -61,7 +62,16 @@ def query(self, url, obj=None, method='POST'):
response = requests.request(method, url=url, headers=HEADERS, data=payload)
try:
# Check for success
if response.status_code // 100 != 2:
if response.status_code == 401 or response.status_code == 403:
error_msg = f'API call was not successful, reason: Authentification'
logging.critical(error_msg)
sys.exit(1)
raise Exception(error_msg)
if response.status_code == 404:
error_msg = f'API call was not successful, reason: Entity not found {url}'
logging.warning(error_msg)
raise Exception(error_msg)
elif response.status_code // 100 != 2:
error_msg = f'API call was not successful, reason:\n{extract_error_from_response(response)}'
logging.warning(error_msg)
raise Exception(error_msg)
Expand Down Expand Up @@ -111,18 +121,20 @@ def insert_record(self, record):
result = self.query(url, record)
return result['id']

def find_record(self, ctx, vocabulary_id, search_term):
def find_record(self, ctx, vocabulary_id, search_term, search_field=None):
url = self.urls[RECORD_SEARCH].replace('{{VOCABULARY_ID}}', str(vocabulary_id)).replace('{{SEARCH_TERM}}', search_term)
result = self.query(url, obj=None, method='GET')
if not '_embedded' in result:
raise Exception(f'Record search for search term "{search_term}" has no results')
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
results = result['_embedded']['vocabularyRecordList']
# Filter for exact searches
results = [r for r in results if ctx.record_contains_value(r, search_term)]
results = [r for r in results if ctx.record_contains_value(r, search_term, search_field=search_field)]

if len(results) == 0:
raise Exception(f'Record search for search term "{search_term}" has no results')
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
elif len(results) > 1:
raise Exception(f'Record search for search term "{search_term}" has no unique result, {len(results)} records found')
ids = [r['id'] for r in results]
raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no unique result, {len(results)} records found: {ids}')

return results[0]['id']

Expand Down
34 changes: 28 additions & 6 deletions migration/lib/mets_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
RECORD_PATTERN = re.compile('^(\\d+).*$')

class Context:
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix):
def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic):
self.api = api
self.dry = dry
self.verbose = verbose
Expand All @@ -15,12 +15,24 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map
self.mapping_file = mapping_file
self.preferred_mets_main_value_language = preferred_mets_main_value_language
self.manual_id_fix = manual_id_fix
self.trust = trust
self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic
self.vocabulary_name_id_map = {}
self.vocabulary_id_name_map = {}
self.vocabulary_id_map = {}
self.record_id_map = {}
self.vocabulary_id_schema_id_map = {}
self.schema_id_main_field_id_map = {}

def find_vocabulary_by_name(self, identifier):
if not identifier in self.vocabulary_name_id_map:
error = f'Vocabulary name "{identifier}" not found'
if self.continue_on_error:
logging.error(error)
else:
raise Exception(error)
return self.vocabulary_name_id_map[identifier]

def lookup_vocabulary_name(self, identifier):
if not identifier in self.vocabulary_id_name_map:
error = f'Vocabulary name not found for vocabulary with ID {identifier}'
Expand Down Expand Up @@ -69,12 +81,22 @@ def retrieve_main_field_id(self, schema_id):
self.schema_id_main_field_id_map[schema_id] = main_definitions[0]['id']
return self.schema_id_main_field_id_map[schema_id]

def record_contains_value(self, record, value):
def record_contains_value(self, record, value, search_field=None):
field_id = None
if search_field != None:
vocabulary = self.api.lookup_vocabulary(record['vocabularyId'])
schema = self.api.lookup_schema(vocabulary['schemaId'])
ids = [d['id'] for d in schema['definitions'] if d['name'] == search_field]
if len(ids) != 1:
logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
sys.exit(1)
field_id = ids[0]
for f in record['fields']:
for v in f['values']:
for t in v['translations']:
if t['value'] == value:
return True
if field_id == None or f['definitionId'] == field_id:
for v in f['values']:
for t in v['translations']:
if t['value'] == value:
return True
return False

def extract_language_values(self, field):
Expand Down
119 changes: 112 additions & 7 deletions migration/lib/mets_manipulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ def process_mets_file(self):
self.ctx.log_processed(self.file_path)

def process_node(self, node):
if self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
self.process_vocabulary_reference(node)
if self.ctx.dry:
dump_node(node)
if self.is_manual_id_reference(node):
self.process_manual_id_reference(node)
if self.ctx.dry:
dump_node(node)
elif self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
self.process_vocabulary_reference(node)
if self.ctx.dry:
dump_node(node)
for child in node:
self.process_node(child)

Expand All @@ -67,6 +67,14 @@ def generate_record_uri(self, record_id):
return self.record_endpoint.replace('{{ID}}', str(record_id))

def process_vocabulary_reference(self, node):
if (self.ctx.trust == 'ID'):
self.process_vocabulary_reference_by_id(node)
elif (self.ctx.trust == 'Value'):
self.process_vocabulary_reference_by_value(node)
else:
raise Exception(f'Unknown trust source \"{self.ctx.trust}\"')

def process_vocabulary_reference_by_id(self, node):
try:
# Extract old vocabulary and record ids
valueURI = node.attrib['valueURI']
Expand Down Expand Up @@ -132,18 +140,115 @@ def process_vocabulary_reference(self, node):
error = f'Unable to retrieve vocabulary and record id from valueURI: {valueURI}\n\t\t{e}'
logging.debug(error)
self.ctx.log_issue(self.file_path, error)

def process_vocabulary_reference_by_value(self, node):
try:
vocabulary_name = node.attrib['authority']

if vocabulary_name == 'geonames':
return
vocabulary_id = self.ctx.find_vocabulary_by_name(vocabulary_name)
except Exception as e:
error = f'Unable to retrieve vocabulary by name: {vocabulary_name}\n\t\t{e}'
logging.debug(error)
self.ctx.log_issue(self.file_path, error)
return

try:
value = node.text

search_field=None
inverse_search_field=None
if self.ctx.enable_relation_vocabulary_column_logic and 'Relationship' in vocabulary_name:
parent = node.getparent()
if parent == None:
logging.warn(f'No parent found!')
dump_node(node)
return

entity_type = None
for sibling in parent:
if sibling.attrib['name'] == 'RelationEntityType':
entity_type = sibling.text
break

entity_type_in_relation_count = vocabulary_name.count(entity_type)
if entity_type_in_relation_count == 1:
# Find out relation direction
separator_position = vocabulary_name.index('-')
entity_type_position = vocabulary_name.index(entity_type)

# use second column of vocabulary: `Reverse relationship` (The relation vocabulary is specified from `A->B`, the relation references an entity of type `A` and is therefore of type `B`)
if entity_type_position < separator_position:
search_field='Reverse relationship'
inverse_search_field='Relationship type'
else:
search_field='Relationship type'
inverse_search_field='Reverse relationship'

try:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field)
except:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
old_value = node.text
record_data = self.ctx.api.lookup_record(new_record_id)

v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId'])
s = self.ctx.api.lookup_schema(v['schemaId'])
ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for
if len(ids) != 1:
logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
sys.exit(1)

field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]]
if len(field_data) != 1:
logging.critical(f'Record [{new_record_id}] has no unique search column entry field')
sys.exit(1)

# Replace node text if not matching any translation of main value
translated_main_values = self.ctx.extract_language_values(field_data[0])
new_value = self.ctx.extract_preferred_language(translated_main_values)

#dump_node(node)
logging.warn(f'Relation is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
node.text = new_value

else:
new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=None)

# Set all attributes accordingly
node.attrib['authority'] = vocabulary_name
node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary_id)
node.attrib['valueURI'] = self.generate_record_uri(new_record_id)

self.changed = True
except Exception as e:
error = f'Unable to find record by value: {value}\n\t\t{e}'
logging.error(error)
self.ctx.log_issue(self.file_path, error)

def process_manual_id_reference(self, node):
try:
if node.text == None:
return
record_id_old = int(node.text)
record_id_new = self.ctx.lookup_record_id(record_id_old)
node.text = str(record_id_new)

if 'authority' in node.attrib or 'authorityURI' in node.attrib or 'valueURI' in node.attrib:
record = self.ctx.api.lookup_record(record_id_new)
vocabulary = self.ctx.api.lookup_vocabulary(record['vocabularyId'])
node.attrib['authority'] = vocabulary['name']
node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary['id'])
node.attrib['valueURI'] = self.generate_record_uri(record_id_new)

self.changed = True
except Exception as e:
msg = f'Unable to read ID {node.text}!'
logging.critical(msg)
raise Exception(msg)
logging.warn(msg)
#raise Exception(msg)

def dump_node(node):
attributes = ' '.join(f'{k}="{v}"' for k, v in node.attrib.items())
logging.info(f'<{node.tag} {attributes} />')
value = node.text
logging.info(f'<{node.tag} {attributes}>{value}</{node.tag}>')
2 changes: 2 additions & 0 deletions migration/lib/mets_migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def load_mapping_file(self):

if not vocabulary_id_new in self.ctx.vocabulary_id_name_map:
self.ctx.vocabulary_id_name_map[vocabulary_id_new] = vocabulary_name
if not vocabulary_name in self.ctx.vocabulary_name_id_map:
self.ctx.vocabulary_name_id_map[vocabulary_name] = vocabulary_id_new
if not vocabulary_id_old in self.ctx.vocabulary_id_map:
self.ctx.vocabulary_id_map[vocabulary_id_old] = vocabulary_id_new
if not record_id_old in self.ctx.record_id_map:
Expand Down
4 changes: 3 additions & 1 deletion migration/metadata-migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def main():
args.vocabulary_server_port,
args.vocabulary_server_token
)
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix)
ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic)

try:
migrator = MetsMigrator(ctx)
Expand All @@ -39,6 +39,8 @@ def parse_args():
parser.add_argument('--vocabulary-server-port', type=str, default='8081', help='vocabulary server port')
parser.add_argument('--vocabulary-server-token', type=str, default=None, help='vocabulary server security token')
parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid')
parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".')
parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)')
parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!')
parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)')
parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output')
Expand Down
4 changes: 2 additions & 2 deletions module-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
</parent>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server-core</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<name>Vocabulary-Server-Core</name>
<description>Spring Boot based RESTful web service for vocabulary management</description>
<packaging>jar</packaging>
Expand All @@ -35,7 +35,7 @@
<dependency>
<groupId>io.goobi.vocabulary</groupId>
<artifactId>vocabulary-server-exchange</artifactId>
<version>1.1.8</version>
<version>1.1.9</version>
<scope>compile</scope>
</dependency>

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.api.assemblers.FieldDefinitionAssembler;
import io.goobi.vocabulary.exchange.FieldDefinition;
import io.goobi.vocabulary.service.manager.FieldDefinitionDTOManager;
import org.springframework.hateoas.EntityModel;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

@RestController
@RequestMapping("/api/v1")
public class FieldDefinitionController {
private final FieldDefinitionDTOManager manager;
private final FieldDefinitionAssembler assembler;

public FieldDefinitionController(FieldDefinitionDTOManager manager, FieldDefinitionAssembler assembler) {
this.manager = manager;
this.assembler = assembler;
}

@GetMapping("/fieldDefinitions/{id}")
public EntityModel<FieldDefinition> one(@PathVariable long id) {
return assembler.toModel(manager.get(id));
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.maintenance.selfcheck.SelfCheckResult;
import io.goobi.vocabulary.monitoring.SelfCheckResult;
import io.goobi.vocabulary.service.manager.MaintenanceManager;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.goobi.vocabulary.api;

import io.goobi.vocabulary.maintenance.MonitoringResult;
import io.goobi.vocabulary.monitoring.MonitoringResult;
import io.goobi.vocabulary.service.manager.MaintenanceManager;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
Expand Down
Loading

0 comments on commit 8d81e1e

Please sign in to comment.