Merge pull request 'release_1.1.9' (#13) from release_1.1.9 into master

Reviewed-on: https://gitea.intranda.com/intranda/goobi-vocabulary-server/pulls/13
intranda · Nov 15, 2024 · 8d81e1e · 8d81e1e
2 parents f1fea75 + fd4687b
commit 8d81e1e
Show file tree

Hide file tree

Showing 24 changed files with 264 additions and 43 deletions.
diff --git a/docs/de/migration.md b/docs/de/migration.md
@@ -155,6 +155,8 @@ Diese URLs sollten gültig und auflösbar sein.
 Der Host-Teil dieser URLs wird aus der Anfrage generiert.
 
 ## Migration der Mets-Datei
+**Achtung** Bitte erstellen Sie vorher ein Backup. In einigen Fällen sind die mets-Dateien inkonsistent, d. h. die gespeicherten Werte und Vokabularreferenzen stimmen nicht überein. Das Migrationsskript verwendet die Vokabularreferenzen, um die richtigen Werte zu finden. Wenn die Vokabularreferenzen falsch und die Werte richtig sind, wird die Migration die Datenintegrität korrumpieren!
+
 Dieser Schritt kann nur durchgeführt werden, wenn die Migration der Vokabulardaten erfolgreich abgeschlossen wurde!
 
 Wenn die Datei `migration.csv` vorhanden ist, führen Sie den folgenden Befehl in der aktivierten Python-Umgebung aus:

diff --git a/docs/en/migration.md b/docs/en/migration.md
@@ -127,6 +127,8 @@ blau,123
 This file maps all record values to the corresponding record IDs in the reference vocabulary.
 
 ## Mets file migration
+**Caution** Please create a backup beforehand. In some cases the mets files are inconsistent, i. e. the stored values and vocabulary references do not match. The migration script uses the vocabulary references to find the correct values. If the vocabulary references were wrong and the values correct, the migration will corrupt the data!
+
 This step can only be done after the vocabulary data migration has been successfully completed!
 
 With the `migration.csv` file present, run the following command in the activated Python environment:

diff --git a/migration/lib/api.py b/migration/lib/api.py
@@ -1,6 +1,7 @@
 import logging
 import requests
 import json
+import sys
 
 SCHEMA_INSERTION_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas'
 SCHEMA_LOOKUP_URL = 'http://{{HOST}}:{{PORT}}/api/v1/schemas/{{SCHEMA_ID}}'
@@ -61,7 +62,16 @@ def query(self, url, obj=None, method='POST'):
         response = requests.request(method, url=url, headers=HEADERS, data=payload)
         try:
             # Check for success
-            if response.status_code // 100 != 2:
+            if response.status_code == 401 or response.status_code == 403:
+                error_msg = f'API call was not successful, reason: Authentification'
+                logging.critical(error_msg)
+                sys.exit(1)
+                raise Exception(error_msg)
+            if response.status_code == 404:
+                error_msg = f'API call was not successful, reason: Entity not found {url}'
+                logging.warning(error_msg)
+                raise Exception(error_msg)
+            elif response.status_code // 100 != 2:
                 error_msg = f'API call was not successful, reason:\n{extract_error_from_response(response)}'
                 logging.warning(error_msg)
                 raise Exception(error_msg)
@@ -111,18 +121,20 @@ def insert_record(self, record):
         result = self.query(url, record)
         return result['id']
 
-    def find_record(self, ctx, vocabulary_id, search_term):
+    def find_record(self, ctx, vocabulary_id, search_term, search_field=None):
         url = self.urls[RECORD_SEARCH].replace('{{VOCABULARY_ID}}', str(vocabulary_id)).replace('{{SEARCH_TERM}}', search_term)
         result = self.query(url, obj=None, method='GET')
         if not '_embedded' in result:
-            raise Exception(f'Record search for search term "{search_term}" has no results')
+            raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
         results = result['_embedded']['vocabularyRecordList']
         # Filter for exact searches
-        results = [r for r in results if ctx.record_contains_value(r, search_term)]
+        results = [r for r in results if ctx.record_contains_value(r, search_term, search_field=search_field)]
+
         if len(results) == 0:
-            raise Exception(f'Record search for search term "{search_term}" has no results')
+            raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no results')
         elif len(results) > 1:
-            raise Exception(f'Record search for search term "{search_term}" has no unique result, {len(results)} records found')
+            ids = [r['id'] for r in results]
+            raise Exception(f'Record search in vocabulary "{vocabulary_id}" for search term "{search_term}" has no unique result, {len(results)} records found: {ids}')
 
         return results[0]['id']
 

diff --git a/migration/lib/mets_context.py b/migration/lib/mets_context.py
@@ -6,7 +6,7 @@
 RECORD_PATTERN = re.compile('^(\\d+).*$')
 
 class Context:
-    def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix):
+    def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic):
         self.api = api
         self.dry = dry
         self.verbose = verbose
@@ -15,12 +15,24 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map
         self.mapping_file = mapping_file
         self.preferred_mets_main_value_language = preferred_mets_main_value_language
         self.manual_id_fix = manual_id_fix
+        self.trust = trust
+        self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic
+        self.vocabulary_name_id_map = {}
         self.vocabulary_id_name_map = {}
         self.vocabulary_id_map = {}
         self.record_id_map = {}
         self.vocabulary_id_schema_id_map = {}
         self.schema_id_main_field_id_map = {}
 
+    def find_vocabulary_by_name(self, identifier):
+        if not identifier in self.vocabulary_name_id_map:
+            error = f'Vocabulary name "{identifier}" not found'
+            if self.continue_on_error:
+                logging.error(error)
+            else:
+                raise Exception(error)
+        return self.vocabulary_name_id_map[identifier]
+
     def lookup_vocabulary_name(self, identifier):
         if not identifier in self.vocabulary_id_name_map:
             error = f'Vocabulary name not found for vocabulary with ID {identifier}'
@@ -69,12 +81,22 @@ def retrieve_main_field_id(self, schema_id):
             self.schema_id_main_field_id_map[schema_id] = main_definitions[0]['id']
         return self.schema_id_main_field_id_map[schema_id]
 
-    def record_contains_value(self, record, value):
+    def record_contains_value(self, record, value, search_field=None):
+        field_id = None
+        if search_field != None:
+            vocabulary = self.api.lookup_vocabulary(record['vocabularyId'])
+            schema = self.api.lookup_schema(vocabulary['schemaId'])
+            ids = [d['id'] for d in schema['definitions'] if d['name'] == search_field]
+            if len(ids) != 1:
+                logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
+                sys.exit(1)
+            field_id = ids[0]
         for f in record['fields']:
-            for v in f['values']:
-                for t in v['translations']:
-                    if t['value'] == value:
-                        return True
+            if field_id == None or f['definitionId'] == field_id:
+                for v in f['values']:
+                    for t in v['translations']:
+                        if t['value'] == value:
+                            return True
         return False
 
     def extract_language_values(self, field):

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
@@ -40,14 +40,14 @@ def process_mets_file(self):
             self.ctx.log_processed(self.file_path)
 
     def process_node(self, node):
-        if self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
-            self.process_vocabulary_reference(node)
-            if self.ctx.dry:
-                dump_node(node)
         if self.is_manual_id_reference(node):
             self.process_manual_id_reference(node)
             if self.ctx.dry:
                 dump_node(node)
+        elif self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
+            self.process_vocabulary_reference(node)
+            if self.ctx.dry:
+                dump_node(node)
         for child in node:
             self.process_node(child)
 
@@ -67,6 +67,14 @@ def generate_record_uri(self, record_id):
         return self.record_endpoint.replace('{{ID}}', str(record_id))
 
     def process_vocabulary_reference(self, node):
+        if (self.ctx.trust == 'ID'):
+            self.process_vocabulary_reference_by_id(node)
+        elif (self.ctx.trust == 'Value'):
+            self.process_vocabulary_reference_by_value(node)
+        else:
+            raise Exception(f'Unknown trust source \"{self.ctx.trust}\"')
+
+    def process_vocabulary_reference_by_id(self, node):
         try:
             # Extract old vocabulary and record ids
             valueURI = node.attrib['valueURI']
@@ -132,18 +140,115 @@ def process_vocabulary_reference(self, node):
             error = f'Unable to retrieve vocabulary and record id from valueURI: {valueURI}\n\t\t{e}'
             logging.debug(error)
             self.ctx.log_issue(self.file_path, error)
+
+    def process_vocabulary_reference_by_value(self, node):
+        try:
+            vocabulary_name = node.attrib['authority']
+
+            if vocabulary_name == 'geonames':
+                return
+            vocabulary_id = self.ctx.find_vocabulary_by_name(vocabulary_name)
+        except Exception as e:
+            error = f'Unable to retrieve vocabulary by name: {vocabulary_name}\n\t\t{e}'
+            logging.debug(error)
+            self.ctx.log_issue(self.file_path, error)
+            return
+
+        try:
+            value = node.text
+
+            search_field=None
+            inverse_search_field=None
+            if self.ctx.enable_relation_vocabulary_column_logic and 'Relationship' in vocabulary_name:
+                parent = node.getparent()
+                if parent == None:
+                    logging.warn(f'No parent found!')
+                    dump_node(node)
+                    return
+
+                entity_type = None
+                for sibling in parent:
+                    if sibling.attrib['name'] == 'RelationEntityType':
+                        entity_type = sibling.text
+                        break
+
+                entity_type_in_relation_count = vocabulary_name.count(entity_type)
+                if entity_type_in_relation_count == 1:
+                    # Find out relation direction
+                    separator_position = vocabulary_name.index('-')
+                    entity_type_position = vocabulary_name.index(entity_type)
+
+                    # use second column of vocabulary: `Reverse relationship` (The relation vocabulary is specified from `A->B`, the relation references an entity of type `A` and is therefore of type `B`)
+                    if entity_type_position < separator_position:
+                        search_field='Reverse relationship'
+                        inverse_search_field='Relationship type'
+                    else:
+                        search_field='Relationship type'
+                        inverse_search_field='Reverse relationship'
+
+                try:
+                    new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field)
+                except:
+                    new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
+                    old_value = node.text
+                    record_data = self.ctx.api.lookup_record(new_record_id)
+
+                    v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId'])
+                    s = self.ctx.api.lookup_schema(v['schemaId'])
+                    ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for
+                    if len(ids) != 1:
+                        logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
+                        sys.exit(1)
+
+                    field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]]
+                    if len(field_data) != 1:
+                        logging.critical(f'Record [{new_record_id}] has no unique search column entry field')
+                        sys.exit(1)
+
+                    # Replace node text if not matching any translation of main value
+                    translated_main_values = self.ctx.extract_language_values(field_data[0])
+                    new_value =  self.ctx.extract_preferred_language(translated_main_values)
+
+                    #dump_node(node)
+                    logging.warn(f'Relation is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
+                    node.text = new_value
+
+            else:
+                new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=None)
+
+            # Set all attributes accordingly
+            node.attrib['authority'] = vocabulary_name
+            node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary_id)
+            node.attrib['valueURI'] = self.generate_record_uri(new_record_id)
+
+            self.changed = True
+        except Exception as e:
+            error = f'Unable to find record by value: {value}\n\t\t{e}'
+            logging.error(error)
+            self.ctx.log_issue(self.file_path, error)
 
     def process_manual_id_reference(self, node):
         try:
+            if node.text == None:
+                return
             record_id_old = int(node.text)
             record_id_new = self.ctx.lookup_record_id(record_id_old)
             node.text = str(record_id_new)
+
+            if 'authority' in node.attrib or 'authorityURI' in node.attrib or 'valueURI' in node.attrib:
+                record = self.ctx.api.lookup_record(record_id_new)
+                vocabulary = self.ctx.api.lookup_vocabulary(record['vocabularyId'])
+                node.attrib['authority'] = vocabulary['name']
+                node.attrib['authorityURI'] = self.generate_vocabulary_uri(vocabulary['id'])
+                node.attrib['valueURI'] = self.generate_record_uri(record_id_new)
+
             self.changed = True
         except Exception as e:
             msg = f'Unable to read ID {node.text}!'
-            logging.critical(msg)
-            raise Exception(msg)
+            logging.warn(msg)
+            #raise Exception(msg)
 
 def dump_node(node):
     attributes = ' '.join(f'{k}="{v}"' for k, v in node.attrib.items())
-    logging.info(f'<{node.tag} {attributes} />')
+    value = node.text
+    logging.info(f'<{node.tag} {attributes}>{value}</{node.tag}>')
diff --git a/migration/lib/mets_migrator.py b/migration/lib/mets_migrator.py
@@ -43,6 +43,8 @@ def load_mapping_file(self):
 
                 if not vocabulary_id_new in self.ctx.vocabulary_id_name_map:
                     self.ctx.vocabulary_id_name_map[vocabulary_id_new] = vocabulary_name
+                if not vocabulary_name in self.ctx.vocabulary_name_id_map:
+                    self.ctx.vocabulary_name_id_map[vocabulary_name] = vocabulary_id_new
                 if not vocabulary_id_old in self.ctx.vocabulary_id_map:
                     self.ctx.vocabulary_id_map[vocabulary_id_old] = vocabulary_id_new
                 if not record_id_old in self.ctx.record_id_map:

diff --git a/migration/metadata-migrator.py b/migration/metadata-migrator.py
@@ -14,7 +14,7 @@ def main():
         args.vocabulary_server_port,
         args.vocabulary_server_token
     )
-    ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix)
+    ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic)
 
     try:
         migrator = MetsMigrator(ctx)
@@ -39,6 +39,8 @@ def parse_args():
     parser.add_argument('--vocabulary-server-port', type=str, default='8081', help='vocabulary server port')
     parser.add_argument('--vocabulary-server-token', type=str, default=None, help='vocabulary server security token')
     parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid')
+    parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".')
+    parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)')
     parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!')
     parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)')
     parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output')

diff --git a/module-core/pom.xml b/module-core/pom.xml
@@ -10,7 +10,7 @@
     </parent>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server-core</artifactId>
-    <version>1.1.8</version>
+    <version>1.1.9</version>
     <name>Vocabulary-Server-Core</name>
     <description>Spring Boot based RESTful web service for vocabulary management</description>
     <packaging>jar</packaging>
@@ -35,7 +35,7 @@
         <dependency>
             <groupId>io.goobi.vocabulary</groupId>
             <artifactId>vocabulary-server-exchange</artifactId>
-            <version>1.1.8</version>
+            <version>1.1.9</version>
             <scope>compile</scope>
         </dependency>
 

diff --git a/module-core/src/main/java/io/goobi/vocabulary/api/FieldDefinitionController.java b/module-core/src/main/java/io/goobi/vocabulary/api/FieldDefinitionController.java
@@ -0,0 +1,27 @@
+package io.goobi.vocabulary.api;
+
+import io.goobi.vocabulary.api.assemblers.FieldDefinitionAssembler;
+import io.goobi.vocabulary.exchange.FieldDefinition;
+import io.goobi.vocabulary.service.manager.FieldDefinitionDTOManager;
+import org.springframework.hateoas.EntityModel;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.PathVariable;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+@RestController
+@RequestMapping("/api/v1")
+public class FieldDefinitionController {
+    private final FieldDefinitionDTOManager manager;
+    private final FieldDefinitionAssembler assembler;
+
+    public FieldDefinitionController(FieldDefinitionDTOManager manager, FieldDefinitionAssembler assembler) {
+        this.manager = manager;
+        this.assembler = assembler;
+    }
+
+    @GetMapping("/fieldDefinitions/{id}")
+    public EntityModel<FieldDefinition> one(@PathVariable long id) {
+        return assembler.toModel(manager.get(id));
+    }
+}
diff --git a/module-core/src/main/java/io/goobi/vocabulary/api/MaintenanceController.java b/module-core/src/main/java/io/goobi/vocabulary/api/MaintenanceController.java
@@ -1,6 +1,6 @@
 package io.goobi.vocabulary.api;
 
-import io.goobi.vocabulary.maintenance.selfcheck.SelfCheckResult;
+import io.goobi.vocabulary.monitoring.SelfCheckResult;
 import io.goobi.vocabulary.service.manager.MaintenanceManager;
 import org.springframework.web.bind.annotation.GetMapping;
 import org.springframework.web.bind.annotation.RequestMapping;

diff --git a/module-core/src/main/java/io/goobi/vocabulary/api/MonitoringController.java b/module-core/src/main/java/io/goobi/vocabulary/api/MonitoringController.java
@@ -1,6 +1,6 @@
 package io.goobi.vocabulary.api;
 
-import io.goobi.vocabulary.maintenance.MonitoringResult;
+import io.goobi.vocabulary.monitoring.MonitoringResult;
 import io.goobi.vocabulary.service.manager.MaintenanceManager;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.web.bind.annotation.GetMapping;