From ccedf2b43df412fcc5d711c6cbcf43e76e5c8891 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Thu, 21 Nov 2024 08:59:59 +0100
Subject: [PATCH 01/19] version: set to version 1.1.11-SNAPSHOT

---
 module-core/pom.xml     | 4 ++--
 module-exchange/pom.xml | 2 +-
 pom.xml                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/module-core/pom.xml b/module-core/pom.xml
index d779cf4..ef1cbae 100644
--- a/module-core/pom.xml
+++ b/module-core/pom.xml
@@ -10,7 +10,7 @@
     </parent>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server-core</artifactId>
-    <version>1.1.10</version>
+    <version>1.1.11-SNAPSHOT</version>
     <name>Vocabulary-Server-Core</name>
     <description>Spring Boot based RESTful web service for vocabulary management</description>
     <packaging>jar</packaging>
@@ -35,7 +35,7 @@
         <dependency>
             <groupId>io.goobi.vocabulary</groupId>
             <artifactId>vocabulary-server-exchange</artifactId>
-            <version>1.1.10</version>
+            <version>1.1.11-SNAPSHOT</version>
             <scope>compile</scope>
         </dependency>
 
diff --git a/module-exchange/pom.xml b/module-exchange/pom.xml
index 7ca1a99..c7e2b70 100644
--- a/module-exchange/pom.xml
+++ b/module-exchange/pom.xml
@@ -4,7 +4,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server-exchange</artifactId>
-    <version>1.1.10</version>
+    <version>1.1.11-SNAPSHOT</version>
     <name>Vocabulary Exchange</name>
     <description>Vocabulary data exchange classes</description>
     <packaging>jar</packaging>
diff --git a/pom.xml b/pom.xml
index ad2b989..226d729 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server</artifactId>
-    <version>1.1.10</version>
+    <version>1.1.11-SNAPSHOT</version>
     <name>Vocabulary-Server</name>
     <packaging>pom</packaging>
     <description>RESTful webservice for vocabulary management</description>

From 27992d36ff9dd175226717a0624389b76e3f2ba1 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Thu, 21 Nov 2024 10:39:22 +0100
Subject: [PATCH 02/19] task: delete metadata for intentionally missing
 vocabulary values

---
 migration/lib/mets_context.py     | 10 +++++++++-
 migration/lib/mets_manipulator.py | 26 +++++++++++++++++++++++---
 migration/lib/mets_migrator.py    | 26 ++++++++++++++++++++++++++
 migration/metadata-migrator.py    |  3 ++-
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/migration/lib/mets_context.py b/migration/lib/mets_context.py
index 37b2bed..b406a40 100644
--- a/migration/lib/mets_context.py
+++ b/migration/lib/mets_context.py
@@ -6,7 +6,7 @@
 RECORD_PATTERN = re.compile('^(\\d+).*$')
 
 class Context:
-    def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic):
+    def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic, delete_missing_vocabulary_references):
         self.api = api
         self.dry = dry
         self.verbose = verbose
@@ -17,6 +17,8 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map
         self.manual_id_fix = manual_id_fix
         self.trust = trust
         self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic
+        self.delete_missing_vocabulary_references = delete_missing_vocabulary_references
+        self.removable_metadata_map = {}
         self.vocabulary_name_id_map = {}
         self.vocabulary_id_name_map = {}
         self.vocabulary_id_map = {}
@@ -133,6 +135,12 @@ def robust_find_record_id(self, parts):
             return vocabulary_id, record_id
         except:
             return None, None
+    
+    def is_removable_metadata(self, vocabulary_id, value):
+        if not vocabulary_id in self.removable_metadata_map:
+            return False
+
+        return value in self.removable_metadata_map[vocabulary_id]
 
     def log_processed(self, file):
         with open('mets_migration.log', 'a') as f:
diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 49d403e..093b2de 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -187,8 +187,10 @@ def process_vocabulary_reference_by_value(self, node):
                         inverse_search_field='Reverse relationship'
 
                 try:
+                    # First, try to find the value in the correct column
                     new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field)
                 except:
+                    # If failed, try to find the value in the other column (assuming the value was stored incorrectly)
                     new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
                     old_value = node.text
                     record_data = self.ctx.api.lookup_record(new_record_id)
@@ -223,9 +225,27 @@ def process_vocabulary_reference_by_value(self, node):
 
             self.changed = True
         except Exception as e:
-            error = f'Unable to find record by value: {value}\n\t\t{e}'
-            logging.error(error)
-            self.ctx.log_issue(self.file_path, error)
+            # If this fails as well and the value is not found, remove the metadata if configured
+            if 'has no results' in e.__str__() and self.ctx.is_removable_metadata(vocabulary_id, node.text):
+                logging.warn(f'Removing node due to intentionally missing vocabulary value: "{node.text}"')
+                self.remove_metadata_node(node)
+            else:
+                error = f'Unable to find record by value: {value}\n\t\t{e}'
+                logging.error(error)
+                self.ctx.log_issue(self.file_path, error)
+
+    def remove_metadata_node(self, node):
+        parent = node.getparent()
+        if parent != None and parent.attrib['type'] == 'group':
+            node = parent
+        parent = node.getparent()
+
+        if parent == None:
+            dump_node(node)
+            raise Exception(f'Unable to remove node due to missing parent')
+
+        parent.remove(node)
+        self.changed = True
 
     def process_manual_id_reference(self, node):
         try:
diff --git a/migration/lib/mets_migrator.py b/migration/lib/mets_migrator.py
index 1969a07..429fb0e 100644
--- a/migration/lib/mets_migrator.py
+++ b/migration/lib/mets_migrator.py
@@ -11,6 +11,7 @@ def __init__(self, ctx):
     
     def migrate(self):
         self.load_mapping_file()
+        self.load_delete_missing_vocabulary_references_file()
         self.mets_files = self.scan_for_mets_files()
         logging.info(f'{len(self.mets_files)} mets file(s) found!')
         logging.info(f'Start processing ...')
@@ -52,6 +53,31 @@ def load_mapping_file(self):
                 else:
                     raise Exception(f'Mapping file contains duplicate entry for old record {record_id_old}')
 
+    def load_delete_missing_vocabulary_references_file(self):
+        if self.ctx.delete_missing_vocabulary_references == None:
+            return
+
+        header = None
+        with open(self.ctx.delete_missing_vocabulary_references, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if header == None:
+                    header = line
+                    if header != CSV_DELIMITER.join(['vocabulary_id', 'value']):
+                        raise Exception('Header mismatch in mapping file!')
+                    continue
+
+                parts = line.split(CSV_DELIMITER)
+                if len(parts) != 2:
+                    raise Exception(f'Wrong number of fields in line: {line}')
+                
+                vocabulary_id = int(parts[0])
+                value = parts[1]
+
+                if not vocabulary_id in self.ctx.removable_metadata_map:
+                    self.ctx.removable_metadata_map[vocabulary_id] = []
+                self.ctx.removable_metadata_map[vocabulary_id].append(value)
+
     def scan_for_mets_files(self):
         results = []
         for root, dirs, files in os.walk(self.ctx.metadata_directory):
diff --git a/migration/metadata-migrator.py b/migration/metadata-migrator.py
index 582f97e..8f38dd9 100644
--- a/migration/metadata-migrator.py
+++ b/migration/metadata-migrator.py
@@ -14,7 +14,7 @@ def main():
         args.vocabulary_server_port,
         args.vocabulary_server_token
     )
-    ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic)
+    ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic, args.delete_missing_vocabulary_references)
 
     try:
         migrator = MetsMigrator(ctx)
@@ -41,6 +41,7 @@ def parse_args():
     parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid')
     parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".')
     parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)')
+    parser.add_argument('--delete-missing-vocabulary-references', type=str, required=False, default=None, help='vocabulary and value mapping file defining intentionally removed vocabulary values that should be removed in the Mets files as well.')
     parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!')
     parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)')
     parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output')

From 80f81af19034ddb9efab0c78b919643dec0a03d5 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Fri, 22 Nov 2024 11:36:30 +0100
Subject: [PATCH 03/19] task: add force mode to mets migration

---
 migration/lib/mets_context.py     | 3 ++-
 migration/lib/mets_manipulator.py | 9 +++++----
 migration/metadata-migrator.py    | 3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/migration/lib/mets_context.py b/migration/lib/mets_context.py
index b406a40..30cacbd 100644
--- a/migration/lib/mets_context.py
+++ b/migration/lib/mets_context.py
@@ -6,10 +6,11 @@
 RECORD_PATTERN = re.compile('^(\\d+).*$')
 
 class Context:
-    def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic, delete_missing_vocabulary_references):
+    def __init__(self, api, dry, verbose, force, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic, delete_missing_vocabulary_references):
         self.api = api
         self.dry = dry
         self.verbose = verbose
+        self.force = force
         self.continue_on_error = continue_on_error
         self.metadata_directory = metadata_directory
         self.mapping_file = mapping_file
diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 093b2de..cf5aeed 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -44,10 +44,11 @@ def process_node(self, node):
             self.process_manual_id_reference(node)
             if self.ctx.dry:
                 dump_node(node)
-        elif self.is_vocabulary_reference(node) and not self.is_already_migrated(node):
-            self.process_vocabulary_reference(node)
-            if self.ctx.dry:
-                dump_node(node)
+        elif self.is_vocabulary_reference(node):
+            if self.ctx.force or not self.is_already_migrated(node):
+                self.process_vocabulary_reference(node)
+                if self.ctx.dry:
+                    dump_node(node)
         for child in node:
             self.process_node(child)
 
diff --git a/migration/metadata-migrator.py b/migration/metadata-migrator.py
index 8f38dd9..a1edc87 100644
--- a/migration/metadata-migrator.py
+++ b/migration/metadata-migrator.py
@@ -14,7 +14,7 @@ def main():
         args.vocabulary_server_port,
         args.vocabulary_server_token
     )
-    ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic, args.delete_missing_vocabulary_references)
+    ctx = Context(api, args.dry, args.verbose, args.force, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic, args.delete_missing_vocabulary_references)
 
     try:
         migrator = MetsMigrator(ctx)
@@ -33,6 +33,7 @@ class RawTextDefaultsHelpFormatter(argparse.RawTextHelpFormatter, argparse.Argum
 def parse_args():
     parser = argparse.ArgumentParser(prog='metadata-migrator.py', formatter_class=RawTextDefaultsHelpFormatter, description='Metadata migration tool.')
     parser.add_argument('--dry', required=False, default=False, action='store_const', const=True, help='Don\'t persist changes but only print replacements to the console')
+    parser.add_argument('--force', '-f', required=False, default=False, action='store_const', const=True, help='Force a re-execution of the migration on already migrated metadata')
     parser.add_argument('--metadata-directory', '-d', required=True, help='directory to recursively scan for metadata to update')
     parser.add_argument('--mapping-file', '-m', required=True, help='vocabulary and record mapping file')
     parser.add_argument('--vocabulary-server-host', type=str, default='localhost', help='vocabulary server host')

From 5cc3dd94cdf23acba7871d685e466c66ebe197b9 Mon Sep 17 00:00:00 2001
From: Marcel Neumann <marcel.neuman@intranda.com>
Date: Fri, 22 Nov 2024 13:58:49 +0100
Subject: [PATCH 04/19] synchronise english and german version

---
 docs/en/setup.md | 151 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 109 insertions(+), 42 deletions(-)

diff --git a/docs/en/setup.md b/docs/en/setup.md
index a769a46..b7639c5 100644
--- a/docs/en/setup.md
+++ b/docs/en/setup.md
@@ -7,42 +7,120 @@ This documentation describes the process of bootstrapping the vocabulary server.
 - Adapt configuration file properly and remove unmodified lines.
     - Database credentials and database name.
     - Base URL and port.
-- **TODO** *Install the `vocabulary-server.jar` and the `application.properties` configuration file both directly into a new folder (e. g. `/opt/digiverso/vocabulary/`)*
+    - Security token (this must also be configured identically in Goobi).
+- Create a Systemd service to start the service automatically. 
 
-## Start as systemd service
-- **TODO** *Create a systemd service unit for the vocabulary server (The application should be able to correctly shutdown on SIGTERM)*
-- **TODO** *Admin documentation here*
-- Run `java -jar vocabulary-server-VERSION.jar`.
-- If startup succeeds, you will see a line like this after a few seconds:
-```bash
-Started VocabularyServerApplication in 4.244 seconds (process running for 4.581)
-```
+## Setting up Goobi Workflow to communicate with the vocabulary server
 
-## Goobi Workflow Setup Communication
-- Goobi Workflow uses the new vocabulary server since version `24.07`.
-- Configure the `vocabularyServerHost` and `vocabularyServerPort` variables in `goobi_config.properties` according to your vocabulary server configuration.
+- Goobi Workflow has been using the new vocabulary server since version `24.07`.
+- Configure the variables `vocabularyServerHost`, `vocabularyServerPort` and `vocabularyServerToken` in the `goobi_config.properties` file according to the configuration of your vocabulary server.
 - Restart Goobi Workflow for the changes to take effect.
-- Navigate to `Administration` > `Vocabularies` to check if everything works. You should see a list of vocabularies if everything is okay (not right now, but after you have created some vocabularies or migrated the existing ones). You will see a red error message if something doesn't work.
-
-## Initial Setup
-- For proper operation, the vocabulary server needs some initial data.
-- This data contains language specifications (if multi-language vocabularies are used) and field type definitions. 
-- You can use the following script that installs some sample languages and field types.
-- Download the [Initial Data Script](https://jenkins.intranda.com/job/intranda/job/vocabulary-server/job/develop/lastSuccessfulBuild/artifact/install/default_setup.sh).
-- Change the `HOST` variable at the top according to the vocabulary server configuration, leave the `/api/v1` suffix unchanged.
-- Run the script.
-
-## Security
-- You can setup Apache url restrictions in order to secure the vocabulary server from unauthorized access.
-- **TODO** *Admins, please find out what and how to do it in detail.*
-
-## Installation Test
-- For all commands, change host and port accordingly.
-- After the initial setup, check that types were created successfully:
+- Navigate to `Administration` > `Vocabularies` to check that everything is working. You should see a list of vocabularies if everything is OK (not now, but after you have created some vocabularies or migrated the existing ones). If something is not working, you will see a red error message.
+
+
+## Initial setup
+
+- For proper operation, the vocabulary server requires some initial data.
+- This data contains language information (if multilingual vocabularies are used) and field type definitions.
+- You can use the following script, which installs some sample languages and field types.
+- Download the [Initial Data Script](https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh).
+- Change the variables `HOST` and `TOKEN` at the beginning according to the configuration of the vocabulary server, leave the suffix `/api/v1` unchanged.
+- Execute the script.
+
+## Installation script
+The vocabulary server requires Java 17, the systemd service assumes that Java 17 is the system default.
+
+For the above three points, under Ubuntu:
+``bash
+export VOC_PORT=8081
+export VOC_TOKEN=$(</dev/urandom tr -dc '[:alnum:]' | head -c17)
+export VOC_PATH=/opt/digiverso/vocabulary
+export VOC_USER=vocabulary
+export VOC_SQL_USER=${VOC_USER}
+export VOC_SQL_DB=${VOC_USER}
+export PW_SQL_VOC=$(</dev/urandom tr -dc '[:alnum:]' | head -c17)
+
+# create install folder
+sudo mkdir ${VOC_PATH}
+# download and link vocabulary server application file
+wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-server-core.jar -O - | sudo tee ${VOC_PATH}/vocabulary-server-core.jar >/dev/null
+
+# create system user which will run the service
+sudo adduser --system --home ${VOC_PATH}/home --shell /usr/sbin/nologin --no-create-home --disabled-login ${VOC_USER}
+
+# download the vocabulary migration tools
+wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/migration-tool.zip -O /tmp/migration-tool.zip
+sudo unzip /tmp/migration-tool.zip -d "${VOC_PATH}"
+
+# download and set up the config file
+wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/application.properties -O - | sudo tee ${VOC_PATH}/application.properties >/dev/null
+sudo sed -re "s|^(server.port=).*|\1${VOC_PORT}|" \
+     -e "s|^#?(security.token=).*|\1${VOC_TOKEN}|" \
+     -e "s|^(spring.datasource.username=).*|\1${VOC_SQL_USER}|" \
+     -e "s|^(spring.datasource.password=).*|\1${PW_SQL_VOC}|" \
+     -e "s|^(spring.datasource.url=).*|\1jdbc:mariadb://localhost:3306/${VOC_SQL_DB}|" \
+     -i ${VOC_PATH}/application.properties
+sudo chown ${VOC_USER}: ${VOC_PATH}/application.properties
+sudo chmod 600 ${VOC_PATH}/application.properties
+
+# install a systemd service unit file
+cat << EOF | sudo tee /etc/systemd/system/vocabulary.service
+[Unit]
+Description=Goobi Vocabulary Server
+After=mysql.service remote-fs.target
+Requires=mysql.service remote-fs.target
+
+[Service]
+WorkingDirectory=${VOC_PATH}
+Restart=always
+RestartSec=20s
+StartLimitInterval=100s
+StartLimitBurst=4
+ExecStart=/usr/bin/java -jar vocabulary-server-core.jar
+User=${VOC_USER}
+NoNewPrivileges=true
+ProtectSystem=true
+PrivateTmp=yes
+
+[Install]
+WantedBy=default.target tomcat9.service
+EOF
+sudo systemctl daemon-reload
+sudo systemctl enable vocabulary.service
+
+# create and configure the database
+sudo mysql -e "CREATE DATABASE ${VOC_SQL_DB} CHARACTER SET = 'utf8mb4' COLLATE = 'utf8mb4_unicode_ci';
+               CREATE USER '${VOC_SQL_USER}'@'localhost' IDENTIFIED BY '${PW_SQL_VOC}';
+               GRANT ALL PRIVILEGES ON ${VOC_SQL_DB}.* TO '${VOC_SQL_USER}'@'localhost' WITH GRANT OPTION;
+               FLUSH PRIVILEGES;"
+
+# append vocabulary server address to the Goobi workflow config
+grep ^vocabularyServerHost= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerHost=localhost"   | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties
+grep ^vocabularyServerPort= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerPort=${VOC_PORT}" | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties
+grep ^vocabularyServerToken= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerToken=${VOC_TOKEN}" | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties
+
+# start the vocabulary server and wait for startup
+sudo systemctl restart vocabulary.service & sudo journalctl -u vocabulary.service  -f -n 0 | grep -q "Started VocabularyServerApplication in"
+
+# initial set up
+wget https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh -O /tmp/default_setup.sh
+bash /tmp/default_setup.sh
+
+## test
+curl -s http://localhost:${VOC_PORT}/api/v1/types --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.fieldTypeList[] .name'
+```
+
+## Accessibility
+- You can make the vocabulary server accessible from outside by connecting a proxy with access control in front of it.
+
+## Installation test
+- Change the host and port accordingly for all commands.
+- After the initial setup, check whether the field types have been created successfully:
 ```bash
-curl http://localhost:8081/api/v1/types | jq -r '._embedded.fieldTypeList[] .name'
+curl ‘http://localhost:${VOC_PORT:-8081}/api/v1/types’ --header ‘Authorisation: Bearer $VOC_TOKEN’ | jq -r ‘._embedded.fieldTypeList[] .name’
 ```
-- The result should look like:
+
+- The result should look like this:
 ```bash
 Anything
 Boolean
@@ -56,14 +134,3 @@ skos:related
 skos:closeMatch
 skos:exactMatch
 ```
-- If a data migration has been done, check that all vocabularies have been migrated:
-```bash
-curl http://localhost:8081/api/v1/vocabularies/all | jq -r '._embedded.vocabularyList[] .name'                
-```
-- Check that links resolve correctly (see configuration):
-```bash
-curl http://localhost:8081/api/v1/records/1 | jq
-```
-The `_links` JSON element should contain references to other resources. 
-These URLs should be valid and resolvable. 
-The host part of these URLs is generated from the request.

From fccadc1b8af42054c2ee56f9d000eb28a8c2d307 Mon Sep 17 00:00:00 2001
From: Marcel Neumann <marcel.neuman@intranda.com>
Date: Fri, 22 Nov 2024 14:00:53 +0100
Subject: [PATCH 05/19] fix syntax

---
 docs/en/setup.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/setup.md b/docs/en/setup.md
index b7639c5..a1cc2f9 100644
--- a/docs/en/setup.md
+++ b/docs/en/setup.md
@@ -31,7 +31,7 @@ This documentation describes the process of bootstrapping the vocabulary server.
 The vocabulary server requires Java 17, the systemd service assumes that Java 17 is the system default.
 
 For the above three points, under Ubuntu:
-``bash
+```bash
 export VOC_PORT=8081
 export VOC_TOKEN=$(</dev/urandom tr -dc '[:alnum:]' | head -c17)
 export VOC_PATH=/opt/digiverso/vocabulary

From f9eebe955ceac8a1d07623cb155939da8f5347bc Mon Sep 17 00:00:00 2001
From: Marcel Neumann <marcel.neuman@intranda.com>
Date: Fri, 22 Nov 2024 14:08:25 +0100
Subject: [PATCH 06/19] synchronise english and german version

---
 docs/en/migration.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/en/migration.md b/docs/en/migration.md
index 49a2d97..7671cf4 100644
--- a/docs/en/migration.md
+++ b/docs/en/migration.md
@@ -30,6 +30,23 @@ If you don't want to create any field types, you can start the data migration wi
 ```bash
 python vocabulary-migrator.py --vocabulary-server-host localhost --vocabulary-server-port 8081 --goobi-database-host localhost --goobi-database-port 3306 --goobi-database-name goobi --goobi-database-user goobi --goobi-database-password goobi --continue-on-error --fallback-language eng
 ```
+
+### Script 
+The above two points, the virtual Python environment and the migration of the vocabulary data in a typical installation, as root:
+```bash
+cd /opt/digiverso/vocabulary/migration
+python3 -m venv vmenv
+. vmenv/bin/activate
+pip install requests mysql-connector-python==8.4.0 alive_progress lxml
+VOC_PORT=$(sudo grep -oP '^server.port=\K.*' /opt/digiverso/vocabulary/application.properties)
+VOC_TOKEN=$(sudo grep -oP '^security.token=\K.*' /opt/digiverso/vocabulary/application.properties)
+DB_GOOBI_PW=$(sudo xmlstarlet sel -t -v '//Resource/@password' -n /etc/tomcat9/Catalina/localhost/goobi.xml)
+python vocabulary-migrator.py --vocabulary-server-host localhost --vocabulary-server-port "${VOC_PORT}" --vocabulary-server-token "${VOC_TOKEN}" --goobi-database-host localhost --goobi-database-port 3306 --goobi-database-name goobi --goobi-database-user goobi --goobi-database-password "${DB_GOOBI_PW}" --continue-on-error --fallback-language ger
+
+# Test
+curl -s http://localhost:8081/api/v1/vocabularies --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.vocabularyList[] .name'
+```
+
 **Hint** Change the parameters according to your configuration. The `fallback-language` parameter defines the default language to be used for a multi-lingual vocabulary field for which no default language could be derived. The `continue-on-error` option prevents the migration tool to stop on data migration errors. These errors can occur if the data could not be inserted into the new vocabulary server. Possible reasons might be:
 - The vocabulary record is empty.
 - The vocabulary record contains data that is incompatible with some type restrictions.

From db92373e58e1ec4e542524aa33f970ece822b0ba Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 08:41:30 +0100
Subject: [PATCH 07/19] docs: python venv installation fix

---
 docs/de/migration.md | 1 +
 docs/en/migration.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/de/migration.md b/docs/de/migration.md
index 5f5469f..8e2593c 100644
--- a/docs/de/migration.md
+++ b/docs/de/migration.md
@@ -11,6 +11,7 @@ Erstellen Sie zunächst eine virtuelle Python-Umgebung, aktivieren Sie diese und
 ```bash
 python3 -m venv vmenv
 . vmenv/bin/activate
+pip install wheel # This needs to be done beforehand as a separate call
 pip install requests mysql-connector-python==8.4.0 alive_progress lxml
 ```
 
diff --git a/docs/en/migration.md b/docs/en/migration.md
index 7671cf4..f301dcb 100644
--- a/docs/en/migration.md
+++ b/docs/en/migration.md
@@ -11,6 +11,7 @@ First, create a virtual Python environment, activate it and install all required
 ```bash
 python -m venv vmenv
 . vmenv/bin/activate
+pip install wheel # This needs to be done beforehand as a separate call
 pip install requests mysql-connector-python==8.4.0 alive_progress lxml
 ```
 

From 52c96c1a7a8fda872ade7b4207c3b872c69679f0 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 09:26:25 +0100
Subject: [PATCH 08/19] fix: don't crash on malformed mets files

---
 migration/lib/mets_manipulator.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index cf5aeed..b19ac99 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -30,7 +30,13 @@ def create_backup(self):
         logging.debug(f'Backed up mets file: {backup_filename}')
 
     def process_mets_file(self):
-        tree = ET.parse(self.file_path)
+        try:
+            tree = ET.parse(self.file_path)
+        except Exception as e:
+            logging.error(f'Error parsing mets file {self.file_path}, skipping')
+            self.ctx.log_issue(self.file_path, error)
+            return
+
         root = tree.getroot()
         self.process_node(root)
         

From d489d4b8867683ed33d59653b005ef47fa67914a Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 09:47:47 +0100
Subject: [PATCH 09/19] fix: missing error variable

---
 migration/lib/mets_manipulator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index b19ac99..92f42f3 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -33,7 +33,8 @@ def process_mets_file(self):
         try:
             tree = ET.parse(self.file_path)
         except Exception as e:
-            logging.error(f'Error parsing mets file {self.file_path}, skipping')
+            error = f'Error parsing mets file {self.file_path}, skipping'
+            logging.error(error)
             self.ctx.log_issue(self.file_path, error)
             return
 

From c91dc126cd5733fa2b4098e8431ef976b941ba7b Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 09:58:15 +0100
Subject: [PATCH 10/19] fix: better logging for unsuccessful mets file
 processing

---
 migration/lib/mets_manipulator.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 92f42f3..e05349e 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -37,14 +37,18 @@ def process_mets_file(self):
             logging.error(error)
             self.ctx.log_issue(self.file_path, error)
             return
-
-        root = tree.getroot()
-        self.process_node(root)
+        try:
+            root = tree.getroot()
+            self.process_node(root)
         
-        if self.changed and not self.ctx.dry:
-            self.create_backup()
-            tree.write(self.file_path, encoding='utf-8', xml_declaration=True)
-            self.ctx.log_processed(self.file_path)
+            if self.changed and not self.ctx.dry:
+                self.create_backup()
+                tree.write(self.file_path, encoding='utf-8', xml_declaration=True)
+                self.ctx.log_processed(self.file_path)
+        except Exception as e:
+            error = f'Something very unexpected happened during processing of mets file {self.file_path}: {e}'
+            logging.critical(error)
+            raise Exception(error)
 
     def process_node(self, node):
         if self.is_manual_id_reference(node):

From 8b56306f811dec245e91a20140a217f912be80dd Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 10:27:58 +0100
Subject: [PATCH 11/19] fix: relation column logic for same type relations

---
 migration/lib/mets_manipulator.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index e05349e..1f660b6 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -197,6 +197,9 @@ def process_vocabulary_reference_by_value(self, node):
                     else:
                         search_field='Relationship type'
                         inverse_search_field='Reverse relationship'
+                else:
+                    search_field='Relationship type'
+                    inverse_search_field='Reverse relationship'
 
                 try:
                     # First, try to find the value in the correct column

From 89d2abfc5f0754442ddce7843c47a7b7086ad708 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 10:59:58 +0100
Subject: [PATCH 12/19] fix: relation column logic bug

---
 migration/lib/mets_manipulator.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 1f660b6..422e4d5 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -181,14 +181,14 @@ def process_vocabulary_reference_by_value(self, node):
                 entity_type = None
                 for sibling in parent:
                     if sibling.attrib['name'] == 'RelationEntityType':
-                        entity_type = sibling.text
+                        entity_type = sibling.text.lower()
                         break
                 
-                entity_type_in_relation_count = vocabulary_name.count(entity_type)
+                entity_type_in_relation_count = vocabulary_name.lower().count(entity_type)
                 if entity_type_in_relation_count == 1:
                     # Find out relation direction
                     separator_position = vocabulary_name.index('-')
-                    entity_type_position = vocabulary_name.index(entity_type)
+                    entity_type_position = vocabulary_name.lower().index(entity_type)
 
                     # use second column of vocabulary: `Reverse relationship` (The relation vocabulary is specified from `A->B`, the relation references an entity of type `A` and is therefore of type `B`)
                     if entity_type_position < separator_position:
@@ -197,9 +197,11 @@ def process_vocabulary_reference_by_value(self, node):
                     else:
                         search_field='Relationship type'
                         inverse_search_field='Reverse relationship'
-                else:
+                elif entity_type_in_relation_count == 2:
                     search_field='Relationship type'
                     inverse_search_field='Reverse relationship'
+                else:
+                    raise Exception(f'Unable to perform relation column logic on relation [{vocabulary_name}] with search entity: {entity_type}')
 
                 try:
                     # First, try to find the value in the correct column
@@ -227,7 +229,7 @@ def process_vocabulary_reference_by_value(self, node):
                     new_value =  self.ctx.extract_preferred_language(translated_main_values)
 
                     #dump_node(node)
-                    logging.warn(f'Relation is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
+                    logging.warn(f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
                     node.text = new_value
 
             else:

From b30031cf45bdfb3dd21e2e01f1af11a4ad9af94a Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 11:18:03 +0100
Subject: [PATCH 13/19] fix: don't invert value for A -> A relationships

---
 migration/lib/mets_manipulator.py | 49 ++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 422e4d5..0fc49c8 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -171,6 +171,7 @@ def process_vocabulary_reference_by_value(self, node):
 
             search_field=None
             inverse_search_field=None
+            perform_inversion_fix=False
             if self.ctx.enable_relation_vocabulary_column_logic and 'Relationship' in vocabulary_name:
                 parent = node.getparent()
                 if parent == None:
@@ -197,6 +198,8 @@ def process_vocabulary_reference_by_value(self, node):
                     else:
                         search_field='Relationship type'
                         inverse_search_field='Reverse relationship'
+                    perform_inversion_fix=True
+
                 elif entity_type_in_relation_count == 2:
                     search_field='Relationship type'
                     inverse_search_field='Reverse relationship'
@@ -210,27 +213,31 @@ def process_vocabulary_reference_by_value(self, node):
                     # If failed, try to find the value in the other column (assuming the value was stored incorrectly)
                     new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field)
                     old_value = node.text
-                    record_data = self.ctx.api.lookup_record(new_record_id)
-
-                    v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId'])
-                    s = self.ctx.api.lookup_schema(v['schemaId'])
-                    ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for
-                    if len(ids) != 1:
-                        logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
-                        sys.exit(1)
-
-                    field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]]
-                    if len(field_data) != 1:
-                        logging.critical(f'Record [{new_record_id}] has no unique search column entry field')
-                        sys.exit(1)
-
-                    # Replace node text if not matching any translation of main value
-                    translated_main_values = self.ctx.extract_language_values(field_data[0])
-                    new_value =  self.ctx.extract_preferred_language(translated_main_values)
-
-                    #dump_node(node)
-                    logging.warn(f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
-                    node.text = new_value
+
+                    if perform_inversion_fix:
+                        record_data = self.ctx.api.lookup_record(new_record_id)
+
+                        v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId'])
+                        s = self.ctx.api.lookup_schema(v['schemaId'])
+                        ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for
+                        if len(ids) != 1:
+                            logging.critical(f'Non unique "{search_field}" fields found: {ids}!')
+                            sys.exit(1)
+
+                        field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]]
+                        if len(field_data) != 1:
+                            logging.critical(f'Record [{new_record_id}] has no unique search column entry field')
+                            sys.exit(1)
+
+                        # Replace node text if not matching any translation of main value
+                        translated_main_values = self.ctx.extract_language_values(field_data[0])
+                        new_value =  self.ctx.extract_preferred_language(translated_main_values)
+
+                        #dump_node(node)
+                        logging.warn(f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
+                        node.text = new_value
+                    else:
+                        logging.debug(f'Relation [{vocabulary_name}] value "{value}" found in column "{inverse_search_field}", keeping as is')
 
             else:
                 new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=None)

From 1bdf2eb37aaa70d393646c2ee396e8ca652617da Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 11:25:31 +0100
Subject: [PATCH 14/19] fix: log direction changes

---
 migration/lib/mets_manipulator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 0fc49c8..7195535 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -234,7 +234,9 @@ def process_vocabulary_reference_by_value(self, node):
                         new_value =  self.ctx.extract_preferred_language(translated_main_values)
 
                         #dump_node(node)
-                        logging.warn(f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"')
+                        warn_message = f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"'
+                        logging.warn(warn_message)
+                        self.ctx.log_issue(self.file_path, warn_message)
                         node.text = new_value
                     else:
                         logging.debug(f'Relation [{vocabulary_name}] value "{value}" found in column "{inverse_search_field}", keeping as is')

From 8f4553d448bb05c4560048af9b5a916ca2b62c4e Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 25 Nov 2024 11:26:37 +0100
Subject: [PATCH 15/19] fix: log intentional deletion

---
 migration/lib/mets_manipulator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py
index 7195535..f06e264 100644
--- a/migration/lib/mets_manipulator.py
+++ b/migration/lib/mets_manipulator.py
@@ -253,7 +253,9 @@ def process_vocabulary_reference_by_value(self, node):
         except Exception as e:
             # If this fails as well and the value is not found, remove the metadata if configured
             if 'has no results' in e.__str__() and self.ctx.is_removable_metadata(vocabulary_id, node.text):
-                logging.warn(f'Removing node due to intentionally missing vocabulary value: "{node.text}"')
+                warn_message = f'Removing node due to intentionally missing vocabulary value: "{node.text}"'
+                logging.warn(warn_message)
+                self.ctx.log_issue(self.file_path, warn_message)
                 self.remove_metadata_node(node)
             else:
                 error = f'Unable to find record by value: {value}\n\t\t{e}'

From 8313ead6aa4de06fde99c9815f9b3ea1e08aaf21 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Fri, 29 Nov 2024 08:51:08 +0100
Subject: [PATCH 16/19] task: make install script more generic and applicable
 to external init folders

---
 install/{install_samples.sh => install.sh} | 41 ++++++++++++++++------
 1 file changed, 30 insertions(+), 11 deletions(-)
 rename install/{install_samples.sh => install.sh} (76%)

diff --git a/install/install_samples.sh b/install/install.sh
similarity index 76%
rename from install/install_samples.sh
rename to install/install.sh
index 09a37fe..166a687 100755
--- a/install/install_samples.sh
+++ b/install/install.sh
@@ -1,7 +1,6 @@
 #!/usr/bin/env bash
 MY_PATH="$(dirname -- "${BASH_SOURCE[0]}")"
 SAMPLES_DIR=$MY_PATH/samples
-CACHE_FILE=$MY_PATH/cache.txt
 
 # Check for parameters
 FAIL=0
@@ -29,11 +28,17 @@ if [ -z $SAMPLE ]; then
     ls $SAMPLES_DIR
     read SAMPLE
 fi
-SAMPLE_PATH=$SAMPLES_DIR/$SAMPLE
+
+if [ -d "$SAMPLE" ]; then
+    SAMPLE_PATH="$SAMPLE"
+else
+    SAMPLE_PATH="$SAMPLES_DIR/$SAMPLE"
+fi
 if [ ! -e $SAMPLE_PATH ]; then
     echo "Sample \"$SAMPLE\" does not exist!"
     exit 1
 fi
+CACHE_FILE="$SAMPLE_PATH/cache.txt"
 
 curl_call() {
     curl --location "$HOST:$PORT/api/v1/$1" \
@@ -59,16 +64,22 @@ for INSTALL_DIR in $(ls $SAMPLE_PATH); do
         ITEM_NAME=$(echo "$ITEM" | cut -d'.' -f1)
         ITEM_IDENTIFIER=${ENDPOINT}_"${ITEM_NAME}"
 
-        ID=$(cat $CACHE_FILE | grep "$ITEM_IDENTIFIER;" | cut -d';' -f2)
+        if [ -f "$CACHE_FILE" ]; then
+            ID=$(cat $CACHE_FILE | grep "$ITEM_IDENTIFIER;" | cut -d';' -f2)
+        else
+            ID=""
+        fi
         if [ -z "$ID" ]; then
             JSON=$(cat $SAMPLE_PATH/$INSTALL_DIR/$ITEM)
 
             # Replace ID placeholders
-            for CACHE_LINE in $(cat $CACHE_FILE); do
-                TEMPLATE_PLACEHOLDER="{{$(echo $CACHE_LINE | cut -d';' -f1)}}"
-                CACHE_ID=$(echo $CACHE_LINE | cut -d';' -f2)
-                JSON=$(echo $JSON | sed "s/$TEMPLATE_PLACEHOLDER/$CACHE_ID/g")
-            done
+            if [ -f "$CACHE_FILE" ]; then
+                for CACHE_LINE in $(cat $CACHE_FILE); do
+                    TEMPLATE_PLACEHOLDER="{{$(echo $CACHE_LINE | cut -d';' -f1)}}"
+                    CACHE_ID=$(echo $CACHE_LINE | cut -d';' -f2)
+                    JSON=$(echo $JSON | sed "s/$TEMPLATE_PLACEHOLDER/$CACHE_ID/g")
+                done
+            fi
 
             RESULT=$(curl_call $ENDPOINT "$JSON")
         
@@ -91,7 +102,11 @@ for INSTALL_DIR in $(ls $SAMPLE_PATH); do
     for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".csv"); do
         VOCABULARY_NAME=$(echo "$ITEM" | cut -d'.' -f1)
         VOCABULARY_IDENTIFIER=vocabularies_${VOCABULARY_NAME}
-        VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2)
+        if [ -f "$CACHE_FILE" ]; then
+            VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2)
+        else
+            VOCABULARY_ID=""
+        fi
 
         if [ ! -z "$VOCABULARY_ID" ]; then
             curl_file_upload_call "vocabularies/$VOCABULARY_ID/import/csv" "$SAMPLE_PATH/$INSTALL_DIR/$ITEM"
@@ -103,8 +118,12 @@ for INSTALL_DIR in $(ls $SAMPLE_PATH); do
     for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".xlsx"); do
         VOCABULARY_NAME=$(echo "$ITEM" | cut -d'.' -f1)
         VOCABULARY_IDENTIFIER=vocabularies_${VOCABULARY_NAME}
-        VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2)
-
+        if [ -f "$CACHE_FILE" ]; then
+            VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2)
+        else
+            VOCABULARY_ID=""
+        fi
+        
         if [ ! -z "$VOCABULARY_ID" ]; then
             curl_file_upload_call "vocabularies/$VOCABULARY_ID/import/excel" "$SAMPLE_PATH/$INSTALL_DIR/$ITEM"
             echo -e "\tImported \"$VOCABULARY_NAME\" vocabulary records"

From 41d56e302a4c201bedb02147c6b10b4ec9614b37 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Fri, 29 Nov 2024 09:17:35 +0100
Subject: [PATCH 17/19] task: rework and cleanup vocabulary initialization
 logic

---
 .github/workflows/develop-build.yml           |  4 +-
 .github/workflows/release-build.yml           |  4 +-
 .gitignore                                    |  1 +
 docs/de/setup.md                              | 29 +++++-----
 docs/en/setup.md                              | 30 +++++------
 install/.gitignore                            |  2 +-
 install/default_setup.sh                      | 54 -------------------
 .../samples/minimal/1_languages/Deutsch.json  |  4 ++
 .../samples/minimal/1_languages/English.json  |  4 ++
 .../minimal/1_languages/Fran\303\247ais.json" |  4 ++
 install/samples/minimal/2_types/Any_Text.json |  4 ++
 install/samples/minimal/2_types/Anything.json |  3 ++
 install/samples/minimal/2_types/Boolean.json  |  6 +++
 install/samples/minimal/2_types/Number.json   |  4 ++
 14 files changed, 63 insertions(+), 90 deletions(-)
 delete mode 100755 install/default_setup.sh
 create mode 100644 install/samples/minimal/1_languages/Deutsch.json
 create mode 100644 install/samples/minimal/1_languages/English.json
 create mode 100644 "install/samples/minimal/1_languages/Fran\303\247ais.json"
 create mode 100644 install/samples/minimal/2_types/Any_Text.json
 create mode 100644 install/samples/minimal/2_types/Anything.json
 create mode 100644 install/samples/minimal/2_types/Boolean.json
 create mode 100644 install/samples/minimal/2_types/Number.json

diff --git a/.github/workflows/develop-build.yml b/.github/workflows/develop-build.yml
index 2005d1f..aaa92f2 100644
--- a/.github/workflows/develop-build.yml
+++ b/.github/workflows/develop-build.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Create ZIP archive for setup scripts
         uses: montudor/action-zip@v1
         with:
-          args: zip -qq -r setup-scripts.zip install
+          args: zip -qq -r vocabulary-init-script.zip install
       - name: Get current date
         id: date
         run: echo "::set-output name=date::$(date +'%Y-%m-%d %H:%M:%S %Z')"
@@ -53,5 +53,5 @@ jobs:
           files: |
             module-*/target/*.jar
             module-core/src/main/resources/application.properties
-            setup-scripts.zip
+            vocabulary-init-script.zip
             migration-tool.zip
diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml
index 1611dae..6a309d0 100644
--- a/.github/workflows/release-build.yml
+++ b/.github/workflows/release-build.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Create ZIP archive for setup scripts
         uses: montudor/action-zip@v1
         with:
-          args: zip -qq -r setup-scripts.zip install
+          args: zip -qq -r vocabulary-init-script.zip install
       - name: Release
         id: create_release
         uses: softprops/action-gh-release@v2
@@ -46,5 +46,5 @@ jobs:
           files: |
             module-*/target/*.jar
             module-core/src/main/resources/application.properties
-            setup-scripts.zip
+            vocabulary-init-script.zip
             migration-tool.zip
diff --git a/.gitignore b/.gitignore
index 157eb5a..819c586 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,4 +36,5 @@ build/
 
 ### Python ###
 /migration/vmenv
+/migration/migration.csv
 __pycache__/
diff --git a/docs/de/setup.md b/docs/de/setup.md
index 7b6dacc..92f48a3 100644
--- a/docs/de/setup.md
+++ b/docs/de/setup.md
@@ -13,16 +13,21 @@ Diese Dokumentation beschreibt den Prozess der Installation und Ersteinrichtung
 ## Einrichtung von Goobi Workflow zur Kommunikation mit dem Vokabularserver
 - Goobi Workflow verwendet seit Version `24.07` den neuen Vokabularserver.
 - Konfigurieren Sie die Variablen `vocabularyServerHost`, `vocabularyServerPort` und `vocabularyServerToken` in der Datei `goobi_config.properties` entsprechend der Konfiguration Ihres Vokabularservers.
+- Alternativ zu `vocabularyServerHost` und `vocabularyServerPort` kann auch die Variable `vocabularyServerAddress` gesetzt werden (beispielsweise `vocabularyServerAddress=https://external.address.com/vocabulary`). Diese Variable erlaubt auch eine SSL Verbindung.
 - Starten Sie Goobi Workflow neu, damit die Änderungen wirksam werden.
 - Navigieren Sie zu `Administration` > `Vocabulare`, um zu überprüfen, ob alles funktioniert. Sie sollten eine Liste von Vokabularen sehen, wenn alles in Ordnung ist (nicht jetzt, sondern nachdem Sie einige Vokabulare erstellt oder die bestehenden migriert haben). Wenn etwas nicht funktioniert, wird eine rote Fehlermeldung angezeigt.
 
 ## Ersteinrichtung
-- Für den ordnungsgemäßen Betrieb benötigt der Vokabularserver einige Ausgangsdaten.
+- Im Falle einer Datenmigration benötigt der Vokabularserver einige Ausgangsdaten.
 - Diese Daten enthalten Sprachangaben (wenn mehrsprachige Vokabulare verwendet werden) und Feldtypdefinitionen. 
-- Sie können das folgende Skript verwenden, welches einige Beispielsprachen und Feldtypen installiert.
-- Laden Sie das [Initial Data Script](https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh) herunter.
-- Ändern Sie die Variablen `HOST` und `TOKEN` am Anfang entsprechend der Konfiguration des Vokabularservers, lassen Sie das Suffix `/api/v1` unverändert.
-- Führen Sie das Skript aus.
+- Diese Daten sind in einem Minimaldatensatz hinterlegt und können einfach mithilfe des dafür vorgesehenen Installationsscriptes installiert werden.
+- Laden Sie das [Vokabular-Initialisierungs-Tool](https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip) herunter.
+- Falls nicht bereits geschehen, starten Sie den Vokabularserver.
+- Entpacken Sie das Archiv.
+- Passen Sie die Variablen `HOST`, `PORT` und `TOKEN` an Ihre Konfiguration an und starten Sie das Script wie folgt:
+```bash
+HOST=localhost PORT=8081 TOKEN=secret /path/to/the/script/install.sh minimal
+```
 
 ## Installationsskript
 Der Vokabularserver benötigt Java 17, der Systemd-Service geht davon aus, dass Java 17 der System-Default ist.
@@ -100,8 +105,9 @@ grep ^vocabularyServerToken= /opt/digiverso/goobi/config/goobi_config.properties
 sudo systemctl restart vocabulary.service & sudo journalctl -u vocabulary.service  -f -n 0 | grep -q "Started VocabularyServerApplication in"
 
 # initial set up
-wget https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh -O /tmp/default_setup.sh
-bash /tmp/default_setup.sh
+wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip -O /tmp/vocabulary-init-script.zip
+sudo unzip /tmp/vocabulary-init-script.zip -d "${VOC_PATH}"
+HOST=localhost PORT=${VOC_PORT} TOKEN=${VOC_TOKEN} ${VOC_PATH}/vocabulary-init-script/install.sh minimal
 
 ## test
 curl -s http://localhost:${VOC_PORT}/api/v1/types --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.fieldTypeList[] .name'
@@ -118,15 +124,8 @@ curl "http://localhost:${VOC_PORT:-8081}/api/v1/types" --header "Authorization:
 ```
 - Das Ergebnis sollte wie folgt aussehen:
 ```bash
+Any Text
 Anything
 Boolean
 Number
-Word
-skos:prefLabel
-skos:altLabel
-skos:definition
-skos:editorialNote
-skos:related
-skos:closeMatch
-skos:exactMatch
 ```
diff --git a/docs/en/setup.md b/docs/en/setup.md
index a1cc2f9..dab318f 100644
--- a/docs/en/setup.md
+++ b/docs/en/setup.md
@@ -14,18 +14,22 @@ This documentation describes the process of bootstrapping the vocabulary server.
 
 - Goobi Workflow has been using the new vocabulary server since version `24.07`.
 - Configure the variables `vocabularyServerHost`, `vocabularyServerPort` and `vocabularyServerToken` in the `goobi_config.properties` file according to the configuration of your vocabulary server.
+- As an alternative to `vocabularyServerHost` and `vocabularyServerPort`, the variable `vocabularyServerAddress` can also be set (e. g. `vocabularyServerAddress=https://external.address.com/vocabulary`). This variable also allows an SSL connection.
 - Restart Goobi Workflow for the changes to take effect.
 - Navigate to `Administration` > `Vocabularies` to check that everything is working. You should see a list of vocabularies if everything is OK (not now, but after you have created some vocabularies or migrated the existing ones). If something is not working, you will see a red error message.
 
 
 ## Initial setup
-
-- For proper operation, the vocabulary server requires some initial data.
+- In the case of data migration, the vocabulary server requires some initial data.
 - This data contains language information (if multilingual vocabularies are used) and field type definitions.
-- You can use the following script, which installs some sample languages and field types.
-- Download the [Initial Data Script](https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh).
-- Change the variables `HOST` and `TOKEN` at the beginning according to the configuration of the vocabulary server, leave the suffix `/api/v1` unchanged.
-- Execute the script.
+- This data is stored in a minimal data set and can be easily installed using the installation script provided.
+- Download the [Vocabulary-Initialization-Tool](https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip).
+- If you have not already done so, start the vocabulary server.
+- Unpack the archive.
+- Adapt the variables `HOST`, `PORT` and `TOKEN` to your configuration and start the script as follows:
+```bash
+HOST=localhost PORT=8081 TOKEN=secret /path/to/the/script/install.sh minimal
+```
 
 ## Installation script
 The vocabulary server requires Java 17, the systemd service assumes that Java 17 is the system default.
@@ -103,8 +107,9 @@ grep ^vocabularyServerToken= /opt/digiverso/goobi/config/goobi_config.properties
 sudo systemctl restart vocabulary.service & sudo journalctl -u vocabulary.service  -f -n 0 | grep -q "Started VocabularyServerApplication in"
 
 # initial set up
-wget https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh -O /tmp/default_setup.sh
-bash /tmp/default_setup.sh
+wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip -O /tmp/vocabulary-init-script.zip
+sudo unzip /tmp/vocabulary-init-script.zip -d "${VOC_PATH}"
+HOST=localhost PORT=${VOC_PORT} TOKEN=${VOC_TOKEN} ${VOC_PATH}/vocabulary-init-script/install.sh minimal
 
 ## test
 curl -s http://localhost:${VOC_PORT}/api/v1/types --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.fieldTypeList[] .name'
@@ -122,15 +127,8 @@ curl ‘http://localhost:${VOC_PORT:-8081}/api/v1/types’ --header ‘Authorisa
 
 - The result should look like this:
 ```bash
+Any Text
 Anything
 Boolean
 Number
-Word
-skos:prefLabel
-skos:altLabel
-skos:definition
-skos:editorialNote
-skos:related
-skos:closeMatch
-skos:exactMatch
 ```
diff --git a/install/.gitignore b/install/.gitignore
index 3415572..ea122ca 100644
--- a/install/.gitignore
+++ b/install/.gitignore
@@ -1 +1 @@
-/cache.txt
\ No newline at end of file
+cache.txt
\ No newline at end of file
diff --git a/install/default_setup.sh b/install/default_setup.sh
deleted file mode 100755
index 2b2b056..0000000
--- a/install/default_setup.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-HOST="localhost:${VOC_PORT:-8081}/api/v1"
-TOKEN="${VOC_TOKEN:-CHANGEME}"
-
-curl_call() {
-  curl --location "$HOST/$1" --header 'Content-Type: application/json' --header "Authorization: Bearer $TOKEN" --data "$2"
-}
-
-create_language() {
-  curl_call "languages" "{\"abbreviation\":\"$1\", \"name\":\"$2\"}"
-}
-
-create_type() {
-  if [ -n "$3" ]
-  then
-    if [ -n "$2" ]
-    then
-      curl_call "types" "{\"name\":\"$1\",\"validation\":\"$2\",\"selectableValues\":[$3]}"
-    else
-      curl_call "types" "{\"name\":\"$1\",\"selectableValues\":[$3]}"
-    fi
-  else
-    if [ -n "$2" ]
-    then
-      echo "{\"name\":\"$1\",\"validation\":\"$2\"}"
-      curl_call "types" "{\"name\":\"$1\",\"validation\":\"$2\"}"
-    else
-      curl_call "types" "{\"name\":\"$1\"}"
-    fi
-  fi
-}
-
-create_language "eng" "English"
-create_language "ger" "Deutsch"
-create_language "fre" "Français"
-create_type "Anything"
-curl_call "types" "{\"name\":\"Any Text\",\"large\":true}"
-create_type "Boolean" "" "\"true\",\"false\""
-create_type "Number" "\\\\d+"
-create_type "Word" "\\\\w+"
-create_type "skos:prefLabel"
-create_type "skos:altLabel"
-create_type "skos:definition"
-create_type "skos:editorialNote"
-create_type "skos:related"
-create_type "skos:closeMatch" "https?.*"
-create_type "skos:exactMatch" "https?.*"
-create_type "dct:title"
-create_type "dct:creator"
-create_type "dct:created" "\\\\d{4}\\\\-\\\\d{2}\\\\-\\\\d{2}"
-create_type "dct:license" "https?.*"
-echo
-
-curl -s "${HOST}"/types --header "Authorization: Bearer $TOKEN" | jq -r '._embedded.fieldTypeList[] .name' | grep Anything -q || { echo "ERROR while executing $0"; exit 1; }
diff --git a/install/samples/minimal/1_languages/Deutsch.json b/install/samples/minimal/1_languages/Deutsch.json
new file mode 100644
index 0000000..5a2cdbb
--- /dev/null
+++ b/install/samples/minimal/1_languages/Deutsch.json
@@ -0,0 +1,4 @@
+{
+    "abbreviation": "ger",
+    "name": "Deutsch"
+}
\ No newline at end of file
diff --git a/install/samples/minimal/1_languages/English.json b/install/samples/minimal/1_languages/English.json
new file mode 100644
index 0000000..34fc8a6
--- /dev/null
+++ b/install/samples/minimal/1_languages/English.json
@@ -0,0 +1,4 @@
+{
+    "abbreviation": "eng",
+    "name": "English"
+}
\ No newline at end of file
diff --git "a/install/samples/minimal/1_languages/Fran\303\247ais.json" "b/install/samples/minimal/1_languages/Fran\303\247ais.json"
new file mode 100644
index 0000000..e796791
--- /dev/null
+++ "b/install/samples/minimal/1_languages/Fran\303\247ais.json"
@@ -0,0 +1,4 @@
+{
+    "abbreviation": "fre",
+    "name": "Français"
+}
\ No newline at end of file
diff --git a/install/samples/minimal/2_types/Any_Text.json b/install/samples/minimal/2_types/Any_Text.json
new file mode 100644
index 0000000..1dd14d8
--- /dev/null
+++ b/install/samples/minimal/2_types/Any_Text.json
@@ -0,0 +1,4 @@
+{
+    "name": "Any Text",
+    "large": true
+}
\ No newline at end of file
diff --git a/install/samples/minimal/2_types/Anything.json b/install/samples/minimal/2_types/Anything.json
new file mode 100644
index 0000000..ea7aa0f
--- /dev/null
+++ b/install/samples/minimal/2_types/Anything.json
@@ -0,0 +1,3 @@
+{
+    "name": "Anything"
+}
\ No newline at end of file
diff --git a/install/samples/minimal/2_types/Boolean.json b/install/samples/minimal/2_types/Boolean.json
new file mode 100644
index 0000000..6c9b600
--- /dev/null
+++ b/install/samples/minimal/2_types/Boolean.json
@@ -0,0 +1,6 @@
+{
+    "name": "Boolean",
+    "selectableValues": [
+        "true", "false"
+    ]
+}
\ No newline at end of file
diff --git a/install/samples/minimal/2_types/Number.json b/install/samples/minimal/2_types/Number.json
new file mode 100644
index 0000000..9390686
--- /dev/null
+++ b/install/samples/minimal/2_types/Number.json
@@ -0,0 +1,4 @@
+{
+    "name": "Number",
+    "validation": "\\d+"
+}
\ No newline at end of file

From 7da236ffc9614d2cf1d37c778fbcb8a755de6455 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Fri, 29 Nov 2024 09:33:32 +0100
Subject: [PATCH 18/19] task: make install script cache aware

---
 install/install.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/install/install.sh b/install/install.sh
index 166a687..43db0e1 100755
--- a/install/install.sh
+++ b/install/install.sh
@@ -40,6 +40,25 @@ if [ ! -e $SAMPLE_PATH ]; then
 fi
 CACHE_FILE="$SAMPLE_PATH/cache.txt"
 
+if [ -f "$CACHE_FILE" ]; then
+    while true; do
+        read -n 1 -s -p "An existing import cache was found, do you want to install from scratch? [Y]es|[N]o " answer
+        case $answer in
+            y|Y)
+                rm "$CACHE_FILE"
+                break
+                ;;
+            n|N)
+                break
+                ;;
+            *)
+                echo -e "\nInvalid choice!"
+                ;;
+        esac
+    done
+    echo ""
+fi
+
 curl_call() {
     curl --location "$HOST:$PORT/api/v1/$1" \
         --header 'Content-Type: application/json' \
@@ -58,6 +77,10 @@ curl_file_upload_call() {
 }
 
 for INSTALL_DIR in $(ls $SAMPLE_PATH); do
+    if [ "$INSTALL_DIR" == "cache.txt" ]; then
+        continue
+    fi
+
     ENDPOINT=$(echo $INSTALL_DIR | cut -d'_' -f2)
     echo "Installing $ENDPOINT"
     for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".json"); do

From c7e254ee2c151b88326d9f43b65e2ddd61903154 Mon Sep 17 00:00:00 2001
From: Dominick Leppich <dominick.leppich@intranda.com>
Date: Mon, 2 Dec 2024 09:27:02 +0100
Subject: [PATCH 19/19] version: set to version 1.1.11

---
 module-core/pom.xml     | 4 ++--
 module-exchange/pom.xml | 2 +-
 pom.xml                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/module-core/pom.xml b/module-core/pom.xml
index ef1cbae..2c3bd3f 100644
--- a/module-core/pom.xml
+++ b/module-core/pom.xml
@@ -10,7 +10,7 @@
     </parent>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server-core</artifactId>
-    <version>1.1.11-SNAPSHOT</version>
+    <version>1.1.11</version>
     <name>Vocabulary-Server-Core</name>
     <description>Spring Boot based RESTful web service for vocabulary management</description>
     <packaging>jar</packaging>
@@ -35,7 +35,7 @@
         <dependency>
             <groupId>io.goobi.vocabulary</groupId>
             <artifactId>vocabulary-server-exchange</artifactId>
-            <version>1.1.11-SNAPSHOT</version>
+            <version>1.1.11</version>
             <scope>compile</scope>
         </dependency>
 
diff --git a/module-exchange/pom.xml b/module-exchange/pom.xml
index c7e2b70..d5a56c2 100644
--- a/module-exchange/pom.xml
+++ b/module-exchange/pom.xml
@@ -4,7 +4,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server-exchange</artifactId>
-    <version>1.1.11-SNAPSHOT</version>
+    <version>1.1.11</version>
     <name>Vocabulary Exchange</name>
     <description>Vocabulary data exchange classes</description>
     <packaging>jar</packaging>
diff --git a/pom.xml b/pom.xml
index 226d729..60673f4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>io.goobi.vocabulary</groupId>
     <artifactId>vocabulary-server</artifactId>
-    <version>1.1.11-SNAPSHOT</version>
+    <version>1.1.11</version>
     <name>Vocabulary-Server</name>
     <packaging>pom</packaging>
     <description>RESTful webservice for vocabulary management</description>