diff --git a/.github/workflows/develop-build.yml b/.github/workflows/develop-build.yml index 2005d1f..aaa92f2 100644 --- a/.github/workflows/develop-build.yml +++ b/.github/workflows/develop-build.yml @@ -34,7 +34,7 @@ jobs: - name: Create ZIP archive for setup scripts uses: montudor/action-zip@v1 with: - args: zip -qq -r setup-scripts.zip install + args: zip -qq -r vocabulary-init-script.zip install - name: Get current date id: date run: echo "::set-output name=date::$(date +'%Y-%m-%d %H:%M:%S %Z')" @@ -53,5 +53,5 @@ jobs: files: | module-*/target/*.jar module-core/src/main/resources/application.properties - setup-scripts.zip + vocabulary-init-script.zip migration-tool.zip diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml index 1611dae..6a309d0 100644 --- a/.github/workflows/release-build.yml +++ b/.github/workflows/release-build.yml @@ -34,7 +34,7 @@ jobs: - name: Create ZIP archive for setup scripts uses: montudor/action-zip@v1 with: - args: zip -qq -r setup-scripts.zip install + args: zip -qq -r vocabulary-init-script.zip install - name: Release id: create_release uses: softprops/action-gh-release@v2 @@ -46,5 +46,5 @@ jobs: files: | module-*/target/*.jar module-core/src/main/resources/application.properties - setup-scripts.zip + vocabulary-init-script.zip migration-tool.zip diff --git a/.gitignore b/.gitignore index 157eb5a..819c586 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,5 @@ build/ ### Python ### /migration/vmenv +/migration/migration.csv __pycache__/ diff --git a/docs/de/migration.md b/docs/de/migration.md index 5f5469f..8e2593c 100644 --- a/docs/de/migration.md +++ b/docs/de/migration.md @@ -11,6 +11,7 @@ Erstellen Sie zunächst eine virtuelle Python-Umgebung, aktivieren Sie diese und ```bash python3 -m venv vmenv . vmenv/bin/activate +pip install wheel # This needs to be done beforehand as a separate call pip install requests mysql-connector-python==8.4.0 alive_progress lxml ``` diff --git a/docs/de/setup.md b/docs/de/setup.md index 7b6dacc..92f48a3 100644 --- a/docs/de/setup.md +++ b/docs/de/setup.md @@ -13,16 +13,21 @@ Diese Dokumentation beschreibt den Prozess der Installation und Ersteinrichtung ## Einrichtung von Goobi Workflow zur Kommunikation mit dem Vokabularserver - Goobi Workflow verwendet seit Version `24.07` den neuen Vokabularserver. - Konfigurieren Sie die Variablen `vocabularyServerHost`, `vocabularyServerPort` und `vocabularyServerToken` in der Datei `goobi_config.properties` entsprechend der Konfiguration Ihres Vokabularservers. +- Alternativ zu `vocabularyServerHost` und `vocabularyServerPort` kann auch die Variable `vocabularyServerAddress` gesetzt werden (beispielsweise `vocabularyServerAddress=https://external.address.com/vocabulary`). Diese Variable erlaubt auch eine SSL Verbindung. - Starten Sie Goobi Workflow neu, damit die Änderungen wirksam werden. - Navigieren Sie zu `Administration` > `Vocabulare`, um zu überprüfen, ob alles funktioniert. Sie sollten eine Liste von Vokabularen sehen, wenn alles in Ordnung ist (nicht jetzt, sondern nachdem Sie einige Vokabulare erstellt oder die bestehenden migriert haben). Wenn etwas nicht funktioniert, wird eine rote Fehlermeldung angezeigt. ## Ersteinrichtung -- Für den ordnungsgemäßen Betrieb benötigt der Vokabularserver einige Ausgangsdaten. +- Im Falle einer Datenmigration benötigt der Vokabularserver einige Ausgangsdaten. - Diese Daten enthalten Sprachangaben (wenn mehrsprachige Vokabulare verwendet werden) und Feldtypdefinitionen. -- Sie können das folgende Skript verwenden, welches einige Beispielsprachen und Feldtypen installiert. -- Laden Sie das [Initial Data Script](https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh) herunter. -- Ändern Sie die Variablen `HOST` und `TOKEN` am Anfang entsprechend der Konfiguration des Vokabularservers, lassen Sie das Suffix `/api/v1` unverändert. -- Führen Sie das Skript aus. +- Diese Daten sind in einem Minimaldatensatz hinterlegt und können einfach mithilfe des dafür vorgesehenen Installationsscriptes installiert werden. +- Laden Sie das [Vokabular-Initialisierungs-Tool](https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip) herunter. +- Falls nicht bereits geschehen, starten Sie den Vokabularserver. +- Entpacken Sie das Archiv. +- Passen Sie die Variablen `HOST`, `PORT` und `TOKEN` an Ihre Konfiguration an und starten Sie das Script wie folgt: +```bash +HOST=localhost PORT=8081 TOKEN=secret /path/to/the/script/install.sh minimal +``` ## Installationsskript Der Vokabularserver benötigt Java 17, der Systemd-Service geht davon aus, dass Java 17 der System-Default ist. @@ -100,8 +105,9 @@ grep ^vocabularyServerToken= /opt/digiverso/goobi/config/goobi_config.properties sudo systemctl restart vocabulary.service & sudo journalctl -u vocabulary.service -f -n 0 | grep -q "Started VocabularyServerApplication in" # initial set up -wget https://github.com/intranda/goobi-vocabulary-server/raw/develop/install/default_setup.sh -O /tmp/default_setup.sh -bash /tmp/default_setup.sh +wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip -O /tmp/vocabulary-init-script.zip +sudo unzip /tmp/vocabulary-init-script.zip -d "${VOC_PATH}" +HOST=localhost PORT=${VOC_PORT} TOKEN=${VOC_TOKEN} ${VOC_PATH}/vocabulary-init-script/install.sh minimal ## test curl -s http://localhost:${VOC_PORT}/api/v1/types --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.fieldTypeList[] .name' @@ -118,15 +124,8 @@ curl "http://localhost:${VOC_PORT:-8081}/api/v1/types" --header "Authorization: ``` - Das Ergebnis sollte wie folgt aussehen: ```bash +Any Text Anything Boolean Number -Word -skos:prefLabel -skos:altLabel -skos:definition -skos:editorialNote -skos:related -skos:closeMatch -skos:exactMatch ``` diff --git a/docs/en/migration.md b/docs/en/migration.md index 49a2d97..f301dcb 100644 --- a/docs/en/migration.md +++ b/docs/en/migration.md @@ -11,6 +11,7 @@ First, create a virtual Python environment, activate it and install all required ```bash python -m venv vmenv . vmenv/bin/activate +pip install wheel # This needs to be done beforehand as a separate call pip install requests mysql-connector-python==8.4.0 alive_progress lxml ``` @@ -30,6 +31,23 @@ If you don't want to create any field types, you can start the data migration wi ```bash python vocabulary-migrator.py --vocabulary-server-host localhost --vocabulary-server-port 8081 --goobi-database-host localhost --goobi-database-port 3306 --goobi-database-name goobi --goobi-database-user goobi --goobi-database-password goobi --continue-on-error --fallback-language eng ``` + +### Script +The above two points, the virtual Python environment and the migration of the vocabulary data in a typical installation, as root: +```bash +cd /opt/digiverso/vocabulary/migration +python3 -m venv vmenv +. vmenv/bin/activate +pip install requests mysql-connector-python==8.4.0 alive_progress lxml +VOC_PORT=$(sudo grep -oP '^server.port=\K.*' /opt/digiverso/vocabulary/application.properties) +VOC_TOKEN=$(sudo grep -oP '^security.token=\K.*' /opt/digiverso/vocabulary/application.properties) +DB_GOOBI_PW=$(sudo xmlstarlet sel -t -v '//Resource/@password' -n /etc/tomcat9/Catalina/localhost/goobi.xml) +python vocabulary-migrator.py --vocabulary-server-host localhost --vocabulary-server-port "${VOC_PORT}" --vocabulary-server-token "${VOC_TOKEN}" --goobi-database-host localhost --goobi-database-port 3306 --goobi-database-name goobi --goobi-database-user goobi --goobi-database-password "${DB_GOOBI_PW}" --continue-on-error --fallback-language ger + +# Test +curl -s http://localhost:8081/api/v1/vocabularies --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.vocabularyList[] .name' +``` + **Hint** Change the parameters according to your configuration. The `fallback-language` parameter defines the default language to be used for a multi-lingual vocabulary field for which no default language could be derived. The `continue-on-error` option prevents the migration tool to stop on data migration errors. These errors can occur if the data could not be inserted into the new vocabulary server. Possible reasons might be: - The vocabulary record is empty. - The vocabulary record contains data that is incompatible with some type restrictions. diff --git a/docs/en/setup.md b/docs/en/setup.md index a769a46..dab318f 100644 --- a/docs/en/setup.md +++ b/docs/en/setup.md @@ -7,63 +7,128 @@ This documentation describes the process of bootstrapping the vocabulary server. - Adapt configuration file properly and remove unmodified lines. - Database credentials and database name. - Base URL and port. -- **TODO** *Install the `vocabulary-server.jar` and the `application.properties` configuration file both directly into a new folder (e. g. `/opt/digiverso/vocabulary/`)* + - Security token (this must also be configured identically in Goobi). +- Create a Systemd service to start the service automatically. -## Start as systemd service -- **TODO** *Create a systemd service unit for the vocabulary server (The application should be able to correctly shutdown on SIGTERM)* -- **TODO** *Admin documentation here* -- Run `java -jar vocabulary-server-VERSION.jar`. -- If startup succeeds, you will see a line like this after a few seconds: -```bash -Started VocabularyServerApplication in 4.244 seconds (process running for 4.581) -``` +## Setting up Goobi Workflow to communicate with the vocabulary server -## Goobi Workflow Setup Communication -- Goobi Workflow uses the new vocabulary server since version `24.07`. -- Configure the `vocabularyServerHost` and `vocabularyServerPort` variables in `goobi_config.properties` according to your vocabulary server configuration. +- Goobi Workflow has been using the new vocabulary server since version `24.07`. +- Configure the variables `vocabularyServerHost`, `vocabularyServerPort` and `vocabularyServerToken` in the `goobi_config.properties` file according to the configuration of your vocabulary server. +- As an alternative to `vocabularyServerHost` and `vocabularyServerPort`, the variable `vocabularyServerAddress` can also be set (e. g. `vocabularyServerAddress=https://external.address.com/vocabulary`). This variable also allows an SSL connection. - Restart Goobi Workflow for the changes to take effect. -- Navigate to `Administration` > `Vocabularies` to check if everything works. You should see a list of vocabularies if everything is okay (not right now, but after you have created some vocabularies or migrated the existing ones). You will see a red error message if something doesn't work. - -## Initial Setup -- For proper operation, the vocabulary server needs some initial data. -- This data contains language specifications (if multi-language vocabularies are used) and field type definitions. -- You can use the following script that installs some sample languages and field types. -- Download the [Initial Data Script](https://jenkins.intranda.com/job/intranda/job/vocabulary-server/job/develop/lastSuccessfulBuild/artifact/install/default_setup.sh). -- Change the `HOST` variable at the top according to the vocabulary server configuration, leave the `/api/v1` suffix unchanged. -- Run the script. - -## Security -- You can setup Apache url restrictions in order to secure the vocabulary server from unauthorized access. -- **TODO** *Admins, please find out what and how to do it in detail.* - -## Installation Test -- For all commands, change host and port accordingly. -- After the initial setup, check that types were created successfully: +- Navigate to `Administration` > `Vocabularies` to check that everything is working. You should see a list of vocabularies if everything is OK (not now, but after you have created some vocabularies or migrated the existing ones). If something is not working, you will see a red error message. + + +## Initial setup +- In the case of data migration, the vocabulary server requires some initial data. +- This data contains language information (if multilingual vocabularies are used) and field type definitions. +- This data is stored in a minimal data set and can be easily installed using the installation script provided. +- Download the [Vocabulary-Initialization-Tool](https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip). +- If you have not already done so, start the vocabulary server. +- Unpack the archive. +- Adapt the variables `HOST`, `PORT` and `TOKEN` to your configuration and start the script as follows: ```bash -curl http://localhost:8081/api/v1/types | jq -r '._embedded.fieldTypeList[] .name' +HOST=localhost PORT=8081 TOKEN=secret /path/to/the/script/install.sh minimal ``` -- The result should look like: + +## Installation script +The vocabulary server requires Java 17, the systemd service assumes that Java 17 is the system default. + +For the above three points, under Ubuntu: ```bash -Anything -Boolean -Number -Word -skos:prefLabel -skos:altLabel -skos:definition -skos:editorialNote -skos:related -skos:closeMatch -skos:exactMatch +export VOC_PORT=8081 +export VOC_TOKEN=$(/dev/null + +# create system user which will run the service +sudo adduser --system --home ${VOC_PATH}/home --shell /usr/sbin/nologin --no-create-home --disabled-login ${VOC_USER} + +# download the vocabulary migration tools +wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/migration-tool.zip -O /tmp/migration-tool.zip +sudo unzip /tmp/migration-tool.zip -d "${VOC_PATH}" + +# download and set up the config file +wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/application.properties -O - | sudo tee ${VOC_PATH}/application.properties >/dev/null +sudo sed -re "s|^(server.port=).*|\1${VOC_PORT}|" \ + -e "s|^#?(security.token=).*|\1${VOC_TOKEN}|" \ + -e "s|^(spring.datasource.username=).*|\1${VOC_SQL_USER}|" \ + -e "s|^(spring.datasource.password=).*|\1${PW_SQL_VOC}|" \ + -e "s|^(spring.datasource.url=).*|\1jdbc:mariadb://localhost:3306/${VOC_SQL_DB}|" \ + -i ${VOC_PATH}/application.properties +sudo chown ${VOC_USER}: ${VOC_PATH}/application.properties +sudo chmod 600 ${VOC_PATH}/application.properties + +# install a systemd service unit file +cat << EOF | sudo tee /etc/systemd/system/vocabulary.service +[Unit] +Description=Goobi Vocabulary Server +After=mysql.service remote-fs.target +Requires=mysql.service remote-fs.target + +[Service] +WorkingDirectory=${VOC_PATH} +Restart=always +RestartSec=20s +StartLimitInterval=100s +StartLimitBurst=4 +ExecStart=/usr/bin/java -jar vocabulary-server-core.jar +User=${VOC_USER} +NoNewPrivileges=true +ProtectSystem=true +PrivateTmp=yes + +[Install] +WantedBy=default.target tomcat9.service +EOF +sudo systemctl daemon-reload +sudo systemctl enable vocabulary.service + +# create and configure the database +sudo mysql -e "CREATE DATABASE ${VOC_SQL_DB} CHARACTER SET = 'utf8mb4' COLLATE = 'utf8mb4_unicode_ci'; + CREATE USER '${VOC_SQL_USER}'@'localhost' IDENTIFIED BY '${PW_SQL_VOC}'; + GRANT ALL PRIVILEGES ON ${VOC_SQL_DB}.* TO '${VOC_SQL_USER}'@'localhost' WITH GRANT OPTION; + FLUSH PRIVILEGES;" + +# append vocabulary server address to the Goobi workflow config +grep ^vocabularyServerHost= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerHost=localhost" | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties +grep ^vocabularyServerPort= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerPort=${VOC_PORT}" | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties +grep ^vocabularyServerToken= /opt/digiverso/goobi/config/goobi_config.properties || echo "vocabularyServerToken=${VOC_TOKEN}" | sudo tee -a /opt/digiverso/goobi/config/goobi_config.properties + +# start the vocabulary server and wait for startup +sudo systemctl restart vocabulary.service & sudo journalctl -u vocabulary.service -f -n 0 | grep -q "Started VocabularyServerApplication in" + +# initial set up +wget https://github.com/intranda/goobi-vocabulary-server/releases/latest/download/vocabulary-init-script.zip -O /tmp/vocabulary-init-script.zip +sudo unzip /tmp/vocabulary-init-script.zip -d "${VOC_PATH}" +HOST=localhost PORT=${VOC_PORT} TOKEN=${VOC_TOKEN} ${VOC_PATH}/vocabulary-init-script/install.sh minimal + +## test +curl -s http://localhost:${VOC_PORT}/api/v1/types --header "Authorization: Bearer $VOC_TOKEN" | jq -r '._embedded.fieldTypeList[] .name' ``` -- If a data migration has been done, check that all vocabularies have been migrated: + +## Accessibility +- You can make the vocabulary server accessible from outside by connecting a proxy with access control in front of it. + +## Installation test +- Change the host and port accordingly for all commands. +- After the initial setup, check whether the field types have been created successfully: ```bash -curl http://localhost:8081/api/v1/vocabularies/all | jq -r '._embedded.vocabularyList[] .name' +curl ‘http://localhost:${VOC_PORT:-8081}/api/v1/types’ --header ‘Authorisation: Bearer $VOC_TOKEN’ | jq -r ‘._embedded.fieldTypeList[] .name’ ``` -- Check that links resolve correctly (see configuration): + +- The result should look like this: ```bash -curl http://localhost:8081/api/v1/records/1 | jq +Any Text +Anything +Boolean +Number ``` -The `_links` JSON element should contain references to other resources. -These URLs should be valid and resolvable. -The host part of these URLs is generated from the request. diff --git a/install/.gitignore b/install/.gitignore index 3415572..ea122ca 100644 --- a/install/.gitignore +++ b/install/.gitignore @@ -1 +1 @@ -/cache.txt \ No newline at end of file +cache.txt \ No newline at end of file diff --git a/install/default_setup.sh b/install/default_setup.sh deleted file mode 100755 index 2b2b056..0000000 --- a/install/default_setup.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -HOST="localhost:${VOC_PORT:-8081}/api/v1" -TOKEN="${VOC_TOKEN:-CHANGEME}" - -curl_call() { - curl --location "$HOST/$1" --header 'Content-Type: application/json' --header "Authorization: Bearer $TOKEN" --data "$2" -} - -create_language() { - curl_call "languages" "{\"abbreviation\":\"$1\", \"name\":\"$2\"}" -} - -create_type() { - if [ -n "$3" ] - then - if [ -n "$2" ] - then - curl_call "types" "{\"name\":\"$1\",\"validation\":\"$2\",\"selectableValues\":[$3]}" - else - curl_call "types" "{\"name\":\"$1\",\"selectableValues\":[$3]}" - fi - else - if [ -n "$2" ] - then - echo "{\"name\":\"$1\",\"validation\":\"$2\"}" - curl_call "types" "{\"name\":\"$1\",\"validation\":\"$2\"}" - else - curl_call "types" "{\"name\":\"$1\"}" - fi - fi -} - -create_language "eng" "English" -create_language "ger" "Deutsch" -create_language "fre" "Français" -create_type "Anything" -curl_call "types" "{\"name\":\"Any Text\",\"large\":true}" -create_type "Boolean" "" "\"true\",\"false\"" -create_type "Number" "\\\\d+" -create_type "Word" "\\\\w+" -create_type "skos:prefLabel" -create_type "skos:altLabel" -create_type "skos:definition" -create_type "skos:editorialNote" -create_type "skos:related" -create_type "skos:closeMatch" "https?.*" -create_type "skos:exactMatch" "https?.*" -create_type "dct:title" -create_type "dct:creator" -create_type "dct:created" "\\\\d{4}\\\\-\\\\d{2}\\\\-\\\\d{2}" -create_type "dct:license" "https?.*" -echo - -curl -s "${HOST}"/types --header "Authorization: Bearer $TOKEN" | jq -r '._embedded.fieldTypeList[] .name' | grep Anything -q || { echo "ERROR while executing $0"; exit 1; } diff --git a/install/install_samples.sh b/install/install.sh similarity index 68% rename from install/install_samples.sh rename to install/install.sh index 09a37fe..43db0e1 100755 --- a/install/install_samples.sh +++ b/install/install.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash MY_PATH="$(dirname -- "${BASH_SOURCE[0]}")" SAMPLES_DIR=$MY_PATH/samples -CACHE_FILE=$MY_PATH/cache.txt # Check for parameters FAIL=0 @@ -29,11 +28,36 @@ if [ -z $SAMPLE ]; then ls $SAMPLES_DIR read SAMPLE fi -SAMPLE_PATH=$SAMPLES_DIR/$SAMPLE + +if [ -d "$SAMPLE" ]; then + SAMPLE_PATH="$SAMPLE" +else + SAMPLE_PATH="$SAMPLES_DIR/$SAMPLE" +fi if [ ! -e $SAMPLE_PATH ]; then echo "Sample \"$SAMPLE\" does not exist!" exit 1 fi +CACHE_FILE="$SAMPLE_PATH/cache.txt" + +if [ -f "$CACHE_FILE" ]; then + while true; do + read -n 1 -s -p "An existing import cache was found, do you want to install from scratch? [Y]es|[N]o " answer + case $answer in + y|Y) + rm "$CACHE_FILE" + break + ;; + n|N) + break + ;; + *) + echo -e "\nInvalid choice!" + ;; + esac + done + echo "" +fi curl_call() { curl --location "$HOST:$PORT/api/v1/$1" \ @@ -53,22 +77,32 @@ curl_file_upload_call() { } for INSTALL_DIR in $(ls $SAMPLE_PATH); do + if [ "$INSTALL_DIR" == "cache.txt" ]; then + continue + fi + ENDPOINT=$(echo $INSTALL_DIR | cut -d'_' -f2) echo "Installing $ENDPOINT" for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".json"); do ITEM_NAME=$(echo "$ITEM" | cut -d'.' -f1) ITEM_IDENTIFIER=${ENDPOINT}_"${ITEM_NAME}" - ID=$(cat $CACHE_FILE | grep "$ITEM_IDENTIFIER;" | cut -d';' -f2) + if [ -f "$CACHE_FILE" ]; then + ID=$(cat $CACHE_FILE | grep "$ITEM_IDENTIFIER;" | cut -d';' -f2) + else + ID="" + fi if [ -z "$ID" ]; then JSON=$(cat $SAMPLE_PATH/$INSTALL_DIR/$ITEM) # Replace ID placeholders - for CACHE_LINE in $(cat $CACHE_FILE); do - TEMPLATE_PLACEHOLDER="{{$(echo $CACHE_LINE | cut -d';' -f1)}}" - CACHE_ID=$(echo $CACHE_LINE | cut -d';' -f2) - JSON=$(echo $JSON | sed "s/$TEMPLATE_PLACEHOLDER/$CACHE_ID/g") - done + if [ -f "$CACHE_FILE" ]; then + for CACHE_LINE in $(cat $CACHE_FILE); do + TEMPLATE_PLACEHOLDER="{{$(echo $CACHE_LINE | cut -d';' -f1)}}" + CACHE_ID=$(echo $CACHE_LINE | cut -d';' -f2) + JSON=$(echo $JSON | sed "s/$TEMPLATE_PLACEHOLDER/$CACHE_ID/g") + done + fi RESULT=$(curl_call $ENDPOINT "$JSON") @@ -91,7 +125,11 @@ for INSTALL_DIR in $(ls $SAMPLE_PATH); do for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".csv"); do VOCABULARY_NAME=$(echo "$ITEM" | cut -d'.' -f1) VOCABULARY_IDENTIFIER=vocabularies_${VOCABULARY_NAME} - VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2) + if [ -f "$CACHE_FILE" ]; then + VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2) + else + VOCABULARY_ID="" + fi if [ ! -z "$VOCABULARY_ID" ]; then curl_file_upload_call "vocabularies/$VOCABULARY_ID/import/csv" "$SAMPLE_PATH/$INSTALL_DIR/$ITEM" @@ -103,8 +141,12 @@ for INSTALL_DIR in $(ls $SAMPLE_PATH); do for ITEM in $(ls $SAMPLE_PATH/$INSTALL_DIR | grep ".xlsx"); do VOCABULARY_NAME=$(echo "$ITEM" | cut -d'.' -f1) VOCABULARY_IDENTIFIER=vocabularies_${VOCABULARY_NAME} - VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2) - + if [ -f "$CACHE_FILE" ]; then + VOCABULARY_ID=$(cat $CACHE_FILE | grep $VOCABULARY_IDENTIFIER | cut -d';' -f2) + else + VOCABULARY_ID="" + fi + if [ ! -z "$VOCABULARY_ID" ]; then curl_file_upload_call "vocabularies/$VOCABULARY_ID/import/excel" "$SAMPLE_PATH/$INSTALL_DIR/$ITEM" echo -e "\tImported \"$VOCABULARY_NAME\" vocabulary records" diff --git a/install/samples/minimal/1_languages/Deutsch.json b/install/samples/minimal/1_languages/Deutsch.json new file mode 100644 index 0000000..5a2cdbb --- /dev/null +++ b/install/samples/minimal/1_languages/Deutsch.json @@ -0,0 +1,4 @@ +{ + "abbreviation": "ger", + "name": "Deutsch" +} \ No newline at end of file diff --git a/install/samples/minimal/1_languages/English.json b/install/samples/minimal/1_languages/English.json new file mode 100644 index 0000000..34fc8a6 --- /dev/null +++ b/install/samples/minimal/1_languages/English.json @@ -0,0 +1,4 @@ +{ + "abbreviation": "eng", + "name": "English" +} \ No newline at end of file diff --git "a/install/samples/minimal/1_languages/Fran\303\247ais.json" "b/install/samples/minimal/1_languages/Fran\303\247ais.json" new file mode 100644 index 0000000..e796791 --- /dev/null +++ "b/install/samples/minimal/1_languages/Fran\303\247ais.json" @@ -0,0 +1,4 @@ +{ + "abbreviation": "fre", + "name": "Français" +} \ No newline at end of file diff --git a/install/samples/minimal/2_types/Any_Text.json b/install/samples/minimal/2_types/Any_Text.json new file mode 100644 index 0000000..1dd14d8 --- /dev/null +++ b/install/samples/minimal/2_types/Any_Text.json @@ -0,0 +1,4 @@ +{ + "name": "Any Text", + "large": true +} \ No newline at end of file diff --git a/install/samples/minimal/2_types/Anything.json b/install/samples/minimal/2_types/Anything.json new file mode 100644 index 0000000..ea7aa0f --- /dev/null +++ b/install/samples/minimal/2_types/Anything.json @@ -0,0 +1,3 @@ +{ + "name": "Anything" +} \ No newline at end of file diff --git a/install/samples/minimal/2_types/Boolean.json b/install/samples/minimal/2_types/Boolean.json new file mode 100644 index 0000000..6c9b600 --- /dev/null +++ b/install/samples/minimal/2_types/Boolean.json @@ -0,0 +1,6 @@ +{ + "name": "Boolean", + "selectableValues": [ + "true", "false" + ] +} \ No newline at end of file diff --git a/install/samples/minimal/2_types/Number.json b/install/samples/minimal/2_types/Number.json new file mode 100644 index 0000000..9390686 --- /dev/null +++ b/install/samples/minimal/2_types/Number.json @@ -0,0 +1,4 @@ +{ + "name": "Number", + "validation": "\\d+" +} \ No newline at end of file diff --git a/migration/lib/mets_context.py b/migration/lib/mets_context.py index 37b2bed..30cacbd 100644 --- a/migration/lib/mets_context.py +++ b/migration/lib/mets_context.py @@ -6,10 +6,11 @@ RECORD_PATTERN = re.compile('^(\\d+).*$') class Context: - def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic): + def __init__(self, api, dry, verbose, force, continue_on_error, metadata_directory, mapping_file, preferred_mets_main_value_language, manual_id_fix, trust, enable_relation_vocabulary_column_logic, delete_missing_vocabulary_references): self.api = api self.dry = dry self.verbose = verbose + self.force = force self.continue_on_error = continue_on_error self.metadata_directory = metadata_directory self.mapping_file = mapping_file @@ -17,6 +18,8 @@ def __init__(self, api, dry, verbose, continue_on_error, metadata_directory, map self.manual_id_fix = manual_id_fix self.trust = trust self.enable_relation_vocabulary_column_logic = enable_relation_vocabulary_column_logic + self.delete_missing_vocabulary_references = delete_missing_vocabulary_references + self.removable_metadata_map = {} self.vocabulary_name_id_map = {} self.vocabulary_id_name_map = {} self.vocabulary_id_map = {} @@ -133,6 +136,12 @@ def robust_find_record_id(self, parts): return vocabulary_id, record_id except: return None, None + + def is_removable_metadata(self, vocabulary_id, value): + if not vocabulary_id in self.removable_metadata_map: + return False + + return value in self.removable_metadata_map[vocabulary_id] def log_processed(self, file): with open('mets_migration.log', 'a') as f: diff --git a/migration/lib/mets_manipulator.py b/migration/lib/mets_manipulator.py index 49d403e..f06e264 100644 --- a/migration/lib/mets_manipulator.py +++ b/migration/lib/mets_manipulator.py @@ -30,24 +30,36 @@ def create_backup(self): logging.debug(f'Backed up mets file: {backup_filename}') def process_mets_file(self): - tree = ET.parse(self.file_path) - root = tree.getroot() - self.process_node(root) + try: + tree = ET.parse(self.file_path) + except Exception as e: + error = f'Error parsing mets file {self.file_path}, skipping' + logging.error(error) + self.ctx.log_issue(self.file_path, error) + return + try: + root = tree.getroot() + self.process_node(root) - if self.changed and not self.ctx.dry: - self.create_backup() - tree.write(self.file_path, encoding='utf-8', xml_declaration=True) - self.ctx.log_processed(self.file_path) + if self.changed and not self.ctx.dry: + self.create_backup() + tree.write(self.file_path, encoding='utf-8', xml_declaration=True) + self.ctx.log_processed(self.file_path) + except Exception as e: + error = f'Something very unexpected happened during processing of mets file {self.file_path}: {e}' + logging.critical(error) + raise Exception(error) def process_node(self, node): if self.is_manual_id_reference(node): self.process_manual_id_reference(node) if self.ctx.dry: dump_node(node) - elif self.is_vocabulary_reference(node) and not self.is_already_migrated(node): - self.process_vocabulary_reference(node) - if self.ctx.dry: - dump_node(node) + elif self.is_vocabulary_reference(node): + if self.ctx.force or not self.is_already_migrated(node): + self.process_vocabulary_reference(node) + if self.ctx.dry: + dump_node(node) for child in node: self.process_node(child) @@ -159,6 +171,7 @@ def process_vocabulary_reference_by_value(self, node): search_field=None inverse_search_field=None + perform_inversion_fix=False if self.ctx.enable_relation_vocabulary_column_logic and 'Relationship' in vocabulary_name: parent = node.getparent() if parent == None: @@ -169,14 +182,14 @@ def process_vocabulary_reference_by_value(self, node): entity_type = None for sibling in parent: if sibling.attrib['name'] == 'RelationEntityType': - entity_type = sibling.text + entity_type = sibling.text.lower() break - entity_type_in_relation_count = vocabulary_name.count(entity_type) + entity_type_in_relation_count = vocabulary_name.lower().count(entity_type) if entity_type_in_relation_count == 1: # Find out relation direction separator_position = vocabulary_name.index('-') - entity_type_position = vocabulary_name.index(entity_type) + entity_type_position = vocabulary_name.lower().index(entity_type) # use second column of vocabulary: `Reverse relationship` (The relation vocabulary is specified from `A->B`, the relation references an entity of type `A` and is therefore of type `B`) if entity_type_position < separator_position: @@ -185,33 +198,48 @@ def process_vocabulary_reference_by_value(self, node): else: search_field='Relationship type' inverse_search_field='Reverse relationship' + perform_inversion_fix=True + + elif entity_type_in_relation_count == 2: + search_field='Relationship type' + inverse_search_field='Reverse relationship' + else: + raise Exception(f'Unable to perform relation column logic on relation [{vocabulary_name}] with search entity: {entity_type}') try: + # First, try to find the value in the correct column new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=search_field) except: + # If failed, try to find the value in the other column (assuming the value was stored incorrectly) new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=inverse_search_field) old_value = node.text - record_data = self.ctx.api.lookup_record(new_record_id) - - v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId']) - s = self.ctx.api.lookup_schema(v['schemaId']) - ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for - if len(ids) != 1: - logging.critical(f'Non unique "{search_field}" fields found: {ids}!') - sys.exit(1) - - field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]] - if len(field_data) != 1: - logging.critical(f'Record [{new_record_id}] has no unique search column entry field') - sys.exit(1) - - # Replace node text if not matching any translation of main value - translated_main_values = self.ctx.extract_language_values(field_data[0]) - new_value = self.ctx.extract_preferred_language(translated_main_values) - #dump_node(node) - logging.warn(f'Relation is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"') - node.text = new_value + if perform_inversion_fix: + record_data = self.ctx.api.lookup_record(new_record_id) + + v = self.ctx.api.lookup_vocabulary(record_data['vocabularyId']) + s = self.ctx.api.lookup_schema(v['schemaId']) + ids = [d['id'] for d in s['definitions'] if d['name'] == search_field] # We need the value, that we actually originally searched for + if len(ids) != 1: + logging.critical(f'Non unique "{search_field}" fields found: {ids}!') + sys.exit(1) + + field_data = [f for f in record_data['fields'] if f['definitionId'] == ids[0]] + if len(field_data) != 1: + logging.critical(f'Record [{new_record_id}] has no unique search column entry field') + sys.exit(1) + + # Replace node text if not matching any translation of main value + translated_main_values = self.ctx.extract_language_values(field_data[0]) + new_value = self.ctx.extract_preferred_language(translated_main_values) + + #dump_node(node) + warn_message = f'Relation [{vocabulary_name}] is saved in the wrong direction, correct direction found and corrected: "{old_value}" -> "{new_value}"' + logging.warn(warn_message) + self.ctx.log_issue(self.file_path, warn_message) + node.text = new_value + else: + logging.debug(f'Relation [{vocabulary_name}] value "{value}" found in column "{inverse_search_field}", keeping as is') else: new_record_id = self.ctx.api.find_record(self.ctx, vocabulary_id, value, search_field=None) @@ -223,9 +251,29 @@ def process_vocabulary_reference_by_value(self, node): self.changed = True except Exception as e: - error = f'Unable to find record by value: {value}\n\t\t{e}' - logging.error(error) - self.ctx.log_issue(self.file_path, error) + # If this fails as well and the value is not found, remove the metadata if configured + if 'has no results' in e.__str__() and self.ctx.is_removable_metadata(vocabulary_id, node.text): + warn_message = f'Removing node due to intentionally missing vocabulary value: "{node.text}"' + logging.warn(warn_message) + self.ctx.log_issue(self.file_path, warn_message) + self.remove_metadata_node(node) + else: + error = f'Unable to find record by value: {value}\n\t\t{e}' + logging.error(error) + self.ctx.log_issue(self.file_path, error) + + def remove_metadata_node(self, node): + parent = node.getparent() + if parent != None and parent.attrib['type'] == 'group': + node = parent + parent = node.getparent() + + if parent == None: + dump_node(node) + raise Exception(f'Unable to remove node due to missing parent') + + parent.remove(node) + self.changed = True def process_manual_id_reference(self, node): try: diff --git a/migration/lib/mets_migrator.py b/migration/lib/mets_migrator.py index 1969a07..429fb0e 100644 --- a/migration/lib/mets_migrator.py +++ b/migration/lib/mets_migrator.py @@ -11,6 +11,7 @@ def __init__(self, ctx): def migrate(self): self.load_mapping_file() + self.load_delete_missing_vocabulary_references_file() self.mets_files = self.scan_for_mets_files() logging.info(f'{len(self.mets_files)} mets file(s) found!') logging.info(f'Start processing ...') @@ -52,6 +53,31 @@ def load_mapping_file(self): else: raise Exception(f'Mapping file contains duplicate entry for old record {record_id_old}') + def load_delete_missing_vocabulary_references_file(self): + if self.ctx.delete_missing_vocabulary_references == None: + return + + header = None + with open(self.ctx.delete_missing_vocabulary_references, 'r') as f: + for line in f: + line = line.strip() + if header == None: + header = line + if header != CSV_DELIMITER.join(['vocabulary_id', 'value']): + raise Exception('Header mismatch in mapping file!') + continue + + parts = line.split(CSV_DELIMITER) + if len(parts) != 2: + raise Exception(f'Wrong number of fields in line: {line}') + + vocabulary_id = int(parts[0]) + value = parts[1] + + if not vocabulary_id in self.ctx.removable_metadata_map: + self.ctx.removable_metadata_map[vocabulary_id] = [] + self.ctx.removable_metadata_map[vocabulary_id].append(value) + def scan_for_mets_files(self): results = [] for root, dirs, files in os.walk(self.ctx.metadata_directory): diff --git a/migration/metadata-migrator.py b/migration/metadata-migrator.py index 582f97e..a1edc87 100644 --- a/migration/metadata-migrator.py +++ b/migration/metadata-migrator.py @@ -14,7 +14,7 @@ def main(): args.vocabulary_server_port, args.vocabulary_server_token ) - ctx = Context(api, args.dry, args.verbose, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic) + ctx = Context(api, args.dry, args.verbose, args.force, args.continue_on_error, args.metadata_directory, args.mapping_file, args.preferred_mets_main_value_language, args.manual_id_fix, args.trust, args.enable_relation_vocabulary_column_logic, args.delete_missing_vocabulary_references) try: migrator = MetsMigrator(ctx) @@ -33,6 +33,7 @@ class RawTextDefaultsHelpFormatter(argparse.RawTextHelpFormatter, argparse.Argum def parse_args(): parser = argparse.ArgumentParser(prog='metadata-migrator.py', formatter_class=RawTextDefaultsHelpFormatter, description='Metadata migration tool.') parser.add_argument('--dry', required=False, default=False, action='store_const', const=True, help='Don\'t persist changes but only print replacements to the console') + parser.add_argument('--force', '-f', required=False, default=False, action='store_const', const=True, help='Force a re-execution of the migration on already migrated metadata') parser.add_argument('--metadata-directory', '-d', required=True, help='directory to recursively scan for metadata to update') parser.add_argument('--mapping-file', '-m', required=True, help='vocabulary and record mapping file') parser.add_argument('--vocabulary-server-host', type=str, default='localhost', help='vocabulary server host') @@ -41,6 +42,7 @@ def parse_args(): parser.add_argument('--preferred-mets-main-value-language', type=str, default='eng', help='Default language to use for mets value writing, if present and prior value invalid') parser.add_argument('--trust', required=False, type=str, default='ID', help='Set the data source to trust for the migration. Possible values are: "ID" and "Value". If "ID" is set, the record ID is parsed from the valueURI and used to find the migrated record. If "Value" is set, the XML elements value is used to find the newly migrated record by value. Defaults to "ID".') parser.add_argument('--enable-relation-vocabulary-column-logic', required=False, default=False, action='store_const', const=True, help='Activate relationship vocabulary correct column finding logic (reverse vs non-reverse, artist dictionary)') + parser.add_argument('--delete-missing-vocabulary-references', type=str, required=False, default=None, help='vocabulary and value mapping file defining intentionally removed vocabulary values that should be removed in the Mets files as well.') parser.add_argument('--manual-id-fix', type=str, default=None, help='Manually fix the record ID of elements whose name attribute matches this parameter. Caution, this must not be executed twice!') parser.add_argument('--log', required=False, default='INFO', help='logger level (possible values are: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL)') parser.add_argument('--verbose', required=False, default=False, action='store_const', const=True, help='verbose output') diff --git a/module-core/pom.xml b/module-core/pom.xml index d779cf4..2c3bd3f 100644 --- a/module-core/pom.xml +++ b/module-core/pom.xml @@ -10,7 +10,7 @@ io.goobi.vocabulary vocabulary-server-core - 1.1.10 + 1.1.11 Vocabulary-Server-Core Spring Boot based RESTful web service for vocabulary management jar @@ -35,7 +35,7 @@ io.goobi.vocabulary vocabulary-server-exchange - 1.1.10 + 1.1.11 compile diff --git a/module-exchange/pom.xml b/module-exchange/pom.xml index 7ca1a99..d5a56c2 100644 --- a/module-exchange/pom.xml +++ b/module-exchange/pom.xml @@ -4,7 +4,7 @@ 4.0.0 io.goobi.vocabulary vocabulary-server-exchange - 1.1.10 + 1.1.11 Vocabulary Exchange Vocabulary data exchange classes jar diff --git a/pom.xml b/pom.xml index ad2b989..60673f4 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ 4.0.0 io.goobi.vocabulary vocabulary-server - 1.1.10 + 1.1.11 Vocabulary-Server pom RESTful webservice for vocabulary management