diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/balancer/BalancerService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/balancer/BalancerService.java
index 7b6f4f13ef..75c7c38f4a 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/balancer/BalancerService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/balancer/BalancerService.java
@@ -24,7 +24,7 @@ public BalancerService(BalancerConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-balancer service with parameters : {}", config);
+    log.info("Started pipelines-balancer service");
     StepConfiguration stepConfig = config.stepConfig;
 
     // Prefetch is one, since this is a long-running process.
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/hdfs/HdfsViewService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/hdfs/HdfsViewService.java
index 39b95a4f82..5c9ac9e9a0 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/hdfs/HdfsViewService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/hdfs/HdfsViewService.java
@@ -30,8 +30,7 @@ public HdfsViewService(HdfsViewConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info(
-        "Started pipelines-{}-hdfs-view service with parameters : {}", config.stepType, config);
+    log.info("Started pipelines-{}-hdfs-view service", config.stepType);
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/indexing/EventsIndexingService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/indexing/EventsIndexingService.java
index 304f8ff4e8..6a6894f080 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/indexing/EventsIndexingService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/indexing/EventsIndexingService.java
@@ -29,7 +29,7 @@ public EventsIndexingService(EventsIndexingConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-event-indexing service with parameters : {}", config);
+    log.info("Started pipelines-event-indexing service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/interpretation/EventsInterpretationService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/interpretation/EventsInterpretationService.java
index 42695409f8..8d1694a9e9 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/interpretation/EventsInterpretationService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/events/interpretation/EventsInterpretationService.java
@@ -29,7 +29,7 @@ public EventsInterpretationService(EventsInterpretationConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-event-interpretation service with parameters : {}", config);
+    log.info("Started pipelines-event-interpretation service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/hdfs/HdfsViewService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/hdfs/HdfsViewService.java
index 043df6b3f5..990349fa31 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/hdfs/HdfsViewService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/hdfs/HdfsViewService.java
@@ -32,7 +32,7 @@ public HdfsViewService(HdfsViewConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-hdfs-view service with parameters : {}", config);
+    log.info("Started pipelines-hdfs-view service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/identifier/IdentifierService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/identifier/IdentifierService.java
index 106125f857..355d56c3e7 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/identifier/IdentifierService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/identifier/IdentifierService.java
@@ -33,8 +33,7 @@ public IdentifierService(IdentifierConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info(
-        "Started pipelines-occurrence-identifier dataset service with parameters : {}", config);
+    log.info("Started pipelines-occurrence-identifier dataset service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/indexing/IndexingService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/indexing/IndexingService.java
index ff2f84f786..e8d8d33aad 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/indexing/IndexingService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/indexing/IndexingService.java
@@ -36,7 +36,7 @@ public IndexingService(IndexingConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-occurrence-indexing service with parameters : {}", config);
+    log.info("Started pipelines-occurrence-indexing service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/interpretation/InterpretationService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/interpretation/InterpretationService.java
index 39515356af..559831e34e 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/interpretation/InterpretationService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/occurrences/interpretation/InterpretationService.java
@@ -36,7 +36,7 @@ public InterpretationService(InterpreterConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-occurrence-interpretation service with parameters : {}", config);
+    log.info("Started pipelines-occurrence-interpretation service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/cleaner/CleanerService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/cleaner/CleanerService.java
index 58706bb1c8..a82d55c7cc 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/cleaner/CleanerService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/cleaner/CleanerService.java
@@ -25,7 +25,7 @@ public CleanerService(CleanerConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-validator-cleaner service with parameters : {}", config);
+    log.info("Started pipelines-validator-cleaner service");
     // create the listener.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/metrics/MetricsCollectorService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/metrics/MetricsCollectorService.java
index 4493f9e3f0..d7ef2ae58c 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/metrics/MetricsCollectorService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/metrics/MetricsCollectorService.java
@@ -29,7 +29,7 @@ public MetricsCollectorService(MetricsCollectorConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-validator-metrics-collector with parameters : {}", config);
+    log.info("Started pipelines-validator-metrics-collector");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/validator/ArchiveValidatorService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/validator/ArchiveValidatorService.java
index 2d4d65e412..84c4febcbf 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/validator/ArchiveValidatorService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/validators/validator/ArchiveValidatorService.java
@@ -28,7 +28,7 @@ public ArchiveValidatorService(ArchiveValidatorConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-validator-archive-validator service with parameters : {}", config);
+    log.info("Started pipelines-validator-archive-validator service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/abcd/AbcdToAvroService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/abcd/AbcdToAvroService.java
index 4cb87449d8..9ab0fc6cce 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/abcd/AbcdToAvroService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/abcd/AbcdToAvroService.java
@@ -37,7 +37,7 @@ public AbcdToAvroService(XmlToAvroConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-verbatim-to-avro-from-abcd service with parameters : {}", config);
+    log.info("Started pipelines-verbatim-to-avro-from-abcd service");
     // create the listener.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/dwca/DwcaToAvroService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/dwca/DwcaToAvroService.java
index 368cc99338..d7ca238562 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/dwca/DwcaToAvroService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/dwca/DwcaToAvroService.java
@@ -28,7 +28,7 @@ public DwcaToAvroService(DwcaToAvroConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-verbatim-to-avro-from-dwca service with parameters : {}", config);
+    log.info("Started pipelines-verbatim-to-avro-from-dwca service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/fragmenter/FragmenterService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/fragmenter/FragmenterService.java
index 1665750ede..99a3601985 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/fragmenter/FragmenterService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/fragmenter/FragmenterService.java
@@ -37,7 +37,7 @@ public FragmenterService(FragmenterConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-verbatim-fragmenter service with parameters : {}", config);
+    log.info("Started pipelines-verbatim-fragmenter service");
     // Prefetch is one, since this is a long-running process.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/xml/XmlToAvroService.java b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/xml/XmlToAvroService.java
index ab7ae34599..92b37b99a8 100644
--- a/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/xml/XmlToAvroService.java
+++ b/gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/tasks/verbatims/xml/XmlToAvroService.java
@@ -35,7 +35,7 @@ public XmlToAvroService(XmlToAvroConfiguration config) {
 
   @Override
   protected void startUp() throws Exception {
-    log.info("Started pipelines-verbatim-to-avro-from-xml service with parameters : {}", config);
+    log.info("Started pipelines-verbatim-to-avro-from-xml service");
     // create the listener.
     StepConfiguration c = config.stepConfig;
     listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
diff --git a/gbif/ingestion/clustering-gbif-oozie/README.md b/gbif/ingestion/clustering-gbif-oozie/README.md
new file mode 100644
index 0000000000..f89cf9be6e
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/README.md
@@ -0,0 +1,12 @@
+# Oozie workflow
+
+Install by building the shaded artifact and using the script:
+
+```
+cd ../clustering-gbif
+mvn install -Pextra-artifacts
+
+cd ../clustering-gbif-oozie
+./install-workflow.sh dev GITHUB_KEY
+```
+
diff --git a/gbif/ingestion/clustering-gbif-oozie/install-workflow.sh b/gbif/ingestion/clustering-gbif-oozie/install-workflow.sh
new file mode 100755
index 0000000000..f5400fa3b7
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/install-workflow.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -e
+set -o pipefail
+
+ENV=$1
+TOKEN=$2
+
+echo "Installing clustering workflow for $ENV"
+
+echo "Get latest clustering config profiles from GitHub"
+curl -Ss -H "Authorization: token $TOKEN" -H 'Accept: application/vnd.github.v3.raw' -O -L https://api.github.com/repos/gbif/gbif-configuration/contents/clustering/$ENV/clustering.properties
+
+START=$(date +%Y-%m-%d)T$(grep '^startHour=' clustering.properties | cut -d= -f 2)Z
+FREQUENCY="$(grep '^frequency=' clustering.properties | cut -d= -f 2)"
+OOZIE=$(grep '^oozie.url=' clustering.properties | cut -d= -f 2)
+
+# Gets the Oozie id of the current coordinator job if it exists
+WID=$(oozie jobs -oozie $OOZIE -jobtype coordinator -filter name=Clustering | awk 'NR==3 {print $1}')
+if [ -n "$WID" ]; then
+  echo "Killing current coordinator job" $WID
+  sudo -u hdfs oozie job -oozie $OOZIE -kill $WID
+fi
+
+echo "Assembling jar for $ENV"
+# Oozie uses timezone UTC
+mvn -Dclustering.frequency="$FREQUENCY" -Dclustering.start="$START" -DskipTests -Duser.timezone=UTC clean install
+
+echo "Copy to Hadoop"
+sudo -u hdfs hdfs dfs -rm -r /clustering-workflow/
+sudo -u hdfs hdfs dfs -copyFromLocal target/clustering-workflow /
+sudo -u hdfs hdfs dfs -copyFromLocal /etc/hive/conf/hive-site.xml /clustering-workflow/lib/
+
+echo "Start Oozie clustering datasets job"
+sudo -u hdfs oozie job --oozie $OOZIE -config clustering.properties -run
diff --git a/gbif/ingestion/clustering-gbif-oozie/pom.xml b/gbif/ingestion/clustering-gbif-oozie/pom.xml
new file mode 100644
index 0000000000..53fbb62527
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/pom.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.gbif.pipelines</groupId>
+    <artifactId>ingestion</artifactId>
+    <version>2.18.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>clustering-oozie-workflow</artifactId>
+
+  <name>Clustering :: Oozie workflow</name>
+
+  <build>
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+        <filtering>true</filtering>
+        <includes>
+          <include>**/coordinator.xml</include>
+        </includes>
+      </resource>
+      <resource>
+        <directory>src/main/resources</directory>
+        <filtering>false</filtering>
+      </resource>
+    </resources>
+    <plugins>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <appendAssemblyId>false</appendAssemblyId>
+          <finalName>clustering-workflow</finalName>
+          <descriptors>
+            <descriptor>src/main/assembly/oozie.xml</descriptor>
+          </descriptors>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-oozie</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.gbif.pipelines</groupId>
+      <artifactId>clustering-gbif</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+
+</project>
diff --git a/gbif/ingestion/clustering-gbif-oozie/resume-workflow.sh b/gbif/ingestion/clustering-gbif-oozie/resume-workflow.sh
new file mode 100755
index 0000000000..5b9c3841ab
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/resume-workflow.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -e
+set -o pipefail
+
+ENV=$1
+TOKEN=$2
+
+echo "Resuming clustering workflow for $ENV"
+
+echo "Get latest clustering config profiles from GitHub"
+curl -Ss -H "Authorization: token $TOKEN" -H 'Accept: application/vnd.github.v3.raw' -O -L https://api.github.com/repos/gbif/gbif-configuration/contents/clustering/$ENV/clustering.properties
+
+OOZIE=$(grep '^oozie.url=' clustering.properties | cut -d= -f 2)
+
+# Gets the Oozie id of the current coordinator job if it exists
+WID=$(oozie jobs -oozie $OOZIE -jobtype coordinator -filter name=Clustering | awk 'NR==3 {print $1}')
+if [ -n "$WID" ]; then
+  echo "Resuming current coordinator job" $WID
+  sudo -u hdfs oozie job -oozie $OOZIE -resume $WID
+fi
diff --git a/gbif/ingestion/clustering-gbif-oozie/src/main/assembly/oozie.xml b/gbif/ingestion/clustering-gbif-oozie/src/main/assembly/oozie.xml
new file mode 100644
index 0000000000..749ce29921
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/src/main/assembly/oozie.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<assembly xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
+    xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2 http://maven.apache.org/xsd/assembly-1.1.2.xsd">
+
+  <id>clustering</id>
+  <formats>
+    <format>dir</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <fileSets>
+    <fileSet>
+      <directory>${project.build.outputDirectory}</directory>
+      <outputDirectory>/</outputDirectory>
+    </fileSet>
+  </fileSets>
+  <dependencySets>
+    <dependencySet>
+      <outputDirectory>lib</outputDirectory>
+      <fileMode>0644</fileMode>
+      <outputFileNameMapping>${artifact.artifactId}.${artifact.extension}</outputFileNameMapping>
+      <includes>
+        <include>org.gbif.pipelines:clustering-gbif</include>
+      </includes>
+    </dependencySet>
+  </dependencySets>
+</assembly>
diff --git a/gbif/ingestion/clustering-gbif-oozie/src/main/resources/coordinator.xml b/gbif/ingestion/clustering-gbif-oozie/src/main/resources/coordinator.xml
new file mode 100644
index 0000000000..71ff7dce66
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/src/main/resources/coordinator.xml
@@ -0,0 +1,11 @@
+<!--
+  Rebuilds clustering table on a schedule.
+-->
+<coordinator-app name="Clustering" frequency="${clustering.frequency}"
+                 start="${clustering.start}" end="2050-05-28T00:00Z" timezone="UTC" xmlns="uri:oozie:coordinator:0.4">
+  <action>
+    <workflow>
+      <app-path>hdfs://ha-nn/clustering-workflow</app-path>
+    </workflow>
+  </action>
+</coordinator-app>
diff --git a/gbif/ingestion/clustering-gbif-oozie/src/main/resources/workflow.xml b/gbif/ingestion/clustering-gbif-oozie/src/main/resources/workflow.xml
new file mode 100644
index 0000000000..adfc50eb50
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/src/main/resources/workflow.xml
@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="utf-8"?>
+<workflow-app xmlns="uri:oozie:workflow:0.4.5" name="clustering">
+
+  <global>
+    <job-tracker>${wf:conf("hadoop.jobtracker")}</job-tracker>
+    <name-node>${wf:conf("hdfs.namenode")}</name-node>
+    <configuration>
+      <property>
+        <name>oozie.launcher.mapred.job.queue.name</name>
+        <value>${wf:conf("hadoop.queuename")}</value>
+      </property>
+      <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+      </property>
+    </configuration>
+  </global>
+
+  <start to="clustering" />
+
+  <action name="clustering">
+    <spark xmlns="uri:oozie:spark-action:0.1">
+      <job-tracker>${wf:conf("hadoop.jobtracker")}</job-tracker>
+      <name-node>${wf:conf("hdfs.namenode")}</name-node>
+      <master>yarn-cluster</master>
+      <name>Clustering</name>
+      <class>org.gbif.pipelines.clustering.Cluster</class>
+      <jar>lib/clustering-gbif.jar</jar>
+      <!-- Following enabling static service pools (cgroups) we found the native libraries would not load. The only way we found to pass this through was using extraLibraryPath -->
+      <spark-opts>${wf:conf("gbif.clustering.spark.opts")} --conf spark.executor.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native --conf spark.driver.extraLibraryPath=/opt/cloudera/parcels/CDH/lib/hadoop/lib/native</spark-opts>
+      <arg>--hive-db</arg>
+      <arg>${wf:conf("gbif.clustering.hive.db")}</arg>
+      <arg>--source-table</arg>
+      <arg>${wf:conf("gbif.clustering.source.table")}</arg>
+      <arg>--hive-table-prefix</arg>
+      <arg>${wf:conf("gbif.clustering.hive.table.prefix")}</arg>
+      <arg>--hbase-table</arg>
+      <arg>${wf:conf("gbif.clustering.hbase.table")}</arg>
+      <arg>--hbase-regions</arg>
+      <arg>${wf:conf("gbif.clustering.hbase.regions")}</arg>
+      <arg>--hbase-zk</arg>
+      <arg>${wf:conf("gbif.clustering.hbase.zk")}</arg>
+      <arg>--target-dir</arg>
+      <arg>${wf:conf("gbif.clustering.target.dir")}</arg>
+      <arg>--hash-count-threshold</arg>
+      <arg>${wf:conf("gbif.clustering.hash.count.threshold")}</arg>
+    </spark>
+    
+    <ok to="end" />
+    <error to="kill" />
+  </action>
+
+  <kill name="kill">
+    <message>Clustering failed:[${wf:errorMessage(wf:lastErrorNode())}]</message>
+  </kill>
+
+  <end name="end" />
+
+</workflow-app>
diff --git a/gbif/ingestion/clustering-gbif-oozie/suspend-workflow.sh b/gbif/ingestion/clustering-gbif-oozie/suspend-workflow.sh
new file mode 100755
index 0000000000..acc689fd66
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif-oozie/suspend-workflow.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -e
+set -o pipefail
+
+ENV=$1
+TOKEN=$2
+
+echo "Suspending clustering workflow for $ENV"
+
+echo "Get latest clustering config profiles from GitHub"
+curl -Ss -H "Authorization: token $TOKEN" -H 'Accept: application/vnd.github.v3.raw' -O -L https://api.github.com/repos/gbif/gbif-configuration/contents/clustering/$ENV/clustering.properties
+
+OOZIE=$(grep '^oozie.url=' clustering.properties | cut -d= -f 2)
+
+# Gets the Oozie id of the current coordinator job if it exists
+WID=$(oozie jobs -oozie $OOZIE -jobtype coordinator -filter name=Clustering | awk 'NR==3 {print $1}')
+if [ -n "$WID" ]; then
+  echo "Suspending current coordinator job" $WID
+  sudo -u hdfs oozie job -oozie $OOZIE -suspend $WID
+fi
diff --git a/gbif/ingestion/clustering-gbif/README.md b/gbif/ingestion/clustering-gbif/README.md
index a3a4345d42..a69690920f 100644
--- a/gbif/ingestion/clustering-gbif/README.md
+++ b/gbif/ingestion/clustering-gbif/README.md
@@ -1,24 +1,13 @@
 ## Occurrence Clustering
 
-Provides utilities to cluster GBIF occurrence records.
+Processes occurrence data and establishes links between similar records. 
 
-Status: Having explored approaches using Spark ML (no success due to data skew) and
-[LinkedIn ScANNs](https://github.com/linkedin/scanns) (limited success, needs more investigation)
-this uses a declarative rule-based approach making use of domain knowledge. A multi-blocking stage
-groups candidate record pairs, followed by a pair-wise comparison to detect links within the group.
+The output is a set of HFiles suitable for bulk loading into HBase which drives the pipeline lookup and 
+public related occurrence API.
 
-The initial focus is on specimens to locate:
- 1. Physical records (e.g. Isotypes and specimens split and deposited in multiple herbaria)
- 2. Database duplicates across datasets (huge biases observed within datasets (e.g. gutworm datasets))
- 3. Strong links between sequence records, citations and specimens
+Build the project: `mvn spotless:apply test package -Pextra-artifacts`
 
-This is intended to be run regularly (e.g. daily) and therefore performance is of critical concern.
-
-The output is bulk loaded into HBase, suitable for an API to deployed on.
-
-Build the project: `mvn clean package`
-
-To run (while in exploration - will be made into Oozie workflow later):
+To run this against a completely new table:
 
 Setup hbase:
 ```
@@ -40,34 +29,30 @@ create 'occurrence_relationships_experimental',
   ]}
 ```
 
-Remove hive tables from the target database:
-```
-drop table occurrence_clustering_hashed;
-drop table occurrence_clustering_hashed_all;
-drop table occurrence_clustering_candidates;
-drop table occurrence_relationships;
-```
-
-Run the job (In production this configuration takes 2.6 hours with ~2.3B records)
+Run the job  
+(In production this configuration takes ~2hours with ~2.3B records)
 ```
 hdfs dfs -rm -r /tmp/clustering
 
 nohup sudo -u hdfs spark2-submit --class org.gbif.pipelines.clustering.Cluster \
   --master yarn --num-executors 100 \
-  --executor-cores 6 \
+  --executor-cores 4 \
   --conf spark.dynamicAllocation.enabled=false \
-  --conf spark.sql.shuffle.partitions=1000 \
+  --conf spark.sql.shuffle.partitions=1200 \
   --executor-memory 64G \
   --driver-memory 4G \
-  clustering-gbif-2.14.0-SNAPSHOT.jar \
-  --hive-db prod_h \
-  --hive-table-hashed occurrence_clustering_hashed \
-  --hive-table-candidates occurrence_clustering_candidates  \
-  --hive-table-relationships occurrence_relationships \
+  --conf spark.executor.memoryOverhead=4096 \
+  --conf spark.debug.maxToStringFields=100000 \
+  --conf spark.network.timeout=600s \
+  clustering-gbif-2.18.0-SNAPSHOT.jar \
+  --hive-db prod_TODO \
+  --source-table occurrence \
+  --hive-table-prefix clustering \
   --hbase-table occurrence_relationships_experimental \
   --hbase-regions 100 \
   --hbase-zk c5zk1.gbif.org,c5zk2.gbif.org,c5zk3.gbif.org \
-  --hfile-dir /tmp/clustering &
+  --target-dir /tmp/clustering \
+  --hash-count-threshold 100 &
 ```
 
 Load HBase
diff --git a/gbif/ingestion/clustering-gbif/pom.xml b/gbif/ingestion/clustering-gbif/pom.xml
index c77c5594df..73a6d37bd5 100644
--- a/gbif/ingestion/clustering-gbif/pom.xml
+++ b/gbif/ingestion/clustering-gbif/pom.xml
@@ -24,45 +24,21 @@
     <sonar.skip>true</sonar.skip>
   </properties>
 
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>net.alchim31.maven</groupId>
-        <artifactId>scala-maven-plugin</artifactId>
-        <version>${maven-scala-plugin.version}</version>
-        <executions>
-          <execution>
-            <id>scala-compile-first</id>
-            <phase>process-resources</phase>
-            <goals>
-              <goal>add-source</goal>
-              <goal>compile</goal>
-            </goals>
-          </execution>
-          <execution>
-            <id>scala-test-compile</id>
-            <phase>process-test-resources</phase>
-            <goals>
-              <goal>testCompile</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <secondaryCacheDir>/var/tmp/sbt</secondaryCacheDir>
-        </configuration>
-      </plugin>
-    </plugins>
-  </build>
-
   <dependencies>
     <dependency>
       <groupId>org.gbif.pipelines</groupId>
       <artifactId>core</artifactId>
+      <exclusions>
+        <!-- Brings in huge transitive dependencies, while we only need a single interface -->
+        <exclusion>
+          <groupId>*</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-catalyst_2.11</artifactId>
-      <version>${spark.version}</version>
+      <groupId>org.projectlombok</groupId>
+      <artifactId>lombok</artifactId>
       <scope>provided</scope>
     </dependency>
     <dependency>
@@ -77,18 +53,6 @@
       <version>${spark.version}</version>
       <scope>provided</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-yarn_2.11</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-hive_2.11</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-client</artifactId>
@@ -158,11 +122,13 @@
       <groupId>org.codehaus.janino</groupId>
       <artifactId>commons-compiler</artifactId>
       <version>${commons-compiler.version}</version>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.codehaus.janino</groupId>
       <artifactId>janino</artifactId>
       <version>${janino.version}</version>
+      <scope>test</scope>
     </dependency>
 
     <dependency>
@@ -183,6 +149,8 @@
             <configuration>
               <createDependencyReducedPom>false</createDependencyReducedPom>
               <keepDependenciesWithProvidedScope>false</keepDependenciesWithProvidedScope>
+              <!-- Breaks relocations otherwise -->
+              <shadeTestJar>false</shadeTestJar>
               <filters>
                 <filter>
                   <artifact>*:*</artifact>
@@ -204,6 +172,13 @@
                   <transformers>
                     <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
                   </transformers>
+                  <relocations>
+                    <!-- Necessary for Oozie-based launching -->
+                    <relocation>
+                      <pattern>com.fasterxml.jackson</pattern>
+                      <shadedPattern>gbif.com.fasterxml.jackson</shadedPattern>
+                    </relocation>
+                  </relocations>
                 </configuration>
               </execution>
             </executions>
diff --git a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/Cluster.java b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/Cluster.java
new file mode 100644
index 0000000000..ba0b9c67fc
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/Cluster.java
@@ -0,0 +1,376 @@
+package org.gbif.pipelines.clustering;
+
+import static org.gbif.pipelines.clustering.HashUtilities.recordHashes;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.*;
+import lombok.Builder;
+import lombok.Data;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FsShell;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
+import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.SnappyCodec;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.catalyst.encoders.RowEncoder;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships;
+import org.gbif.pipelines.core.parsers.clustering.RelationshipAssertion;
+import scala.Tuple2;
+
+/**
+ * Regenerates the HBase and Hive tables for a data clustering run. This will read the occurrence
+ * Hive table, generate HFiles for relationships and create intermediate tables in Hive for
+ * diagnostics, replacing any that exist. Finally, the HBase table is truncated preserving its
+ * partitioning and bulk loaded with new data. <br>
+ * This process works on the deliberate design principle that readers of the table will have
+ * implemented a retry mechanism for the brief outage during the table swap which is expected to be
+ * on an e.g. weekly basis.
+ */
+@Builder
+@Data
+public class Cluster implements Serializable {
+  private String hiveDB;
+  private String sourceTable;
+  private String hiveTablePrefix;
+  private String hbaseTable;
+  private int hbaseRegions;
+  private String hbaseZK;
+  private String targetDir;
+  private int hashCountThreshold;
+
+  private static final StructType HASH_ROW_SCHEMA =
+      DataTypes.createStructType(
+          new StructField[] {
+            DataTypes.createStructField("gbifId", DataTypes.StringType, false),
+            DataTypes.createStructField("datasetKey", DataTypes.StringType, false),
+            DataTypes.createStructField("hash", DataTypes.StringType, false)
+          });
+
+  private static final StructType RELATIONSHIP_SCHEMA =
+      DataTypes.createStructType(
+          new StructField[] {
+            DataTypes.createStructField("id1", DataTypes.StringType, false),
+            DataTypes.createStructField("id2", DataTypes.StringType, false),
+            DataTypes.createStructField("reasons", DataTypes.StringType, false),
+            DataTypes.createStructField("dataset1", DataTypes.StringType, false),
+            DataTypes.createStructField("dataset2", DataTypes.StringType, false),
+            DataTypes.createStructField("o1", DataTypes.StringType, false),
+            DataTypes.createStructField("o2", DataTypes.StringType, false)
+          });
+
+  public static void main(String[] args) throws IOException {
+    CommandLineParser.parse(args).build().run();
+  }
+
+  /** Run the full process, generating relationships and refreshing the HBase table. */
+  private void run() throws IOException {
+    removeTargetDir(); // fail fast if given bad config
+
+    SparkSession spark =
+        SparkSession.builder()
+            .appName("Occurrence clustering")
+            .config("spark.sql.warehouse.dir", new File("spark-warehouse").getAbsolutePath())
+            .enableHiveSupport()
+            .getOrCreate();
+    spark.sql("use " + hiveDB);
+
+    createCandidatePairs(spark);
+    Dataset<Row> relationships = generateRelationships(spark);
+    generateHFiles(relationships);
+    replaceHBaseTable();
+  }
+
+  /**
+   * Reads the input, and through a series of record hashing creates a table of candidate record
+   * pairs for proper comparison.
+   */
+  private void createCandidatePairs(SparkSession spark) {
+    // Read the input fields needed for generating the hashes
+    Dataset<Row> occurrences =
+        spark.sql(
+            String.format(
+                "SELECT"
+                    + "  gbifId, datasetKey, basisOfRecord, typeStatus, "
+                    + "  CAST(taxonKey AS String) AS taxonKey, CAST(speciesKey AS String) AS speciesKey, "
+                    + "  decimalLatitude, decimalLongitude, year, month, day, recordedBy, "
+                    + "  recordNumber, fieldNumber, occurrenceID, otherCatalogNumbers, institutionCode, "
+                    + "  collectionCode, catalogNumber "
+                    + "FROM %s",
+                sourceTable));
+    Dataset<Row> hashes =
+        occurrences
+            .flatMap(
+                row -> recordHashes(new RowOccurrenceFeatures(row)),
+                RowEncoder.apply(HASH_ROW_SCHEMA))
+            .dropDuplicates();
+    spark.sql("DROP TABLE IF EXISTS " + hiveTablePrefix + "_hashes");
+    hashes.write().format("parquet").saveAsTable(hiveTablePrefix + "_hashes");
+
+    // To avoid NxN, filter to a threshold to exclude e.g. gut worm analysis datasets
+    Dataset<Row> hashCounts =
+        spark.sql(
+            String.format(
+                "SELECT hash, count(*) AS c FROM %s_hashes GROUP BY hash", hiveTablePrefix));
+    spark.sql("DROP TABLE IF EXISTS " + hiveTablePrefix + "_hash_counts");
+    hashCounts.write().format("parquet").saveAsTable(hiveTablePrefix + "_hash_counts");
+    Dataset<Row> filteredHashes =
+        spark.sql(
+            String.format(
+                "SELECT t1.gbifID, t1.datasetKey, t1.hash "
+                    + "FROM %s_hashes t1"
+                    + "  JOIN %s_hash_counts t2 ON t1.hash=t2.hash "
+                    + "WHERE t2.c <= %d",
+                hiveTablePrefix, hiveTablePrefix, hashCountThreshold));
+    spark.sql("DROP TABLE IF EXISTS " + hiveTablePrefix + "_hashes_filtered");
+    filteredHashes.write().format("parquet").saveAsTable(hiveTablePrefix + "_hashes_filtered");
+
+    // distinct cross join to generate the candidate pairs for comparison
+    Dataset<Row> candidates =
+        spark.sql(
+            String.format(
+                "SELECT "
+                    + "t1.gbifId as id1, t1.datasetKey as ds1, t2.gbifId as id2, t2.datasetKey as ds2 "
+                    + "FROM %s_hashes_filtered t1 JOIN %s_hashes_filtered t2 ON t1.hash = t2.hash "
+                    + "WHERE "
+                    + "  t1.gbifId < t2.gbifId AND "
+                    + "  t1.datasetKey != t2.datasetKey "
+                    + "GROUP BY t1.gbifId, t1.datasetKey, t2.gbifId, t2.datasetKey",
+                hiveTablePrefix, hiveTablePrefix));
+    spark.sql("DROP TABLE IF EXISTS " + hiveTablePrefix + "_candidates");
+    candidates.write().format("parquet").saveAsTable(hiveTablePrefix + "_candidates");
+  }
+
+  /** Reads the candidate pairs table, and runs the record to record comparison. */
+  private Dataset<Row> generateRelationships(SparkSession spark) {
+    // Spark DF naming convention requires that we alias each term to avoid naming collision while
+    // still having named fields to access (i.e. not relying on the column number of the term). All
+    // taxa keys are converted to String to allow shared routines between GBIF and ALA
+    // (https://github.com/gbif/pipelines/issues/484)
+    Dataset<Row> pairs =
+        spark.sql(
+            String.format(
+                "SELECT "
+                    + "  t1.gbifId AS t1_gbifId, t1.datasetKey AS t1_datasetKey, t1.basisOfRecord AS t1_basisOfRecord, t1.publishingorgkey AS t1_publishingOrgKey, t1.datasetName AS t1_datasetName, t1.publisher AS t1_publishingOrgName, "
+                    + "  CAST(t1.kingdomKey AS String) AS t1_kingdomKey, CAST(t1.phylumKey AS String) AS t1_phylumKey, CAST(t1.classKey AS String) AS t1_classKey, CAST(t1.orderKey AS String) AS t1_orderKey, CAST(t1.familyKey AS String) AS t1_familyKey, CAST(t1.genusKey AS String) AS t1_genusKey, CAST(t1.speciesKey AS String) AS t1_speciesKey, CAST(t1.acceptedTaxonKey AS String) AS t1_acceptedTaxonKey, CAST(t1.taxonKey AS String) AS t1_taxonKey, "
+                    + "  t1.scientificName AS t1_scientificName, t1.acceptedScientificName AS t1_acceptedScientificName, t1.kingdom AS t1_kingdom, t1.phylum AS t1_phylum, t1.order_ AS t1_order, t1.family AS t1_family, t1.genus AS t1_genus, t1.species AS t1_species, t1.genericName AS t1_genericName, t1.specificEpithet AS t1_specificEpithet, t1.taxonRank AS t1_taxonRank, "
+                    + "  t1.typeStatus AS t1_typeStatus, t1.preparations AS t1_preparations, "
+                    + "  t1.decimalLatitude AS t1_decimalLatitude, t1.decimalLongitude AS t1_decimalLongitude, t1.countryCode AS t1_countryCode, "
+                    + "  t1.year AS t1_year, t1.month AS t1_month, t1.day AS t1_day, from_unixtime(floor(t1.eventDate/1000)) AS t1_eventDate, "
+                    + "  t1.recordNumber AS t1_recordNumber, t1.fieldNumber AS t1_fieldNumber, t1.occurrenceID AS t1_occurrenceID, t1.otherCatalogNumbers AS t1_otherCatalogNumbers, t1.institutionCode AS t1_institutionCode, t1.collectionCode AS t1_collectionCode, t1.catalogNumber AS t1_catalogNumber, "
+                    + "  t1.recordedBy AS t1_recordedBy, t1.recordedByID AS t1_recordedByID, "
+                    + "  t1.ext_multimedia AS t1_media, "
+                    + ""
+                    + "  t2.gbifId AS t2_gbifId, t2.datasetKey AS t2_datasetKey, t2.basisOfRecord AS t2_basisOfRecord, t2.publishingorgkey AS t2_publishingOrgKey, t2.datasetName AS t2_datasetName, t2.publisher AS t2_publishingOrgName, "
+                    + "  CAST(t2.kingdomKey AS String) AS t2_kingdomKey, CAST(t2.phylumKey AS String) AS t2_phylumKey, CAST(t2.classKey AS String) AS t2_classKey, CAST(t2.orderKey AS String) AS t2_orderKey, CAST(t2.familyKey AS String) AS t2_familyKey, CAST(t2.genusKey AS String) AS t2_genusKey, CAST(t2.speciesKey AS String) AS t2_speciesKey, CAST(t2.acceptedTaxonKey AS String) AS t2_acceptedTaxonKey, CAST(t2.taxonKey AS String) AS t2_taxonKey, "
+                    + "  t2.scientificName AS t2_scientificName, t2.acceptedScientificName AS t2_acceptedScientificName, t2.kingdom AS t2_kingdom, t2.phylum AS t2_phylum, t2.order_ AS t2_order, t2.family AS t2_family, t2.genus AS t2_genus, t2.species AS t2_species, t2.genericName AS t2_genericName, t2.specificEpithet AS t2_specificEpithet, t2.taxonRank AS t2_taxonRank, "
+                    + "  t2.typeStatus AS t2_typeStatus, t2.preparations AS t2_preparations, "
+                    + "  t2.decimalLatitude AS t2_decimalLatitude, t2.decimalLongitude AS t2_decimalLongitude, t2.countryCode AS t2_countryCode, "
+                    + "  t2.year AS t2_year, t2.month AS t2_month, t2.day AS t2_day, from_unixtime(floor(t2.eventDate/1000)) AS t2_eventDate, "
+                    + "  t2.recordNumber AS t2_recordNumber, t2.fieldNumber AS t2_fieldNumber, t2.occurrenceID AS t2_occurrenceID, t2.otherCatalogNumbers AS t2_otherCatalogNumbers, t2.institutionCode AS t2_institutionCode, t2.collectionCode AS t2_collectionCode, t2.catalogNumber AS t2_catalogNumber, "
+                    + "  t2.recordedBy AS t2_recordedBy, t2.recordedByID AS t2_recordedByID, "
+                    + "  t2.ext_multimedia AS t2_media "
+                    + ""
+                    + "FROM %s h"
+                    + "  JOIN %s t1 ON h.id1 = t1.gbifID "
+                    + "  JOIN %s t2 ON h.id2 = t2.gbifID",
+                hiveTablePrefix + "_candidates", sourceTable, sourceTable));
+
+    // Compare all candidate pairs and generate the relationships
+    Dataset<Row> relationships =
+        pairs.flatMap(row -> relateRecords(row), RowEncoder.apply(RELATIONSHIP_SCHEMA));
+    spark.sql("DROP TABLE IF EXISTS " + hiveTablePrefix + "_relationships");
+    relationships.write().format("parquet").saveAsTable(hiveTablePrefix + "_relationships");
+    return relationships;
+  }
+
+  /** Partitions the relationships to match the target table layout and creates the HFiles. */
+  private void generateHFiles(Dataset<Row> relationships) throws IOException {
+    // convert to HFiles, prepared with modulo salted keys
+    JavaPairRDD<Tuple2<String, String>, String> sortedRelationships =
+        relationships
+            .javaRDD()
+            .flatMapToPair(
+                row -> {
+                  String id1 = row.getString(0);
+                  String id2 = row.getString(1);
+
+                  // salt only on the id1 to enable prefix scanning using an occurrence ID
+                  int salt = Math.abs(id1.hashCode()) % hbaseRegions;
+                  String saltedRowKey = salt + ":" + id1 + ":" + id2;
+
+                  List<Tuple2<Tuple2<String, String>, String>> cells = new ArrayList<>();
+                  cells.add(new Tuple2<>(new Tuple2<>(saltedRowKey, "id1"), id1));
+                  cells.add(new Tuple2<>(new Tuple2<>(saltedRowKey, "id2"), id2));
+                  cells.add(new Tuple2<>(new Tuple2<>(saltedRowKey, "reasons"), row.getString(2)));
+                  cells.add(new Tuple2<>(new Tuple2<>(saltedRowKey, "dataset1"), row.getString(3)));
+                  cells.add(new Tuple2<>(new Tuple2<>(saltedRowKey, "dataset2"), row.getString(4)));
+                  cells.add(
+                      new Tuple2<>(new Tuple2<>(saltedRowKey, "occurrence1"), row.getString(5)));
+                  cells.add(
+                      new Tuple2<>(new Tuple2<>(saltedRowKey, "occurrence2"), row.getString(6)));
+
+                  return cells.iterator();
+                })
+            .repartitionAndSortWithinPartitions(
+                new SaltPrefixPartitioner(hbaseRegions), new Tuple2StringComparator());
+
+    sortedRelationships
+        .mapToPair(
+            cell -> {
+              ImmutableBytesWritable k = new ImmutableBytesWritable(Bytes.toBytes(cell._1._1));
+              Cell row =
+                  new KeyValue(
+                      Bytes.toBytes(cell._1._1), // key
+                      Bytes.toBytes("o"), // column family
+                      Bytes.toBytes(cell._1._2), // cell
+                      Bytes.toBytes(cell._2) // cell value
+                      );
+              return new Tuple2<>(k, row);
+            })
+        .saveAsNewAPIHadoopFile(
+            targetDir,
+            ImmutableBytesWritable.class,
+            KeyValue.class,
+            HFileOutputFormat2.class,
+            hadoopConf());
+  }
+
+  /** Truncates the target table preserving its layout and loads in the HFiles. */
+  private void replaceHBaseTable() throws IOException {
+    Configuration conf = hadoopConf();
+    try (Connection connection = ConnectionFactory.createConnection(conf);
+        HTable table = new HTable(conf, hbaseTable);
+        Admin admin = connection.getAdmin()) {
+      LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
+
+      // bulkload requires files to be in hbase ownership
+      FsShell shell = new FsShell(conf);
+      try {
+        System.out.println("Executing chown -R hbase:hbase for " + targetDir);
+        shell.run(new String[] {"-chown", "-R", "hbase:hbase", targetDir});
+      } catch (Exception e) {
+        throw new IOException("Unable to modify FS ownership to hbase", e);
+      }
+
+      System.out.println(String.format("Truncating table[%s]", hbaseTable));
+      Instant start = Instant.now();
+      admin.disableTable(table.getName());
+      admin.truncateTable(table.getName(), true);
+      System.out.println(
+          String.format(
+              "Table[%s] truncated in %d ms",
+              hbaseTable, Duration.between(start, Instant.now()).toMillis()));
+      System.out.println(String.format("Loading table[%s] from [%s]", hbaseTable, targetDir));
+      loader.doBulkLoad(new Path(targetDir), table);
+      System.out.println(
+          String.format(
+              "Table [%s] truncated and reloaded in %d ms",
+              hbaseTable, Duration.between(start, Instant.now()).toMillis()));
+      removeTargetDir();
+
+    } catch (Exception e) {
+      throw new IOException(e);
+    }
+  }
+
+  /** Removes the target directory provided it is in the /tmp location */
+  private void removeTargetDir() throws IOException {
+    // defensive, cleaning only /tmp in hdfs (we assume people won't do /tmp/../...
+    String regex = "/tmp/.+";
+    if (targetDir.matches(regex)) {
+      FsShell shell = new FsShell(new Configuration());
+      try {
+        System.out.println(
+            String.format(
+                "Deleting working directory [%s] which translates to [-rm -r -skipTrash %s ]",
+                targetDir, targetDir));
+
+        shell.run(new String[] {"-rm", "-r", "-skipTrash", targetDir});
+      } catch (Exception e) {
+        throw new IOException("Unable to delete the working directory", e);
+      }
+    } else {
+      throw new IllegalArgumentIOException("Target directory must be within /tmp");
+    }
+  }
+
+  /** Runs the record to record comparison */
+  private Iterator<Row> relateRecords(Row row) throws IOException {
+    Set<Row> relationships = new HashSet<>();
+
+    RowOccurrenceFeatures o1 = new RowOccurrenceFeatures(row, "t1_", "t1_media");
+    RowOccurrenceFeatures o2 = new RowOccurrenceFeatures(row, "t2_", "t2_media");
+    RelationshipAssertion<RowOccurrenceFeatures> assertions =
+        OccurrenceRelationships.generate(o1, o2);
+
+    // store any relationship bidirectionally matching RELATIONSHIP_SCHEMA
+    if (assertions != null) {
+      relationships.add(
+          RowFactory.create(
+              assertions.getOcc1().get("gbifId"),
+              assertions.getOcc2().get("gbifId"),
+              assertions.getJustificationAsDelimited(),
+              assertions.getOcc1().get("datasetKey"),
+              assertions.getOcc2().get("datasetKey"),
+              assertions.getOcc1().asJson(),
+              assertions.getOcc2().asJson()));
+
+      relationships.add(
+          RowFactory.create(
+              assertions.getOcc2().get("gbifId"),
+              assertions.getOcc1().get("gbifId"),
+              assertions.getJustificationAsDelimited(),
+              assertions.getOcc2().get("datasetKey"),
+              assertions.getOcc1().get("datasetKey"),
+              assertions.getOcc2().asJson(),
+              assertions.getOcc1().asJson()));
+    }
+    return relationships.iterator();
+  }
+
+  /** Creates the Hadoop configuration suitable for HDFS and HBase use. */
+  private Configuration hadoopConf() throws IOException {
+    Configuration conf = HBaseConfiguration.create();
+    conf.set("hbase.zookeeper.quorum", hbaseZK);
+    conf.set(FileOutputFormat.COMPRESS, "true");
+    conf.setClass(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
+    Job job = new Job(conf, "Clustering"); // name not actually used
+    HTable table = new HTable(conf, hbaseTable);
+    HFileOutputFormat2.configureIncrementalLoad(job, table);
+    return job.getConfiguration(); // job created a copy of the conf
+  }
+
+  /** Necessary as the Tuple2 does not implement a comparator in Java */
+  static class Tuple2StringComparator implements Comparator<Tuple2<String, String>>, Serializable {
+    @Override
+    public int compare(Tuple2<String, String> o1, Tuple2<String, String> o2) {
+      if (o1._1.equals(o2._1)) return o1._2.compareTo(o2._2);
+      else return o1._1.compareTo(o2._1);
+    }
+  }
+}
diff --git a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/CommandLineParser.java b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/CommandLineParser.java
new file mode 100644
index 0000000000..91dedad2e7
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/CommandLineParser.java
@@ -0,0 +1,53 @@
+package org.gbif.pipelines.clustering;
+
+import java.util.Arrays;
+import java.util.List;
+
+/** Utility to read the named arguments. */
+class CommandLineParser {
+
+  static Cluster.ClusterBuilder parse(String[] args) {
+    if (args.length != 16) {
+      String provided = args.length == 0 ? "no args" : String.join(" ", args);
+      throw new IllegalArgumentException("Incorrect configuration provided: " + provided);
+    }
+    Cluster.ClusterBuilder builder = nextOption(new Cluster.ClusterBuilder(), Arrays.asList(args));
+    System.out.println("Clustering started with configuration: " + builder);
+    return builder;
+  }
+
+  private static Cluster.ClusterBuilder nextOption(
+      Cluster.ClusterBuilder builder, List<String> list) {
+    if (list.isEmpty()) return builder;
+
+    String option = list.get(0);
+    List<String> tail = list.subList(1, list.size());
+    if (option.startsWith("-")) {
+      switch (option) {
+        case "--hive-db":
+          return nextOption(builder.hiveDB(tail.get(0)), tail.subList(1, tail.size()));
+        case "--source-table":
+          return nextOption(builder.sourceTable(tail.get(0)), tail.subList(1, tail.size()));
+        case "--hive-table-prefix":
+          return nextOption(builder.hiveTablePrefix(tail.get(0)), tail.subList(1, tail.size()));
+        case "--hbase-table":
+          return nextOption(builder.hbaseTable(tail.get(0)), tail.subList(1, tail.size()));
+        case "--hbase-regions":
+          return nextOption(
+              builder.hbaseRegions(Integer.parseInt(tail.get(0))), tail.subList(1, tail.size()));
+        case "--hbase-zk":
+          return nextOption(builder.hbaseZK(tail.get(0)), tail.subList(1, tail.size()));
+        case "--target-dir":
+          return nextOption(builder.targetDir(tail.get(0)), tail.subList(1, tail.size()));
+        case "--hash-count-threshold":
+          return nextOption(
+              builder.hashCountThreshold(Integer.parseInt(tail.get(0))),
+              tail.subList(1, tail.size()));
+        default:
+          throw new IllegalArgumentException("Unknown option " + option);
+      }
+    } else {
+      throw new IllegalArgumentException("Unknown option " + option);
+    }
+  }
+}
diff --git a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/HashUtilities.java b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/HashUtilities.java
index 8cb85d1bae..1fcdc8ac31 100644
--- a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/HashUtilities.java
+++ b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/HashUtilities.java
@@ -3,42 +3,136 @@
 import static org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships.concatIfEligible;
 import static org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships.hashOrNull;
 import static org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships.isEligibleCode;
-import static org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships.isNumeric;
 
-import java.util.Set;
+import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
 import org.gbif.pipelines.core.parsers.clustering.OccurrenceFeatures;
 import org.gbif.pipelines.core.parsers.clustering.OccurrenceRelationships;
 
-/** Utility functions for hashing records to pregroup. */
-public class HashUtilities {
+/** Utility functions for hashing records to pre-group. */
+class HashUtilities {
+  private static final Set<String> SPECIMEN_BASIS_OF_RECORD_SET =
+      Stream.of(
+              "PRESERVED_SPECIMEN",
+              "MATERIAL_SAMPLE",
+              "LIVING_SPECIMEN",
+              "FOSSIL_SPECIMEN",
+              "MATERIAL_CITATION")
+          .collect(Collectors.toSet());
+
+  static Iterator<Row> recordHashes(OccurrenceFeatures o) {
+    Double lat = o.getDecimalLatitude();
+    Double lng = o.getDecimalLongitude();
+    Integer year = o.getYear();
+    Integer month = o.getMonth();
+    Integer day = o.getDay();
+    String taxonKey = o.getTaxonKey();
+    List<String> typeStatus = o.getTypeStatus();
+    List<String> recordedBy = o.getRecordedBy();
+    String speciesKey = o.getSpeciesKey();
+    Set<String> identifiers = hashCodesAndIDs(o, true);
+
+    Set<Row> hashes = new HashSet<>();
+
+    // generic grouping on species, time and space
+    if (noNulls(lat, lng, year, month, day, speciesKey)) {
+      hashes.add(
+          RowFactory.create(
+              o.getId(),
+              o.getDatasetKey(),
+              String.join(
+                  "|",
+                  speciesKey,
+                  String.valueOf(Math.round(lat * 1000)),
+                  String.valueOf(Math.round(lng * 1000)),
+                  year.toString(),
+                  month.toString(),
+                  day.toString())));
+    }
+
+    // identifiers overlap for the same species
+    if (noNulls(speciesKey)) {
+      for (String id : identifiers) {
+        hashes.add(
+            RowFactory.create(o.getId(), o.getDatasetKey(), String.join("|", speciesKey, id)));
+      }
+    }
+
+    // anything claiming a type for the same name is of interest (regardless of type stated)
+    if (noNulls(taxonKey, typeStatus) && !typeStatus.isEmpty())
+      hashes.add(
+          RowFactory.create(o.getId(), o.getDatasetKey(), String.join("|", taxonKey, "TYPE")));
+
+    // all similar species recorded by the same person within the same year are of interest
+    if (noNulls(taxonKey, year, recordedBy)) {
+      for (String r : recordedBy) {
+        hashes.add(
+            RowFactory.create(o.getId(), o.getDatasetKey(), String.join("|", year.toString(), r)));
+      }
+    }
+
+    // append the specimen specific hashes
+    hashes.addAll(specimenHashes(o));
+
+    return hashes.iterator();
+  }
 
   /**
-   * Hashes the occurrenceID, catalogCode and otherCatalogNumber into a Set of codes. The
-   * catalogNumber is constructed as it's code and also prefixed in the commonly used format of
-   * IC:CC:CN and IC:CN. No code in the resulting set will be numeric as the cardinality of e.g.
-   * catalogNumber=100 is too high to compute, and this is intended to be used to detect explicit
-   * relationships with otherCatalogNumber.
-   *
-   * @return a set of hashes suitable for grouping specimen related records without other filters.
+   * Generate hashes for specimens combining the various IDs and accepted species. Specimens often
+   * link by record identifiers, while other occurrence data skews here greatly for little benefit.
    */
-  public static Set<String> hashCatalogNumbers(OccurrenceFeatures o) {
-    Stream<String> cats =
+  private static Set<Row> specimenHashes(OccurrenceFeatures o) {
+    Set<Row> hashes = new HashSet<>();
+    String bor = o.getBasisOfRecord();
+    if (SPECIMEN_BASIS_OF_RECORD_SET.contains(bor)) {
+
+      // non-numeric identifiers for specimens used across datasets
+      Set<String> codes = hashCodesAndIDs(o, true);
+      for (String code : codes) {
+        hashes.add(
+            RowFactory.create(
+                o.getId(),
+                o.getDatasetKey(),
+                String.join("|", o.getSpeciesKey(), OccurrenceRelationships.normalizeID(code))));
+      }
+
+      // stricter code hashing (non-numeric) but without species
+      Set<String> codesStrict = hashCodesAndIDs(o, false);
+      for (String code : codesStrict) {
+        hashes.add(
+            RowFactory.create(
+                o.getId(),
+                o.getDatasetKey(),
+                String.join("|", OccurrenceRelationships.normalizeID(code))));
+      }
+    }
+    return hashes;
+  }
+
+  /** Hashes all the various ways that record codes and identifiers are commonly used. */
+  static Set<String> hashCodesAndIDs(OccurrenceFeatures o, boolean allowNumerics) {
+    Stream<String> ids =
         Stream.of(
-            hashOrNull(o.getCatalogNumber(), false),
-            hashOrNull(o.getOccurrenceID(), false),
+            hashOrNull(o.getOccurrenceID(), allowNumerics),
+            hashOrNull(o.getRecordNumber(), allowNumerics),
+            hashOrNull(o.getFieldNumber(), allowNumerics),
+            hashOrNull(o.getCatalogNumber(), allowNumerics),
             concatIfEligible(
                 ":", o.getInstitutionCode(), o.getCollectionCode(), o.getCatalogNumber()),
             concatIfEligible(":", o.getInstitutionCode(), o.getCatalogNumber()));
-
     if (o.getOtherCatalogNumbers() != null) {
-      cats =
-          Stream.concat(cats, o.getOtherCatalogNumbers().stream().map(c -> hashOrNull(c, false)));
+      ids =
+          Stream.concat(
+              ids, o.getOtherCatalogNumbers().stream().map(c -> hashOrNull(c, allowNumerics)));
     }
+    return ids.filter(c -> isEligibleCode(c)).collect(Collectors.toSet());
+  }
 
-    return cats.map(OccurrenceRelationships::normalizeID)
-        .filter(c -> isEligibleCode(c) && !isNumeric(c))
-        .collect(Collectors.toSet());
+  /** Return true of no nulls or empty strings provided */
+  static boolean noNulls(Object... o) {
+    return Arrays.stream(o).noneMatch(s -> s == null || "".equals(s));
   }
 }
diff --git a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java
index 861408645f..2e039b3c73 100644
--- a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java
+++ b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java
@@ -130,7 +130,7 @@ public String asJson() throws IOException {
 
   @Override
   public String getId() {
-    return get("id");
+    return get("gbifId");
   }
 
   @Override
diff --git a/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/SaltPrefixPartitioner.java b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/SaltPrefixPartitioner.java
new file mode 100644
index 0000000000..86d19b082d
--- /dev/null
+++ b/gbif/ingestion/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/SaltPrefixPartitioner.java
@@ -0,0 +1,22 @@
+package org.gbif.pipelines.clustering;
+
+import lombok.AllArgsConstructor;
+import org.apache.spark.Partitioner;
+import scala.Tuple2;
+
+/** Partitions by the prefix on the given key extracted from the given key. */
+@AllArgsConstructor
+class SaltPrefixPartitioner extends Partitioner {
+  private final int numPartitions;
+
+  @Override
+  public int getPartition(Object key) {
+    String k = ((Tuple2<String, String>) key)._1;
+    return Integer.parseInt(k.substring(0, k.indexOf(":")));
+  }
+
+  @Override
+  public int numPartitions() {
+    return numPartitions;
+  }
+}
diff --git a/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/Cluster.scala b/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/Cluster.scala
deleted file mode 100755
index 24b8e87dfc..0000000000
--- a/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/Cluster.scala
+++ /dev/null
@@ -1,391 +0,0 @@
-package org.gbif.pipelines.clustering
-
-import java.io.File
-import org.apache.hadoop.hbase.client.HTable
-import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue}
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat, HFileOutputFormat2}
-import org.apache.hadoop.hbase.util.Bytes
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.Partitioner
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
-import org.apache.spark.sql.{Row, SparkSession}
-import org.gbif.pipelines.core.parsers.clustering.{OccurrenceRelationships, RelationshipAssertion}
-
-import scala.collection.JavaConversions._
-
-object Cluster {
-
-  /**
-   * Reads the salt from the encoded key structure.
-   */
-  class SaltPartitioner(partitionCount: Int) extends Partitioner {
-
-    def getPartition(key: Any): Int = {
-      // (salt:id1:id2:type, column)
-      val k = key.asInstanceOf[(String, String)]._1
-      Integer.valueOf(k.substring(0, k.indexOf(":")))
-    }
-
-    override def numPartitions: Int = partitionCount
-  }
-
-  // To aid running in Oozie, all properties are supplied as main arguments
-  val usage = """
-    Usage: Cluster \
-      [--hive-db database] \
-      [--hive-table-hashed tableName] \
-      [--hive-table-candidates tableName] \
-      [--hive-table-relationships tableName] \
-      [--hbase-table tableName] \
-      [--hbase-regions numberOfRegions] \
-      [--hbase-zk zookeeperEnsemble] \
-      [--hfile-dir directoryForHFiles]
-  """
-
-  def main(args: Array[String]): Unit = {
-    val parsedArgs = checkArgs(args) // sanitize input
-    assert(parsedArgs.size==8, usage)
-    System.err.println("Configuration: " + parsedArgs) // Oozie friendly logging use
-
-    val hiveDatabase = parsedArgs.get('hiveDatabase).get
-    val hiveTableHashed = parsedArgs.get('hiveTableHashed).get
-    val hiveTableCandidates = parsedArgs.get('hiveTableCandidates).get
-    val hiveTableRelationships = parsedArgs.get('hiveTableRelationships).get
-    val hbaseTable = parsedArgs.get('hbaseTable).get
-    val hbaseRegions = parsedArgs.get('hbaseRegions).get.toInt
-    val hbaseZK = parsedArgs.get('hbaseZK).get
-    val hfileDir = parsedArgs.get('hfileDir).get
-
-    val warehouseLocation = new File("spark-warehouse").getAbsolutePath
-
-    val spark = SparkSession
-      .builder()
-      .appName("Occurrence clustering")
-      .config("spark.sql.warehouse.dir", warehouseLocation)
-      .enableHiveSupport()
-      .getOrCreate()
-    import spark.sql
-
-    spark.sql("use " + hiveDatabase)
-
-    val runAll = true; // developers: set to false to short circuit to the clustering stage
-
-    if (runAll) {
-      val occurrences = sql(SQL_OCCURRENCE)
-
-      val schema = StructType(
-        StructField("gbifId", StringType, nullable = false) ::
-          StructField("datasetKey", StringType, nullable = false) ::
-          StructField("hash", StringType, nullable = false) :: Nil
-      )
-      val hashEncoder = RowEncoder(schema)
-
-      import spark.implicits._
-
-      // species+ids for specimens only
-      val hashSpecimenIds = occurrences.flatMap(r => {
-        val records = scala.collection.mutable.ListBuffer[Row]()
-
-        // specimens often link by record identifiers, while occurrence data skews here greatly for little benefit
-        val bor = Option(r.getAs[String]("basisOfRecord"))
-        if (specimenBORs.contains(bor.getOrElse("ignore"))) {
-          var idsUsed = Set[Option[String]](
-            Option(r.getAs[String]("occurrenceID")),
-            Option(r.getAs[String]("fieldNumber")),
-            Option(r.getAs[String]("recordNumber")),
-            Option(r.getAs[String]("catalogNumber")),
-            triplify(r), // ic:cc:cn format
-            scopeCatalogNumber(r) // ic:cn format like ENA uses
-          )
-
-          val other = Option(r.getAs[Seq[String]]("otherCatalogNumbers"))
-          if (!other.isEmpty && other.get.length>0) {
-            idsUsed ++= other.get.map(x => Option(x))
-          }
-
-          // clean IDs
-          val filteredIds = idsUsed.filter(s => {
-            s match {
-              case Some(s) => s!=null && s.length>0 && !omitIds.contains(s.toUpperCase())
-              case _ => false
-            }
-          })
-
-          filteredIds.foreach(id => {
-            id match {
-              case Some(id) =>
-                val speciesKey = Option(r.getAs[Int]("speciesKey"))
-                if (!speciesKey.isEmpty) {
-                  records.append(
-                    Row(
-                      r.getAs[String]("gbifId"),
-                      r.getAs[String]("datasetKey"),
-                      speciesKey + "|" + OccurrenceRelationships.normalizeID(id)
-                    ))
-                }
-              case None => // skip
-            }
-          })
-
-          // a special treatment for catalogNumber and otherCatalogNumber overlap
-          val features = new RowOccurrenceFeatures(r)
-          val hashedIDs = HashUtilities.hashCatalogNumbers(features) // normalised, non-numeric codes
-          hashedIDs.foreach(id => {
-            records.append(
-              Row(
-                r.getAs[String]("gbifId"),
-                r.getAs[String]("datasetKey"),
-                id // only the ID overlap is suffice here
-              ))
-          })
-        }
-        records
-      })(hashEncoder).toDF()
-
-      val hashAll =  occurrences.flatMap(r => {
-        val records = scala.collection.mutable.ListBuffer[Row]()
-
-        // all records of species at same location, time should be compared regardless of BOR
-        // TODO: consider improving this for null values (and when one side is null) - will the ID link above suffice?
-        val lat = Option(r.getAs[Double]("decimalLatitude"))
-        val lng = Option(r.getAs[Double]("decimalLongitude"))
-        val year = Option(r.getAs[Int]("year"))
-        val month = Option(r.getAs[Int]("month"))
-        val day = Option(r.getAs[Int]("day"))
-        val taxonKey = Option(r.getAs[Int]("taxonKey"))
-        val typeStatus = Option(r.getAs[Seq[String]]("typeStatus"))
-        val recordedBy = Option(r.getAs[Seq[String]]("recordedBy"))
-        val speciesKey = Option(r.getAs[Int]("speciesKey"))
-
-        if (!lat.isEmpty && !lng.isEmpty && !year.isEmpty && !month.isEmpty && !day.isEmpty && !speciesKey.isEmpty) {
-          records.append(
-            Row(
-              r.getAs[String]("gbifId"),
-              r.getAs[String]("datasetKey"),
-              r.getAs[Integer]("speciesKey") + "|" + Math.round(lat.get*1000) + "|" + Math.round(lng.get*1000) + "|" + year.get + "|" + month.get + "|" + day.get
-            ))
-        }
-        // any type record of a taxon is of interest
-        if (!taxonKey.isEmpty && !typeStatus.isEmpty) {
-          typeStatus.get.foreach(t => {
-            records.append(
-              Row(
-                r.getAs[String]("gbifId"),
-                r.getAs[String]("datasetKey"),
-                taxonKey.get + "|" + t
-              ))
-          })
-        }
-
-        // all similar species recorded by the same person within the same year is of interest (misses recordings over new year)
-        if (!taxonKey.isEmpty && !year.isEmpty && !recordedBy.isEmpty) {
-          recordedBy.get.foreach(name => {
-            records.append(
-              Row(
-                r.getAs[String]("gbifId"),
-                r.getAs[String]("datasetKey"),
-                taxonKey.get + "|" + year.get + "|" + name
-              ))
-          })
-        }
-
-        records
-      })(hashEncoder).toDF()
-
-      val deduplicatedHashedRecords = hashAll.union(hashSpecimenIds).dropDuplicates()
-
-      // persist for debugging, enable for further processing in SQL
-      deduplicatedHashedRecords.write.saveAsTable(hiveTableHashed + "_all") // for diagnostics in hive
-
-      //deduplicatedHashedRecords.createOrReplaceTempView("DF_hashed_all")
-
-      // defend against NxN runtime issues by capping the number of records in a candidate group to 1000
-      val hashCounts = deduplicatedHashedRecords.groupBy("hash").count().withColumnRenamed("count", "c");
-      hashCounts.createOrReplaceTempView("DF_hash_counts")
-      val hashedFiltered = sql("SELECT t1.gbifID, t1.datasetKey, t1.hash " +
-          "FROM " + hiveTableHashed + "_all  t1 JOIN DF_hash_counts t2 ON t1.hash=t2.hash " +
-          "WHERE t2.c <= 1000")
-      hashedFiltered.write.saveAsTable(hiveTableHashed) // for diagnostics in hive
-
-      //hashedFiltered.createOrReplaceTempView("DF_hashed")
-
-      // Cross join to distinct pairs of records spanning 2 datasets
-      val candidates = sql("SELECT t1.gbifId as id1, t1.datasetKey as ds1, t2.gbifId as id2, t2.datasetKey as ds2 " +
-      "FROM " +
-      hiveTableHashed + " t1 JOIN " + hiveTableHashed + " t2 ON t1.hash = t2.hash " +
-      "WHERE " +
-      "  t1.gbifId < t2.gbifId AND " +
-      "  t1.datasetKey != t2.datasetKey " +
-      "GROUP BY t1.gbifId, t1.datasetKey, t2.gbifId, t2.datasetKey");
-
-      candidates.write.saveAsTable(hiveTableCandidates) // for diagnostics in hive
-    }
-
-    // Spark DF naming convention requires that we alias each term to avoid naming collision while still having
-    // named fields to access (i.e. not relying the column number of the term). All taxa keys are converted to String
-    // to allow shared routines between GBIF and ALA (https://github.com/gbif/pipelines/issues/484)
-    val pairs = sql("""
-      SELECT
-
-        t1.gbifId AS t1_gbifId, t1.datasetKey AS t1_datasetKey, t1.basisOfRecord AS t1_basisOfRecord, t1.publishingorgkey AS t1_publishingOrgKey, t1.datasetName AS t1_datasetName, t1.publisher AS t1_publishingOrgName,
-        CAST(t1.kingdomKey AS String) AS t1_kingdomKey, CAST(t1.phylumKey AS String) AS t1_phylumKey, CAST(t1.classKey AS String) AS t1_classKey, CAST(t1.orderKey AS String) AS t1_orderKey, CAST(t1.familyKey AS String) AS t1_familyKey, CAST(t1.genusKey AS String) AS t1_genusKey, CAST(t1.speciesKey AS String) AS t1_speciesKey, CAST(t1.acceptedTaxonKey AS String) AS t1_acceptedTaxonKey, CAST(t1.taxonKey AS String) AS t1_taxonKey,
-        t1.scientificName AS t1_scientificName, t1.acceptedScientificName AS t1_acceptedScientificName, t1.kingdom AS t1_kingdom, t1.phylum AS t1_phylum, t1.order_ AS t1_order, t1.family AS t1_family, t1.genus AS t1_genus, t1.species AS t1_species, t1.genericName AS t1_genericName, t1.specificEpithet AS t1_specificEpithet, t1.taxonRank AS t1_taxonRank,
-        t1.typeStatus AS t1_typeStatus, t1.preparations AS t1_preparations,
-        t1.decimalLatitude AS t1_decimalLatitude, t1.decimalLongitude AS t1_decimalLongitude, t1.countryCode AS t1_countryCode,
-        t1.year AS t1_year, t1.month AS t1_month, t1.day AS t1_day, from_unixtime(floor(t1.eventDate/1000)) AS t1_eventDate,
-        t1.recordNumber AS t1_recordNumber, t1.fieldNumber AS t1_fieldNumber, t1.occurrenceID AS t1_occurrenceID, t1.otherCatalogNumbers AS t1_otherCatalogNumbers, t1.institutionCode AS t1_institutionCode, t1.collectionCode AS t1_collectionCode, t1.catalogNumber AS t1_catalogNumber,
-        t1.recordedBy AS t1_recordedBy, t1.recordedByID AS t1_recordedByID,
-        t1.ext_multimedia AS t1_media,
-
-        t2.gbifId AS t2_gbifId, t2.datasetKey AS t2_datasetKey, t2.basisOfRecord AS t2_basisOfRecord, t2.publishingorgkey AS t2_publishingOrgKey, t2.datasetName AS t2_datasetName, t2.publisher AS t2_publishingOrgName,
-        CAST(t2.kingdomKey AS String) AS t2_kingdomKey, CAST(t2.phylumKey AS String) AS t2_phylumKey, CAST(t2.classKey AS String) AS t2_classKey, CAST(t2.orderKey AS String) AS t2_orderKey, CAST(t2.familyKey AS String) AS t2_familyKey, CAST(t2.genusKey AS String) AS t2_genusKey, CAST(t2.speciesKey AS String) AS t2_speciesKey, CAST(t2.acceptedTaxonKey AS String) AS t2_acceptedTaxonKey, CAST(t2.taxonKey AS String) AS t2_taxonKey,
-        t2.scientificName AS t2_scientificName, t2.acceptedScientificName AS t2_acceptedScientificName, t2.kingdom AS t2_kingdom, t2.phylum AS t2_phylum, t2.order_ AS t2_order, t2.family AS t2_family, t2.genus AS t2_genus, t2.species AS t2_species, t2.genericName AS t2_genericName, t2.specificEpithet AS t2_specificEpithet, t2.taxonRank AS t2_taxonRank,
-        t2.typeStatus AS t2_typeStatus, t2.preparations AS t2_preparations,
-        t2.decimalLatitude AS t2_decimalLatitude, t2.decimalLongitude AS t2_decimalLongitude, t2.countryCode AS t2_countryCode,
-        t2.year AS t2_year, t2.month AS t2_month, t2.day AS t2_day, from_unixtime(floor(t2.eventDate/1000)) AS t2_eventDate,
-        t2.recordNumber AS t2_recordNumber, t2.fieldNumber AS t2_fieldNumber, t2.occurrenceID AS t2_occurrenceID, t2.otherCatalogNumbers AS t2_otherCatalogNumbers, t2.institutionCode AS t2_institutionCode, t2.collectionCode AS t2_collectionCode, t2.catalogNumber AS t2_catalogNumber,
-        t2.recordedBy AS t2_recordedBy, t2.recordedByID AS t2_recordedByID,
-        t2.ext_multimedia AS t2_media
-
-      FROM """ + hiveTableCandidates + """ h
-       JOIN occurrence t1 ON h.id1 = t1.gbifID
-       JOIN occurrence t2 ON h.id2 = t2.gbifID
-      """);
-
-    import org.apache.spark.sql.types._
-    // schema holds redundant information, but aids diagnostics in Hive at low cost
-    val relationshipSchema = StructType(
-      StructField("id1", StringType, nullable = false) ::
-        StructField("id2", StringType, nullable = false) ::
-        StructField("reasons", StringType, nullable = false) ::
-        StructField("dataset1", StringType, nullable = false) ::
-        StructField("dataset2", StringType, nullable = false) ::
-        StructField("o1", StringType, nullable = false) ::
-        StructField("o2", StringType, nullable = false) :: Nil
-
-    )
-    val relationshipEncoder = RowEncoder(relationshipSchema)
-
-    val relationships = pairs.flatMap(row => {
-      val records = scala.collection.mutable.ListBuffer[Row]()
-
-      val o1 = new RowOccurrenceFeatures(row, "t1_", "t1_media")
-      val o2 = new RowOccurrenceFeatures(row, "t2_", "t2_media")
-
-      val relationships: Option[RelationshipAssertion[RowOccurrenceFeatures]] = Option(OccurrenceRelationships.generate(o1,o2))
-      relationships match {
-        case Some(r) => {
-          // store both ways
-          records.append(Row(
-            r.getOcc1.get("gbifId"),
-            r.getOcc2.get("gbifId"),
-            r.getJustificationAsDelimited,
-            r.getOcc1.get("datasetKey"),
-            r.getOcc2.get("datasetKey"),
-            r.getOcc1.asJson(),
-            r.getOcc2.asJson()))
-
-          records.append(Row(
-            r.getOcc2.get("gbifId"),
-            r.getOcc1.get("gbifId"),
-            r.getJustificationAsDelimited,
-            r.getOcc2.get("datasetKey"),
-            r.getOcc1.get("datasetKey"),
-            r.getOcc2.asJson(),
-            r.getOcc1.asJson()))
-        }
-        case None => // skip
-      }
-
-      records
-
-    })(relationshipEncoder).toDF().dropDuplicates()
-
-    relationships.write.saveAsTable(hiveTableRelationships) // for diagnostics in hive
-
-    // convert to HBase, with modulo salted keys
-    val relationshipsSorted = relationships.rdd.flatMap(r => {
-      // index based access as cannot access by schema using flatMap and rdd
-      val id1 = r.getString(0)
-      val id2 = r.getString(1)
-      val relationshipType = r.getString(2)
-      val dataset1 = r.getString(3)
-      val dataset2 = r.getString(4)
-      val occurrence1 = r.getString(5)
-      val occurrence2 = r.getString(6)
-
-      // we salt in HBase only on the id1 to enable prefix scanning using an occurrence ID
-      val salt = Math.abs(id1.hashCode) % hbaseRegions
-
-      val saltedRowKey = salt + ":" + id1 + ":" + id2
-      val cells = scala.collection.mutable.ListBuffer[((String, String), String)]()
-
-      // while only occurrence2 is needed it is not expensive to store each which aids diagnostics
-      cells.append(((saltedRowKey, "id1"),id1))
-      cells.append(((saltedRowKey, "id2"),id2))
-      cells.append(((saltedRowKey, "reasons"),relationshipType))
-      cells.append(((saltedRowKey, "dataset1"),dataset1))
-      cells.append(((saltedRowKey, "dataset2"),dataset2))
-      cells.append(((saltedRowKey, "occurrence1"),occurrence1))
-      cells.append(((saltedRowKey, "occurrence2"),occurrence2))
-
-      cells
-    }).repartitionAndSortWithinPartitions(new SaltPartitioner(hbaseRegions)).map(cell => {
-      val k = new ImmutableBytesWritable(Bytes.toBytes(cell._1._1))
-      val row = new KeyValue(Bytes.toBytes(cell._1._1), // key
-        Bytes.toBytes("o"), // column family
-        Bytes.toBytes(cell._1._2), // cell
-        Bytes.toBytes(cell._2) // cell value
-      )
-
-      (k, row)
-    })
-
-    val conf = HBaseConfiguration.create()
-    conf.set("hbase.zookeeper.quorum", hbaseZK);
-    // NOTE: job creates a copy of the conf
-    val job = new Job(conf,"Relationships") // name not actually used since we don't submit MR
-    job.setJarByClass(this.getClass)
-    val table = new HTable(conf, hbaseTable)
-    HFileOutputFormat2.configureIncrementalLoad(job, table);
-    val conf2 = job.getConfiguration // important
-
-    relationshipsSorted.saveAsNewAPIHadoopFile(hfileDir, classOf[ImmutableBytesWritable], classOf[KeyValue], classOf[HFileOutputFormat], conf2)
-  }
-
-  /**
-   * Sanitizes application arguments.
-   */
-  private def checkArgs(args: Array[String]) : Map[Symbol, String] = {
-    assert(args != null && args.length==16, usage)
-
-    def nextOption(map : Map[Symbol, String], list: List[String]) : Map[Symbol, String] = {
-      def isSwitch(s : String) = (s(0) == '-')
-      list match {
-        case Nil => map
-        case "--hive-db" :: value :: tail =>
-          nextOption(map ++ Map('hiveDatabase -> value), tail)
-        case "--hive-table-hashed" :: value :: tail =>
-          nextOption(map ++ Map('hiveTableHashed -> value), tail)
-        case "--hive-table-candidates" :: value :: tail =>
-          nextOption(map ++ Map('hiveTableCandidates -> value), tail)
-        case "--hive-table-relationships" :: value :: tail =>
-          nextOption(map ++ Map('hiveTableRelationships -> value), tail)
-        case "--hbase-table" :: value :: tail =>
-          nextOption(map ++ Map('hbaseTable -> value), tail)
-        case "--hbase-regions" :: value :: tail =>
-          nextOption(map ++ Map('hbaseRegions -> value), tail)
-        case "--hbase-zk" :: value :: tail =>
-          nextOption(map ++ Map('hbaseZK -> value), tail)
-        case "--hfile-dir" :: value :: tail =>
-          nextOption(map ++ Map('hfileDir -> value), tail)
-        case option :: tail => println("Unknown option "+option)
-          System.exit(1)
-          map
-      }
-    }
-    nextOption(Map(), args.toList)
-  }
-}
diff --git a/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/package.scala b/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/package.scala
deleted file mode 100644
index c028cdb0fa..0000000000
--- a/gbif/ingestion/clustering-gbif/src/main/scala/org/gbif/pipelines/clustering/package.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-package org.gbif.pipelines
-
-import org.apache.spark.sql.Row
-
-package object clustering {
-  // IDs to skip
-  val omitIds = List("NO APLICA", "NA", "[]", "NO DISPONIBLE", "NO DISPONIBL", "NO NUMBER", "--", "UNKNOWN")
-
-  // SPECIMENS
-  val specimenBORs = List("PRESERVED_SPECIMEN", "MATERIAL_SAMPLE", "LIVING_SPECIMEN", "FOSSIL_SPECIMEN", "MATERIAL_CITATION")
-
-  // SQL to extract fields necessary for grouping for candidate pairs
-  val SQL_OCCURRENCE = """
-SELECT
-  gbifId, datasetKey, basisOfRecord, publishingorgkey, datasetName, publisher,
-  kingdomKey, phylumKey, classKey, orderKey, familyKey, genusKey, speciesKey, acceptedTaxonKey, taxonKey,
-  scientificName, acceptedScientificName, kingdom, phylum, order_ AS order, family, genus, species, genericName, specificEpithet, taxonRank,
-  typeStatus, preparations,
-  decimalLatitude, decimalLongitude, countryCode,
-  year, month, day, from_unixtime(floor(eventDate/1000)) AS eventDate,
-  recordNumber, fieldNumber, occurrenceID, otherCatalogNumbers, institutionCode, collectionCode, catalogNumber,
-  recordedBy, recordedByID,
-  ext_multimedia
-FROM occurrence
-"""
-
-  case class SimpleOccurrence(gbifID: String, decimalLatitude: Double)
-
-  /**
-   * @return A triplified version of the codes if all present, otherwise None
-   */
-  def triplify(r: Row) : Option[String] = {
-    val ic = Option(r.getAs[String]("institutionCode"));
-    val cc = Option(r.getAs[String]("collectionCode"));
-    val cn = Option(r.getAs[String]("catalogNumber"));
-
-    if (!ic.isEmpty && !cc.isEmpty && !cn.isEmpty) Option(ic.get + ":" + cc.get + ":" + cn.get)
-    else None
-  }
-
-  /**
-    * @return The catalogNumber prefixed by the institutionCode if both present, otherwise None
-    */
-  def scopeCatalogNumber(r: Row) : Option[String] = {
-    // This format of catalog number used by e.g. ENA datasets
-    val ic = Option(r.getAs[String]("institutionCode"));
-    val cn = Option(r.getAs[String]("catalogNumber"));
-
-    if (!ic.isEmpty && !cn.isEmpty) Option(ic.get + ":" + cn.get)
-    else None
-  }
-}
diff --git a/gbif/ingestion/pom.xml b/gbif/ingestion/pom.xml
index 640422d74e..eaa406794a 100644
--- a/gbif/ingestion/pom.xml
+++ b/gbif/ingestion/pom.xml
@@ -17,6 +17,7 @@
 
   <modules>
     <module>clustering-gbif</module>
+    <module>clustering-gbif-oozie</module>
     <module>ingest-gbif-beam</module>
     <module>ingest-gbif-fragmenter</module>
     <module>ingest-gbif-java</module>
diff --git a/gbif/validator/validator-checklists/src/main/java/org/gbif/pipelines/validator/checklists/cli/ChecklistValidatorService.java b/gbif/validator/validator-checklists/src/main/java/org/gbif/pipelines/validator/checklists/cli/ChecklistValidatorService.java
index 1048ca878d..40f1a73838 100644
--- a/gbif/validator/validator-checklists/src/main/java/org/gbif/pipelines/validator/checklists/cli/ChecklistValidatorService.java
+++ b/gbif/validator/validator-checklists/src/main/java/org/gbif/pipelines/validator/checklists/cli/ChecklistValidatorService.java
@@ -53,8 +53,7 @@ public static ValidationWsClient createValidationWsClient(
 
   @Override
   protected void startUp() throws Exception {
-    log.info(
-        "Started pipelines-validator-checklist-validator service with parameters : {}", config);
+    log.info("Started pipelines-validator-checklist-validator service");
     // Prefetch is one, since this is a long-running process.
     listener = new MessageListener(config.messaging.getConnectionParameters(), 1);
     publisher = new DefaultMessagePublisher(config.messaging.getConnectionParameters());
diff --git a/livingatlas/pipelines/pom.xml b/livingatlas/pipelines/pom.xml
index 3d966eae84..fc442a6186 100644
--- a/livingatlas/pipelines/pom.xml
+++ b/livingatlas/pipelines/pom.xml
@@ -302,6 +302,11 @@
           <groupId>org.gbif.registry</groupId>
           <artifactId>registry-ws-client</artifactId>
         </exclusion>
+        <!-- exclude jna when testing on arm64 -->
+<!--        <exclusion>-->
+<!--          <groupId>org.elasticsearch</groupId>-->
+<!--          <artifactId>jna</artifactId>-->
+<!--        </exclusion>-->
       </exclusions>
     </dependency>
     <dependency>
diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/IndexRecordTransform.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/IndexRecordTransform.java
index e88eda5361..b23bca7855 100644
--- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/IndexRecordTransform.java
+++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/IndexRecordTransform.java
@@ -446,31 +446,13 @@ private static void applyMultimedia(
 
       if (!images.isEmpty()) {
         indexRecord.getStrings().put(IMAGE_ID, isr.getImageItems().get(0).getIdentifier());
-        indexRecord
-            .getMultiValues()
-            .put(
-                IMAGE_IDS,
-                isr.getImageItems().stream()
-                    .map(Image::getIdentifier)
-                    .collect(Collectors.toList()));
+        indexRecord.getMultiValues().put(IMAGE_IDS, images);
       }
       if (!sounds.isEmpty()) {
-        indexRecord
-            .getMultiValues()
-            .put(
-                SOUND_IDS,
-                isr.getImageItems().stream()
-                    .map(Image::getIdentifier)
-                    .collect(Collectors.toList()));
+        indexRecord.getMultiValues().put(SOUND_IDS, sounds);
       }
       if (!videos.isEmpty()) {
-        indexRecord
-            .getMultiValues()
-            .put(
-                VIDEO_IDS,
-                isr.getImageItems().stream()
-                    .map(Image::getIdentifier)
-                    .collect(Collectors.toList()));
+        indexRecord.getMultiValues().put(VIDEO_IDS, videos);
       }
 
       List<MultimediaIndexRecord> mir =
@@ -886,6 +868,8 @@ public static Set<String> getAddedValues() {
         .add(DwcTerm.associatedOccurrences.simpleName())
         .add(DwcTerm.identifiedByID.simpleName())
         .add(DwcTerm.recordedByID.simpleName())
+        .add(STATE_CONSERVATION)
+        .add(COUNTRY_CONSERVATION)
         .build();
   }
 
@@ -907,12 +891,6 @@ private static void addSpeciesListInfo(
     for (ConservationStatus conservationStatus : conservationStatuses) {
       if (conservationStatus.getRegion() != null) {
         if (conservationStatus.getRegion().equalsIgnoreCase(stateProvince)) {
-
-          if (isNotBlank(conservationStatus.getSourceStatus())) {
-            indexRecord
-                .getStrings()
-                .put(RAW_STATE_CONSERVATION, conservationStatus.getSourceStatus());
-          }
           if (isNotBlank(conservationStatus.getStatus())) {
             indexRecord.getStrings().put(STATE_CONSERVATION, conservationStatus.getStatus());
           }
diff --git a/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/beam/CompleteIngestPipelineTestIT.java b/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/beam/CompleteIngestPipelineTestIT.java
index c7b4323085..b70c4a6ee1 100644
--- a/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/beam/CompleteIngestPipelineTestIT.java
+++ b/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/beam/CompleteIngestPipelineTestIT.java
@@ -150,6 +150,10 @@ public static void checkSingleRecordContent(String currentIndexName) throws Exce
     assertEquals("id1|id2", record.get().get("raw_identifiedByID"));
     assertEquals("id1", ((List) record.get().get("identifiedByID")).get(0));
     assertEquals("id2", ((List) record.get().get("identifiedByID")).get(1));
+
+    // raw state and raw country conservation
+    assertEquals("Extinct", record.get().get("raw_stateConservation"));
+    assertEquals("Vulnerable", record.get().get("raw_countryConservation"));
   }
 
   public void loadTestDataset(String datasetID, String inputPath) throws Exception {
diff --git a/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/converters/CoreTsvConverterTest.java b/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/converters/CoreTsvConverterTest.java
index 3037e7c4e4..bfbc4ef70c 100644
--- a/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/converters/CoreTsvConverterTest.java
+++ b/livingatlas/pipelines/src/test/java/au/org/ala/pipelines/converters/CoreTsvConverterTest.java
@@ -826,26 +826,45 @@ public void converterTest() {
             .setFirstLoaded(6L)
             .build();
 
+    Image im1 =
+        Image.newBuilder()
+            .setCreated("ir_Image")
+            .setAudience("ir_Audienc")
+            .setCreator("ir_Creator")
+            .setContributor("ir_Contributor")
+            .setDatasetId("ir_DatasetId")
+            .setLicense("ir_License")
+            .setLatitude(77d)
+            .setLongitude(777d)
+            .setSpatial("ir_Spatial")
+            .setTitle("ir_Title")
+            .setRightsHolder("ir_RightsHolder")
+            .setIdentifier("ir_Identifier1")
+            .setFormat("image")
+            .build();
+
+    Image im2 =
+        Image.newBuilder()
+            .setCreated("ir_Audio")
+            .setAudience("ir_Audienc")
+            .setCreator("ir_Creator")
+            .setContributor("ir_Contributor")
+            .setDatasetId("ir_DatasetId")
+            .setLicense("ir_License")
+            .setLatitude(77d)
+            .setLongitude(777d)
+            .setSpatial("ir_Spatial")
+            .setTitle("ir_Title")
+            .setRightsHolder("ir_RightsHolder")
+            .setIdentifier("ir_Identifier2")
+            .setFormat("audio")
+            .build();
+
     ImageRecord ir =
         ImageRecord.newBuilder()
             .setId(DwcTerm.occurrenceID.simpleName())
             .setCreated(7L)
-            .setImageItems(
-                Collections.singletonList(
-                    Image.newBuilder()
-                        .setCreated("ir_Image")
-                        .setAudience("ir_Audienc")
-                        .setCreator("ir_Creator")
-                        .setContributor("ir_Contributor")
-                        .setDatasetId("ir_DatasetId")
-                        .setLicense("ir_License")
-                        .setLatitude(77d)
-                        .setLongitude(777d)
-                        .setSpatial("ir_Spatial")
-                        .setTitle("ir_Title")
-                        .setRightsHolder("ir_RightsHolder")
-                        .setIdentifier("ir_Identifier")
-                        .build()))
+            .setImageItems(Arrays.asList(im1, im2))
             .build();
 
     TaxonProfile tp = TaxonProfile.newBuilder().setId(DwcTerm.occurrenceID.simpleName()).build();
@@ -903,6 +922,10 @@ public void converterTest() {
 
     // Should
     Assert.assertEquals(String.join("\t", expected), result);
+
+    Assert.assertEquals(1, source.getMultiValues().get("imageIDs").size());
+    Assert.assertEquals(1, source.getMultiValues().get("soundIDs").size());
+    Assert.assertNull(source.getMultiValues().get("videoIDs"));
   }
 
   @Test
diff --git a/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/meta.xml b/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/meta.xml
index 376fecdda4..7ff8ba40e2 100755
--- a/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/meta.xml
+++ b/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/meta.xml
@@ -52,5 +52,7 @@
         <field index="43" term="http://rs.tdwg.org/dwc/terms/establishmentMeans"/>
         <field index="44" term="http://rs.tdwg.org/dwc/terms/identifiedByID"/>
         <field index="45" term="http://rs.tdwg.org/dwc/terms/recordedByID"/>
+        <field index="46" term="stateConservation"/>
+        <field index="47" term="countryConservation"/>
       </core>
     </archive>
diff --git a/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/occurrence.csv b/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/occurrence.csv
index 934387d5fd..b9d94f985b 100644
--- a/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/occurrence.csv
+++ b/livingatlas/pipelines/src/test/resources/complete-pipeline-java/dr893/occurrence.csv
@@ -1,6 +1,6 @@
-not-an-uuid-1,,,,http://www.bowerbird.org.au/observations/49875,,Jenny Holmes,,,MORDELLIDAE,urn:lsid:biodiversity.org.au:afd.taxon:58c067e4-a55c-45bf-894c-e5edb7c771d8,family,Animalia,Arthropoda,Insecta,Coleoptera,Mordellidae,,,-37.1516626,142.8546633,EPSG:4326,,,,,,Australia,Victoria,,,2015,11,27,27/11/15,,,Pintail,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Coleoptera | Mordellidae | <not-supplied> | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffb8bf8-caf2-47f5-8ed8-e0777570e870,nonDwcFieldSalinity,native,native,id1|id2,
-not-an-uuid-2,,,,http://www.bowerbird.org.au/observations/48531,,Adam Edmonds,,,Scoparia crocospila,urn:lsid:biodiversity.org.au:afd.taxon:0b444e3f-22a2-4809-b102-a0ac65636425,species,Animalia,Arthropoda,Insecta,Lepidoptera,Crambidae,Scoparia,,-38.211987,144.322971,EPSG:4326,,,,,,Australia,Victoria,,,2015,10,2,2/10/15,,,Moth Grovedale 2-10-15,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Pyralidae | Scoparia | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffcb15c-2a7f-4c32-ad80-08b3c03a40f3,salty,captive,introduced,,id3|id4
-not-an-uuid-3,,,,http://www.bowerbird.org.au/observations/98064,,Janet Grevillea,,,Argiope keyserlingi,urn:lsid:biodiversity.org.au:afd.taxon:22b63a56-adb9-468b-b086-e6552f48cee3,species,Animalia,Arthropoda,Arachnida,Araneae,Araneidae,Argiope,,-33.06852769,151.6010284,EPSG:4326,,,,,,Australia,New South Wales,,,2017,11,2,2/11/17,,,St Andrews Cross spider,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Arachnida | Araneae | Araneidae | Argiope | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffdad7f-93df-4ea8-97f8-b4a78bab1021,fresh,native,introduced,id1,id3
-not-an-uuid-4,,,,http://www.bowerbird.org.au/observations/4810,,Reiner Richter,,,Rattus fuscipes,urn:lsid:biodiversity.org.au:afd.taxon:96e81ce0-ba02-4f33-b831-b598eece44a7,species,Animalia,Chordata,Mammalia,Rodentia,Muridae,Rattus,Bush Rat,-38.1369,145.2833,EPSG:4326,,,,,,Australia,Victoria,,,2013,7,31,31/7/13,,,Bush Rat,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Chordata | Mammalia | Rodentia | Muridae | Rattus | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffe4a61-a032-45e4-b5b4-53ae230e804a,fresh,captive,other,id1|id2,id3|id4
-not-an-uuid-5,,,,http://www.bowerbird.org.au/observations/76433,,Carol Page,,,Scioglyptis chionomera,urn:lsid:biodiversity.org.au:afd.taxon:d3c54055-181d-4135-87b7-97920e6bd73e,species,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Scioglyptis,,-37.9909881,145.1256931,EPSG:4326,,,,,,Australia,Victoria,,,2016,11,26,26/11/16,,,Tattered Geometrid,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Geometridae | Scioglyptis | <not-supplied>]],,https://biocache.ala.org.au/occurrences/ffff4965-d9bc-4baa-a7dd-84ba594fc298,salty,other,native,id1|id2,id3
-not-an-uuid-6,,,,http://www.bowerbird.org.au/observations/49878,,Jenny Holmes,,,Erythrotriorchis radiatus,urn:lsid:biodiversity.org.au:afd.taxon:1405a1d4-557c-40ac-9f44-a6d41e9136cd,species,Animalia,,,,,,,-35.260319,149.425934,EPSG:4326,,,,,,Australia,New South Wales,,,2015,11,27,27/11/15,,,,,,,,,
+not-an-uuid-1,,,,http://www.bowerbird.org.au/observations/49875,,Jenny Holmes,,,MORDELLIDAE,urn:lsid:biodiversity.org.au:afd.taxon:58c067e4-a55c-45bf-894c-e5edb7c771d8,family,Animalia,Arthropoda,Insecta,Coleoptera,Mordellidae,,,-37.1516626,142.8546633,EPSG:4326,,,,,,Australia,Victoria,,,2015,11,27,27/11/15,,,Pintail,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Coleoptera | Mordellidae | <not-supplied> | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffb8bf8-caf2-47f5-8ed8-e0777570e870,nonDwcFieldSalinity,native,native,id1|id2,,Endangered,Vulnerable
+not-an-uuid-2,,,,http://www.bowerbird.org.au/observations/48531,,Adam Edmonds,,,Scoparia crocospila,urn:lsid:biodiversity.org.au:afd.taxon:0b444e3f-22a2-4809-b102-a0ac65636425,species,Animalia,Arthropoda,Insecta,Lepidoptera,Crambidae,Scoparia,,-38.211987,144.322971,EPSG:4326,,,,,,Australia,Victoria,,,2015,10,2,2/10/15,,,Moth Grovedale 2-10-15,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Pyralidae | Scoparia | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffcb15c-2a7f-4c32-ad80-08b3c03a40f3,salty,captive,introduced,,id3|id4,Endangered,
+not-an-uuid-3,,,,http://www.bowerbird.org.au/observations/98064,,Janet Grevillea,,,Argiope keyserlingi,urn:lsid:biodiversity.org.au:afd.taxon:22b63a56-adb9-468b-b086-e6552f48cee3,species,Animalia,Arthropoda,Arachnida,Araneae,Araneidae,Argiope,,-33.06852769,151.6010284,EPSG:4326,,,,,,Australia,New South Wales,,,2017,11,2,2/11/17,,,St Andrews Cross spider,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Arachnida | Araneae | Araneidae | Argiope | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffdad7f-93df-4ea8-97f8-b4a78bab1021,fresh,native,introduced,id1,id3,,Vulnerable
+not-an-uuid-4,,,,http://www.bowerbird.org.au/observations/4810,,Reiner Richter,,,Rattus fuscipes,urn:lsid:biodiversity.org.au:afd.taxon:96e81ce0-ba02-4f33-b831-b598eece44a7,species,Animalia,Chordata,Mammalia,Rodentia,Muridae,Rattus,Bush Rat,-38.1369,145.2833,EPSG:4326,,,,,,Australia,Victoria,,,2013,7,31,31/7/13,,,Bush Rat,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Chordata | Mammalia | Rodentia | Muridae | Rattus | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffe4a61-a032-45e4-b5b4-53ae230e804a,fresh,captive,other,id1|id2,id3|id4,,
+not-an-uuid-5,,,,http://www.bowerbird.org.au/observations/76433,,Carol Page,,,Scioglyptis chionomera,urn:lsid:biodiversity.org.au:afd.taxon:d3c54055-181d-4135-87b7-97920e6bd73e,species,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Scioglyptis,,-37.9909881,145.1256931,EPSG:4326,,,,,,Australia,Victoria,,,2016,11,26,26/11/16,,,Tattered Geometrid,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Geometridae | Scioglyptis | <not-supplied>]],,https://biocache.ala.org.au/occurrences/ffff4965-d9bc-4baa-a7dd-84ba594fc298,salty,other,native,id1|id2,id3,Extinct,Vulnerable
+not-an-uuid-6,,,,http://www.bowerbird.org.au/observations/49878,,Jenny Holmes,,,Erythrotriorchis radiatus,urn:lsid:biodiversity.org.au:afd.taxon:1405a1d4-557c-40ac-9f44-a6d41e9136cd,species,Animalia,,,,,,,-35.260319,149.425934,EPSG:4326,,,,,,Australia,New South Wales,,,2015,11,27,27/11/15,,,,,,,,,,,,,
diff --git a/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/meta.xml b/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/meta.xml
index 376fecdda4..7ff8ba40e2 100755
--- a/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/meta.xml
+++ b/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/meta.xml
@@ -52,5 +52,7 @@
         <field index="43" term="http://rs.tdwg.org/dwc/terms/establishmentMeans"/>
         <field index="44" term="http://rs.tdwg.org/dwc/terms/identifiedByID"/>
         <field index="45" term="http://rs.tdwg.org/dwc/terms/recordedByID"/>
+        <field index="46" term="stateConservation"/>
+        <field index="47" term="countryConservation"/>
       </core>
     </archive>
diff --git a/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/occurrence.csv b/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/occurrence.csv
index 934387d5fd..2b5ffd6e82 100644
--- a/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/occurrence.csv
+++ b/livingatlas/pipelines/src/test/resources/complete-pipeline/dr893/occurrence.csv
@@ -1,6 +1,6 @@
-not-an-uuid-1,,,,http://www.bowerbird.org.au/observations/49875,,Jenny Holmes,,,MORDELLIDAE,urn:lsid:biodiversity.org.au:afd.taxon:58c067e4-a55c-45bf-894c-e5edb7c771d8,family,Animalia,Arthropoda,Insecta,Coleoptera,Mordellidae,,,-37.1516626,142.8546633,EPSG:4326,,,,,,Australia,Victoria,,,2015,11,27,27/11/15,,,Pintail,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Coleoptera | Mordellidae | <not-supplied> | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffb8bf8-caf2-47f5-8ed8-e0777570e870,nonDwcFieldSalinity,native,native,id1|id2,
-not-an-uuid-2,,,,http://www.bowerbird.org.au/observations/48531,,Adam Edmonds,,,Scoparia crocospila,urn:lsid:biodiversity.org.au:afd.taxon:0b444e3f-22a2-4809-b102-a0ac65636425,species,Animalia,Arthropoda,Insecta,Lepidoptera,Crambidae,Scoparia,,-38.211987,144.322971,EPSG:4326,,,,,,Australia,Victoria,,,2015,10,2,2/10/15,,,Moth Grovedale 2-10-15,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Pyralidae | Scoparia | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffcb15c-2a7f-4c32-ad80-08b3c03a40f3,salty,captive,introduced,,id3|id4
-not-an-uuid-3,,,,http://www.bowerbird.org.au/observations/98064,,Janet Grevillea,,,Argiope keyserlingi,urn:lsid:biodiversity.org.au:afd.taxon:22b63a56-adb9-468b-b086-e6552f48cee3,species,Animalia,Arthropoda,Arachnida,Araneae,Araneidae,Argiope,,-33.06852769,151.6010284,EPSG:4326,,,,,,Australia,New South Wales,,,2017,11,2,2/11/17,,,St Andrews Cross spider,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Arachnida | Araneae | Araneidae | Argiope | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffdad7f-93df-4ea8-97f8-b4a78bab1021,fresh,native,introduced,id1,id3
-not-an-uuid-4,,,,http://www.bowerbird.org.au/observations/4810,,Reiner Richter,,,Rattus fuscipes,urn:lsid:biodiversity.org.au:afd.taxon:96e81ce0-ba02-4f33-b831-b598eece44a7,species,Animalia,Chordata,Mammalia,Rodentia,Muridae,Rattus,Bush Rat,-38.1369,145.2833,EPSG:4326,,,,,,Australia,Victoria,,,2013,7,31,31/7/13,,,Bush Rat,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Chordata | Mammalia | Rodentia | Muridae | Rattus | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffe4a61-a032-45e4-b5b4-53ae230e804a,fresh,captive,other,id1|id2,id3|id4
-not-an-uuid-5,,,,http://www.bowerbird.org.au/observations/76433,,Carol Page,,,Scioglyptis chionomera,urn:lsid:biodiversity.org.au:afd.taxon:d3c54055-181d-4135-87b7-97920e6bd73e,species,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Scioglyptis,,-37.9909881,145.1256931,EPSG:4326,,,,,,Australia,Victoria,,,2016,11,26,26/11/16,,,Tattered Geometrid,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Geometridae | Scioglyptis | <not-supplied>]],,https://biocache.ala.org.au/occurrences/ffff4965-d9bc-4baa-a7dd-84ba594fc298,salty,other,native,id1|id2,id3
-not-an-uuid-6,,,,http://www.bowerbird.org.au/observations/49878,,Jenny Holmes,,,Erythrotriorchis radiatus,urn:lsid:biodiversity.org.au:afd.taxon:1405a1d4-557c-40ac-9f44-a6d41e9136cd,species,Animalia,,,,,,,-35.260319,149.425934,EPSG:4326,,,,,,Australia,New South Wales,,,2015,11,27,27/11/15,,,,,,,,,
+not-an-uuid-1,,,,http://www.bowerbird.org.au/observations/49875,,Jenny Holmes,,,MORDELLIDAE,urn:lsid:biodiversity.org.au:afd.taxon:58c067e4-a55c-45bf-894c-e5edb7c771d8,family,Animalia,Arthropoda,Insecta,Coleoptera,Mordellidae,,,-37.1516626,142.8546633,EPSG:4326,,,,,,Australia,Victoria,,,2015,11,27,27/11/15,,,Pintail,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Coleoptera | Mordellidae | <not-supplied> | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffb8bf8-caf2-47f5-8ed8-e0777570e870,nonDwcFieldSalinity,native,native,id1|id2,,Endangered,Vulnerable
+not-an-uuid-2,,,,http://www.bowerbird.org.au/observations/48531,,Adam Edmonds,,,Scoparia crocospila,urn:lsid:biodiversity.org.au:afd.taxon:0b444e3f-22a2-4809-b102-a0ac65636425,species,Animalia,Arthropoda,Insecta,Lepidoptera,Crambidae,Scoparia,,-38.211987,144.322971,EPSG:4326,,,,,,Australia,Victoria,,,2015,10,2,2/10/15,,,Moth Grovedale 2-10-15,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Pyralidae | Scoparia | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffcb15c-2a7f-4c32-ad80-08b3c03a40f3,salty,captive,introduced,,id3|id4,Endangered,
+not-an-uuid-3,,,,http://www.bowerbird.org.au/observations/98064,,Janet Grevillea,,,Argiope keyserlingi,urn:lsid:biodiversity.org.au:afd.taxon:22b63a56-adb9-468b-b086-e6552f48cee3,species,Animalia,Arthropoda,Arachnida,Araneae,Araneidae,Argiope,,-33.06852769,151.6010284,EPSG:4326,,,,,,Australia,New South Wales,,,2017,11,2,2/11/17,,,St Andrews Cross spider,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Arachnida | Araneae | Araneidae | Argiope | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffdad7f-93df-4ea8-97f8-b4a78bab1021,fresh,native,introduced,id1,id3,,Vulnerable
+not-an-uuid-4,,,,http://www.bowerbird.org.au/observations/4810,,Reiner Richter,,,Rattus fuscipes,urn:lsid:biodiversity.org.au:afd.taxon:96e81ce0-ba02-4f33-b831-b598eece44a7,species,Animalia,Chordata,Mammalia,Rodentia,Muridae,Rattus,Bush Rat,-38.1369,145.2833,EPSG:4326,,,,,,Australia,Victoria,,,2013,7,31,31/7/13,,,Bush Rat,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Chordata | Mammalia | Rodentia | Muridae | Rattus | <not-supplied>]],,https://biocache.ala.org.au/occurrences/fffe4a61-a032-45e4-b5b4-53ae230e804a,fresh,captive,other,id1|id2,id3|id4,,
+not-an-uuid-5,,,,http://www.bowerbird.org.au/observations/76433,,Carol Page,,,Scioglyptis chionomera,urn:lsid:biodiversity.org.au:afd.taxon:d3c54055-181d-4135-87b7-97920e6bd73e,species,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Scioglyptis,,-37.9909881,145.1256931,EPSG:4326,,,,,,Australia,Victoria,,,2016,11,26,26/11/16,,,Tattered Geometrid,Identification aligned to national taxonomy. Originally supplied as [[ Animalia | Arthropoda | Insecta | Lepidoptera | Geometridae | Scioglyptis | <not-supplied>]],,https://biocache.ala.org.au/occurrences/ffff4965-d9bc-4baa-a7dd-84ba594fc298,salty,other,native,id1|id2,id3,Extinct,Vulnerable
+not-an-uuid-6,,,,http://www.bowerbird.org.au/observations/49878,,Jenny Holmes,,,Erythrotriorchis radiatus,urn:lsid:biodiversity.org.au:afd.taxon:1405a1d4-557c-40ac-9f44-a6d41e9136cd,species,Animalia,,,,,,,-35.260319,149.425934,EPSG:4326,,,,,,Australia,New South Wales,,,2015,11,27,27/11/15,,,,,,,,,,,
diff --git a/livingatlas/solr/conf/managed-schema b/livingatlas/solr/conf/managed-schema
index 109740a550..b51638686e 100644
--- a/livingatlas/solr/conf/managed-schema
+++ b/livingatlas/solr/conf/managed-schema
@@ -17,7 +17,7 @@
   <fieldType name="float" class="solr.FloatPointField" omitNorms="true" />
   <fieldType name="geohash"
             class="solr.SpatialRecursivePrefixTreeFieldType"
-            datelineRule="ccwRect"
+            datelineRule="none"
             spatialContextFactory="org.locationtech.spatial4j.context.jts.JtsSpatialContextFactory"
             validationRule="none"
             distanceUnits="degrees"
@@ -25,7 +25,7 @@
   />
   <fieldType name="quad"
             class="solr.SpatialRecursivePrefixTreeFieldType"
-            datelineRule="ccwRect"
+            datelineRule="none"
             spatialContextFactory="org.locationtech.spatial4j.context.jts.JtsSpatialContextFactory"
             validationRule="none"
             distanceUnits="degrees"
@@ -34,7 +34,7 @@
   />
   <fieldType name="packedQuad"
             class="solr.SpatialRecursivePrefixTreeFieldType"
-            datelineRule="ccwRect"
+            datelineRule="none"
             spatialContextFactory="org.locationtech.spatial4j.context.jts.JtsSpatialContextFactory"
             validationRule="none"
             distanceUnits="degrees"
@@ -198,6 +198,7 @@
   <field name="occurrence_year"    type="date" docValues="true" multiValued="true"  indexed="true" />
 
   <!-- Legacy conservation fields -->
+  <field name="raw_countryConservation" type="string"  docValues="true" multiValued="false"  indexed="true" />
   <field name="raw_stateConservation" type="string"  docValues="true" multiValued="false"  indexed="true" />
   <field name="stateConservation"     type="string"  docValues="true" multiValued="false"  indexed="true" />
   <field name="countryConservation"   type="string"  docValues="true" multiValued="false"  indexed="true" />
@@ -538,6 +539,7 @@
   <field name="text_eventID" type="textgen"  indexed="true" stored="false"/>
   <field name="text_parentEventID" type="textgen" indexed="true" stored="false"/>
   <field name="text_fieldNumber" type="textgen" indexed="true" stored="false"/>
+  <field name="text_catalogNumber" type="textgen" indexed="true" stored="false"/>
 
   <!-- occurrenceDetails needs to be remapped on export -->
   <!-- see https://github.com/AtlasOfLivingAustralia/la-pipelines/issues/236 -->
@@ -712,6 +714,7 @@
   <!-- Do we need this -->
   <copyField source="basisOfRecord" dest="text"/>
   <copyField source="catalogNumber" dest="text"/>
+  <copyField source="catalogNumber" dest="text_catalogNumber"/>
   <copyField source="class" dest="text"/>
   <copyField source="collectionCode" dest="text"/>
   <copyField source="collectionName" dest="text"/>
diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreter.java b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreter.java
index c547d7614b..528db783aa 100644
--- a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreter.java
+++ b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreter.java
@@ -7,12 +7,14 @@
 import static org.gbif.api.vocabulary.OccurrenceIssue.OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_COUNT;
 import static org.gbif.api.vocabulary.OccurrenceIssue.OCCURRENCE_STATUS_UNPARSABLE;
 import static org.gbif.api.vocabulary.OccurrenceIssue.TYPE_STATUS_INVALID;
+import static org.gbif.pipelines.core.utils.ModelUtils.DEFAULT_SEPARATOR;
 import static org.gbif.pipelines.core.utils.ModelUtils.addIssue;
-import static org.gbif.pipelines.core.utils.ModelUtils.extractOptListValue;
+import static org.gbif.pipelines.core.utils.ModelUtils.extractListValue;
 import static org.gbif.pipelines.core.utils.ModelUtils.extractOptValue;
 
 import com.google.common.base.Strings;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.Optional;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
@@ -287,27 +289,42 @@ public static BiConsumer<ExtendedRecord, BasicRecord> interpretOccurrenceStatus(
 
   /** {@link DwcTerm#otherCatalogNumbers} interpretation. */
   public static void interpretOtherCatalogNumbers(ExtendedRecord er, BasicRecord br) {
-    extractOptListValue(er, DwcTerm.otherCatalogNumbers).ifPresent(br::setOtherCatalogNumbers);
+    List<String> list = extractListValue(DEFAULT_SEPARATOR + "|;", er, DwcTerm.otherCatalogNumbers);
+    if (!list.isEmpty()) {
+      br.setOtherCatalogNumbers(list);
+    }
   }
 
   /** {@link DwcTerm#recordedBy} interpretation. */
   public static void interpretRecordedBy(ExtendedRecord er, BasicRecord br) {
-    extractOptListValue(er, DwcTerm.recordedBy).ifPresent(br::setRecordedBy);
+    List<String> list = extractListValue(er, DwcTerm.recordedBy);
+    if (!list.isEmpty()) {
+      br.setRecordedBy(list);
+    }
   }
 
   /** {@link DwcTerm#identifiedBy} interpretation. */
   public static void interpretIdentifiedBy(ExtendedRecord er, BasicRecord br) {
-    extractOptListValue(er, DwcTerm.identifiedBy).ifPresent(br::setIdentifiedBy);
+    List<String> list = extractListValue(er, DwcTerm.identifiedBy);
+    if (!list.isEmpty()) {
+      br.setIdentifiedBy(list);
+    }
   }
 
   /** {@link DwcTerm#preparations} interpretation. */
   public static void interpretPreparations(ExtendedRecord er, BasicRecord br) {
-    extractOptListValue(er, DwcTerm.preparations).ifPresent(br::setPreparations);
+    List<String> list = extractListValue(er, DwcTerm.preparations);
+    if (!list.isEmpty()) {
+      br.setPreparations(list);
+    }
   }
 
   /** {@link org.gbif.dwc.terms.GbifTerm#projectId} interpretation. */
   public static void interpretProjectId(ExtendedRecord er, BasicRecord br) {
-    extractOptListValue(er, GbifTerm.projectId).ifPresent(br::setProjectId);
+    List<String> list = extractListValue(er, GbifTerm.projectId);
+    if (!list.isEmpty()) {
+      br.setProjectId(list);
+    }
   }
 
   /** Sets the coreId field. */
diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/CoreInterpreter.java b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/CoreInterpreter.java
index 9bd6853d92..fb56487b3c 100644
--- a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/CoreInterpreter.java
+++ b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/CoreInterpreter.java
@@ -2,8 +2,8 @@
 
 import static org.gbif.api.vocabulary.OccurrenceIssue.REFERENCES_URI_INVALID;
 import static org.gbif.pipelines.core.utils.ModelUtils.addIssue;
+import static org.gbif.pipelines.core.utils.ModelUtils.extractListValue;
 import static org.gbif.pipelines.core.utils.ModelUtils.extractNullAwareOptValue;
-import static org.gbif.pipelines.core.utils.ModelUtils.extractOptListValue;
 import static org.gbif.pipelines.core.utils.ModelUtils.extractOptValue;
 import static org.gbif.pipelines.core.utils.ModelUtils.extractValue;
 
@@ -78,12 +78,18 @@ public static void interpretLicense(ExtendedRecord er, Consumer<String> consumer
 
   /** {@link DwcTerm#datasetID} interpretation. */
   public static void interpretDatasetID(ExtendedRecord er, Consumer<List<String>> consumer) {
-    extractOptListValue(er, DwcTerm.datasetID).ifPresent(consumer);
+    List<String> list = extractListValue(er, DwcTerm.datasetID);
+    if (!list.isEmpty()) {
+      consumer.accept(list);
+    }
   }
 
   /** {@link DwcTerm#datasetName} interpretation. */
   public static void interpretDatasetName(ExtendedRecord er, Consumer<List<String>> consumer) {
-    extractOptListValue(er, DwcTerm.datasetName).ifPresent(consumer);
+    List<String> list = extractListValue(er, DwcTerm.datasetName);
+    if (!list.isEmpty()) {
+      consumer.accept(list);
+    }
   }
 
   /** {@link DwcTerm#parentEventID} interpretation. */
@@ -134,7 +140,10 @@ public static BiConsumer<ExtendedRecord, EventCoreRecord> interpretLineages(
 
   /** {@link DwcTerm#samplingProtocol} interpretation. */
   public static void interpretSamplingProtocol(ExtendedRecord er, Consumer<List<String>> consumer) {
-    extractOptListValue(er, DwcTerm.samplingProtocol).ifPresent(consumer);
+    List<String> list = extractListValue(er, DwcTerm.samplingProtocol);
+    if (!list.isEmpty()) {
+      consumer.accept(list);
+    }
   }
 
   /** {@link DwcTerm#locationID} interpretation. */
diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/GrscicollInterpreter.java b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/GrscicollInterpreter.java
index 59fd7c1481..c6b2ef0f8f 100644
--- a/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/GrscicollInterpreter.java
+++ b/sdks/core/src/main/java/org/gbif/pipelines/core/interpreters/core/GrscicollInterpreter.java
@@ -134,7 +134,9 @@ private static boolean isSpecimenRecord(ExtendedRecord er) {
 
     return bor == BasisOfRecord.PRESERVED_SPECIMEN
         || bor == BasisOfRecord.FOSSIL_SPECIMEN
-        || bor == BasisOfRecord.LIVING_SPECIMEN;
+        || bor == BasisOfRecord.LIVING_SPECIMEN
+        || bor == BasisOfRecord.MATERIAL_SAMPLE
+        || bor == BasisOfRecord.MATERIAL_CITATION;
   }
 
   @VisibleForTesting
diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/utils/ModelUtils.java b/sdks/core/src/main/java/org/gbif/pipelines/core/utils/ModelUtils.java
index b1c1fc2084..c7a791e45e 100644
--- a/sdks/core/src/main/java/org/gbif/pipelines/core/utils/ModelUtils.java
+++ b/sdks/core/src/main/java/org/gbif/pipelines/core/utils/ModelUtils.java
@@ -2,13 +2,14 @@
 
 import static org.gbif.pipelines.core.utils.IdentificationUtils.extractFromIdentificationExtension;
 
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;
 import lombok.AccessLevel;
 import lombok.NoArgsConstructor;
 import org.gbif.api.vocabulary.Extension;
@@ -68,15 +69,20 @@ public static Optional<String> extractOptValue(ExtendedRecord er, Term term) {
     return Optional.ofNullable(extractValue(er, term));
   }
 
-  public static Optional<List<String>> extractOptListValue(ExtendedRecord er, Term term) {
+  public static List<String> extractListValue(ExtendedRecord er, Term term) {
+    return extractListValue(DEFAULT_SEPARATOR, er, term);
+  }
+
+  public static List<String> extractListValue(String separatorRegex, ExtendedRecord er, Term term) {
     return extractOptValue(er, term)
         .filter(x -> !x.isEmpty())
         .map(
             x ->
-                Stream.of(x.split(DEFAULT_SEPARATOR))
+                Arrays.stream(x.split(separatorRegex))
                     .map(String::trim)
                     .filter(v -> !v.isEmpty())
-                    .collect(Collectors.toList()));
+                    .collect(Collectors.toList()))
+        .orElse(Collections.emptyList());
   }
 
   public static boolean hasValue(ExtendedRecord er, Term term) {
diff --git a/sdks/core/src/test/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreterTest.java b/sdks/core/src/test/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreterTest.java
index 6e20428902..da9798c649 100644
--- a/sdks/core/src/test/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreterTest.java
+++ b/sdks/core/src/test/java/org/gbif/pipelines/core/interpreters/core/BasicInterpreterTest.java
@@ -268,6 +268,28 @@ public void interpretOtherCatalogNumbersTest() {
     assertIssueSize(br, 0);
   }
 
+  @Test
+  public void interpretSemicolonOtherCatalogNumbersTest() {
+    final String number1 = "111";
+    final String number2 = "22";
+
+    // State
+    Map<String, String> coreMap = new HashMap<>(1);
+    coreMap.put(DwcTerm.otherCatalogNumbers.qualifiedName(), number1 + " ; " + number2 + " ; ");
+    ExtendedRecord er = ExtendedRecord.newBuilder().setId(ID).setCoreTerms(coreMap).build();
+
+    BasicRecord br = BasicRecord.newBuilder().setId(ID).build();
+
+    // When
+    BasicInterpreter.interpretOtherCatalogNumbers(er, br);
+
+    // Should
+    Assert.assertEquals(2, br.getOtherCatalogNumbers().size());
+    Assert.assertTrue(br.getOtherCatalogNumbers().contains(number1));
+    Assert.assertTrue(br.getOtherCatalogNumbers().contains(number2));
+    assertIssueSize(br, 0);
+  }
+
   @Test
   public void interpretRecordedByTest() {
     final String person1 = "person 1";