diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinder.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinder.java index a73ae704865..b3a3c846e92 100644 --- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinder.java +++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinder.java @@ -17,7 +17,6 @@ package org.apache.gobblin.data.management.copy.hive; -import com.google.common.base.Throwables; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.net.URISyntaxException; @@ -25,12 +24,7 @@ import java.util.Iterator; import java.util.List; import java.util.Properties; - -import javax.annotation.Nonnull; - -import lombok.Data; -import lombok.Getter; -import lombok.extern.slf4j.Slf4j; +import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.reflect.ConstructorUtils; @@ -43,12 +37,18 @@ import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; +import com.google.common.base.Throwables; import com.google.common.collect.AbstractIterator; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; +import javax.annotation.Nonnull; +import lombok.Data; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; + import org.apache.gobblin.config.client.ConfigClient; import org.apache.gobblin.config.client.ConfigClientCache; import org.apache.gobblin.config.client.ConfigClientUtils; @@ -80,6 +80,9 @@ public class HiveDatasetFinder implements IterableDatasetFinder { public static final String DEFAULT_TABLE_PATTERN = "*"; public static final String TABLE_FILTER = HIVE_DATASET_PREFIX + ".tableFilter"; + // Property used to filter tables only physically within a folder, represented by a regex + public static final String TABLE_FOLDER_ALLOWLIST_FILTER = HIVE_DATASET_PREFIX + ".tableFolderAllowlistFilter"; + /* * By setting the prefix, only config keys with this prefix will be used to build a HiveDataset. * By passing scoped configurations the same config keys can be used in different contexts. @@ -118,6 +121,8 @@ public class HiveDatasetFinder implements IterableDatasetFinder { protected final Function configStoreDatasetUriBuilder; protected final Optional> tableFilter; + protected final Optional tableFolderAllowlistRegex; + protected final String datasetConfigPrefix; protected final ConfigClient configClient; private final Config jobConfig; @@ -194,6 +199,8 @@ protected HiveDatasetFinder(FileSystem fs, Properties properties, HiveMetastoreC } else { this.tableFilter = Optional.absent(); } + this.tableFolderAllowlistRegex = properties.containsKey(TABLE_FOLDER_ALLOWLIST_FILTER) ? + Optional.of(Pattern.compile(properties.getProperty(TABLE_FOLDER_ALLOWLIST_FILTER))): Optional.absent(); } protected static HiveMetastoreClientPool createClientPool(Properties properties) throws IOException { @@ -262,7 +269,10 @@ protected HiveDataset computeNext() { try (AutoReturnableObject client = HiveDatasetFinder.this.clientPool.getClient()) { Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable()); - if (tableFilter.isPresent() && !tableFilter.get().apply(table)) { + if ((tableFilter.isPresent() && !tableFilter.get().apply(table)) + || !shouldAllowTableLocation(tableFolderAllowlistRegex, table)) { + log.info("Ignoring table {} as its underlying location {} does not pass allowlist regex {}", dbAndTable, + table.getSd().getLocation(), tableFolderAllowlistRegex.get()); continue; } @@ -294,6 +304,12 @@ protected HiveDataset computeNext() { }; } + protected static boolean shouldAllowTableLocation(Optional regex, Table table) { + if (!regex.isPresent()) { + return true; + } + return regex.get().matcher(table.getSd().getLocation()).matches(); + } /** * @deprecated Use {@link #createHiveDataset(Table, Config)} instead diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinderTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinderTest.java index 07945202551..a9805d39540 100644 --- a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinderTest.java +++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/hive/HiveDatasetFinderTest.java @@ -215,6 +215,51 @@ public void testDatasetConfig() throws Exception { } + @Test + public void testHiveTableFolderAllowlistFilter() throws Exception { + List dbAndTables = Lists.newArrayList(); + dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); + // This table is created on /tmp/test + HiveMetastoreClientPool pool = getTestPool(dbAndTables); + + Properties properties = new Properties(); + properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); + // Try a regex with multiple groups + properties.put(HiveDatasetFinder.TABLE_FOLDER_ALLOWLIST_FILTER, "(/tmp/|a).*"); + + HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); + List datasets = Lists.newArrayList(finder.getDatasetsIterator()); + + Assert.assertEquals(datasets.size(), 1); + + properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); + // The table located at /tmp/test should be filtered + properties.put(HiveDatasetFinder.TABLE_FOLDER_ALLOWLIST_FILTER, "/a/b"); + + finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); + datasets = Lists.newArrayList(finder.getDatasetsIterator()); + + Assert.assertEquals(datasets.size(), 0); + + // Test empty filter + properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); + // The table located at /tmp/test should be filtered + properties.put(HiveDatasetFinder.TABLE_FOLDER_ALLOWLIST_FILTER, ""); + + finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); + datasets = Lists.newArrayList(finder.getDatasetsIterator()); + + Assert.assertEquals(datasets.size(), 0); + + // Test no regex config + properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); + + finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); + datasets = Lists.newArrayList(finder.getDatasetsIterator()); + + Assert.assertEquals(datasets.size(), 0); + } + private HiveMetastoreClientPool getTestPool(List dbAndTables) throws Exception { SetMultimap entities = HashMultimap.create();