From fd963afda8411773d1a36833583b95ba939c63e1 Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Thu, 26 May 2022 12:08:10 +0800 Subject: [PATCH 1/7] add hudi support for TIS --- .../org/apache/hudi/common/fs/FSUtils.java | 45 +- .../hudi/common/table/HoodieTableConfig.java | 3 + .../hudi/sink/utils/HiveSyncContext.java | 4 +- .../org/apache/hudi/hive/HiveSyncTool.java | 608 +++++++++--------- .../apache/hudi/hive/HoodieHiveClient.java | 1 + packaging/hudi-flink-bundle/package.sh | 2 +- packaging/hudi-utilities-bundle/package.sh | 1 + tisInstall.sh | 1 + 8 files changed, 344 insertions(+), 321 deletions(-) create mode 100644 packaging/hudi-utilities-bundle/package.sh create mode 100644 tisInstall.sh diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 8461fe0b3526..ff5904de5a5d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -53,6 +53,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -95,30 +96,42 @@ public static Configuration prepareHadoopConf(Configuration conf) { return conf; } - static ServiceLoader extraFileSystemLoader; + static IExtraHadoopFileSystemGetter extraFileSystemLoader; public static FileSystem getFs(String path, Configuration conf) { - + if (extraFileSystemLoader == null || FSUtils.class.getClassLoader() != extraFileSystemLoader.getClass().getClassLoader()) { + LOG.info("start to get create instance of extraFileSystemLoader"); + extraFileSystemLoader = getExtraFileSystemLoader(conf, 0); + } if (extraFileSystemLoader == null) { - extraFileSystemLoader - = ServiceLoader.load(IExtraHadoopFileSystemGetter.class, FSUtils.class.getClassLoader()); + throw new IllegalStateException("extraFileSystemLoader can not be null"); } - for (IExtraHadoopFileSystemGetter loader : extraFileSystemLoader) { - LOG.info("load hdfs of " + path + " from extrnal System"); - return loader.getHadoopFileSystem(path); + LOG.info("load hdfs of " + path + " from extrnal System"); + return extraFileSystemLoader.getHadoopFileSystem(path); + } + + private static IExtraHadoopFileSystemGetter getExtraFileSystemLoader(Configuration conf, int retryCount) { + + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + ServiceLoader svcLoader + = ServiceLoader.load(IExtraHadoopFileSystemGetter.class, FSUtils.class.getClassLoader()); + Iterator it = svcLoader.iterator(); + while (it.hasNext()) { + return it.next(); } -// FileSystem fs; -// prepareHadoopConf(conf); -// try { -// fs = new Path(path).getFileSystem(conf); -// } catch (IOException e) { -// throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); -// } -// return fs; - throw new IllegalStateException("has not find any extraFileSystemLoader"); + if (retryCount < 3) { + return getExtraFileSystemLoader(conf, retryCount + 1); + } else { + throw new IllegalStateException("has not find any extraFileSystemLoader,retryCount:" + retryCount); + } } public static FileSystem getFs(String path, Configuration conf, boolean localByDefault) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index e4b60e2ea385..59caac3cffc6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -293,6 +293,9 @@ public static void delete(FileSystem fs, Path metadataFolder, Set delete */ public static void create(FileSystem fs, Path metadataFolder, Properties properties) throws IOException { +// if(1==1){ +// throw new IllegalStateException(""); +// } if (!fs.exists(metadataFolder)) { fs.mkdirs(metadataFolder); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java index 1c051c8cd230..c928e483940d 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java @@ -19,6 +19,7 @@ package org.apache.hudi.sink.utils; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HiveSyncTool; @@ -75,7 +76,8 @@ private static HiveSyncConfig buildSyncConfig(Configuration conf) { hiveSyncConfig.hiveUser = conf.getString(FlinkOptions.HIVE_SYNC_USERNAME); hiveSyncConfig.hivePass = conf.getString(FlinkOptions.HIVE_SYNC_PASSWORD); hiveSyncConfig.jdbcUrl = conf.getString(FlinkOptions.HIVE_SYNC_JDBC_URL); - hiveSyncConfig.partitionFields = Arrays.asList(FilePathUtils.extractPartitionKeys(conf)); + // hiveSyncConfig.partitionFields = Arrays.asList(FilePathUtils.extractPartitionKeys(conf)); + hiveSyncConfig.partitionFields = Arrays.asList(org.apache.hadoop.util.StringUtils.split(conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS))); hiveSyncConfig.partitionValueExtractorClass = conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME); hiveSyncConfig.useJdbc = conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC); hiveSyncConfig.useFileListingFromMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED); diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index b37b28ed2763..ef66d22cb618 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -18,7 +18,9 @@ package org.apache.hudi.hive; -import com.beust.jcommander.JCommander; +import static org.apache.parquet.schema.OriginalType.UTF8; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; @@ -49,8 +51,7 @@ import java.util.Map; import java.util.stream.Collectors; -import static org.apache.parquet.schema.OriginalType.UTF8; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import com.beust.jcommander.JCommander; /** * Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api @@ -62,325 +63,326 @@ @SuppressWarnings("WeakerAccess") public class HiveSyncTool extends AbstractSyncTool { - private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); - public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; - public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; - - protected final HiveSyncConfig cfg; - protected HoodieHiveClient hoodieHiveClient = null; - protected String snapshotTableName = null; - protected Option roTableName = null; - - public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { - super(configuration.getAllProperties(), fs); - - try { - this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs); - } catch (RuntimeException e) { - if (cfg.ignoreExceptions) { - LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); - } else { - throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); - } - } + private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class); + public static final String SUFFIX_SNAPSHOT_TABLE = "_rt"; + public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro"; + + protected final HiveSyncConfig cfg; + protected HoodieHiveClient hoodieHiveClient = null; + protected String snapshotTableName = null; + protected Option roTableName = null; + + public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { + super(configuration.getAllProperties(), fs); + + try { + this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs); + } catch (RuntimeException e) { + if (cfg.ignoreExceptions) { + LOG.error("Got runtime exception when hive syncing, but continuing as ignoreExceptions config is set ", e); + } else { + throw new HoodieHiveSyncException("Got runtime exception when hive syncing", e); + } + } - this.cfg = cfg; - // Set partitionFields to empty, when the NonPartitionedExtractor is used - if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) { - LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used"); - cfg.partitionFields = new ArrayList<>(); - } - if (hoodieHiveClient != null) { - switch (hoodieHiveClient.getTableType()) { - case COPY_ON_WRITE: - this.snapshotTableName = cfg.tableName; - this.roTableName = Option.empty(); - break; - case MERGE_ON_READ: - this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE; - this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) : - Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE); - break; - default: - LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); - throw new InvalidTableException(hoodieHiveClient.getBasePath()); - } - } - } - - @Override - public void syncHoodieTable() { - try { - if (hoodieHiveClient != null) { - doSync(); - } - } catch (RuntimeException re) { - throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re); - } finally { - if (hoodieHiveClient != null) { - hoodieHiveClient.close(); - } + this.cfg = cfg; + // Set partitionFields to empty, when the NonPartitionedExtractor is used + if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) { + LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used"); + cfg.partitionFields = new ArrayList<>(); + } + if (hoodieHiveClient != null) { + switch (hoodieHiveClient.getTableType()) { + case COPY_ON_WRITE: + this.snapshotTableName = cfg.tableName; + this.roTableName = Option.empty(); + break; + case MERGE_ON_READ: + this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE; + this.roTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) : + Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE); + break; + default: + LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); + throw new InvalidTableException(hoodieHiveClient.getBasePath()); + } + } } - } - - protected void doSync() { - switch (hoodieHiveClient.getTableType()) { - case COPY_ON_WRITE: - syncHoodieTable(snapshotTableName, false, false); - break; - case MERGE_ON_READ: - // sync a RO table for MOR - syncHoodieTable(roTableName.get(), false, true); - // sync a RT table for MOR - syncHoodieTable(snapshotTableName, true, false); - break; - default: - LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); - throw new InvalidTableException(hoodieHiveClient.getBasePath()); + + @Override + public void syncHoodieTable() { + try { + if (hoodieHiveClient != null) { + doSync(); + } + } catch (RuntimeException re) { + throw new HoodieException("Got runtime exception when hive syncing " + cfg.tableName, re); + } finally { + if (hoodieHiveClient != null) { + hoodieHiveClient.close(); + } + } } - } - - protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, - boolean readAsOptimized) { - LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath() - + " of type " + hoodieHiveClient.getTableType()); - - // check if the database exists else create it - if (cfg.autoCreateDatabase) { - try { - if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { - hoodieHiveClient.createDatabase(cfg.databaseName); + + protected void doSync() { + switch (hoodieHiveClient.getTableType()) { + case COPY_ON_WRITE: + syncHoodieTable(snapshotTableName, false, false); + break; + case MERGE_ON_READ: + // sync a RO table for MOR + syncHoodieTable(roTableName.get(), false, true); + // sync a RT table for MOR + syncHoodieTable(snapshotTableName, true, false); + break; + default: + LOG.error("Unknown table type " + hoodieHiveClient.getTableType()); + throw new InvalidTableException(hoodieHiveClient.getBasePath()); } - } catch (Exception e) { - // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing - LOG.warn("Unable to create database", e); - } - } else { - if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { - throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName); - } } - // Check if the necessary table exists - boolean tableExists = hoodieHiveClient.doesTableExist(tableName); + protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, + boolean readAsOptimized) { + LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath() + + " of type " + hoodieHiveClient.getTableType()); + + // check if the database exists else create it + if (cfg.autoCreateDatabase) { + try { + if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { + hoodieHiveClient.createDatabase(cfg.databaseName); + } + } catch (Exception e) { + // this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing + LOG.warn("Unable to create database", e); + } + } else { + if (!hoodieHiveClient.doesDataBaseExist(cfg.databaseName)) { + throw new HoodieHiveSyncException("hive database does not exist " + cfg.databaseName); + } + } - // check if isDropPartition - boolean isDropPartition = hoodieHiveClient.isDropPartition(); + // Check if the necessary table exists + boolean tableExists = hoodieHiveClient.doesTableExist(tableName); - // Get the parquet schema for this table looking at the latest commit - MessageType schema = hoodieHiveClient.getDataSchema(); + // check if isDropPartition + boolean isDropPartition = hoodieHiveClient.isDropPartition(); - // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, - // so we disable the syncAsSparkDataSourceTable here to avoid read such kind table - // by the data source way (which will use the HoodieBootstrapRelation). - // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical. - if (hoodieHiveClient.isBootstrap() - && hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ - && !readAsOptimized) { - cfg.syncAsSparkDataSourceTable = false; - } + // Get the parquet schema for this table looking at the latest commit + MessageType schema = hoodieHiveClient.getDataSchema(); - // Sync schema if needed - boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema); + // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, + // so we disable the syncAsSparkDataSourceTable here to avoid read such kind table + // by the data source way (which will use the HoodieBootstrapRelation). + // TODO after we support bootstrap MOR rt table in HoodieBootstrapRelation[HUDI-2071], we can remove this logical. + if (hoodieHiveClient.isBootstrap() + && hoodieHiveClient.getTableType() == HoodieTableType.MERGE_ON_READ + && !readAsOptimized) { + cfg.syncAsSparkDataSourceTable = false; + } - LOG.info("Schema sync complete. Syncing partitions for " + tableName); - // Get the last time we successfully synced partitions - Option lastCommitTimeSynced = Option.empty(); - if (tableExists) { - lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName); - } - LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null")); - List writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced); - LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); - - // Sync the partitions if needed - boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition); - boolean meetSyncConditions = schemaChanged || partitionsChanged; - if (!cfg.isConditionalSync || meetSyncConditions) { - hoodieHiveClient.updateLastCommitTimeSynced(tableName); - } - LOG.info("Sync complete for " + tableName); - } - - /** - * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the - * table schema. - * - * @param tableExists - does table exist - * @param schema - extracted schema - */ - private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, - boolean readAsOptimized, MessageType schema) { - // Append spark table properties & serde properties - Map tableProperties = ConfigUtils.toMap(cfg.tableProperties); - Map serdeProperties = ConfigUtils.toMap(cfg.serdeProperties); - if (cfg.syncAsSparkDataSourceTable) { - Map sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema); - Map sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized); - tableProperties.putAll(sparkTableProperties); - serdeProperties.putAll(sparkSerdeProperties); - } - boolean schemaChanged = false; - // Check and sync schema - if (!tableExists) { - LOG.info("Hive table " + tableName + " is not found. Creating it"); - HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase()); - String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); - - if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) { - // Parquet input format had an InputFormat class visible under the old naming scheme. - inputFormatClassName = useRealTimeInputFormat - ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() - : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName(); - } - - String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat); - String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat); - - // Custom serde will not work with ALTER TABLE REPLACE COLUMNS - // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive - // /ql/exec/DDLTask.java#L3488 - hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, - outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties); - schemaChanged = true; - } else { - // Check if the table schema has evolved - Map tableSchema = hoodieHiveClient.getTableSchema(tableName); - SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp); - if (!schemaDiff.isEmpty()) { - LOG.info("Schema difference found for " + tableName); - hoodieHiveClient.updateTableDefinition(tableName, schema); - // Sync the table properties if the schema has changed - if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) { - hoodieHiveClient.updateTableProperties(tableName, tableProperties); - LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties); + // Sync schema if needed + boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema); + + LOG.info("Schema sync complete. Syncing partitions for " + tableName); + // Get the last time we successfully synced partitions + Option lastCommitTimeSynced = Option.empty(); + if (tableExists) { + lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName); } - schemaChanged = true; - } else { - LOG.info("No Schema difference for " + tableName); - } - } - return schemaChanged; - } - - /** - * Get Spark Sql related table properties. This is used for spark datasource table. - * @param schema The schema to write to the table. - * @return A new parameters added the spark's table properties. - */ - private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) { - // Convert the schema and partition info used by spark sql to hive table properties. - // The following code refers to the spark code in - // https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala - GroupType originGroupType = schema.asGroupType(); - List partitionNames = cfg.partitionFields; - List partitionCols = new ArrayList<>(); - List dataCols = new ArrayList<>(); - Map column2Field = new HashMap<>(); - - for (Type field : originGroupType.getFields()) { - column2Field.put(field.getName(), field); + LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null")); + List writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced); + LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size()); + + // Sync the partitions if needed + boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince, isDropPartition); + boolean meetSyncConditions = schemaChanged || partitionsChanged; + if (!cfg.isConditionalSync || meetSyncConditions) { + hoodieHiveClient.updateLastCommitTimeSynced(tableName); + } + LOG.info("Sync complete for " + tableName); } - // Get partition columns and data columns. - for (String partitionName : partitionNames) { - // Default the unknown partition fields to be String. - // Keep the same logical with HiveSchemaUtil#getPartitionKeyType. - partitionCols.add(column2Field.getOrDefault(partitionName, - new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8))); + + /** + * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the + * table schema. + * + * @param tableExists - does table exist + * @param schema - extracted schema + */ + private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, + boolean readAsOptimized, MessageType schema) { + // Append spark table properties & serde properties + Map tableProperties = ConfigUtils.toMap(cfg.tableProperties); + Map serdeProperties = ConfigUtils.toMap(cfg.serdeProperties); + if (cfg.syncAsSparkDataSourceTable) { + Map sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema); + Map sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized); + tableProperties.putAll(sparkTableProperties); + serdeProperties.putAll(sparkSerdeProperties); + } + boolean schemaChanged = false; + // Check and sync schema + if (!tableExists) { + LOG.info("Hive table " + tableName + " is not found. Creating it"); + HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase()); + String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat); + + if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) { + // Parquet input format had an InputFormat class visible under the old naming scheme. + inputFormatClassName = useRealTimeInputFormat + ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() + : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName(); + } + + String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat); + String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat); + + // Custom serde will not work with ALTER TABLE REPLACE COLUMNS + // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive + // /ql/exec/DDLTask.java#L3488 + hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, + outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties); + schemaChanged = true; + } else { + // Check if the table schema has evolved + Map tableSchema = hoodieHiveClient.getTableSchema(tableName); + SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp); + if (!schemaDiff.isEmpty()) { + LOG.info("Schema difference found for " + tableName + ",difference:" + schemaDiff.toString()); + hoodieHiveClient.updateTableDefinition(tableName, schema); + // Sync the table properties if the schema has changed + if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) { + hoodieHiveClient.updateTableProperties(tableName, tableProperties); + LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties); + } + schemaChanged = true; + } else { + LOG.info("No Schema difference for " + tableName); + } + } + return schemaChanged; } - for (Type field : originGroupType.getFields()) { - if (!partitionNames.contains(field.getName())) { - dataCols.add(field); - } + /** + * Get Spark Sql related table properties. This is used for spark datasource table. + * + * @param schema The schema to write to the table. + * @return A new parameters added the spark's table properties. + */ + private Map getSparkTableProperties(int schemaLengthThreshold, MessageType schema) { + // Convert the schema and partition info used by spark sql to hive table properties. + // The following code refers to the spark code in + // https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala + GroupType originGroupType = schema.asGroupType(); + List partitionNames = cfg.partitionFields; + List partitionCols = new ArrayList<>(); + List dataCols = new ArrayList<>(); + Map column2Field = new HashMap<>(); + + for (Type field : originGroupType.getFields()) { + column2Field.put(field.getName(), field); + } + // Get partition columns and data columns. + for (String partitionName : partitionNames) { + // Default the unknown partition fields to be String. + // Keep the same logical with HiveSchemaUtil#getPartitionKeyType. + partitionCols.add(column2Field.getOrDefault(partitionName, + new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8))); + } + + for (Type field : originGroupType.getFields()) { + if (!partitionNames.contains(field.getName())) { + dataCols.add(field); + } + } + + List reOrderedFields = new ArrayList<>(); + reOrderedFields.addAll(dataCols); + reOrderedFields.addAll(partitionCols); + GroupType reOrderedType = new GroupType(originGroupType.getRepetition(), originGroupType.getName(), reOrderedFields); + + Map sparkProperties = new HashMap<>(); + sparkProperties.put("spark.sql.sources.provider", "hudi"); + // Split the schema string to multi-parts according the schemaLengthThreshold size. + String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType); + int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold; + sparkProperties.put("spark.sql.sources.schema.numParts", String.valueOf(numSchemaPart)); + // Add each part of schema string to sparkProperties + for (int i = 0; i < numSchemaPart; i++) { + int start = i * schemaLengthThreshold; + int end = Math.min(start + schemaLengthThreshold, schemaString.length()); + sparkProperties.put("spark.sql.sources.schema.part." + i, schemaString.substring(start, end)); + } + // Add partition columns + if (!partitionNames.isEmpty()) { + sparkProperties.put("spark.sql.sources.schema.numPartCols", String.valueOf(partitionNames.size())); + for (int i = 0; i < partitionNames.size(); i++) { + sparkProperties.put("spark.sql.sources.schema.partCol." + i, partitionNames.get(i)); + } + } + return sparkProperties; } - List reOrderedFields = new ArrayList<>(); - reOrderedFields.addAll(dataCols); - reOrderedFields.addAll(partitionCols); - GroupType reOrderedType = new GroupType(originGroupType.getRepetition(), originGroupType.getName(), reOrderedFields); - - Map sparkProperties = new HashMap<>(); - sparkProperties.put("spark.sql.sources.provider", "hudi"); - // Split the schema string to multi-parts according the schemaLengthThreshold size. - String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType); - int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold; - sparkProperties.put("spark.sql.sources.schema.numParts", String.valueOf(numSchemaPart)); - // Add each part of schema string to sparkProperties - for (int i = 0; i < numSchemaPart; i++) { - int start = i * schemaLengthThreshold; - int end = Math.min(start + schemaLengthThreshold, schemaString.length()); - sparkProperties.put("spark.sql.sources.schema.part." + i, schemaString.substring(start, end)); + private Map getSparkSerdeProperties(boolean readAsOptimized) { + Map sparkSerdeProperties = new HashMap<>(); + sparkSerdeProperties.put("path", cfg.basePath); + sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized)); + return sparkSerdeProperties; } - // Add partition columns - if (!partitionNames.isEmpty()) { - sparkProperties.put("spark.sql.sources.schema.numPartCols", String.valueOf(partitionNames.size())); - for (int i = 0; i < partitionNames.size(); i++) { - sparkProperties.put("spark.sql.sources.schema.partCol." + i, partitionNames.get(i)); - } + + /** + * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the + * partition path does not match, it updates the partition path). + */ + private boolean syncPartitions(String tableName, List writtenPartitionsSince, boolean isDropPartition) { + boolean partitionsChanged; + try { + List hivePartitions = hoodieHiveClient.scanTablePartitions(tableName); + List partitionEvents = + hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition); + + List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); + if (!newPartitions.isEmpty()) { + LOG.info("New Partitions " + newPartitions); + hoodieHiveClient.addPartitionsToTable(tableName, newPartitions); + } + + List updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); + if (!updatePartitions.isEmpty()) { + LOG.info("Changed Partitions " + updatePartitions); + hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions); + } + + List dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP); + if (!dropPartitions.isEmpty()) { + LOG.info("Drop Partitions " + dropPartitions); + hoodieHiveClient.dropPartitionsToTable(tableName, dropPartitions); + } + + partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty(); + } catch (Exception e) { + throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e); + } + return partitionsChanged; } - return sparkProperties; - } - - private Map getSparkSerdeProperties(boolean readAsOptimized) { - Map sparkSerdeProperties = new HashMap<>(); - sparkSerdeProperties.put("path", cfg.basePath); - sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized)); - return sparkSerdeProperties; - } - - /** - * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the - * partition path does not match, it updates the partition path). - */ - private boolean syncPartitions(String tableName, List writtenPartitionsSince, boolean isDropPartition) { - boolean partitionsChanged; - try { - List hivePartitions = hoodieHiveClient.scanTablePartitions(tableName); - List partitionEvents = - hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince, isDropPartition); - - List newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD); - if (!newPartitions.isEmpty()) { - LOG.info("New Partitions " + newPartitions); - hoodieHiveClient.addPartitionsToTable(tableName, newPartitions); - } - - List updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE); - if (!updatePartitions.isEmpty()) { - LOG.info("Changed Partitions " + updatePartitions); - hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions); - } - - List dropPartitions = filterPartitions(partitionEvents, PartitionEventType.DROP); - if (!dropPartitions.isEmpty()) { - LOG.info("Drop Partitions " + dropPartitions); - hoodieHiveClient.dropPartitionsToTable(tableName, dropPartitions); - } - - partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty() || !dropPartitions.isEmpty(); - } catch (Exception e) { - throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e); + + private List filterPartitions(List events, PartitionEventType eventType) { + return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition) + .collect(Collectors.toList()); } - return partitionsChanged; - } - - private List filterPartitions(List events, PartitionEventType eventType) { - return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition) - .collect(Collectors.toList()); - } - - public static void main(String[] args) { - // parse the params - final HiveSyncConfig cfg = new HiveSyncConfig(); - JCommander cmd = new JCommander(cfg, null, args); - if (cfg.help || args.length == 0) { - cmd.usage(); - System.exit(1); + + public static void main(String[] args) { + // parse the params + final HiveSyncConfig cfg = new HiveSyncConfig(); + JCommander cmd = new JCommander(cfg, null, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); + HiveConf hiveConf = new HiveConf(); + hiveConf.addResource(fs.getConf()); + new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable(); } - FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); - HiveConf hiveConf = new HiveConf(); - hiveConf.addResource(fs.getConf()); - new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable(); - } } diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index acaf6caa18e6..2165f84390a6 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -187,6 +187,7 @@ List getPartitionEvents(List tablePartitions, List Date: Tue, 21 Jun 2022 11:59:09 +0800 Subject: [PATCH 2/7] add scala2.12 profile release --- .../org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java | 2 +- .../main/java/org/apache/hudi/common/config/ConfigProperty.java | 2 +- tisInstall.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index bc84ece50348..fd48def3b7eb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -100,7 +100,7 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException TimestampBasedAvroKeyGenerator(TypedProperties config, String recordKeyField, String partitionPathField) throws IOException { super(config, recordKeyField, partitionPathField); String dateTimeParserClass = config.getString(Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParserImpl.class.getName()); - this.parser = KeyGenUtils.createDateTimeParser(config, dateTimeParserClass); + this.parser = KeyGenUtils.h(config, dateTimeParserClass); this.inputDateTimeZone = parser.getInputDateTimeZone(); this.outputDateTimeZone = parser.getOutputDateTimeZone(); this.outputDateFormat = parser.getOutputDateFormat(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java index 961291430358..a5c5c9f527ad 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java @@ -166,4 +166,4 @@ public ConfigProperty noDefaultValue() { return configProperty; } } -} \ No newline at end of file +} diff --git a/tisInstall.sh b/tisInstall.sh index b69694bc97f2..7bcd82e90aca 100644 --- a/tisInstall.sh +++ b/tisInstall.sh @@ -1 +1 @@ -mvn install -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle +mvn install -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle -Pscala-2.12 From d57e0c3106fe39799efa40f2e81e6938b1448825 Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Tue, 21 Jun 2022 12:01:13 +0800 Subject: [PATCH 3/7] Revert "add scala2.12 profile release" This reverts commit 8da9956e --- .../org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java | 2 +- .../main/java/org/apache/hudi/common/config/ConfigProperty.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index fd48def3b7eb..bc84ece50348 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -100,7 +100,7 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException TimestampBasedAvroKeyGenerator(TypedProperties config, String recordKeyField, String partitionPathField) throws IOException { super(config, recordKeyField, partitionPathField); String dateTimeParserClass = config.getString(Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParserImpl.class.getName()); - this.parser = KeyGenUtils.h(config, dateTimeParserClass); + this.parser = KeyGenUtils.createDateTimeParser(config, dateTimeParserClass); this.inputDateTimeZone = parser.getInputDateTimeZone(); this.outputDateTimeZone = parser.getOutputDateTimeZone(); this.outputDateFormat = parser.getOutputDateFormat(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java index a5c5c9f527ad..961291430358 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigProperty.java @@ -166,4 +166,4 @@ public ConfigProperty noDefaultValue() { return configProperty; } } -} +} \ No newline at end of file From b4dc030a40a2026053bd0181cf955d46c0600cf9 Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Wed, 22 Jun 2022 09:00:56 +0800 Subject: [PATCH 4/7] add scala2.12 profile release --- tisInstall.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tisInstall.sh b/tisInstall.sh index 7bcd82e90aca..3d7be95754f5 100644 --- a/tisInstall.sh +++ b/tisInstall.sh @@ -1 +1,4 @@ -mvn install -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle -Pscala-2.12 +# for spark3 +#mvn install -Pspark3,flink-bundle-shade-hive3 -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle + +mvn deploy -Pspark3,flink-bundle-shade-hive3 -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle -DaltDeploymentRepository=base::default::http://localhost:8080/release From 8dc62ffedb9723c044e88a7dd9014ee684136b4a Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Fri, 15 Jul 2022 11:24:41 +0800 Subject: [PATCH 5/7] some fix --- .../org/apache/hudi/common/fs/FSUtils.java | 76 +- packaging/hudi-hadoop-mr-bundle/pom.xml | 366 +- pom.xml | 3152 +++++++++-------- tisInstall.sh | 20 +- 4 files changed, 1835 insertions(+), 1779 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index ff5904de5a5d..8c3a30dcaed4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -96,43 +96,51 @@ public static Configuration prepareHadoopConf(Configuration conf) { return conf; } - static IExtraHadoopFileSystemGetter extraFileSystemLoader; - + // static IExtraHadoopFileSystemGetter extraFileSystemLoader; public static FileSystem getFs(String path, Configuration conf) { - if (extraFileSystemLoader == null || FSUtils.class.getClassLoader() != extraFileSystemLoader.getClass().getClassLoader()) { - LOG.info("start to get create instance of extraFileSystemLoader"); - extraFileSystemLoader = getExtraFileSystemLoader(conf, 0); - } - if (extraFileSystemLoader == null) { - throw new IllegalStateException("extraFileSystemLoader can not be null"); - } - - LOG.info("load hdfs of " + path + " from extrnal System"); - return extraFileSystemLoader.getHadoopFileSystem(path); - } - - private static IExtraHadoopFileSystemGetter getExtraFileSystemLoader(Configuration conf, int retryCount) { - + FileSystem fs; + prepareHadoopConf(conf); try { - Thread.sleep(2000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - - ServiceLoader svcLoader - = ServiceLoader.load(IExtraHadoopFileSystemGetter.class, FSUtils.class.getClassLoader()); - Iterator it = svcLoader.iterator(); - while (it.hasNext()) { - return it.next(); - } - - if (retryCount < 3) { - return getExtraFileSystemLoader(conf, retryCount + 1); - } else { - throw new IllegalStateException("has not find any extraFileSystemLoader,retryCount:" + retryCount); - } - } + LOG.info("load hdfs of " + path + " from extrnal System"); + fs = new Path(path).getFileSystem(conf); + } catch (IOException e) { + throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); + } + return fs; +// if (extraFileSystemLoader == null || FSUtils.class.getClassLoader() != extraFileSystemLoader.getClass().getClassLoader()) { +// LOG.info("start to get create instance of extraFileSystemLoader"); +// extraFileSystemLoader = getExtraFileSystemLoader(conf, 0); +// } +// if (extraFileSystemLoader == null) { +// throw new IllegalStateException("extraFileSystemLoader can not be null"); +// } +// +// LOG.info("load hdfs of " + path + " from extrnal System"); +// return extraFileSystemLoader.getHadoopFileSystem(path); + } + +// private static IExtraHadoopFileSystemGetter getExtraFileSystemLoader(Configuration conf, int retryCount) { +// +// try { +// Thread.sleep(2000); +// } catch (InterruptedException e) { +// throw new RuntimeException(e); +// } +// +// ServiceLoader svcLoader +// = ServiceLoader.load(IExtraHadoopFileSystemGetter.class, FSUtils.class.getClassLoader()); +// Iterator it = svcLoader.iterator(); +// while (it.hasNext()) { +// return it.next(); +// } +// +// if (retryCount < 3) { +// return getExtraFileSystemLoader(conf, retryCount + 1); +// } else { +// throw new IllegalStateException("has not find any extraFileSystemLoader,retryCount:" + retryCount); +// } +// } public static FileSystem getFs(String path, Configuration conf, boolean localByDefault) { if (localByDefault) { diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 06899a087ebc..c3c0c7c6510a 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -15,192 +15,200 @@ See the License for the specific language governing permissions and limitations under the License. --> - - - hudi - org.apache.hudi - 0.10.1 - ../../pom.xml - - 4.0.0 - hudi-hadoop-mr-bundle - jar + + + hudi + org.apache.hudi + 0.10.1 + ../../pom.xml + + 4.0.0 + hudi-hadoop-mr-bundle + jar - - true - ${project.parent.basedir} - + + true + ${project.parent.basedir} + - - - - org.apache.rat - apache-rat-plugin - - - org.apache.maven.plugins - maven-shade-plugin - ${maven-shade-plugin.version} - - - package - - shade - - - ${shadeSources} - ${project.build.directory}/dependency-reduced-pom.xml - - - - - - true - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - - - org.apache.hudi:hudi-common - org.apache.hudi:hudi-hadoop-mr + + + + org.apache.rat + apache-rat-plugin + + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + + + + + package + + shade + + + ${shadeSources} + ${project.build.directory}/dependency-reduced-pom.xml + + + + + + true + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + + + org.apache.hudi:hudi-common + org.apache.hudi:hudi-hadoop-mr - org.apache.parquet:parquet-avro - com.esotericsoftware:kryo-shaded - org.objenesis:objenesis - com.esotericsoftware:minlog - org.apache.avro:avro - org.apache.hbase:hbase-common - org.apache.hbase:hbase-client - org.apache.hbase:hbase-protocol - org.apache.hbase:hbase-server - org.apache.htrace:htrace-core - com.yammer.metrics:metrics-core - com.google.guava:guava - - - - - com.yammer.metrics. - org.apache.hudi.com.yammer.metrics. - - - com.esotericsoftware.kryo. - org.apache.hudi.com.esotericsoftware.kryo. - - - org.objenesis. - org.apache.hudi.org.objenesis. - - - com.esotericsoftware.minlog. - org.apache.hudi.com.esotericsoftware.minlog. - - - org.apache.avro. - org.apache.hudi.org.apache.avro. - - - com.google.common. - org.apache.hudi.com.google.common. - - - false - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - META-INF/services/javax.* - - - - ${project.artifactId}-${project.version} - - - - - - - - src/main/resources - - - src/test/resources - - - + org.apache.parquet:parquet-avro + com.esotericsoftware:kryo-shaded + org.objenesis:objenesis + com.esotericsoftware:minlog + org.apache.avro:avro + org.apache.hbase:hbase-common + org.apache.hbase:hbase-client + org.apache.hbase:hbase-protocol + org.apache.hbase:hbase-server + org.apache.htrace:htrace-core + com.yammer.metrics:metrics-core + com.google.guava:guava + + + + + com.yammer.metrics. + org.apache.hudi.com.yammer.metrics. + + + com.esotericsoftware.kryo. + org.apache.hudi.com.esotericsoftware.kryo. + + + org.objenesis. + org.apache.hudi.org.objenesis. + + + com.esotericsoftware.minlog. + org.apache.hudi.com.esotericsoftware.minlog. + + + org.apache.avro. + org.apache.hudi.org.apache.avro. + + + com.google.common. + org.apache.hudi.com.google.common. + + + false + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + META-INF/services/javax.* + + + + ${project.artifactId}-${project.version} + + + + + + + + src/main/resources + + + src/test/resources + + + - - - - org.apache.hudi - hudi-common - ${project.version} - - - org.apache.hudi - hudi-hadoop-mr - ${project.version} - + + + + org.apache.hudi + hudi-common + ${project.version} + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + - - - org.apache.parquet - parquet-avro - compile - + + + org.apache.parquet + parquet-avro + compile + - - org.apache.avro - avro - compile - + + org.apache.avro + avro + compile + - - org.apache.htrace - htrace-core - ${htrace.version} - compile - + + org.apache.htrace + htrace-core + ${htrace.version} + compile + - - org.apache.hbase - hbase-common - ${hbase.version} - + + org.apache.hbase + hbase-common + ${hbase.version} + - - org.apache.hbase - hbase-server - ${hbase.version} - compile - - - org.apache.hbase - hbase-common - - - javax.servlet - * - - - org.codehaus.jackson - * - - - org.mortbay.jetty - * - - - tomcat - * - - - - + + org.apache.hbase + hbase-server + ${hbase.version} + compile + + + org.apache.hbase + hbase-common + + + javax.servlet + * + + + org.codehaus.jackson + * + + + org.mortbay.jetty + * + + + tomcat + * + + + + diff --git a/pom.xml b/pom.xml index 0a4394c36e14..f8aeabcffb0e 100644 --- a/pom.xml +++ b/pom.xml @@ -17,1621 +17,1643 @@ limitations under the License. --> - - 4.0.0 + + 4.0.0 - - org.apache - apache - 21 - + + org.apache + apache + 21 + - org.apache.hudi - hudi - pom - 0.10.1 - Apache Hudi brings stream style processing on big data - https://github.com/apache/hudi - Hudi + org.apache.hudi + hudi + pom + 0.10.1 + Apache Hudi brings stream style processing on big data + https://github.com/apache/hudi + Hudi - - hudi-common - hudi-cli - hudi-client - hudi-aws - hudi-hadoop-mr - hudi-spark-datasource - hudi-timeline-service - hudi-utilities - hudi-sync - packaging/hudi-hadoop-mr-bundle - packaging/hudi-hive-sync-bundle - packaging/hudi-spark-bundle - packaging/hudi-presto-bundle - packaging/hudi-utilities-bundle - packaging/hudi-timeline-server-bundle - docker/hoodie/hadoop - hudi-integ-test - packaging/hudi-integ-test-bundle - hudi-examples - hudi-flink - hudi-kafka-connect - packaging/hudi-flink-bundle - packaging/hudi-kafka-connect-bundle - + + hudi-common + hudi-cli + hudi-client + hudi-aws + hudi-hadoop-mr + hudi-spark-datasource + hudi-timeline-service + hudi-utilities + hudi-sync + packaging/hudi-hadoop-mr-bundle + packaging/hudi-hive-sync-bundle + packaging/hudi-spark-bundle + packaging/hudi-presto-bundle + packaging/hudi-utilities-bundle + packaging/hudi-timeline-server-bundle + docker/hoodie/hadoop + hudi-integ-test + packaging/hudi-integ-test-bundle + hudi-examples + hudi-flink + hudi-kafka-connect + packaging/hudi-flink-bundle + packaging/hudi-kafka-connect-bundle + - - - Apache License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + - - The Apache Software Foundation - https://www.apache.org - + + The Apache Software Foundation + https://www.apache.org + - - 3.2.0 - 3.0.0-M4 - 3.0.0-M4 - 3.2.4 - 3.1.1 - 3.8.0 - 2.4 - 0.15 - 1.7 - 3.0.0-M1 - 0.37.0 + + 3.2.0 + 3.0.0-M4 + 3.0.0-M4 + 3.2.4 + 3.1.1 + 3.8.0 + 2.4 + 0.15 + 1.7 + 3.0.0-M1 + 0.37.0 - 1.8 - 2.6.7 - 2.6.7.3 - 2.6.7.1 - 2.7.4 - 2.10.0 - 2.0.0 - 5.3.4 - 2.17 - 1.10.1 - 5.7.0-M1 - 5.7.0-M1 - 1.7.0-M1 - 3.3.3 - 1.2.17 - 2.17.0 - 1.7.30 - 2.9.9 - 2.7.3 - org.apache.hive - 2.3.1 - core - 4.1.1 - 1.6.0 - 0.16 - 0.8.0 - 4.4.1 - ${spark2.version} - - 1.13.1 - 2.4.4 - 3.1.2 - hudi-spark2 - 1.8.2 - 2.11.12 - 2.12.10 - ${scala11.version} - 2.11 - 0.12 - 3.3.1 - 3.0.1 - file://${project.basedir}/src/test/resources/log4j-surefire.properties - 0.12.0 - 9.4.15.v20190215 - 3.1.0-incubating - 1.2.3 - 1.9.13 - 1.4.199 - 3.1.2 - false - ${skipTests} - ${skipTests} - ${skipTests} - ${skipTests} - ${skipTests} - ${skipTests} - UTF-8 - ${project.basedir} - provided - - compile - org.apache.hudi.spark. - provided - - -Xmx2g - 0.8.5 - compile - org.apache.hudi. - true - 2.7.1 - 4.7 - 1.12.22 - 3.17.3 - 3.1.0 - 1.1.0 - 8000 - http://localhost:${dynamodb-local.port} - + 1.8 + 2.6.7 + 2.6.7.3 + 2.6.7.1 + 2.7.4 + 2.10.0 + 2.0.0 + 5.3.4 + 2.17 + 1.10.1 + 5.7.0-M1 + 5.7.0-M1 + 1.7.0-M1 + 3.3.3 + 1.2.17 + 2.17.0 + 1.7.30 + 2.9.9 + 2.7.3 + org.apache.hive + 2.3.1 + core + 4.1.1 + 1.6.0 + 0.16 + 0.8.0 + 4.4.1 + ${spark2.version} + + 1.13.1 + 2.4.4 - - scm:git:git@github.com:apache/hudi.git - scm:git:git@github.com:apache/hudi.git - git@github.com:apache/hudi.git - HEAD - + + + 3.2.1 + hudi-spark2 + 1.8.2 + 2.11.12 + 2.12.10 + ${scala11.version} + 2.11 + 0.12 + 3.3.1 + 3.0.1 + file://${project.basedir}/src/test/resources/log4j-surefire.properties + + 0.12.0 + 9.4.15.v20190215 + 3.1.0-incubating + 1.2.3 + 1.9.13 + 1.4.199 + 3.1.2 + false + ${skipTests} + ${skipTests} + ${skipTests} + ${skipTests} + ${skipTests} + ${skipTests} + UTF-8 + ${project.basedir} + provided + + compile + org.apache.hudi.spark. + provided + + -Xmx2g + 0.8.5 + compile + org.apache.hudi. + true + 2.7.1 + 4.7 + 1.12.22 + 3.17.3 + 3.1.0 + 1.1.0 + 8000 + http://localhost:${dynamodb-local.port} + - - JIRA - https://issues.apache.org/jira/browse/HUDI - + + scm:git:git@github.com:apache/hudi.git + scm:git:git@github.com:apache/hudi.git + git@github.com:apache/hudi.git + HEAD + - - - Dev Mailing List - dev@hudi.apache.org - dev-subscribe@hudi.apache.org - dev-unsubscribe@hudi.apache.org - - - User Mailing List - users@hudi.apache.org - users-subscribe@hudi.apache.org - users-unsubscribe@hudi.apache.org - - - Commits Mailing List - commits@hudi.apache.org - commits-subscribe@hudi.apache.org - commits-unsubscribe@hudi.apache.org - - + + JIRA + https://issues.apache.org/jira/browse/HUDI + - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - 3.0.0 - - - com.puppycrawl.tools - checkstyle - 8.18 - - - - true - true - UTF-8 - style/checkstyle.xml - style/checkstyle-suppressions.xml - checkstyle.suppressions.file - true - warning - true - - ${project.build.sourceDirectory} - - - basedir=${maven.multiModuleProjectDirectory} - **\/generated-sources\/ - - - - compile - - check - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - true - false - release - deploy - - - - org.apache.maven.plugins - maven-deploy-plugin - ${maven-deploy-plugin.version} - - - default-deploy - deploy - - deploy - - - - - - org.apache.maven.plugins - maven-failsafe-plugin - ${maven-failsafe-plugin.version} - - ${skipITs} - @{argLine} - - - - org.jacoco - jacoco-maven-plugin - ${jacoco.version} - - - io.fabric8 - docker-maven-plugin - ${maven-docker-plugin.version} - - ${skipDocker} - - - + + + Dev Mailing List + dev@hudi.apache.org + dev-subscribe@hudi.apache.org + dev-unsubscribe@hudi.apache.org + + + User Mailing List + users@hudi.apache.org + users-subscribe@hudi.apache.org + users-unsubscribe@hudi.apache.org + + + Commits Mailing List + commits@hudi.apache.org + commits-subscribe@hudi.apache.org + commits-unsubscribe@hudi.apache.org + + - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - 3 - @{argLine} - - - ${surefire-log4j.file} - - - - - - org.apache.maven.plugins - maven-jar-plugin - ${maven-jar-plugin.version} - - - net.alchim31.maven - scala-maven-plugin - ${scala-maven-plugin.version} - - false - - - - org.apache.maven.plugins - maven-compiler-plugin - - - - org.apache.rat - apache-rat-plugin - ${apache-rat-plugin.version} - - true - false - 0 - - - - AL2 - Apache License 2.0 - - - Licensed to the Apache Software Foundation (ASF) under one - - - - - - Apache License 2.0 - - - - NOTICE - DISCLAIMER - **/.* - **/*.json - **/*.log - **/*.sqltemplate - **/compose_env - **/*NOTICE* - **/*LICENSE* - **/dependency-reduced-pom.xml - **/test/resources/*.data - **/test/resources/*.commit - **/target/** - **/generated-sources/** - .github/** - **/*.sql - - - - - compile - - check - - - - - - org.apache.avro - avro-maven-plugin - ${avro.version} - - - generate-sources - - schema - - - ${project.basedir}/src/main/avro/ - ${project.build.directory}/generated-sources/src/main/java/ - - String - - - - - - org.scalastyle - scalastyle-maven-plugin - 1.0.0 - - false - true - true - false - ${project.basedir}/src/main/scala - ${project.basedir}/src/test/scala - ${main.basedir}/style/scalastyle.xml - UTF-8 - - - - compile - - check - - - - - - - + + + + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + true + with-spark-${spark.version} + false + + - - - - - log4j - log4j - ${log4j.version} - + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.0.0 + + + com.puppycrawl.tools + checkstyle + 8.18 + + + + true + true + UTF-8 + style/checkstyle.xml + style/checkstyle-suppressions.xml + checkstyle.suppressions.file + true + warning + true + + ${project.build.sourceDirectory} + + + basedir=${maven.multiModuleProjectDirectory} + **\/generated-sources\/ + + + + compile + + check + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${java.version} + ${java.version} + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + true + false + release + deploy + + + + org.apache.maven.plugins + maven-deploy-plugin + ${maven-deploy-plugin.version} + + + default-deploy + deploy + + deploy + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + ${maven-failsafe-plugin.version} + + ${skipITs} + @{argLine} + + + + org.jacoco + jacoco-maven-plugin + ${jacoco.version} + + + io.fabric8 + docker-maven-plugin + ${maven-docker-plugin.version} + + ${skipDocker} + + + - - - com.fasterxml.jackson.core - jackson-annotations - ${fasterxml.version} - - - com.fasterxml.jackson.core - jackson-core - ${fasterxml.version} - - - com.fasterxml.jackson.core - jackson-databind - ${fasterxml.jackson.databind.version} - - - com.fasterxml.jackson.datatype - jackson-datatype-guava - ${fasterxml.version} - - - com.fasterxml.jackson.module - jackson-module-scala_${scala.binary.version} - ${fasterxml.jackson.module.scala.version} - + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + 3 + @{argLine} + + + ${surefire-log4j.file} + + + + + + org.apache.maven.plugins + maven-jar-plugin + ${maven-jar-plugin.version} + + + net.alchim31.maven + scala-maven-plugin + ${scala-maven-plugin.version} + + false + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.apache.rat + apache-rat-plugin + ${apache-rat-plugin.version} + + true + false + 0 + + + + AL2 + Apache License 2.0 + + + Licensed to the Apache Software Foundation (ASF) under one + + + + + + Apache License 2.0 + + + + NOTICE + DISCLAIMER + **/.* + **/*.json + **/*.log + **/*.sqltemplate + **/compose_env + **/*NOTICE* + **/*LICENSE* + **/dependency-reduced-pom.xml + **/test/resources/*.data + **/test/resources/*.commit + **/target/** + **/generated-sources/** + .github/** + **/*.sql + + + + + compile + + check + + + + + + org.apache.avro + avro-maven-plugin + ${avro.version} + + + generate-sources + + schema + + + ${project.basedir}/src/main/avro/ + ${project.build.directory}/generated-sources/src/main/java/ + + String + + + + + + org.scalastyle + scalastyle-maven-plugin + 1.0.0 + + false + true + true + false + ${project.basedir}/src/main/scala + ${project.basedir}/src/test/scala + ${main.basedir}/style/scalastyle.xml + UTF-8 + + + + compile + + check + + + + + + + - - - org.glassfish.jersey.core - jersey-server - ${glassfish.version} - - - org.glassfish.jersey.connectors - jersey-apache-connector - ${glassfish.version} - - - org.glassfish.jersey.containers - jersey-container-servlet-core - ${glassfish.version} - + + + + + log4j + log4j + ${log4j.version} + - - - org.apache.avro - avro - ${avro.version} - provided - + + + com.fasterxml.jackson.core + jackson-annotations + ${fasterxml.version} + + + com.fasterxml.jackson.core + jackson-core + ${fasterxml.version} + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.jackson.databind.version} + + + com.fasterxml.jackson.datatype + jackson-datatype-guava + ${fasterxml.version} + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.binary.version} + ${fasterxml.jackson.module.scala.version} + - - - org.apache.parquet - parquet-avro - ${parquet.version} - provided - + + + org.glassfish.jersey.core + jersey-server + ${glassfish.version} + + + org.glassfish.jersey.connectors + jersey-apache-connector + ${glassfish.version} + + + org.glassfish.jersey.containers + jersey-container-servlet-core + ${glassfish.version} + - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - tests - ${spark.version} - test - - - org.apache.spark - spark-core_${scala.binary.version} - tests - ${spark.version} - test - - - org.apache.spark - spark-catalyst_${scala.binary.version} - tests - ${spark.version} - test - + + + org.apache.avro + avro + ${avro.version} + provided + - - - org.apache.spark - spark-avro_${scala.binary.version} - ${spark.version} - provided - + + + org.apache.parquet + parquet-avro + ${parquet.version} + provided + - - - org.apache.flink - flink-streaming-java_${scala.binary.version} - ${flink.version} - provided - - - org.apache.flink - flink-clients_${scala.binary.version} - ${flink.version} - provided - - - org.apache.flink - flink-connector-kafka_${scala.binary.version} - ${flink.version} - provided - + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + tests + ${spark.version} + test + + + org.apache.spark + spark-core_${scala.binary.version} + tests + ${spark.version} + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + tests + ${spark.version} + test + - - - io.dropwizard.metrics - metrics-graphite - ${metrics.version} - - - io.dropwizard.metrics - metrics-core - ${metrics.version} - - - io.dropwizard.metrics - metrics-jmx - ${metrics.version} - + + + org.apache.spark + spark-avro_${scala.binary.version} + ${spark.version} + provided + - - io.prometheus - simpleclient - ${prometheus.version} - - - io.prometheus - simpleclient_httpserver - ${prometheus.version} - - - io.prometheus - simpleclient_dropwizard - ${prometheus.version} - - - io.prometheus - simpleclient_pushgateway - ${prometheus.version} - + + + org.apache.flink + flink-streaming-java_${scala.binary.version} + ${flink.version} + provided + + + org.apache.flink + flink-clients_${scala.binary.version} + ${flink.version} + provided + + + org.apache.flink + flink-connector-kafka_${scala.binary.version} + ${flink.version} + provided + - - com.beust - jcommander - 1.72 - + + + io.dropwizard.metrics + metrics-graphite + ${metrics.version} + + + io.dropwizard.metrics + metrics-core + ${metrics.version} + + + io.dropwizard.metrics + metrics-jmx + ${metrics.version} + - - joda-time - joda-time - ${joda.version} - + + io.prometheus + simpleclient + ${prometheus.version} + + + io.prometheus + simpleclient_httpserver + ${prometheus.version} + + + io.prometheus + simpleclient_dropwizard + ${prometheus.version} + + + io.prometheus + simpleclient_pushgateway + ${prometheus.version} + - - xerces - xercesImpl - 2.9.1 - + + com.beust + jcommander + 1.72 + - - xalan - xalan - 2.7.1 - + + joda-time + joda-time + ${joda.version} + - - org.rocksdb - rocksdbjni - 5.17.2 - + + xerces + xercesImpl + 2.9.1 + - - - org.apache.httpcomponents - fluent-hc - ${http.version} - - - org.apache.httpcomponents - httpcore - ${http.version} - - - org.apache.httpcomponents - httpclient - ${http.version} - + + xalan + xalan + 2.7.1 + - - - org.codehaus.jackson - jackson-core-asl - ${codehaus-jackson.version} - - - org.codehaus.jackson - jackson-mapper-asl - ${codehaus-jackson.version} - - - org.codehaus.jackson - jackson-jaxrs - ${codehaus-jackson.version} - - - org.codehaus.jackson - jackson-xc - ${codehaus-jackson.version} - + + org.rocksdb + rocksdbjni + 5.17.2 + - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - provided - - - javax.servlet - servlet-api - - - javax.xml.bind - jaxb-api - - - - - org.apache.hadoop - hadoop-common - ${hadoop.version} - provided - - - jdk.tools - jdk.tools - - - javax.xml.bind - jaxb-api - - - - - org.apache.hadoop - hadoop-hdfs - ${hadoop.version} - provided - - - org.apache.hadoop - hadoop-auth - ${hadoop.version} - provided - - - org.apache.hadoop - hadoop-mapreduce-client-core - ${hadoop.version} - provided - - - javax.xml.bind - jaxb-api - - - - - org.apache.hadoop - hadoop-mapreduce-client-common - ${hadoop.version} - provided - - - javax.xml.bind - jaxb-api - - - - - org.apache.hadoop - hadoop-hdfs - tests - test - ${hadoop.version} - - - org.apache.hadoop - hadoop-common - tests - ${hadoop.version} - - - jdk.tools - jdk.tools - - - javax.xml.bind - jaxb-api - - - + + + org.apache.httpcomponents + fluent-hc + ${http.version} + + + org.apache.httpcomponents + httpcore + ${http.version} + + + org.apache.httpcomponents + httpclient + ${http.version} + - - - ${hive.groupid} - hive-service - ${hive.version} - provided - - - javax.mail - * - - - org.eclipse.jetty.aggregate - * - - - org.pentaho - * - - - org.apache.logging.log4j - * - - - org.slf4j - slf4j-api - - - org.slf4j - slf4j-log4j12 - - - - - ${hive.groupid} - hive-shims - ${hive.version} - provided - - - javax.mail - mail - - - javax.xml.bind - jaxb-api - - - org.eclipse.jetty.aggregate - * - - - org.pentaho - * - - - - - ${hive.groupid} - hive-jdbc - ${hive.version} - provided - - - javax.mail - mail - - - org.eclipse.jetty.aggregate - * - - - - - ${hive.groupid} - hive-serde - ${hive.version} - provided - - - javax.mail - mail - - - - - ${hive.groupid} - hive-metastore - ${hive.version} - provided - - - javax.transaction - jta - - - javax.transaction - transaction-api - - - javax.mail - mail - - - org.eclipse.jetty.aggregate - * - - - - - ${hive.groupid} - hive-common - ${hive.version} - provided - - - javax.mail - mail - - - org.eclipse.jetty.aggregate - * - - - org.apache.logging.log4j - * - - - - - ${hive.groupid} - hive-exec - ${hive.version} - provided - ${hive.exec.classifier} - - - javax.mail - mail - - - org.eclipse.jetty.aggregate - * - - - org.pentaho - * - - - org.apache.logging.log4j - * - - - - - org.apache.hive - hive-exec - ${hive.version} - provided - - - commons-lang - commons-lang - - - org.apache.commons - commons-lang3 - - - guava - com.google.guava - - - org.eclipse.jetty.aggregate - * - - - javax.mail - mail - - - org.apache.zookeeper - zookeeper - - - org.pentaho - * - - - com.esotericsoftware - kryo-shaded - - - org.apache.logging.log4j - * - - - + + + org.codehaus.jackson + jackson-core-asl + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-mapper-asl + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-jaxrs + ${codehaus-jackson.version} + + + org.codehaus.jackson + jackson-xc + ${codehaus-jackson.version} + - - com.google.code.gson - gson - 2.3.1 - test - + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + provided + + + javax.servlet + servlet-api + + + javax.xml.bind + jaxb-api + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + provided + + + jdk.tools + jdk.tools + + + javax.xml.bind + jaxb-api + + + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version} + provided + + + org.apache.hadoop + hadoop-auth + ${hadoop.version} + provided + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version} + provided + + + javax.xml.bind + jaxb-api + + + + + org.apache.hadoop + hadoop-mapreduce-client-common + ${hadoop.version} + provided + + + javax.xml.bind + jaxb-api + + + + + org.apache.hadoop + hadoop-hdfs + tests + test + ${hadoop.version} + + + org.apache.hadoop + hadoop-common + tests + ${hadoop.version} + + + jdk.tools + jdk.tools + + + javax.xml.bind + jaxb-api + + + - - - org.apache.curator - curator-framework - ${zk-curator.version} - + + + ${hive.groupid} + hive-service + ${hive.version} + provided + + + javax.mail + * + + + org.eclipse.jetty.aggregate + * + + + org.pentaho + * + + + org.apache.logging.log4j + * + + + org.slf4j + slf4j-api + + + org.slf4j + slf4j-log4j12 + + + + + ${hive.groupid} + hive-shims + ${hive.version} + provided + + + javax.mail + mail + + + javax.xml.bind + jaxb-api + + + org.eclipse.jetty.aggregate + * + + + org.pentaho + * + + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + provided + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + ${hive.groupid} + hive-serde + ${hive.version} + provided + + + javax.mail + mail + + + + + ${hive.groupid} + hive-metastore + ${hive.version} + provided + + + javax.transaction + jta + + + javax.transaction + transaction-api + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + + + ${hive.groupid} + hive-common + ${hive.version} + provided + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + org.apache.logging.log4j + * + + + + + ${hive.groupid} + hive-exec + ${hive.version} + provided + ${hive.exec.classifier} + + + javax.mail + mail + + + org.eclipse.jetty.aggregate + * + + + org.pentaho + * + + + org.apache.logging.log4j + * + + + + + org.apache.hive + hive-exec + ${hive.version} + provided + + + commons-lang + commons-lang + + + org.apache.commons + commons-lang3 + + + guava + com.google.guava + + + org.eclipse.jetty.aggregate + * + + + javax.mail + mail + + + org.apache.zookeeper + zookeeper + + + org.pentaho + * + + + com.esotericsoftware + kryo-shaded + + + org.apache.logging.log4j + * + + + - - org.apache.curator - curator-client - ${zk-curator.version} - + + com.google.code.gson + gson + 2.3.1 + test + - - org.apache.curator - curator-recipes - ${zk-curator.version} - + + + org.apache.curator + curator-framework + ${zk-curator.version} + - - org.junit.jupiter - junit-jupiter-api - ${junit.jupiter.version} - test - + + org.apache.curator + curator-client + ${zk-curator.version} + - - org.junit.jupiter - junit-jupiter-engine - ${junit.jupiter.version} - test - + + org.apache.curator + curator-recipes + ${zk-curator.version} + - - org.junit.vintage - junit-vintage-engine - ${junit.vintage.version} - test - + + org.junit.jupiter + junit-jupiter-api + ${junit.jupiter.version} + test + - - org.junit.jupiter - junit-jupiter-params - ${junit.jupiter.version} - test - + + org.junit.jupiter + junit-jupiter-engine + ${junit.jupiter.version} + test + - - org.mockito - mockito-junit-jupiter - test - ${mockito.jupiter.version} - + + org.junit.vintage + junit-vintage-engine + ${junit.vintage.version} + test + - - org.junit.platform - junit-platform-runner - ${junit.platform.version} - test - + + org.junit.jupiter + junit-jupiter-params + ${junit.jupiter.version} + test + - - org.junit.platform - junit-platform-suite-api - ${junit.platform.version} - test - + + org.mockito + mockito-junit-jupiter + test + ${mockito.jupiter.version} + - - org.junit.platform - junit-platform-commons - ${junit.platform.version} - test - + + org.junit.platform + junit-platform-runner + ${junit.platform.version} + test + - - - com.esotericsoftware - kryo - 4.0.0 - test - + + org.junit.platform + junit-platform-suite-api + ${junit.platform.version} + test + - - - org.awaitility - awaitility - ${awaitility.version} - test - + + org.junit.platform + junit-platform-commons + ${junit.platform.version} + test + - - org.apache.flink - flink-test-utils_${scala.binary.version} - ${flink.version} - test - - - org.apache.logging.log4j - * - - - - - - org.apache.logging.log4j - log4j-core - ${log4j.test.version} - test - + + + com.esotericsoftware + kryo + 4.0.0 + test + - - - - - Maven Central - Maven Repository - https://repo.maven.apache.org/maven2 - - true - - - false - - - - cloudera-repo-releases - https://repository.cloudera.com/artifactory/public/ - - true - - - false - - - - confluent - https://packages.confluent.io/maven/ - - + + + org.awaitility + awaitility + ${awaitility.version} + test + - - - release - - - deployArtifacts - true - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - ${maven-javadoc-plugin.version} - - - attach-javadocs - - jar - - - - - none - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.4 - - - sign-artifacts - verify - - sign - - - - - - - - - warn-log - - - env.HUDI_QUIETER_LOGGING - - - - file://${project.basedir}/src/test/resources/log4j-surefire-quiet.properties - - - - - unit-tests - - false - true - true - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - - org.junit.jupiter - junit-jupiter-engine - ${junit.jupiter.version} - - - - ${skipUTs} - 120 - functional - - **/*FunctionalTestSuite.java - **/IT*.java - **/testsuite/**/Test*.java - - - - - org.jacoco - jacoco-maven-plugin - - - - prepare-agent - - - - post-unit-tests - test - - report - - - ${project.reporting.outputDirectory}/jacoco-ut - - - - - - - - - functional-tests - - true - false - true - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - - org.apache.maven.surefire - surefire-junit47 - ${maven-surefire-plugin.version} - - - - ${skipFTs} - 1 - true - - **/*FunctionalTestSuite.java - - - - - org.jacoco - jacoco-maven-plugin - - - - prepare-agent - - - - post-functional-tests - test - - report - - - ${project.reporting.outputDirectory}/jacoco-ft - - - - - - - - - integration-tests - - true - true - false - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - ${skipUTs} - - - - org.apache.maven.plugins - maven-failsafe-plugin - - ${skipITs} - - **/IT*.java - - - ${dynamodb-local.endpoint} - - - - - integration-test - - integration-test - - - - verify-integration-test - verify - - verify - - - - - - org.jacoco - jacoco-maven-plugin - - - - prepare-agent - - - - post-integration-tests - test - - report - - - ${project.reporting.outputDirectory}/jacoco-it - - - - - - - - - javadocs - - - - org.apache.maven.plugins - maven-compiler-plugin - - ${java.version} - ${java.version} - - - - net.alchim31.maven - scala-maven-plugin - ${scala-maven-plugin.version} - - - doc - generate-sources - - compile - - - - ${project.basedir}/src/main/scala - - false - - - - - - -P:genjavadoc:out=${project.build.directory}/genjavadoc - - - - com.typesafe.genjavadoc - genjavadoc-plugin_${scala.version} - ${genjavadoc-plugin.version} - - - - **/*.scala - - - - - org.codehaus.mojo - build-helper-maven-plugin - ${build-helper-maven-plugin.version} - - - generate-sources - - add-source - - - - ${project.build.directory}/genjavadoc - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - ${maven-javadoc-plugin.version} - - - aggregate - - aggregate - - - - - - none - true - - https://avro.apache.org/docs/${avro.version}/api/java - https://docs.spring.io/spring-shell/docs/1.2.0.RELEASE - https://fasterxml.github.io/jackson-databind/javadoc/2.6 - https://hadoop.apache.org/docs/r${hadoop.version}/api - https://hbase.apache.org/1.2/apidocs - https://hive.apache.org/javadocs/r2.3.6/api - https://javadoc.io/static/io.javalin/javalin/2.3.0 - https://javadoc.io/doc/org.apache.parquet/parquet-avro/${parquet.version} - https://javadoc.io/static/org.apache.parquet/parquet-hadoop/${parquet.version} - https://logging.apache.org/log4j/1.2/apidocs - https://metrics.dropwizard.io/4.1.0/apidocs - https://spark.apache.org/docs/${spark.version}/api/java - - - - **/*$.java - - - - - - - - - scala-2.11 - - - scala-2.12 - - ${scala12.version} - 2.12 - true - true - - - - scala-2.12 - - - - - - org.apache.maven.plugins - maven-enforcer-plugin - ${maven-enforcer-plugin.version} - - - enforce-versions - - enforce - - - - - - *:*_2.11 - - - - - - - - - - + + org.apache.flink + flink-test-utils_${scala.binary.version} + ${flink.version} + test + + + org.apache.logging.log4j + * + + + + + + org.apache.logging.log4j + log4j-core + ${log4j.test.version} + test + - - - spark2 - - - spark3 - - ${spark3.version} - ${spark3.version} - ${scala12.version} - 2.12 - hudi-spark3 - 3.1.0 - 2.4.1 - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - true - true - - - - spark3 - - - + + + + + Maven Central + Maven Repository + https://repo.maven.apache.org/maven2 + + true + + + false + + + + cloudera-repo-releases + https://repository.cloudera.com/artifactory/public/ + + true + + + false + + + + confluent + https://packages.confluent.io/maven/ + + - - spark3.0.x - - 3.0.3 - ${spark3.version} - ${spark3.version} - ${scala12.version} - 2.12 - hudi-spark3 - 3.0.1 - 2.4.1 - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - ${fasterxml.spark3.version} - true - true - - - - spark3.0.x - - - + + + release + + + deployArtifacts + true + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + ${maven-javadoc-plugin.version} + + + attach-javadocs + + jar + + + + + none + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.4 + + + sign-artifacts + verify + + sign + + + + + + + + + warn-log + + + env.HUDI_QUIETER_LOGGING + + + + file://${project.basedir}/src/test/resources/log4j-surefire-quiet.properties + + + + + unit-tests + + false + true + true + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + + org.junit.jupiter + junit-jupiter-engine + ${junit.jupiter.version} + + + + ${skipUTs} + 120 + functional + + **/*FunctionalTestSuite.java + **/IT*.java + **/testsuite/**/Test*.java + + + + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + post-unit-tests + test + + report + + + ${project.reporting.outputDirectory}/jacoco-ut + + + + + + + + + functional-tests + + true + false + true + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + + org.apache.maven.surefire + surefire-junit47 + ${maven-surefire-plugin.version} + + + + ${skipFTs} + 1 + true + + **/*FunctionalTestSuite.java + + + + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + post-functional-tests + test + + report + + + ${project.reporting.outputDirectory}/jacoco-ft + + + + + + + + + integration-tests + + true + true + false + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + ${skipUTs} + + + + org.apache.maven.plugins + maven-failsafe-plugin + + ${skipITs} + + **/IT*.java + + + ${dynamodb-local.endpoint} + + + + + integration-test + + integration-test + + + + verify-integration-test + verify + + verify + + + + + + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + post-integration-tests + test + + report + + + ${project.reporting.outputDirectory}/jacoco-it + + + + + + + + + javadocs + + + + org.apache.maven.plugins + maven-compiler-plugin + + ${java.version} + ${java.version} + + + + net.alchim31.maven + scala-maven-plugin + ${scala-maven-plugin.version} + + + doc + generate-sources + + compile + + + + ${project.basedir}/src/main/scala + + false + + + + + + -P:genjavadoc:out=${project.build.directory}/genjavadoc + + + + com.typesafe.genjavadoc + genjavadoc-plugin_${scala.version} + ${genjavadoc-plugin.version} + + + + **/*.scala + + + + + org.codehaus.mojo + build-helper-maven-plugin + ${build-helper-maven-plugin.version} + + + generate-sources + + add-source + + + + ${project.build.directory}/genjavadoc + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + ${maven-javadoc-plugin.version} + + + aggregate + + aggregate + + + + + + none + true + + https://avro.apache.org/docs/${avro.version}/api/java + https://docs.spring.io/spring-shell/docs/1.2.0.RELEASE + https://fasterxml.github.io/jackson-databind/javadoc/2.6 + https://hadoop.apache.org/docs/r${hadoop.version}/api + https://hbase.apache.org/1.2/apidocs + https://hive.apache.org/javadocs/r2.3.6/api + https://javadoc.io/static/io.javalin/javalin/2.3.0 + https://javadoc.io/doc/org.apache.parquet/parquet-avro/${parquet.version} + https://javadoc.io/static/org.apache.parquet/parquet-hadoop/${parquet.version} + + https://logging.apache.org/log4j/1.2/apidocs + https://metrics.dropwizard.io/4.1.0/apidocs + https://spark.apache.org/docs/${spark.version}/api/java + + + + **/*$.java + + + + + + + + + scala-2.11 + + + scala-2.12 + + ${scala12.version} + 2.12 + true + true + + + + scala-2.12 + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + ${maven-enforcer-plugin.version} + + + enforce-versions + + enforce + + + + + + *:*_2.11 + + + + + + + + + + + + + + spark2 + + + spark3 + + ${spark3.version} + ${spark3.version} + ${scala12.version} + 2.12 + hudi-spark3 + 3.1.0 + 2.4.1 + ${fasterxml.spark3.version} + ${fasterxml.spark3.version} + ${fasterxml.spark3.version} + + ${fasterxml.spark3.version} + + true + true + + + + spark3 + + + + + + spark3.0.x + + 3.0.3 + ${spark3.version} + ${spark3.version} + ${scala12.version} + 2.12 + hudi-spark3 + 3.0.1 + 2.4.1 + ${fasterxml.spark3.version} + ${fasterxml.spark3.version} + ${fasterxml.spark3.version} + + ${fasterxml.spark3.version} + + true + true + + + + spark3.0.x + + + - - skipShadeSources - - false - - - - skipShadeSources - - - - + + skipShadeSources + + false + + + + skipShadeSources + + + + diff --git a/tisInstall.sh b/tisInstall.sh index 3d7be95754f5..b9ede565a745 100644 --- a/tisInstall.sh +++ b/tisInstall.sh @@ -1,4 +1,22 @@ # for spark3 #mvn install -Pspark3,flink-bundle-shade-hive3 -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle -mvn deploy -Pspark3,flink-bundle-shade-hive3 -Dmaven.test.skip=true -pl hudi-flink,hudi-common,packaging/hudi-flink-bundle,packaging/hudi-utilities-bundle -DaltDeploymentRepository=base::default::http://localhost:8080/release +#MVN_PROFILE="spark2,flink-bundle-shade-hive2" + +#MVN_PROFILE="spark3,flink-bundle-shade-hive3" +compileAndDeploy(){ +MVN_PROFILE = $1 +echo "compile and deploy with maven profile:$MVN_PROFILE " +mvn deploy -P$MVN_PROFILE \ + -Dmaven.test.skip=true \ + -pl hudi-flink,hudi-common\ +,packaging/hudi-flink-bundle\ +,packaging/hudi-utilities-bundle\ +,packaging/hudi-hadoop-mr-bundle\ +,packaging/hudi-spark-bundle \ +-DaltDeploymentRepository=base::default::http://localhost:8080/release +} + +compileAndDeploy "spark2,flink-bundle-shade-hive2" + +compileAndDeploy "spark3,flink-bundle-shade-hive3" From 3124c05005dc1205db7b4e464f7af0f218b403ac Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Fri, 15 Jul 2022 14:13:24 +0800 Subject: [PATCH 6/7] small fix --- packaging/hudi-flink-bundle/pom.xml | 2 +- packaging/hudi-utilities-bundle/pom.xml | 2 +- pom.xml | 6 +++--- tisInstall.sh | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index f92f4149dad7..e21f6f771942 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -25,7 +25,7 @@ ../../pom.xml 4.0.0 - tis-hudi-flink-bundle_${scala.binary.version}_hive_${hive.version} + tis-hudi-flink-bundle_${scala.binary.version}_hive_${hive.version}_spark_${spark.version} jar diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 1a0865225573..639987e6d921 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -23,7 +23,7 @@ ../../pom.xml 4.0.0 - tis-hudi-utilities-bundle_${scala.binary.version} + tis-hudi-utilities-bundle_${scala.binary.version}_hive_${hive.version}_spark_${spark.version} jar diff --git a/pom.xml b/pom.xml index f8aeabcffb0e..66ef91f6f78b 100644 --- a/pom.xml +++ b/pom.xml @@ -211,9 +211,9 @@ maven-shade-plugin ${maven-shade-plugin.version} - true - with-spark-${spark.version} - false + true + false + diff --git a/tisInstall.sh b/tisInstall.sh index b9ede565a745..da17f011775f 100644 --- a/tisInstall.sh +++ b/tisInstall.sh @@ -5,9 +5,9 @@ #MVN_PROFILE="spark3,flink-bundle-shade-hive3" compileAndDeploy(){ -MVN_PROFILE = $1 +MVN_PROFILE="$1" echo "compile and deploy with maven profile:$MVN_PROFILE " -mvn deploy -P$MVN_PROFILE \ +mvn clean deploy -P$MVN_PROFILE \ -Dmaven.test.skip=true \ -pl hudi-flink,hudi-common\ ,packaging/hudi-flink-bundle\ From 014501463570105dd413f5142d6161674110448c Mon Sep 17 00:00:00 2001 From: mozhenghua Date: Wed, 26 Jul 2023 09:45:19 +0800 Subject: [PATCH 7/7] suport hadoop3 --- packaging/hudi-flink-bundle/pom.xml | 4 ++-- pom.xml | 17 ++++++++++++++++- tisInstall.sh | 4 ++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index e21f6f771942..901201f5f3a4 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -36,7 +36,7 @@ 3.1.0 1.11.1 - 2.3.1 + 0.9.3 @@ -680,7 +680,7 @@ flink-bundle-shade-hive3 - 3.1.2 + compile diff --git a/pom.xml b/pom.xml index 66ef91f6f78b..c824242b4504 100644 --- a/pom.xml +++ b/pom.xml @@ -121,7 +121,7 @@ - 3.2.1 + 3.3.1 hudi-spark2 1.8.2 2.11.12 @@ -1654,6 +1654,21 @@ + + + + hive2 + + 2.3.1 + + + + hive3 + + 3.1.3 + + + diff --git a/tisInstall.sh b/tisInstall.sh index da17f011775f..3704ef7ed4f2 100644 --- a/tisInstall.sh +++ b/tisInstall.sh @@ -17,6 +17,6 @@ mvn clean deploy -P$MVN_PROFILE \ -DaltDeploymentRepository=base::default::http://localhost:8080/release } -compileAndDeploy "spark2,flink-bundle-shade-hive2" +compileAndDeploy "spark2,flink-bundle-shade-hive2,hive2" -compileAndDeploy "spark3,flink-bundle-shade-hive3" +compileAndDeploy "spark3,flink-bundle-shade-hive3,hive3"