apache · ahmedabu98 · Nov 12, 2024 · Jun 3, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json
@@ -1,4 +1,4 @@
 {
   "comment": "Modify this file in a trivial way to cause this test suite to run",
-  "modification": 2
+  "modification": 1
 }
diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto
@@ -72,8 +72,10 @@ message ManagedTransforms {
       "beam:schematransform:org.apache.beam:kafka_write:v1"];
     BIGQUERY_READ = 4 [(org.apache.beam.model.pipeline.v1.beam_urn) =
       "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"];
-    BIGQUERY_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) =
+    BIGQUERY_STORAGE_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) =
       "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"];
+    BIGQUERY_FILE_LOADS = 6 [(org.apache.beam.model.pipeline.v1.beam_urn) =
+      "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"];
   }
 }
 

diff --git a/...va/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/...va/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java
@@ -17,9 +17,12 @@
  */
 package org.apache.beam.sdk.io.gcp.bigquery;
 
+import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn;
+
 import com.google.auto.service.AutoService;
 import java.util.Collections;
 import java.util.List;
+import org.apache.beam.model.pipeline.v1.ExternalTransforms;
 import org.apache.beam.sdk.annotations.Internal;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
@@ -49,8 +52,6 @@
 public class BigQueryFileLoadsWriteSchemaTransformProvider
     extends TypedSchemaTransformProvider<BigQueryWriteConfiguration> {
 
-  private static final String IDENTIFIER =
-      "beam:schematransform:org.apache.beam:bigquery_fileloads:v1";
   static final String INPUT_TAG = "input";
 
   @Override
@@ -60,7 +61,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) {
 
   @Override
   public String identifier() {
-    return IDENTIFIER;
+    return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS);
   }
 
   @Override

diff --git a/.../apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/.../apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java
@@ -17,6 +17,7 @@
  */
 package org.apache.beam.sdk.io.gcp.bigquery.providers;
 
+import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn;
 import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
 import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull;
 
@@ -26,6 +27,7 @@
 import java.util.Collections;
 import java.util.List;
 import javax.annotation.Nullable;
+import org.apache.beam.model.pipeline.v1.ExternalTransforms;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead;
@@ -78,7 +80,7 @@ protected SchemaTransform from(BigQueryDirectReadSchemaTransformConfiguration co
 
   @Override
   public String identifier() {
-    return "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ);
+    return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ);
   }
 
   @Override

diff --git a/...he/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/...he/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java
@@ -18,6 +18,7 @@
 package org.apache.beam.sdk.io.gcp.bigquery.providers;
 
 import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS;
+import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn;
 import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
 
 import com.google.api.services.bigquery.model.TableConstraints;
@@ -27,6 +28,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
+import org.apache.beam.model.pipeline.v1.ExternalTransforms;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method;
@@ -98,7 +100,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) {
 
   @Override
   public String identifier() {
-    return "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE);
+    return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE);
   }
 
   @Override

diff --git a/...e-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/...e-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java
@@ -24,12 +24,12 @@
 import java.util.stream.LongStream;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
-import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider;
 import org.apache.beam.sdk.io.gcp.testing.BigqueryClient;
 import org.apache.beam.sdk.managed.Managed;
 import org.apache.beam.sdk.schemas.Schema;
 import org.apache.beam.sdk.testing.PAssert;
 import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.testing.TestPipelineOptions;
 import org.apache.beam.sdk.transforms.Create;
 import org.apache.beam.sdk.transforms.MapElements;
 import org.apache.beam.sdk.transforms.PeriodicImpulse;
@@ -39,15 +39,22 @@
 import org.apache.beam.sdk.values.TypeDescriptors;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
 import org.joda.time.Duration;
+import org.joda.time.Instant;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TestName;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
 /** This class tests the execution of {@link Managed} BigQueryIO. */
 @RunWith(JUnit4.class)
 public class BigQueryManagedIT {
+  @Rule public TestName testName = new TestName();
+  @Rule public transient TestPipeline writePipeline = TestPipeline.create();
+  @Rule public transient TestPipeline readPipeline = TestPipeline.create();
+
   private static final Schema SCHEMA =
       Schema.of(
           Schema.Field.of("str", Schema.FieldType.STRING),
@@ -79,34 +86,58 @@ public static void setUpTestEnvironment() throws IOException, InterruptedExcepti
   public static void cleanup() {
     BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID);
   }
+  @Test
+  public void testBatchFileLoadsWriteRead() {
+    String table =
+        String.format("%s:%s.%s", PROJECT, BIG_QUERY_DATASET_ID, testName.getMethodName());
+    Map<String, Object> config = ImmutableMap.of("table", table);
+
+    // file loads requires a GCS temp location
+    String tempLocation = writePipeline.getOptions().as(TestPipelineOptions.class).getTempRoot();
+    writePipeline.getOptions().setTempLocation(tempLocation);
+
+    // batch write
+    PCollectionRowTuple.of("input", getInput(writePipeline, false))
+        .apply(Managed.write(Managed.BIGQUERY).withConfig(config));
+    writePipeline.run().waitUntilFinish();
+
+    // read and validate
+    PCollection<Row> outputRows =
+        readPipeline
+            .apply(Managed.read(Managed.BIGQUERY).withConfig(config))
+            .getSinglePCollection();
+    PAssert.that(outputRows).containsInAnyOrder(ROWS);
+
+    readPipeline.run().waitUntilFinish();
+  }
 
   @Test
   public void testStreamingStorageWriteRead() {
-    String table = String.format("%s:%s.managed_storage_write_read", PROJECT, BIG_QUERY_DATASET_ID);
-
-    Map<String, Object> writeConfig =
-        ImmutableMap.<String, Object>builder().put("table", table).build();
-    Pipeline p = Pipeline.create();
-    PCollectionRowTuple.of("input", getInput(p, true))
-        .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig));
-    p.run().waitUntilFinish();
-
-    Map<String, Object> readConfig =
-        ImmutableMap.<String, Object>builder().put("table", table).build();
-    Pipeline q = Pipeline.create();
+    String table =
+        String.format("%s:%s.%s", PROJECT, BIG_QUERY_DATASET_ID, testName.getMethodName());
+    Map<String, Object> config = ImmutableMap.of("table", table);
+
+    // streaming write
+    PCollectionRowTuple.of("input", getInput(writePipeline, true))
+        .apply(Managed.write(Managed.BIGQUERY).withConfig(config));
+    writePipeline.run().waitUntilFinish();
+
+    // read and validate
     PCollection<Row> outputRows =
-        PCollectionRowTuple.empty(p)
-            .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig))
-            .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG);
+        readPipeline
+            .apply(Managed.read(Managed.BIGQUERY).withConfig(config))
+            .getSinglePCollection();
     PAssert.that(outputRows).containsInAnyOrder(ROWS);
-    q.run().waitUntilFinish();
+
+    readPipeline.run().waitUntilFinish();
   }
 
   public PCollection<Row> getInput(Pipeline p, boolean isStreaming) {
     if (isStreaming) {
       return p.apply(
               PeriodicImpulse.create()
-                  .stopAfter(Duration.millis(20))
+                  .startAt(new Instant(0))
+                  .stopAt(new Instant(19))
                   .withInterval(Duration.millis(1)))
           .apply(
               MapElements.into(TypeDescriptors.rows())

diff --git a/sdks/java/managed/expansion-service/build.gradle b/sdks/java/managed/expansion-service/build.gradle
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+apply plugin: 'org.apache.beam.module'
+apply plugin: 'application'
+mainClassName = "org.apache.beam.sdk.expansion.service.ExpansionService"
+
+applyJavaNature(
+        automaticModuleName: 'org.apache.beam.sdk.managed.expansion.service',
+        exportJavadoc: false,
+        validateShadowJar: false,
+        shadowClosure: {},
+)
+
+// TODO(https://github.com/apache/beam/pull/32486/) Use library.java.kafka_clients once >=3.1.0 is set as default
+configurations.runtimeClasspath {
+    // Pin kafka-clients version due to <3.1.0 missing auth callback classes
+    resolutionStrategy.force 'org.apache.kafka:kafka-clients:3.1.2'
+}
+
+shadowJar {
+    mergeServiceFiles()
+    outputs.upToDateWhen { false }
+}
+
+description = "Apache Beam :: SDKs :: Java :: Managed :: Expansion Service"
+ext.summary = "Expansion service for Managed Transforms"
+
+dependencies {
+    runtimeOnly project(":sdks:java:expansion-service")
+
+    // **** IcebergIO and dependencies ****
+    runtimeOnly project(":sdks:java:io:iceberg")
+    // Needed when writing to GCS
+    runtimeOnly library.java.bigdataoss_gcs_connector
+    runtimeOnly library.java.hadoop_client
+    // For HiveCatalog
+    runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2")
+    runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow")
+
+    // **** KafkaIO and dependencies ****
+    runtimeOnly project(":sdks:java:io:kafka")
+    runtimeOnly library.java.kafka_clients
+
+    runtimeOnly library.java.slf4j_jdk14
+}
diff --git a/sdks/java/managed/expansion-service/container/build.gradle b/sdks/java/managed/expansion-service/container/build.gradle
@@ -0,0 +1,19 @@
+plugins {
+    id 'java'
+}
+
+group = 'org.apache.beam.sdk.managed.expansion.service.container'
+version = '2.61.0-SNAPSHOT'
+
+repositories {
+    mavenCentral()
+}
+
+dependencies {
+    testImplementation platform('org.junit:junit-bom:5.9.1')
+    testImplementation 'org.junit.jupiter:junit-jupiter'
+}
+
+test {
+    useJUnitPlatform()
+}
diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java
@@ -99,15 +99,17 @@ public class Managed {
       ImmutableMap.<String, String>builder()
           .put(ICEBERG, getUrn(ExternalTransforms.ManagedTransforms.Urns.ICEBERG_WRITE))
           .put(KAFKA, getUrn(ExternalTransforms.ManagedTransforms.Urns.KAFKA_WRITE))
-          .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE))
+          .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE))
           .build();
 
   /**
    * Instantiates a {@link Managed.ManagedTransform} transform for the specified source. The
    * supported managed sources are:
    *
    * <ul>
-   *   <li>{@link Managed#ICEBERG} : Read from Apache Iceberg
+   *   <li>{@link Managed#ICEBERG} : Read from Apache Iceberg tables
+   *   <li>{@link Managed#KAFKA} : Read from Apache Kafka topics
+   *   <li>{@link Managed#BIGQUERY} : Read from GCP BigQuery tables
    * </ul>
    */
   public static ManagedTransform read(String source) {
@@ -127,18 +129,23 @@ public static ManagedTransform read(String source) {
    * managed sinks are:
    *
    * <ul>
-   *   <li>{@link Managed#ICEBERG} : Write to Apache Iceberg
+   *   <li>{@link Managed#ICEBERG} : Write to Apache Iceberg tables
+   *   <li>{@link Managed#KAFKA} : Write to Apache Kafka topics
+   *   <li>{@link Managed#BIGQUERY} : Write to GCP BigQuery tables
    * </ul>
    */
   public static ManagedTransform write(String sink) {
+    List<String> supportedIdentifiers = new ArrayList<>(WRITE_TRANSFORMS.values());
+    supportedIdentifiers.add(getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS));
+
     return new AutoValue_Managed_ManagedTransform.Builder()
         .setIdentifier(
             Preconditions.checkNotNull(
                 WRITE_TRANSFORMS.get(sink.toLowerCase()),
                 "An unsupported sink was specified: '%s'. Please specify one of the following sinks: %s",
                 sink,
                 WRITE_TRANSFORMS.keySet()))
-        .setSupportedIdentifiers(new ArrayList<>(WRITE_TRANSFORMS.values()))
+        .setSupportedIdentifiers(supportedIdentifiers)
         .build();
   }