From 541e5c85031a4c20a2c979e1b126ddaad2e0bd24 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:48:45 -0700 Subject: [PATCH] Make BQ file load limit controls public (#32101) --- .../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java index 2a16bf31a6cb..aa094ace7562 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java @@ -3199,14 +3199,34 @@ public Write withMaxFilesPerBundle(int maxFilesPerBundle) { return toBuilder().setMaxFilesPerBundle(maxFilesPerBundle).build(); } - @VisibleForTesting - Write withMaxFileSize(long maxFileSize) { + /** + * Controls the maximum byte size per file to be loaded into BigQuery. If the amount of data + * written to one file reaches this threshold, we will close that file and continue writing in a + * new file. + * + *

The default value (4 TiB) respects BigQuery's maximum number of source URIs per job + * configuration. + * + * @see BigQuery Load Job + * Limits + */ + public Write withMaxFileSize(long maxFileSize) { checkArgument(maxFileSize > 0, "maxFileSize must be > 0, but was: %s", maxFileSize); return toBuilder().setMaxFileSize(maxFileSize).build(); } - @VisibleForTesting - Write withMaxFilesPerPartition(int maxFilesPerPartition) { + /** + * Controls how many files will be assigned to a single BigQuery load job. If the number of + * files increases past this threshold, we will spill it over into multiple load jobs as + * necessary. + * + *

The default value (10,000 files) respects BigQuery's maximum number of source URIs per job + * configuration. + * + * @see BigQuery Load Job + * Limits + */ + public Write withMaxFilesPerPartition(int maxFilesPerPartition) { checkArgument( maxFilesPerPartition > 0, "maxFilesPerPartition must be > 0, but was: %s",