Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into ml_transform_criteo
Browse files Browse the repository at this point in the history
  • Loading branch information
AnandInguva committed Feb 7, 2024
2 parents cebdefe + 4f3963f commit e031888
Show file tree
Hide file tree
Showing 213 changed files with 3,595 additions and 749 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/beam_PostCommit_Java_DataflowV2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ name: PostCommit Java Dataflow V2

on:
schedule:
- cron: '30 3/6 * * *'
- cron: '30 3/8 * * *'
pull_request_target:
paths: ['release/trigger_all_tests.json', '.github/trigger_files/beam_PostCommit_Java_DataflowV2.json']
workflow_dispatch:
Expand Down Expand Up @@ -54,7 +54,7 @@ jobs:
beam_PostCommit_Java_DataflowV2:
name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
runs-on: [self-hosted, ubuntu-20.04, main]
timeout-minutes: 240
timeout-minutes: 360
strategy:
matrix:
job_name: [beam_PostCommit_Java_DataflowV2]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ jobs:
uses: actions/upload-artifact@v4
if: failure()
with:
name: Python Test Results
name: Python ${{ matrix.python_version }} Test Results
path: '**/pytest*.xml'
- name: Publish Python Test Results
uses: EnricoMi/publish-unit-test-result-action@v2
Expand Down
12 changes: 6 additions & 6 deletions .test-infra/metrics/sync/github/sync_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,6 @@
'Beam Metrics Report',
'Build and Version Runner Docker Image',
'PreCommit GHA',
'pr-bot-prs-needing-attention',
'PreCommit RAT',
'Assign or close an issue',
'PostCommit Website Test',
Expand All @@ -339,8 +338,10 @@
'PreCommit Whitespace',
'Publish Beam SDK Snapshots',
'Cancel Stale Dataflow Jobs',
'pr-bot-new-prs',
'pr-bot-pr-updates',
'pr-bot-new-prs'
'pr-bot-prs-needing-attention',
'pr-bot-update-reviewers'
]

MISC_TESTS = [
Expand All @@ -355,7 +356,6 @@
'Cancel',
'PostCommit PortableJar Spark',
'PreCommit Integration and Load Test Framework',
'pr-bot-update-reviewers',
'PostCommit TransformService Direct',
'Cut Release Branch',
'Generate issue report',
Expand Down Expand Up @@ -404,7 +404,7 @@ def get_dashboard_category(workflow_name):
return 'go'
if workflow_name in MISC_TESTS:
return 'misc'

print(f'No category found for workflow: {workflow_name}')
print('Falling back to rules based assignment')

Expand Down Expand Up @@ -471,7 +471,7 @@ def get_token():
git_integration = GithubIntegration(GH_APP_ID, GH_PEM_KEY)
token = git_integration.get_access_token(GH_APP_INSTALLATION_ID).token
return f'Bearer {token}'

@backoff.on_exception(backoff.constant, aiohttp.ClientResponseError, max_tries=5)
async def fetch(url, semaphore, params=None, headers=None, request_id=None):
async with semaphore:
Expand Down Expand Up @@ -574,7 +574,7 @@ def append_workflow_runs(workflow, runs):
else:
workflow_ids_to_fetch_extra_runs.pop(workflow_id, None)
print(f"Successfully fetched details for: {workflow.filename}")

page = math.ceil(
int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH) / number_of_entries_per_page
) + 1
Expand Down
34 changes: 21 additions & 13 deletions .test-infra/tools/stale_bq_datasets_cleaner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Deletes stale and old BQ datasets that are left after tests.
#

set -euo pipefail
set -exuo pipefail

PROJECT=apache-beam-testing
MAX_RESULT=1500
Expand All @@ -36,19 +36,27 @@ for dataset in ${BQ_DATASETS[@]}; do
if [[ $dataset =~ $template ]]; then
# The BQ API reports LAST MODIFIED TIME in miliseconds, while unix works in seconds since epoch
# thus why we need to convert to seconds.
[[ `bq --format=json --project_id=$PROJECT show $dataset` =~ \"lastModifiedTime\":\"([0-9]+)\" ]]
LAST_MODIFIED_MS=${BASH_REMATCH[1]}
LAST_MODIFIED=$(($LAST_MODIFIED_MS / 1000))
if [[ $GRACE_PERIOD -gt $LAST_MODIFIED ]]; then
if bq --project_id=$PROJECT rm -r -f $dataset; then
if [[ $OSTYPE == "linux-gnu"* ]]; then
# date command usage depending on OS
echo "Deleted $dataset (modified `date -d @$LAST_MODIFIED`)"
elif [[ $OSTYPE == "darwin"* ]]; then
echo "Deleted $dataset (modified `date -r @$LAST_MODIFIED`)"

failed=0
ds=`bq --format=json --project_id=$PROJECT show $dataset` || failed=1
if [[ $failed -eq 1 ]]; then
echo "Could not find dataset $dataset - it may have already been deleted, skipping"
else
[[ $ds =~ \"lastModifiedTime\":\"([0-9]+)\" ]]
LAST_MODIFIED_MS=${BASH_REMATCH[1]}
LAST_MODIFIED=$(($LAST_MODIFIED_MS / 1000))
if [[ $GRACE_PERIOD -gt $LAST_MODIFIED ]]; then
if bq --project_id=$PROJECT rm -r -f $dataset; then
if [[ $OSTYPE == "linux-gnu"* ]]; then
# date command usage depending on OS
echo "Deleted $dataset (modified `date -d @$LAST_MODIFIED`)"
elif [[ $OSTYPE == "darwin"* ]]; then
echo "Deleted $dataset (modified `date -r @$LAST_MODIFIED`)"
fi
else
echo "Tried and failed to delete $dataset"
failed_calls+=1
fi
else
failed_calls+=1
fi
fi
break
Expand Down
55 changes: 32 additions & 23 deletions .test-infra/tools/stale_dataflow_prebuilt_image_cleaner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ PRIVATE_REPOSITORIES=(java-postcommit-it python-postcommit-it jenkins github-act
# set as the same as 6-week release period
if [[ $OSTYPE == "linux-gnu"* ]]; then
# date command usage depending on OS
DELETE_BEFORE_DAY=$(date --iso-8601=s -d '6 weeks ago')
DELETE_BEFORE_PUBLIC=$(date --iso-8601=s -d '6 weeks ago')
DELETE_BEFORE_PRIVATE=$(date --iso-8601=s -d '3 days ago')
elif [[ $OSTYPE == "darwin"* ]]; then
DELETE_BEFORE_DAY=$(date -j -v-6w '+%Y-%m-%dT%H:%M:%S')
DELETE_BEFORE_PUBLIC=$(date -j -v-6w '+%Y-%m-%dT%H:%M:%S')
DELETE_BEFORE_PRIVATE=$(date -j -v-3d '+%Y-%m-%dT%H:%M:%S')
fi

REPOSITORIES=("${PUBLIC_REPOSITORIES[@]/#/gcr.io/apache-beam-testing/}" "${PRIVATE_REPOSITORIES[@]/#/us.gcr.io/apache-beam-testing/}")

echo $REPOSITORIES
REPOSITORIES=("${PRIVATE_REPOSITORIES[@]/#/us.gcr.io/apache-beam-testing/}" "${PUBLIC_REPOSITORIES[@]/#/gcr.io/apache-beam-testing/}")

# walk repos recursively
IMAGE_NAMES=""
Expand All @@ -54,8 +54,8 @@ while [ -n "$REPOSITORIES" ]; do
REPOSITORIES=("${PENDING_REPOSITORIES[@]}")
done

STALE_IMAGES=""
FAILED_INSPECT=""
HAS_STALE_IMAGES=""
FAILED_IMAGES=""

for image_name in ${IMAGE_NAMES[@]}; do
echo IMAGES FOR image ${image_name}
Expand All @@ -64,22 +64,31 @@ for image_name in ${IMAGE_NAMES[@]}; do
LATEST_IN_TIME=$(gcloud container images list-tags \
${image_name} --sort-by="~TIMESTAMP" --filter="NOT tags:latest " --format="get(digest)" --limit=1)
if [ -n "$LATEST_IN_TIME" ]; then
# decide timestamp cutoff
if [[ $image_name =~ 'us.gcr.io' ]]; then
DELETE_BEFORE_DAY=$DELETE_BEFORE_PRIVATE
else
DELETE_BEFORE_DAY=$DELETE_BEFORE_PUBLIC
fi
# list containers of the image name
echo "Command" gcloud container images list-tags \
${image_name} \
--sort-by=TIMESTAMP --filter="NOT tags:latest AND timestamp.datetime < $DELETE_BEFORE_DAY" \
--format="get(digest)"
STALE_IMAGES_CURRENT=$(gcloud container images list-tags \
--format="get(digest,timestamp.year)"
STALE_IMAGES=$(gcloud container images list-tags \
${image_name} \
--sort-by=TIMESTAMP --filter="NOT tags:latest AND timestamp.datetime < $DELETE_BEFORE_DAY" \
--format="get(digest)")
STALE_IMAGES+=$STALE_IMAGES_CURRENT
for current in ${STALE_IMAGES_CURRENT[@]}; do
--format="get(digest,timestamp.year)")

STALE_IMAGES_CURRENT=($STALE_IMAGES)
for (( i_stale_images_current=0; i_stale_images_current<${#STALE_IMAGES_CURRENT[@]} ; i_stale_images_current+=2 )) ; do
current=${STALE_IMAGES_CURRENT[i_stale_images_current]}
currentyear=${STALE_IMAGES_CURRENT[i_stale_images_current+1]}
# do not delete the one with latest label and the newest image without latest label
# this make sure we leave at least one container under each image name, either labelled "latest" or not
if [ "$LATEST_IN_TIME" != "$current" ]; then
if [[ $image_name =~ 'beamgrafana' || $image_name =~ 'beammetricssyncjenkins' || $image_name =~ 'beammetricssyncgithub' ]]; then
# Skip docker manifest inspect for known single arch images, workaround permission issue & saving API call
if [[ $currentyear > 1970 ]]; then
# Skip docker manifest inspect for those not in epoch to save API call
SHOULD_DELETE=0
else
# Check to see if this image is built on top of earlier images. This is the case for multiarch images,
Expand All @@ -89,24 +98,24 @@ for image_name in ${IMAGE_NAMES[@]}; do
MANIFEST=$(docker manifest inspect ${image_name}@"${current}" || echo "")
if [ -z "$MANIFEST" ]; then
# Sometimes "no such manifest" seen. Skip current if command hit error
FAILED_INSPECT+=" $current"
FAILED_IMAGES+=" $current"
continue
fi
SHOULD_DELETE=0
DIGEST=$(echo $MANIFEST | jq -r '.manifests[0].digest')
if [ "$DIGEST" != "null" ]; then
SHOULD_DELETE=1
for i in ${STALE_IMAGES_CURRENT[@]}; do
if [ "$i" = "$DIGEST" ]; then
for (( j_stale_images_current=0; j_stale_images_current<${#STALE_IMAGES_CURRENT[@]} ; j_stale_images_current+=2 )) ; do
if [ "${STALE_IMAGES_CURRENT[j_stale_images_current]}" = "$DIGEST" ]; then
SHOULD_DELETE=0
break
fi
done
fi
fi

if [ $SHOULD_DELETE = 0 ]
then
if [ $SHOULD_DELETE = 0 ]; then
HAS_STALE_IMAGES="true"
echo "Deleting image. Command: gcloud container images delete ${image_name}@"${current}" --force-delete-tags -q"
gcloud container images delete ${image_name}@"${current}" --force-delete-tags -q || FAILED_TO_DELETE+="${current} "
fi
Expand All @@ -120,18 +129,18 @@ for image_name in ${IMAGE_NAMES[@]}; do
echo "Failed to delete the following images: ${FAILED_TO_DELETE}. Retrying each of them."
for current in $RETRY_DELETE; do
echo "Trying again to delete image ${image_name}@"${current}". Command: gcloud container images delete ${image_name}@"${current}" --force-delete-tags -q"
gcloud container images delete ${image_name}@"${current}" --force-delete-tags -q
gcloud container images delete ${image_name}@"${current}" --force-delete-tags -q || FAILED_IMAGES+=" ${image_name}@${current}"
done
fi
done

if [[ ${STALE_IMAGES} ]]; then
if [[ -n "$HAS_STALE_IMAGES" ]]; then
echo "Deleted multiple images"
else
echo "No stale prebuilt container images found."
fi

if [ -n "$FAILED_INSPECT" ]; then
echo "Failed delete images $FAILED_INSPECT"
if [ -n "$FAILED_IMAGES" ]; then
echo "Failed delete images $FAILED_IMAGES"
exit 1
fi
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@

## New Features / Improvements

* X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
* [Enrichment Transform](https://s.apache.org/enrichment-transform) along with GCP BigTable handler added to Python SDK ([#30001](https://github.com/apache/beam/pull/30001)).
* Allow writing clustered and not time partitioned BigQuery tables (Java) ([#30094](https://github.com/apache/beam/pull/30094)).

## Breaking Changes

Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ tasks.register("javaPreCommit") {
dependsOn(":sdks:java:extensions:sorter:build")
dependsOn(":sdks:java:extensions:timeseries:build")
dependsOn(":sdks:java:extensions:zetasketch:build")
dependsOn(":sdks:java:fn-execution:build")
dependsOn(":sdks:java:harness:build")
dependsOn(":sdks:java:harness:jmh:build")
dependsOn(":sdks:java:io:bigquery-io-perf-tests:build")
Expand All @@ -313,6 +312,7 @@ tasks.register("javaPreCommit") {
dependsOn(":sdks:java:testing:tpcds:build")
dependsOn(":sdks:java:testing:watermarks:build")
dependsOn(":sdks:java:transform-service:build")
dependsOn(":sdks:java:transform-service:app:build")
dependsOn(":sdks:java:transform-service:launcher:build")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2860,15 +2860,15 @@ class BeamModulePlugin implements Plugin<Project> {
project.evaluationDependsOn(":sdks:python")
project.evaluationDependsOn(":runners:core-construction-java")
project.evaluationDependsOn(":sdks:java:extensions:python")
project.evaluationDependsOn(":sdks:java:transform-service:launcher")
project.evaluationDependsOn(":sdks:java:transform-service:app")

def usesDataflowRunner = config.pythonPipelineOptions.contains("--runner=TestDataflowRunner") || config.pythonPipelineOptions.contains("--runner=DataflowRunner")

// Task for launching transform services
def envDir = project.project(":sdks:python").envdir
def pythonDir = project.project(":sdks:python").projectDir
def externalPort = getRandomPort()
def launcherJar = project.project(':sdks:java:transform-service:launcher').shadowJar.archivePath
def launcherJar = project.project(':sdks:java:transform-service:app').shadowJar.archivePath
def groupId = project.name + randomUUID().toString()
def transformServiceOpts = [
"transform_service_launcher_jar": launcherJar,
Expand All @@ -2895,7 +2895,7 @@ class BeamModulePlugin implements Plugin<Project> {
dependsOn ':sdks:python:expansion-service-container:docker'
dependsOn ':sdks:java:expansion-service:container:docker'
dependsOn ":sdks:python:installGcpTest"
dependsOn project.project(':sdks:java:transform-service:launcher').shadowJar.getPath()
dependsOn project.project(':sdks:java:transform-service:app').shadowJar.getPath()

if (usesDataflowRunner) {
dependsOn ":sdks:python:test-suites:dataflow:py${project.ext.pythonVersion.replace('.', '')}:initializeForDataflowJob"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,16 @@ class GrpcVendoring_1_60_1 {
"com.google.protobuf:protobuf-java:$protobuf_version",
"com.google.protobuf:protobuf-java-util:$protobuf_version",
"com.google.code.gson:gson:$gson_version",
"io.grpc:grpc-alts:$grpc_version",
"io.grpc:grpc-auth:$grpc_version",
"io.grpc:grpc-core:$grpc_version",
"io.grpc:grpc-context:$grpc_version",
"io.grpc:grpc-core:$grpc_version",
"io.grpc:grpc-netty-shaded:$grpc_version",
"io.grpc:grpc-protobuf:$grpc_version",
"io.grpc:grpc-services:$grpc_version",
"io.grpc:grpc-stub:$grpc_version",
"io.grpc:grpc-alts:$grpc_version",
"io.grpc:grpc-testing:$grpc_version",
"io.grpc:grpc-util:$grpc_version",
"com.google.auth:google-auth-library-credentials:$google_auth_version",
"com.google.api.grpc:proto-google-common-protos:$proto_google_common_protos_version",
"io.opencensus:opencensus-api:$opencensus_version",
Expand Down
6 changes: 3 additions & 3 deletions contributor-docs/release-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -564,11 +564,11 @@ The following should be confirmed:
At
[https://hub.docker.com/u/apache](https://hub.docker.com/search?q=apache%2Fbeam&type=image),
visit each repository and navigate to "tags" tab. Verify images are pushed
with tags: `${RELEASE_VERSION}rc{RC_NUM}`
with tags: `${RELEASE_VERSION}rc${RC_NUM}`
Verify that third party licenses are included in Docker. You can do this with a simple script:
RC_TAG=${RELEASE_VERSION}rc{RC_NUM}
RC_TAG=${RELEASE_VERSION}rc${RC_NUM}
for pyver in 3.8 3.9 3.10 3.11; do
docker run --rm --entrypoint sh \
apache/beam_python${pyver}_sdk:${RC_TAG} \
Expand All @@ -577,7 +577,7 @@ Verify that third party licenses are included in Docker. You can do this with a
for javaver in 8 11 17; do
docker run --rm --entrypoint sh \
apache/beam_java${pyver}_sdk:${RC_TAG} \
apache/beam_java${javaver}_sdk:${RC_TAG} \
-c 'ls -al /opt/apache/beam/third_party_licenses/ | wc -l'
done
Expand Down
Loading

0 comments on commit e031888

Please sign in to comment.