Skip to content

Commit

Permalink
Refactor pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
AnandInguva committed Jan 30, 2024
1 parent b498e4a commit 30041ad
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 17 deletions.
28 changes: 12 additions & 16 deletions sdks/python/apache_beam/testing/benchmarks/mltransform/criteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def parse_known_args(argv):
return parser.parse_known_args(argv)


def run(argv=None, ):
def run(argv=None):
known_args, pipeline_args = parse_known_args(argv)
options = PipelineOptions(flags=pipeline_args)
data_path = known_args.input
Expand All @@ -99,21 +99,17 @@ def run(argv=None, ):
for i in range(len(x))})
| beam.Map(convert_str_to_int))

# processed_lines | beam.Map(logging.info)

artifact_location = known_args.artifact_location
if not artifact_location:
import tempfile
artifact_location = tempfile.mkdtemp(prefix='criteo-mltransform-')
ml_transform = MLTransform(write_artifact_location=artifact_location)
ml_transform.with_transform(
ComputeAndApplyVocabulary(columns=CATEGORICAL_FEATURE_KEYS))
ml_transform.with_transform(
Bucketize(columns=NUMERIC_FEATURE_KEYS, num_buckets=_NUM_BUCKETS))

transformed_lines = (processed_lines | 'MLTransform' >> ml_transform)

# _ = transformed_lines | beam.Map(logging.info)
transformed_lines = (
processed_lines
| "MLTransform" >>
MLTransform(write_artifact_location=known_args.artifact_location).
with_transform(
ComputeAndApplyVocabulary(
columns=CATEGORICAL_FEATURE_KEYS, frequency_threshold=5)
).with_transform(
Bucketize(columns=NUMERIC_FEATURE_KEYS, num_buckets=_NUM_BUCKETS)))

transformed_lines | beam.Map(logging.info)


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def test_process_criteo_10GB_dataset(self):
# beam pipeline options
extra_opts['input'] = os.path.join(
_INPUT_GCS_BUCKET_ROOT, constants.INPUT_CRITEO_10GB)
logging.info("#################")
extra_opts['artifact_location'] = os.path.join(
_OUTPUT_GCS_BUCKET_ROOT, uuid.uuid4().hex)

logging.info(extra_opts['artifact_location'])
extra_opts['frequency_threshold'] = 0

extra_opts['job_name'] = (
Expand Down

0 comments on commit 30041ad

Please sign in to comment.