Merge pull request #67 from ASFHyP3/harvest-fix

Harvest products using download/upload instead of S3 copy
ASFHyP3 · Jun 27, 2022 · 63213e0 · 63213e0
2 parents 8e3cfdf + e72808a
commit 63213e0
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/) 
 and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [0.0.13]
+### Changed
+- `harvest_products` Lambda function now harvests data products via download and upload, rather than S3 copy.
+
 ## [0.0.12]
 ### Fixed
 - Handle the case where a product has no temporal neighbors.

diff --git a/harvest_products/cloudformation.yml b/harvest_products/cloudformation.yml
@@ -44,14 +44,7 @@ Resources:
                   - dynamodb:Query
                 Resource: !Sub "arn:aws:dynamodb:${AWS::Region}:${AWS::AccountId}:table/${ProductTable}*"
               - Effect: Allow
-                Action:
-                  - s3:GetObject
-                  - s3:GetObjectTagging
-                Resource: "arn:aws:s3:::*/*"
-              - Effect: Allow
-                Action:
-                  - s3:PutObject
-                  - s3:PutObjectTagging
+                Action: s3:PutObject
                 Resource: !Sub "arn:aws:s3:::${ProductBucket}/*"
 
   Lambda:
@@ -66,7 +59,7 @@ Resources:
           EDL_PASSWORD: !Ref EDLPassword
       Code: src/
       Handler: harvest_products.lambda_handler
-      MemorySize: 256
+      MemorySize: 2048
       Role: !GetAtt Role.Arn
       Runtime: python3.8
       Timeout: 900

diff --git a/harvest_products/src/harvest_products.py b/harvest_products/src/harvest_products.py
@@ -13,34 +13,27 @@
 S3 = boto3.resource('s3')
 
 
-def harvest_image(image_url, destination_bucket, destination_prefix):
-    filename = basename(urlparse(image_url).path)
+def harvest_file(file_url, destination_prefix):
+    destination_bucket = S3.Bucket(environ['BUCKET_NAME'])
+    filename = basename(urlparse(file_url).path)
     destination_key = f'{destination_prefix}/{filename}'
-    response = requests.get(image_url)
+    response = requests.get(file_url)
     response.raise_for_status()
     content_type = guess_type(filename)[0] if guess_type(filename)[0] else 'application/octet-stream'
     destination_bucket.put_object(Body=io.BytesIO(response.content), Key=destination_key, ContentType=content_type)
     return f'https://{destination_bucket.name}.s3.amazonaws.com/{destination_key}'
 
 
 def harvest(product, job):
-    destination_bucket = S3.Bucket(environ['BUCKET_NAME'])
-    copy_source = {
-        'Bucket': job.files[0]['s3']['bucket'],
-        'Key': job.files[0]['s3']['key'],
-    }
-    product_name = job.files[0]['filename']
     destination_prefix = f'{product["event_id"]}/{product["product_id"]}'
-    destination_key = f'{destination_prefix}/{product_name}'
-    print(f'copying {product_name} to s3://{destination_bucket.name}/{destination_key}')
-    destination_bucket.copy(copy_source, destination_key)
+    product_file = job.files[0]
 
     return {
-        'browse_url': harvest_image(job.browse_images[0], destination_bucket, destination_prefix),
-        'thumbnail_url': harvest_image(job.thumbnail_images[0], destination_bucket, destination_prefix),
-        'product_name': product_name,
-        'product_size': job.files[0]['size'],
-        'product_url': f'https://{destination_bucket.name}.s3.amazonaws.com/{destination_key}',
+        'browse_url': harvest_file(job.browse_images[0], destination_prefix),
+        'thumbnail_url': harvest_file(job.thumbnail_images[0], destination_prefix),
+        'product_name': product_file['filename'],
+        'product_size': product_file['size'],
+        'product_url': harvest_file(product_file['url'], destination_prefix),
     }
 
 

diff --git a/tests/test_harvest_products.py b/tests/test_harvest_products.py
@@ -1,5 +1,5 @@
 from os import environ
-from unittest import mock
+from unittest.mock import MagicMock, call, patch
 
 import responses
 from botocore.stub import ANY
@@ -10,7 +10,7 @@
 
 
 @responses.activate
-def test_harvest_image(s3_stubber):
+def test_harvest_file(s3_stubber):
     responses.add(responses.GET, 'https://foo.com/file.png', body='image_content')
     params = {
         'Bucket': environ['BUCKET_NAME'],
@@ -19,16 +19,16 @@ def test_harvest_image(s3_stubber):
         'Body': ANY
     }
     s3_stubber.add_response(method='put_object', expected_params=params, service_response={})
-    bucket = harvest_products.S3.Bucket(environ['BUCKET_NAME'])
-    response = harvest_products.harvest_image('https://foo.com/file.png', bucket, 'prefix')
+    response = harvest_products.harvest_file('https://foo.com/file.png', 'prefix')
 
     assert response == f'https://{environ["BUCKET_NAME"]}.s3.amazonaws.com/prefix/file.png'
 
 
-def test_harvest(s3_stubber):
+@patch('harvest_products.harvest_file')
+def test_harvest(mock_harvest_file: MagicMock):
     product = {
-        'event_id': '1',
-        'product_id': 'source_prefix',
+        'event_id': 'event_id',
+        'product_id': 'product_id',
         'granules': [],
         'status_code': 'PENDING',
         'processing_date': '2020-01-01T00:00:00+00:00'
@@ -39,10 +39,7 @@ class MockJob:
             {
                 'filename': 'product.zip',
                 'size': 123,
-                's3': {
-                    'bucket': 'sourceBucket',
-                    'key': 'source_prefix/product.zip',
-                },
+                'url': 'PRODUCT_URL',
             },
         ]
         browse_images = [
@@ -52,36 +49,23 @@ class MockJob:
             'THUMBNAIL_IMAGE_URL',
         ]
 
-    params = {
-        'Bucket': 'sourceBucket',
-        'Key': 'source_prefix/product.zip',
-    }
-    s3_response = {
-        'ContentLength': 123
-    }
-    s3_stubber.add_response(method='head_object', expected_params=params, service_response=s3_response)
-
-    params = {
-        'Bucket': environ['BUCKET_NAME'],
-        'Key': '1/source_prefix/product.zip',
-        'CopySource': {
-            'Bucket': 'sourceBucket',
-            'Key': 'source_prefix/product.zip'
-        },
-    }
-    s3_stubber.add_response(method='copy_object', expected_params=params, service_response={})
-
-    with mock.patch('harvest_products.harvest_image', lambda x, y, z: 'https://foo.com/file.png'):
-        files = harvest_products.harvest(product, MockJob())
+    mock_harvest_file.return_value = 'https://foo.com/file.png'
+    files = harvest_products.harvest(product, MockJob())
 
     assert files == {
         'browse_url': 'https://foo.com/file.png',
         'thumbnail_url': 'https://foo.com/file.png',
         'product_name': 'product.zip',
         'product_size': 123,
-        'product_url': f'https://{environ["BUCKET_NAME"]}.s3.amazonaws.com/1/source_prefix/product.zip'
+        'product_url': 'https://foo.com/file.png'
     }
 
+    assert mock_harvest_file.mock_calls == [
+        call('BROWSE_IMAGE_URL', 'event_id/product_id'),
+        call('THUMBNAIL_IMAGE_URL', 'event_id/product_id'),
+        call('PRODUCT_URL', 'event_id/product_id'),
+    ]
+
 
 def test_update_product_succeeded(tables):
     product = {
@@ -118,7 +102,7 @@ def test_update_product_succeeded(tables):
         'product_url': f'https://{environ["BUCKET_NAME"]}.s3.amazonaws.com/1/foo/product.zip'
     }
 
-    with mock.patch('harvest_products.harvest', lambda x, y: mock_harvest):
+    with patch('harvest_products.harvest', lambda x, y: mock_harvest):
         harvest_products.update_product(product, job)
 
     updated_product = tables.product_table.scan()['Items'][0]