Skip to content

Commit

Permalink
Merge pull request #80 from OpenDataAlex/process_tracker_python-71
Browse files Browse the repository at this point in the history
Process tracker python 71
  • Loading branch information
OpenDataAlex authored Jul 19, 2019
2 parents 5066347 + 0be167c commit 002703c
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 34 deletions.
85 changes: 57 additions & 28 deletions process_tracker/location_tracker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Location
# For processes dealing with Extract Locations.
import logging
from os.path import basename, normpath
from pathlib import PurePath
from os.path import basename, dirname, join, isdir, normpath

from process_tracker.utilities.aws_utilities import AwsUtilities
from process_tracker.utilities.logging import console
Expand All @@ -24,15 +25,15 @@ def __init__(self, location_path, location_name=None, data_store=None):
raise Exception("Data store is not set.")
else:
self.data_store = data_store
self.session = self.data_store.session

self.location_path = location_path.lower()
self.location_name = location_name
self.location_bucket_name = self.determine_location_bucket_name()

if location_name is None:
self.logger.info("Location name not provided. Generating.")
self.location_name = self.derive_location_name()
else:
self.logger.info("Using provided location name: %s" % location_name)
self.location_name = location_name

self.location_type = self.derive_location_type()

Expand All @@ -43,10 +44,9 @@ def __init__(self, location_path, location_name=None, data_store=None):
location_name=self.location_name,
location_path=location_path,
location_type_id=self.location_type.location_type_id,
location_bucket_name=self.location_bucket_name,
)

self.location_bucket_name = self.determine_location_bucket_name()

def derive_location_name(self):
"""
If location name is not provided, attempt to derive name from path.
Expand All @@ -57,18 +57,50 @@ def derive_location_name(self):

location_prefix = None

location_name = ""
current_name = (
self.session.query(Location)
.filter(Location.location_path == self.location_path)
.first()
)

if "s3" in self.location_path:
# If the path is an S3 Bucket, prefix to name.
self.logger.info("Location appears to be s3 related. Setting prefix.")
location_prefix = "s3"
if current_name is not None:
location_name = current_name.location_name
else:
location_name = ""

if "s3" in self.location_path:
# If the path is an S3 Bucket, prefix to name.
self.logger.info("Location appears to be s3 related. Setting prefix.")
location_prefix = "s3 %s" % self.location_bucket_name
else:
location_prefix = "local"

if location_prefix is not None:
self.logger.info(
"Location prefix provided. Appending to location name."
)
location_name = location_prefix + " - "

if location_prefix is not None:
self.logger.info("Location prefix provided. Appending to location name.")
location_name = location_prefix + " - "
if "." in str(PurePath(self.location_path).name):
location_name += PurePath(self.location_path).parent.name
else:
location_name += PurePath(self.location_path).name

location_name += basename(normpath(self.location_path))
name_count = (
self.session.query(Location)
.filter(Location.location_name.like(location_name + "%"))
.count()
)

if name_count >= 1:
self.logger.info(
"The location name already exists. There are %s instances."
% name_count
)

location_name = "%s - %s" % (location_name, name_count)

self.logger.info("Location name is now %s" % location_name)

return location_name

Expand Down Expand Up @@ -106,29 +138,26 @@ def register_file_count(self, file_count):
"""

self.location.location_file_count = file_count
self.data_store.session.commit()
self.session.commit()

def determine_location_bucket_name(self):
"""
If location is of type 's3', then find which bucket the location belongs to.
:return:
"""
self.logger.info("Determining if location is s3.")
if "s3" in self.location_path or "s3" in self.location_name:
if "s3" in self.location_path or (
self.location_name is not None and "s3" in self.location_name
):

self.logger.info("Location is in s3.")
if self.location.location_bucket_name is None:
self.logger.info("Location bucket was not set.")

self.location.location_bucket_name = AwsUtilities().determine_bucket_name(
path=self.location.location_path
)

self.data_store.session.commit()
location_bucket_name = AwsUtilities().determine_bucket_name(
path=self.location_path
)

else:
self.location.location_bucket_name = None
location_bucket_name = None

self.data_store.session.commit()
self.session.commit()

return self.location.location_bucket_name
return location_bucket_name
4 changes: 2 additions & 2 deletions tests/test_extract_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def test_derive_location_name_local_path(self):
)

given_result = location[0].location_name
expected_result = "extract_dir2"
expected_result = "local - extract_dir2"

self.assertEqual(expected_result, given_result)

Expand All @@ -266,7 +266,7 @@ def test_derive_location_name_s3(self):
)

given_result = location[0].location_name
expected_result = "s3 - extract_dir"
expected_result = "s3 test-test - extract_dir"

self.assertEqual(expected_result, given_result)

Expand Down
124 changes: 120 additions & 4 deletions tests/test_location_tracker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import unittest

from process_tracker.models.extract import Location

from process_tracker.utilities.data_store import DataStore
from process_tracker.location_tracker import LocationTracker

Expand All @@ -8,16 +10,55 @@ class TestLocationTracker(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.data_store = DataStore()
cls.session = cls.data_store.session

@classmethod
def tearDownClass(cls):
cls.session.close()

def tearDown(self):
self.session.query(Location).delete()
self.session.commit()

def test_derive_location_name_no_trailing_slash_local(self):
"""
Testing that if no location name is provided, and it's not a location already, the last directory is set as the
location name even if a trailing slash is not provided.
:return:
"""
test_path = "/tmp/testing/test_dir"

expected_result = "local - test_dir"
given_result = LocationTracker(
location_path=test_path, data_store=self.data_store
).location_name

self.assertEqual(expected_result, given_result)

def test_derive_location_name_no_trailing_slash_s3(self):
"""
Testing that if no location name is provided, and it's not a location already, the last directory is set as the
location name even if a trailing slash is not provided.
:return:
"""
test_path = "s3://tmp/testing/test_dir"

expected_result = "s3 tmp - test_dir"
given_result = LocationTracker(
location_path=test_path, data_store=self.data_store
).location_name

self.assertEqual(expected_result, given_result)

def test_derive_location_name_none(self):
"""
Testing that if no location name is provided, and it's not a location path, the last directory is set as the
location name.
:return:
"""
test_path = "/tmp/testing/test_dir"
test_path = "/tmp/testing/test_dir/"

expected_result = "test_dir"
expected_result = "local - test_dir"
given_result = LocationTracker(
location_path=test_path, data_store=self.data_store
).location_name
Expand All @@ -29,9 +70,9 @@ def test_derive_location_name_s3(self):
Testing that if no location name is provided, and it's an s3 location path, the s3 prefix is added.
:return:
"""
test_path = "s3://tmp/testing/test_dir"
test_path = "s3://tmp/testing/test_dir/"

expected_result = "s3 - test_dir"
expected_result = "s3 tmp - test_dir"
given_result = LocationTracker(
location_path=test_path, data_store=self.data_store
).location_name
Expand Down Expand Up @@ -121,3 +162,78 @@ def test_determine_location_bucket_name_local(self):
given_result = location.location.location_bucket_name

self.assertEqual(expected_result, given_result)

def test_determine_location_name_duplicate_name_s3(self):
"""
Testing that if two different s3 locations produce the same location name
that the second location will append a number to ensure uniqueness.
:return:
"""
expected_result = "s3 duplicate-test - dir - 1"

location = LocationTracker(
location_path="https://duplicate-test.s3.amazonaws.com/this/is/a/test/dir/file.txt",
data_store=self.data_store,
)

dupe_location = LocationTracker(
location_path="https://duplicate-test.s3.amazonaws.com/this/is/another/test/dir/file.txt",
data_store=self.data_store,
)

given_result = dupe_location.location.location_name

self.assertEqual(expected_result, given_result)

def test_determine_location_name_duplicate_name_local(self):
"""
Testing that if two different s3 locations produce the same location name
that the second location will append a number to ensure uniqueness.
:return:
"""
expected_result = "local - test_dir - 1"

location = LocationTracker(
location_path="/tmp/duplicate_testing/test_dir/file.txt",
data_store=self.data_store,
)

dupe_location = LocationTracker(
location_path="/tmp/duplicate_testing_another/test_dir/file.txt",
data_store=self.data_store,
)

given_result = dupe_location.location.location_name

self.assertEqual(expected_result, given_result)

def test_determine_location_name_file_not_part_s3(self):
"""
Testing that when a s3 path is provided with a filename at the end, the file is ignored.
:return:
"""
expected_result = "s3 test-bucket - dir"

location = LocationTracker(
location_path="https://test-bucket.s3.amazonaws.com/this/is/a/test/dir/file.txt",
data_store=self.data_store,
)

given_result = location.location.location_name

self.assertEqual(expected_result, given_result)

def test_determine_location_name_file_not_part_local(self):
"""
Testing that when a local path is provided with a filename at the end, the file is ignored.
:return:
"""
expected_result = "local - path"

location = LocationTracker(
location_path="/local/dir/path/text.txt", data_store=self.data_store
)

given_result = location.location.location_name

self.assertEqual(expected_result, given_result)

0 comments on commit 002703c

Please sign in to comment.