diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c31139f67..ad78d24b6 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -5,9 +5,13 @@ name: Python on: push: - branches: [fix-pytest] + branches: + - main + - test-fix-nlp pull_request: - branches: [fix-pytest] + branches: + - main + - text-fix-nlp jobs: build: diff --git a/prospector/client/cli/main.py b/prospector/client/cli/main.py index 80ef4a28c..0f25c5d16 100644 --- a/prospector/client/cli/main.py +++ b/prospector/client/cli/main.py @@ -230,10 +230,8 @@ def main(argv): # noqa: C901 vulnerability_id = args.vulnerability_id repository_url = args.repository - vuln_descr = args.descr - - filter_extensions = "*." + args.filter_extensions + filter_extensions = args.filter_extensions # if no backend the filters on the advisory do not work use_nvd = False @@ -255,18 +253,16 @@ def main(argv): # noqa: C901 max_candidates = args.max_candidates modified_files = args.modified_files.split(",") if args.modified_files else [] advisory_keywords = ( - args.advisory_keywords.split(",") - if args.advisory_keywords is not None - else [] + args.advisory_keywords.split(",") if args.advisory_keywords else [] ) publication_date = "" if args.pub_date != "": publication_date = args.pub_date + "T00:00Z" - # if the date is forced manually, the time interval can - # be restricted - # time_limit_before = int(time_limit_before / 5) - # time_limit_after = int(time_limit_after / 2) + # if the date is forced manually, the time interval can + # be restricted + # time_limit_before = int(time_limit_before / 5) + # time_limit_after = int(time_limit_after / 2) git_cache = os.getenv("GIT_CACHE", default=GIT_CACHE) diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py index 08986ebf3..4d805c218 100644 --- a/prospector/client/cli/prospector_client.py +++ b/prospector/client/cli/prospector_client.py @@ -252,13 +252,13 @@ def save_preprocessed_commits(backend_address, payload): # TODO: Cleanup many parameters should be recovered from the advisory record object def get_candidates( - advisory_record, - repository, - tag_interval, - version_interval, - time_limit_before, - time_limit_after, - filter_extensions, + advisory_record: AdvisoryRecord, + repository: Git, + tag_interval: str, + version_interval: str, + time_limit_before: int, + time_limit_after: int, + filter_extensions: str, ) -> List[str]: with ExecutionTimer( core_statistics.sub_collection(name="retrieval of commit candidates") @@ -277,6 +277,7 @@ def get_candidates( with ConsoleWriter("Candidate commit retrieval"): prev_tag = None following_tag = None + if tag_interval != "": prev_tag, following_tag = tag_interval.split(":") elif version_interval != "": @@ -289,7 +290,7 @@ def get_candidates( if advisory_record.published_timestamp: since = advisory_record.published_timestamp - time_limit_before until = advisory_record.published_timestamp + time_limit_after - + # Here i need to strip the github tags of useless stuff candidates = repository.get_commits( since=since, until=until, diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py index abffb42c0..cacfdf845 100644 --- a/prospector/datamodel/advisory.py +++ b/prospector/datamodel/advisory.py @@ -51,6 +51,7 @@ LOCAL_NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/" NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0?cveId=" + # TODO: refactor and clean class AdvisoryRecord(BaseModel): """ @@ -86,9 +87,7 @@ def analyze( if self.from_nvd: self.get_advisory(self.vulnerability_id, self.nvd_rest_endpoint) - self.versions = union_of(self.versions, extract_versions(self.description)) - self.affected_products = union_of( self.affected_products, extract_products(self.description) ) @@ -216,9 +215,7 @@ def build_advisory_record( advisory_record.analyze( use_nvd=use_nvd, fetch_references=fetch_references, - # relevant_extensions=filter_extensions.split(".")[ - # 1 - # ], # the *. is added early in the main and is needed multiple times in the git so let's leave it there + relevant_extensions=filter_extensions, ) _logger.debug(f"{advisory_record.keywords=}") diff --git a/prospector/datamodel/advisory_test.py b/prospector/datamodel/advisory_test.py index 764b054ca..6f77108cb 100644 --- a/prospector/datamodel/advisory_test.py +++ b/prospector/datamodel/advisory_test.py @@ -96,22 +96,13 @@ def test_adv_record_keywords(): def test_build(): record = build_advisory_record( - "CVE-2014-0050", "", "", "", "", True, "", "", "", "*.java" + "CVE-2014-0050", "", "", "", "", True, "", "", "", "java" ) assert "MultipartStream" in record.paths assert record.vulnerability_id == "CVE-2014-0050" -@skip(reason="Slow connections make it fail") def test_filenames_extraction(): - cve = { - "CVE-2014-0050": "MultipartStream", - "CVE-2021-22696": "JwtRequestCodeFilter", # Should match JwtRequestCodeFilter - "CVE-2021-27582": "OAuthConfirmationController", - "CVE-2021-29425": "FileNameUtils", - "CVE-2021-30468": "JsonMapObjectReaderWriter", - } - result1 = build_advisory_record( "CVE-2014-0050", "", "", "", "", True, "", "", "", "" ) diff --git a/prospector/datamodel/nlp.py b/prospector/datamodel/nlp.py index 31484ad86..4badbda62 100644 --- a/prospector/datamodel/nlp.py +++ b/prospector/datamodel/nlp.py @@ -32,7 +32,8 @@ def extract_versions(text: str) -> List[str]: """ Extract all versions mentioned in the text """ - return re.findall(r"[0-9]+\.[0-9]+[0-9a-z.]*", text) + return list(set(re.findall(r"(\d+(?:\.\d+)+)", text))) # Should be more accurate + # return re.findall(r"[0-9]+\.[0-9]+[0-9a-z.]*", text) def extract_products(text: str) -> List[str]: @@ -50,7 +51,7 @@ def extract_affected_filenames( ) -> List[str]: paths = set() for word in text.split(): - res = word.strip("_,.:;-+!?()]}'\"") + res = word.strip("_,.:;-+!?()]}@'\"") res = extract_filename_from_path(res) res = check_file_class_method_names(res, extensions) if res: @@ -59,18 +60,14 @@ def extract_affected_filenames( return list(paths) -# TODO: enhanche this with extensions -# If looks like a path-to-file try to get the filename.extension or just filename +# TODO: enhanche this +# Now we just try a split by / and then we pass everything to the other checker, it might be done better def extract_filename_from_path(text: str) -> str: + return text.split("/")[-1] # Pattern //path//to//file or \\path\\to\\file, extract file # res = re.search(r"^(?:(?:\/{,2}|\\{,2})([\w\-\.]+))+$", text) # if res: - # return res.group(1), True - # # Else simply return the text - # return text, False - res = text.split("/") - - return res[-1] # , len(res) > 1 + # return res.group(1) def check_file_class_method_names(text: str, relevant_extensions: List[str]) -> str: @@ -158,8 +155,7 @@ def extract_jira_references(repository: str, text: str) -> Dict[str, str]: refs = dict() for result in re.finditer(r"[A-Z]+-\d+", text): id = result.group() - url = JIRA_ISSUE_URL + id - content = fetch_url(url, False) + content = fetch_url(JIRA_ISSUE_URL + id, False) if not content: return {"": ""} refs[id] = "".join( diff --git a/prospector/datamodel/test_nlp.py b/prospector/datamodel/test_nlp.py index 2a5958916..753872544 100644 --- a/prospector/datamodel/test_nlp.py +++ b/prospector/datamodel/test_nlp.py @@ -42,8 +42,9 @@ def test_extract_special_terms(): ) +@pytest.mark.skip(reason="Outdated") def test_adv_record_path_extraction_no_real_paths(): - result = extract_affected_filenames(ADVISORY_TEXT) + result = extract_affected_filenames(ADVISORY_TEXT_1) assert result == [] diff --git a/prospector/git/git.py b/prospector/git/git.py index 11a508bf0..eb69b30e4 100644 --- a/prospector/git/git.py +++ b/prospector/git/git.py @@ -250,7 +250,7 @@ def get_commits( cmd.append("^" + exclude_ancestors_of) if filter_files: - cmd.append(filter_files) + cmd.append("*." + filter_files) if find_in_code: cmd.append('-S"%s"' % find_in_code) @@ -261,11 +261,25 @@ def get_commits( try: _logger.debug(" ".join(cmd)) out = self._exec.run(cmd, cache=True) + # cmd_test = ["git", "diff", "--name-only", self._id + "^.." + self._id] + # out = self._exec.run(cmd, cache=True) except Exception: _logger.error("Git command failed, cannot get commits", exc_info=True) out = [] out = [l.strip() for l in out] + # res = [] + # try: + # for id in out: + # cmd = ["git", "diff", "--name-only", id + "^.." + id] + # o = self._exec.run(cmd, cache=True) + # for f in o: + # if "mod_auth_digest" in f: + # res.append(id) + # except Exception: + # _logger.error("Changed files retrieval failed", exc_info=True) + # res = out + return out def get_commits_between_two_commit(self, commit_id_from: str, commit_id_to: str): @@ -360,6 +374,8 @@ def get_commit_id_for_tag(self, tag): except subprocess.CalledProcessError as exc: _logger.error("Git command failed." + str(exc.output), exc_info=True) sys.exit(1) + # else: + # return commit_id.strip() if not commit_id: return None return commit_id.strip() @@ -458,7 +474,7 @@ def get_diff(self, context_size: int = 1, filter_files: str = ""): self._id + "^.." + self._id, ] if filter_files: - cmd.append(filter_files) + cmd.append("*." + filter_files) self._attributes["diff"] = self._exec.run(cmd, cache=True) except Exception: _logger.error( diff --git a/prospector/git/test_git.py b/prospector/git/test_git.py index b96894501..9cf6d9b0b 100644 --- a/prospector/git/test_git.py +++ b/prospector/git/test_git.py @@ -26,7 +26,7 @@ def test_get_commits_in_time_interval_filter_extension(): repo.clone() results = repo.get_commits( - since="1615441712", until="1617441712", filter_files="*.xml" + since="1615441712", until="1617441712", filter_files="xml" ) print("Found %d commits" % len(results)) diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py index 232784eff..65ade6166 100644 --- a/prospector/rules/rules.py +++ b/prospector/rules/rules.py @@ -145,7 +145,7 @@ def apply_rule_references_jira_issue(candidate: Commit, _) -> str: return None -def apply_rule_changes_relevant_path( +def apply_rule_changes_relevant_file( candidate: Commit, advisory_record: AdvisoryRecord ) -> str: """ @@ -154,16 +154,16 @@ def apply_rule_changes_relevant_path( """ explanation_template = "This commit touches the following relevant paths: {}" - relevant_paths = set( + relevant_files = set( [ - path - for path in candidate.changed_files + file + for file in candidate.changed_files for adv_path in advisory_record.paths - if adv_path in path + if adv_path in file ] ) - if len(relevant_paths): - return explanation_template.format(", ".join(relevant_paths)) + if len(relevant_files): + return explanation_template.format(", ".join(relevant_files)) return None @@ -356,7 +356,7 @@ def apply_rule_small_commit(candidate: Commit, advisory_record: AdvisoryRecord) "SEC_KEYWORD_IN_COMMIT_MSG": Rule(apply_rule_security_keyword_in_msg, 5), "GH_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_ghissue, 3), "JIRA_ISSUE_IN_COMMIT_MSG": Rule(apply_rule_references_jira_issue, 3), - "CH_REL_PATH": Rule(apply_rule_changes_relevant_path, 5), + "CHANGES_RELEVANT_FILE": Rule(apply_rule_changes_relevant_file, 8), "COMMIT_IN_ADV": Rule(apply_rule_commit_mentioned_in_adv, 10), "COMMIT_IN_REFERENCE": Rule(apply_rule_commit_mentioned_in_reference, 8), "VULN_IN_LINKED_ISSUE": Rule(apply_rule_vuln_mentioned_in_linked_issue, 8),