Skip to content

Commit

Permalink
reworked paths and files extraction from advisory text
Browse files Browse the repository at this point in the history
  • Loading branch information
sacca97 authored and copernico committed Oct 7, 2022
1 parent 99cf71b commit fd5e3bc
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 35 deletions.
7 changes: 5 additions & 2 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from util.http import fetch_url

from .nlp import (
extract_path_tokens,
extract_affected_files_paths,
extract_products,
extract_special_terms,
extract_versions,
Expand Down Expand Up @@ -91,7 +91,10 @@ def analyze(self, use_nvd: bool = False, fetch_references=False):
self.affected_products, extract_products(self.description)
)

self.paths = union_of(self.paths, extract_path_tokens(self.description))
# TODO: if an exact file is found when applying the rules, the relevance must be updated i think
self.paths = union_of(
self.paths, extract_affected_files_paths(self.description)
)
self.keywords = union_of(self.keywords, extract_special_terms(self.description))
_logger.debug("References: " + str(self.references))
self.references = [
Expand Down
3 changes: 0 additions & 3 deletions prospector/datamodel/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ def __lt__(self, other) -> bool:
def __eq__(self, other) -> bool:
return self.relevance == other.relevance

def update_relevance(self, relevance: int):
self.relevance += relevance

# def format(self):
# out = "Commit: {} {}".format(self.repository.get_url(), self.commit_id)
# out += "\nhunk_count: %d diff_size: %d" % (self.hunk_count, len(self.diff))
Expand Down
82 changes: 54 additions & 28 deletions prospector/datamodel/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,39 +45,65 @@ def extract_products(text: str) -> List[str]:
return [p for p in result if len(p) > 2]


def extract_path_tokens(text: str, strict_extensions: bool = False) -> List[str]:
"""
Used to look for paths in the text (i.e. vulnerability description)
Input:
text (str)
strict_extensions (bool): this function will always extract tokens with (back) slashes,
but it will only match single file names if they have the correct extension, if this argument is True
Returns:
list: a list of paths that are found
"""
tokens = re.split(r"\s+", text) # split the text into words
tokens = [
token.strip(",.:;-+!?)]}'\"") for token in tokens
def extract_affected_files_paths(text: str, strict_extensions: bool = False):
words = text.split()
words = [
word.strip("_,.:;-+!?)]}'\"") for word in words
] # removing common punctuation marks
paths = []
for token in tokens:
contains_path_separators = ("\\" in token) or ("/" in token)
separated_with_period = "." in token
has_relevant_extension = token.split(".")[-1] in RELEVANT_EXTENSIONS
is_xml_tag = token.startswith("<")
is_property = token.endswith("=")

is_path = contains_path_separators or (
has_relevant_extension if strict_extensions else separated_with_period
)
probably_not_path = is_xml_tag or is_property
if is_path and not probably_not_path:
paths.append(token)
for word in words:
is_xml_tag = word.startswith("<")
is_property = word.endswith("=")
is_unusual = check_unusual_stuff(word)
not_relevant = is_xml_tag or is_property or is_unusual

if not_relevant:
continue

if check_if_path(word):
paths.append(word)

if check_if_file(word):
paths.append(word.split(".")[0].split("::")[0])

return paths


def check_unusual_stuff(text: str) -> bool:
return '"' in text or "," in text


def check_if_path(text: str) -> bool:
return "/" in text or "\\" in text


# TODO: look if there are others
def check_if_file(text: str) -> bool:
file = text.split(".")
if len(file) == 1:
file = file[0].split("::")

flag = False
# Check if there is an extension
if file[-1] in RELEVANT_EXTENSIONS:
return True

# Common name pattern for files or methods with underscores
if "_" in file[0] or "_" in file[-1]:
return True

# Common methods to refer to methods inside class (e.g. Class.method, Class::method)
if ("." in text or "::" in text) and file[0].isalpha():
return True
# Common name for files or methods with uppercase letter in the middle
if bool(re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[0][1:])) or bool(
re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[-1][1:])
):
return True

return flag


def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
"""
Extract identifiers that look like references to GH issues, then extract their content
Expand Down
3 changes: 1 addition & 2 deletions prospector/rules/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
"insecur",
"inject",
"unsafe",
"patch",
"remote execution",
"malicious",
"cwe-",
" rce ", # standalone RCE is a thing
]

Expand Down Expand Up @@ -162,7 +162,6 @@ def apply_rule_changes_relevant_path(
if adv_path in path
]
)

if len(relevant_paths):
return explanation_template.format(", ".join(relevant_paths))

Expand Down

0 comments on commit fd5e3bc

Please sign in to comment.