-
Notifications
You must be signed in to change notification settings - Fork 260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
data file override support with precedence #916
Changes from all commits
aa48895
724a81e
8890c94
4a73965
bd773dc
3405489
47a2f3b
cb54c27
184b5b3
2513ed9
a10f820
f03d8d6
958ea3c
93e8d62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
"""Local read only resources found by precedence matching supported paths | ||
|
||
Ideal usage: | ||
|
||
``` | ||
file_path = resources / "filename" | ||
with open(file_path) as f: | ||
f.read() | ||
``` | ||
|
||
Resources that do not have a `shipped` version should wrap path access in a try block: | ||
``` | ||
try: | ||
file_path = resources / "filename" | ||
except GarakException as e: | ||
logging.warn("No resource file found.", exc_info=e) | ||
``` | ||
""" | ||
|
||
import pathlib | ||
|
||
from garak import _config | ||
from garak.exception import GarakException | ||
|
||
|
||
class LocalDataPath(type(pathlib.Path())): | ||
"""restricted Path object usable only for existing resource files""" | ||
|
||
ORDERED_SEARCH_PATHS = [ | ||
_config.transient.data_dir / "data", | ||
_config.transient.package_dir / "data", | ||
] | ||
|
||
def _determine_suffix(self): | ||
for path in self.ORDERED_SEARCH_PATHS: | ||
if path == self or path in self.parents: | ||
return self.relative_to(path) | ||
|
||
def _eval_paths(self, segment, next_call, relative): | ||
if self in self.ORDERED_SEARCH_PATHS and segment == relative: | ||
raise GarakException( | ||
f"The requested resource does not refer to a valid path" | ||
) | ||
|
||
prefix_removed = self._determine_suffix() | ||
if prefix_removed is None: | ||
# if LocalDataPath is instantiated using a path not in ORDERED_SEARCH_PATHS | ||
raise GarakException( | ||
f"The requested resource does not refer to a valid path: {self}" | ||
) | ||
for path in self.ORDERED_SEARCH_PATHS: | ||
if segment == relative: | ||
projected = (path / prefix_removed).parent | ||
else: | ||
current_path = path / prefix_removed | ||
projected = getattr(current_path, next_call)(segment) | ||
if projected.exists(): | ||
return LocalDataPath(projected) | ||
|
||
raise GarakException(f"The resource requested does not exist {segment}") | ||
leondz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _glob(self, pattern, recursive=False): | ||
glob_method = "rglob" if recursive else "glob" | ||
|
||
prefix_removed = self._determine_suffix() | ||
candidate_files = [] | ||
for path in self.ORDERED_SEARCH_PATHS: | ||
candidate_path = path / prefix_removed | ||
dir_files = getattr(candidate_path, glob_method)(pattern) | ||
candidate_files.append(dir_files) | ||
relative_paths = [] | ||
selected_files = [] | ||
for files in candidate_files: | ||
for file in files: | ||
suffix = LocalDataPath(file)._determine_suffix() | ||
if suffix not in relative_paths: | ||
selected_files.append(file) | ||
relative_paths.append(suffix) | ||
|
||
return selected_files | ||
|
||
def glob(self, pattern): | ||
return self._glob(pattern, recursive=False) | ||
|
||
def rglob(self, pattern): | ||
return self._glob(pattern, recursive=True) | ||
|
||
def _make_child(self, segment): | ||
return self._eval_paths(segment, "_make_child", ("..",)) | ||
|
||
def joinpath(self, *pathsegments): | ||
for segment in pathsegments: | ||
projected = self._eval_paths(segment, "joinpath", "..") | ||
return projected | ||
|
||
|
||
path = LocalDataPath(_config.transient.data_dir / "data") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,11 +7,12 @@ | |
import csv | ||
|
||
from garak import _config | ||
from garak.data import path as data_path | ||
from garak.detectors.base import StringDetector | ||
|
||
surge_list = defaultdict(list) | ||
with open( | ||
_config.transient.package_dir / "resources" / "profanity_en.csv", | ||
data_path / "profanity_en.csv", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably worth looking at as part of a larger refactor -- do we want all these datasets in our git? Or should we have them in some other place e.g. HuggingFace hub, and the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have discussed in the past that we may want tooling that will download all datasets to create an I suspect there could be some expansion on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sometimes HF drops connections and things go wrong, so I prefer keeping smaller things closer. What that cutoff is, I don'k know - and it does also mean that garak will grow bigger over time. Agree some expansion could work, perhaps using HF by default with a backup URI also (maybe a |
||
"r", | ||
encoding="utf-8", | ||
) as csvfile: | ||
|
@@ -26,7 +27,7 @@ | |
MIN_OFCOM_STRENGTH = 2 | ||
ofcom_list = defaultdict(list) | ||
with open( | ||
_config.transient.package_dir / "resources" / "ofcom-potentially-offensive.txt", | ||
data_path / "ofcom-potentially-offensive.txt", | ||
"r", | ||
encoding="utf-8", | ||
) as csvfile: | ||
|
@@ -285,7 +286,7 @@ class LDNOOBW(StringDetector): | |
def __init__(self, config_root=_config): | ||
substrings = [] | ||
with open( | ||
_config.transient.package_dir / "resources" / "ldnoobw-en.txt", | ||
data_path / "ldnoobw-en.txt", | ||
"r", | ||
encoding="utf-8", | ||
) as wordlist: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
realising now that these aren't particularly transient pieces of information, so might go elsewhere in config. one advantage of this PR is that if this is worth updating one day, the update can be centralised in
data
and made in far fewer places.