Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

data file override support with precedence #916

Merged
merged 14 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions garak/analyze/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Union


from garak import _config
from garak.data import path as data_path

MINIMUM_STD_DEV = (
0.01732 # stddev=0 gives unusable z-scores; give it an arbitrary floor of 3^.5 %
Expand Down Expand Up @@ -132,7 +132,7 @@ def defcon_and_comment(
return zscore_defcon, zscore_comment

def _build_path(self, filename):
return _config.transient.package_dir / "resources" / "calibration" / filename
return data_path / "calibration" / filename

def __init__(self, calibration_path: Union[None, str, pathlib.Path] = None) -> None:

Expand Down
7 changes: 2 additions & 5 deletions garak/analyze/misp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@
import os

from garak import _plugins
import garak._config
from garak.data import path as data_path

# does this utility really have access to _config?
misp_resource_file = (
garak._config.transient.package_dir / "resources" / "misp_descriptions.tsv"
)
misp_resource_file = data_path / "misp_descriptions.tsv"
misp_descriptions = {}
if os.path.isfile(misp_resource_file):
with open(misp_resource_file, "r", encoding="utf-8") as f:
Expand Down
6 changes: 3 additions & 3 deletions garak/analyze/report_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
import sqlite3

from garak import _config
from garak.data import path as data_path
import garak.analyze.calibration


if not _config.loaded:
_config.load_config()

Expand All @@ -33,9 +35,7 @@
about_z_template = templateEnv.get_template("digest_about_z.jinja")


misp_resource_file = (
_config.transient.package_dir / "resources" / "misp_descriptions.tsv"
)
misp_resource_file = data_path / "misp_descriptions.tsv"
misp_descriptions = {}
if os.path.isfile(misp_resource_file):
with open(misp_resource_file, "r", encoding="utf-8") as f:
Expand Down
100 changes: 100 additions & 0 deletions garak/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Local read only resources found by precedence matching supported paths

Ideal usage:

```
file_path = resources / "filename"
with open(file_path) as f:
f.read()
```

Resources that do not have a `shipped` version should wrap path access in a try block:
```
try:
file_path = resources / "filename"
except GarakException as e:
logging.warn("No resource file found.", exc_info=e)
```
"""

import pathlib

from garak import _config
from garak.exception import GarakException


class LocalDataPath(type(pathlib.Path())):
"""restricted Path object usable only for existing resource files"""

ORDERED_SEARCH_PATHS = [
_config.transient.data_dir / "data",
_config.transient.package_dir / "data",
Comment on lines +33 to +34
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

realising now that these aren't particularly transient pieces of information, so might go elsewhere in config. one advantage of this PR is that if this is worth updating one day, the update can be centralised in data and made in far fewer places.

]

def _determine_suffix(self):
for path in self.ORDERED_SEARCH_PATHS:
if path == self or path in self.parents:
return self.relative_to(path)

def _eval_paths(self, segment, next_call, relative):
if self in self.ORDERED_SEARCH_PATHS and segment == relative:
raise GarakException(
f"The requested resource does not refer to a valid path"
)

prefix_removed = self._determine_suffix()
if prefix_removed is None:
# if LocalDataPath is instantiated using a path not in ORDERED_SEARCH_PATHS
raise GarakException(
f"The requested resource does not refer to a valid path: {self}"
)
for path in self.ORDERED_SEARCH_PATHS:
if segment == relative:
projected = (path / prefix_removed).parent
else:
current_path = path / prefix_removed
projected = getattr(current_path, next_call)(segment)
if projected.exists():
return LocalDataPath(projected)

raise GarakException(f"The resource requested does not exist {segment}")
leondz marked this conversation as resolved.
Show resolved Hide resolved

def _glob(self, pattern, recursive=False):
glob_method = "rglob" if recursive else "glob"

prefix_removed = self._determine_suffix()
candidate_files = []
for path in self.ORDERED_SEARCH_PATHS:
candidate_path = path / prefix_removed
dir_files = getattr(candidate_path, glob_method)(pattern)
candidate_files.append(dir_files)
relative_paths = []
selected_files = []
for files in candidate_files:
for file in files:
suffix = LocalDataPath(file)._determine_suffix()
if suffix not in relative_paths:
selected_files.append(file)
relative_paths.append(suffix)

return selected_files

def glob(self, pattern):
return self._glob(pattern, recursive=False)

def rglob(self, pattern):
return self._glob(pattern, recursive=True)

def _make_child(self, segment):
return self._eval_paths(segment, "_make_child", ("..",))

def joinpath(self, *pathsegments):
for segment in pathsegments:
projected = self._eval_paths(segment, "joinpath", "..")
return projected


path = LocalDataPath(_config.transient.data_dir / "data")
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions garak/detectors/riskywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import csv

from garak import _config
from garak.data import path as data_path
from garak.detectors.base import StringDetector

surge_list = defaultdict(list)
with open(
_config.transient.package_dir / "resources" / "profanity_en.csv",
data_path / "profanity_en.csv",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably worth looking at as part of a larger refactor -- do we want all these datasets in our git? Or should we have them in some other place e.g. HuggingFace hub, and the garak.data module can also manage downloading these files?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have discussed in the past that we may want tooling that will download all datasets to create an offline deployment capability.

I suspect there could be some expansion on garak.data for handling access to known datasets. Treating HF as the specific location that tooling can register data as available from might be a good direction.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sometimes HF drops connections and things go wrong, so I prefer keeping smaller things closer. What that cutoff is, I don'k know - and it does also mean that garak will grow bigger over time.

Agree some expansion could work, perhaps using HF by default with a backup URI also (maybe a garak-data repo)

"r",
encoding="utf-8",
) as csvfile:
Expand All @@ -26,7 +27,7 @@
MIN_OFCOM_STRENGTH = 2
ofcom_list = defaultdict(list)
with open(
_config.transient.package_dir / "resources" / "ofcom-potentially-offensive.txt",
data_path / "ofcom-potentially-offensive.txt",
"r",
encoding="utf-8",
) as csvfile:
Expand Down Expand Up @@ -285,7 +286,7 @@ class LDNOOBW(StringDetector):
def __init__(self, config_root=_config):
substrings = []
with open(
_config.transient.package_dir / "resources" / "ldnoobw-en.txt",
data_path / "ldnoobw-en.txt",
"r",
encoding="utf-8",
) as wordlist:
Expand Down
3 changes: 2 additions & 1 deletion garak/detectors/specialwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from garak import _config
from garak.detectors.base import StringDetector
from garak.data import path as data_path


class SlursReclaimedSlurs(StringDetector):
Expand All @@ -25,7 +26,7 @@ class SlursReclaimedSlurs(StringDetector):
def __init__(self, config_root=_config):
self.substrings = set([])
with open(
_config.transient.package_dir / "resources" / "slurprompts.jsonl",
data_path / "slurprompts.jsonl",
"r",
encoding="utf-8",
) as slurfile:
Expand Down
5 changes: 2 additions & 3 deletions garak/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import garak.harnesses.probewise
from garak import _config
from garak.evaluators import ThresholdEvaluator
from garak.data import path as data_path
from garak._plugins import enumerate_plugins

logger = getLogger(__name__)
Expand Down Expand Up @@ -232,9 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace:

def banner():
"""Display a random banner"""
with open(
_config.transient.package_dir / "resources" / "banners", "r", encoding="utf-8"
) as bannerfile:
with open(data_path / "banners", "r", encoding="utf-8") as bannerfile:
banners = bannerfile.read().lstrip().split("\n\n")
print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET)

Expand Down
34 changes: 10 additions & 24 deletions garak/payloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import garak._config
import garak.exception
from garak.data import path as data_path


PAYLOAD_SCHEMA = {
Expand All @@ -35,10 +36,7 @@
],
}

PAYLOAD_SEARCH_DIRS = [
garak._config.transient.data_dir / "resources" / "payloads",
garak._config.transient.package_dir / "resources" / "payloads",
]
PAYLOAD_DIR = data_path / "payloads"


def _validate_payload(payload_json):
Expand All @@ -52,17 +50,9 @@ def _validate_payload(payload_json):
def load_payload(
name: str, path: Union[str, pathlib.Path, None] = None
) -> PayloadGroup:
if path is not None:
return PayloadGroup(name, path)
else:
# iterate through search dirs
for dir in PAYLOAD_SEARCH_DIRS:
path = pathlib.Path(dir) / f"{name}.json"
if path.is_file():
return PayloadGroup(name, path)
raise FileNotFoundError(
"File '%s.json' not found in payload search directories" % name
)
if path is None:
path = PAYLOAD_DIR / f"{name}.json"
jmartin-tech marked this conversation as resolved.
Show resolved Hide resolved
return PayloadGroup(name, path)


class PayloadGroup:
Expand Down Expand Up @@ -155,7 +145,7 @@ def _scan_payload_dir(self, dir) -> dict:
payloads, return name:path dict. optionally filter by type prefixes"""

payloads_found = {}
dir = pathlib.Path(dir)
dir = dir
if not dir.is_dir():
return {}

Expand All @@ -182,11 +172,7 @@ def _scan_payload_dir(self, dir) -> dict:
def _refresh_payloads(self) -> None:
"""Scan resources/payloads and the XDG_DATA_DIR/payloads for
payload objects, and refresh self.payload_list"""
self.payload_list = {}
for payload_dir in PAYLOAD_SEARCH_DIRS[
::-1
]: # reverse order because | clobbers at top-level key
self.payload_list = self.payload_list | self._scan_payload_dir(payload_dir)
self.payload_list = self._scan_payload_dir(PAYLOAD_DIR)

def search(
self, types: Union[List[str], None] = None, include_children=True
Expand Down Expand Up @@ -222,10 +208,10 @@ def load(self, name) -> PayloadGroup:
logging.error(msg, exc_info=ke)
raise garak.exception.PayloadFailure(msg) from ke

except FileNotFoundError as fnfe:
except garak.exception.GarakException as ge:
msg = f"Requested payload {name} not found at expected path {path}"
logging.error(msg, exc_info=fnfe)
raise garak.exception.PayloadFailure(msg) from fnfe
logging.error(msg, exc_info=ge)
raise garak.exception.PayloadFailure(msg) from ge

return p

Expand Down
3 changes: 2 additions & 1 deletion garak/probes/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from garak import _config
from garak.attempt import Attempt
from garak.data import path as data_path
from garak.probes.base import Probe


Expand Down Expand Up @@ -43,7 +44,7 @@ def __init__(self, config_root=_config):
# append these to each of the prefixes for that term
self._slur_prefixes = []
with open(
_config.transient.package_dir / "resources" / self.source_resource_filename,
data_path / self.source_resource_filename,
"r",
encoding="utf-8",
) as slurfile:
Expand Down
11 changes: 3 additions & 8 deletions garak/probes/dan.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import garak.attempt
from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe


Expand Down Expand Up @@ -537,11 +538,7 @@ class AutoDANCached(Probe):

def __init__(
self,
autodan_prompt_location: str = _config.transient.package_dir
/ "resources"
/ "autodan"
/ "data"
/ "autodan_prompts.txt",
autodan_prompt_location: str = data_path / "autodan" / "autodan_prompts.txt",
config_root=_config,
):
self.autodan_prompt_location = autodan_prompt_location
Expand Down Expand Up @@ -645,9 +642,7 @@ class DanInTheWild(Probe):
def __init__(self, config_root=_config):
super().__init__(config_root=config_root)

inthewild_path = str(
_config.transient.package_dir / "resources" / "inthewild_jailbreak_llms.txt"
)
inthewild_path = str(data_path / "inthewild_jailbreak_llms.txt")

with open(inthewild_path, "r", encoding="utf-8") as f:
self.prompts = f.readlines()
Expand Down
6 changes: 2 additions & 4 deletions garak/probes/donotanswer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys

from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe

this = sys.modules[__name__]
Expand All @@ -19,10 +20,7 @@
def load_local_data(self):
# prompts
with open(
_config.transient.package_dir
/ "resources"
/ "donotanswer"
/ f"{self.dna_category}.txt",
data_path / "donotanswer" / f"{self.dna_category}.txt",
encoding="utf=8",
) as f:
self.prompts = f.read().strip().split("\n")
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json

from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe

payload_library = {}
Expand Down Expand Up @@ -58,7 +59,7 @@ def _load_payloads():

payload_library["slur_terms"] = []
with open(
_config.transient.package_dir / "resources" / "slurprompts_mini.jsonl",
data_path / "slurprompts_mini.jsonl",
"r",
encoding="utf-8",
) as slurfile:
Expand Down
Loading
Loading