Skip to content

Commit

Permalink
data file override support with precedence (#916)
Browse files Browse the repository at this point in the history
* Provides a custom path accessor for project `data` files to allow
user's to override project provided datasets using
`$XDG_DATA_DIR/garak/data` to mirror `garak/data` from the package.
* Separates non-code data or dataset files from source code
* Downloaded datasets are considered `cached files` and do not change
location or behavior in the revision
* Removes argument parser no longer supported by `gcg`
  • Loading branch information
jmartin-tech committed Sep 24, 2024
2 parents b60bd9d + 93e8d62 commit 29c5b9a
Show file tree
Hide file tree
Showing 78 changed files with 326 additions and 165 deletions.
4 changes: 2 additions & 2 deletions garak/analyze/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Union


from garak import _config
from garak.data import path as data_path

MINIMUM_STD_DEV = (
0.01732 # stddev=0 gives unusable z-scores; give it an arbitrary floor of 3^.5 %
Expand Down Expand Up @@ -132,7 +132,7 @@ def defcon_and_comment(
return zscore_defcon, zscore_comment

def _build_path(self, filename):
return _config.transient.package_dir / "resources" / "calibration" / filename
return data_path / "calibration" / filename

def __init__(self, calibration_path: Union[None, str, pathlib.Path] = None) -> None:

Expand Down
7 changes: 2 additions & 5 deletions garak/analyze/misp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@
import os

from garak import _plugins
import garak._config
from garak.data import path as data_path

# does this utility really have access to _config?
misp_resource_file = (
garak._config.transient.package_dir / "resources" / "misp_descriptions.tsv"
)
misp_resource_file = data_path / "misp_descriptions.tsv"
misp_descriptions = {}
if os.path.isfile(misp_resource_file):
with open(misp_resource_file, "r", encoding="utf-8") as f:
Expand Down
6 changes: 3 additions & 3 deletions garak/analyze/report_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
import sqlite3

from garak import _config
from garak.data import path as data_path
import garak.analyze.calibration


if not _config.loaded:
_config.load_config()

Expand All @@ -33,9 +35,7 @@
about_z_template = templateEnv.get_template("digest_about_z.jinja")


misp_resource_file = (
_config.transient.package_dir / "resources" / "misp_descriptions.tsv"
)
misp_resource_file = data_path / "misp_descriptions.tsv"
misp_descriptions = {}
if os.path.isfile(misp_resource_file):
with open(misp_resource_file, "r", encoding="utf-8") as f:
Expand Down
100 changes: 100 additions & 0 deletions garak/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Local read only resources found by precedence matching supported paths
Ideal usage:
```
file_path = resources / "filename"
with open(file_path) as f:
f.read()
```
Resources that do not have a `shipped` version should wrap path access in a try block:
```
try:
file_path = resources / "filename"
except GarakException as e:
logging.warn("No resource file found.", exc_info=e)
```
"""

import pathlib

from garak import _config
from garak.exception import GarakException


class LocalDataPath(type(pathlib.Path())):
"""restricted Path object usable only for existing resource files"""

ORDERED_SEARCH_PATHS = [
_config.transient.data_dir / "data",
_config.transient.package_dir / "data",
]

def _determine_suffix(self):
for path in self.ORDERED_SEARCH_PATHS:
if path == self or path in self.parents:
return self.relative_to(path)

def _eval_paths(self, segment, next_call, relative):
if self in self.ORDERED_SEARCH_PATHS and segment == relative:
raise GarakException(
f"The requested resource does not refer to a valid path"
)

prefix_removed = self._determine_suffix()
if prefix_removed is None:
# if LocalDataPath is instantiated using a path not in ORDERED_SEARCH_PATHS
raise GarakException(
f"The requested resource does not refer to a valid path: {self}"
)
for path in self.ORDERED_SEARCH_PATHS:
if segment == relative:
projected = (path / prefix_removed).parent
else:
current_path = path / prefix_removed
projected = getattr(current_path, next_call)(segment)
if projected.exists():
return LocalDataPath(projected)

raise GarakException(f"The resource requested does not exist {segment}")

def _glob(self, pattern, recursive=False):
glob_method = "rglob" if recursive else "glob"

prefix_removed = self._determine_suffix()
candidate_files = []
for path in self.ORDERED_SEARCH_PATHS:
candidate_path = path / prefix_removed
dir_files = getattr(candidate_path, glob_method)(pattern)
candidate_files.append(dir_files)
relative_paths = []
selected_files = []
for files in candidate_files:
for file in files:
suffix = LocalDataPath(file)._determine_suffix()
if suffix not in relative_paths:
selected_files.append(file)
relative_paths.append(suffix)

return selected_files

def glob(self, pattern):
return self._glob(pattern, recursive=False)

def rglob(self, pattern):
return self._glob(pattern, recursive=True)

def _make_child(self, segment):
return self._eval_paths(segment, "_make_child", ("..",))

def joinpath(self, *pathsegments):
for segment in pathsegments:
projected = self._eval_paths(segment, "joinpath", "..")
return projected


path = LocalDataPath(_config.transient.data_dir / "data")
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
7 changes: 4 additions & 3 deletions garak/detectors/riskywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import csv

from garak import _config
from garak.data import path as data_path
from garak.detectors.base import StringDetector

surge_list = defaultdict(list)
with open(
_config.transient.package_dir / "resources" / "profanity_en.csv",
data_path / "profanity_en.csv",
"r",
encoding="utf-8",
) as csvfile:
Expand All @@ -26,7 +27,7 @@
MIN_OFCOM_STRENGTH = 2
ofcom_list = defaultdict(list)
with open(
_config.transient.package_dir / "resources" / "ofcom-potentially-offensive.txt",
data_path / "ofcom-potentially-offensive.txt",
"r",
encoding="utf-8",
) as csvfile:
Expand Down Expand Up @@ -333,7 +334,7 @@ class LDNOOBW(StringDetector):
def __init__(self, config_root=_config):
substrings = []
with open(
_config.transient.package_dir / "resources" / "ldnoobw-en.txt",
data_path / "ldnoobw-en.txt",
"r",
encoding="utf-8",
) as wordlist:
Expand Down
3 changes: 2 additions & 1 deletion garak/detectors/specialwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from garak import _config
from garak.detectors.base import StringDetector
from garak.data import path as data_path


class SlursReclaimedSlurs(StringDetector):
Expand All @@ -25,7 +26,7 @@ class SlursReclaimedSlurs(StringDetector):
def __init__(self, config_root=_config):
self.substrings = set([])
with open(
_config.transient.package_dir / "resources" / "slurprompts.jsonl",
data_path / "slurprompts.jsonl",
"r",
encoding="utf-8",
) as slurfile:
Expand Down
5 changes: 2 additions & 3 deletions garak/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import garak.harnesses.probewise
from garak import _config
from garak.evaluators import ThresholdEvaluator
from garak.data import path as data_path
from garak._plugins import enumerate_plugins

logger = getLogger(__name__)
Expand Down Expand Up @@ -232,9 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace:

def banner():
"""Display a random banner"""
with open(
_config.transient.package_dir / "resources" / "banners", "r", encoding="utf-8"
) as bannerfile:
with open(data_path / "banners", "r", encoding="utf-8") as bannerfile:
banners = bannerfile.read().lstrip().split("\n\n")
print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET)

Expand Down
34 changes: 10 additions & 24 deletions garak/payloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import garak._config
import garak.exception
from garak.data import path as data_path


PAYLOAD_SCHEMA = {
Expand All @@ -35,10 +36,7 @@
],
}

PAYLOAD_SEARCH_DIRS = [
garak._config.transient.data_dir / "resources" / "payloads",
garak._config.transient.package_dir / "resources" / "payloads",
]
PAYLOAD_DIR = data_path / "payloads"


def _validate_payload(payload_json):
Expand All @@ -52,17 +50,9 @@ def _validate_payload(payload_json):
def load_payload(
name: str, path: Union[str, pathlib.Path, None] = None
) -> PayloadGroup:
if path is not None:
return PayloadGroup(name, path)
else:
# iterate through search dirs
for dir in PAYLOAD_SEARCH_DIRS:
path = pathlib.Path(dir) / f"{name}.json"
if path.is_file():
return PayloadGroup(name, path)
raise FileNotFoundError(
"File '%s.json' not found in payload search directories" % name
)
if path is None:
path = PAYLOAD_DIR / f"{name}.json"
return PayloadGroup(name, path)


class PayloadGroup:
Expand Down Expand Up @@ -155,7 +145,7 @@ def _scan_payload_dir(self, dir) -> dict:
payloads, return name:path dict. optionally filter by type prefixes"""

payloads_found = {}
dir = pathlib.Path(dir)
dir = dir
if not dir.is_dir():
return {}

Expand All @@ -182,11 +172,7 @@ def _scan_payload_dir(self, dir) -> dict:
def _refresh_payloads(self) -> None:
"""Scan resources/payloads and the XDG_DATA_DIR/payloads for
payload objects, and refresh self.payload_list"""
self.payload_list = {}
for payload_dir in PAYLOAD_SEARCH_DIRS[
::-1
]: # reverse order because | clobbers at top-level key
self.payload_list = self.payload_list | self._scan_payload_dir(payload_dir)
self.payload_list = self._scan_payload_dir(PAYLOAD_DIR)

def search(
self, types: Union[List[str], None] = None, include_children=True
Expand Down Expand Up @@ -222,10 +208,10 @@ def load(self, name) -> PayloadGroup:
logging.error(msg, exc_info=ke)
raise garak.exception.PayloadFailure(msg) from ke

except FileNotFoundError as fnfe:
except garak.exception.GarakException as ge:
msg = f"Requested payload {name} not found at expected path {path}"
logging.error(msg, exc_info=fnfe)
raise garak.exception.PayloadFailure(msg) from fnfe
logging.error(msg, exc_info=ge)
raise garak.exception.PayloadFailure(msg) from ge

return p

Expand Down
3 changes: 2 additions & 1 deletion garak/probes/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from garak import _config
from garak.attempt import Attempt
from garak.data import path as data_path
from garak.probes.base import Probe


Expand Down Expand Up @@ -43,7 +44,7 @@ def __init__(self, config_root=_config):
# append these to each of the prefixes for that term
self._slur_prefixes = []
with open(
_config.transient.package_dir / "resources" / self.source_resource_filename,
data_path / self.source_resource_filename,
"r",
encoding="utf-8",
) as slurfile:
Expand Down
11 changes: 3 additions & 8 deletions garak/probes/dan.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import garak.attempt
from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe


Expand Down Expand Up @@ -537,11 +538,7 @@ class AutoDANCached(Probe):

def __init__(
self,
autodan_prompt_location: str = _config.transient.package_dir
/ "resources"
/ "autodan"
/ "data"
/ "autodan_prompts.txt",
autodan_prompt_location: str = data_path / "autodan" / "autodan_prompts.txt",
config_root=_config,
):
self.autodan_prompt_location = autodan_prompt_location
Expand Down Expand Up @@ -645,9 +642,7 @@ class DanInTheWild(Probe):
def __init__(self, config_root=_config):
super().__init__(config_root=config_root)

inthewild_path = str(
_config.transient.package_dir / "resources" / "inthewild_jailbreak_llms.txt"
)
inthewild_path = str(data_path / "inthewild_jailbreak_llms.txt")

with open(inthewild_path, "r", encoding="utf-8") as f:
self.prompts = f.readlines()
Expand Down
6 changes: 2 additions & 4 deletions garak/probes/donotanswer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys

from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe

this = sys.modules[__name__]
Expand All @@ -19,10 +20,7 @@
def load_local_data(self):
# prompts
with open(
_config.transient.package_dir
/ "resources"
/ "donotanswer"
/ f"{self.dna_category}.txt",
data_path / "donotanswer" / f"{self.dna_category}.txt",
encoding="utf=8",
) as f:
self.prompts = f.read().strip().split("\n")
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json

from garak import _config
from garak.data import path as data_path
from garak.probes.base import Probe

payload_library = {}
Expand Down Expand Up @@ -58,7 +59,7 @@ def _load_payloads():

payload_library["slur_terms"] = []
with open(
_config.transient.package_dir / "resources" / "slurprompts_mini.jsonl",
data_path / "slurprompts_mini.jsonl",
"r",
encoding="utf-8",
) as slurfile:
Expand Down
Loading

0 comments on commit 29c5b9a

Please sign in to comment.