From aa4889542274870b360bb4233345791abb3a020a Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Mon, 16 Sep 2024 11:39:01 -0500 Subject: [PATCH 01/13] remove parser no longer used for gcg Signed-off-by: Jeffrey Martin --- garak/resources/gcg/__init__.py | 2 +- garak/resources/gcg/generate_gcg.py | 46 ----------------------------- 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/garak/resources/gcg/__init__.py b/garak/resources/gcg/__init__.py index 1c7b644d5..323958347 100644 --- a/garak/resources/gcg/__init__.py +++ b/garak/resources/gcg/__init__.py @@ -6,4 +6,4 @@ # Greedy Coordinate Gradient implementation lightly modified from https://github.com/llm-attacks/llm-attacks # Paper can be found at: https://arxiv.org/abs/2307.15043 -from .generate_gcg import gcg_parser, run_gcg +from .generate_gcg import run_gcg diff --git a/garak/resources/gcg/generate_gcg.py b/garak/resources/gcg/generate_gcg.py index 0dc969fc1..dcdc1f191 100644 --- a/garak/resources/gcg/generate_gcg.py +++ b/garak/resources/gcg/generate_gcg.py @@ -40,52 +40,6 @@ resource_data = garak._config.transient.package_dir / "resources" gcg_resource_data = garak._config.transient.cache_dir / "resources" / "gcg" / "data" -# GCG parser used by interactive mode -gcg_parser = ArgumentParser() -gcg_parser.add_argument("--model_names", nargs="+", help="Model names for generation") -gcg_parser.add_argument( - "--transfer", action="store_true", help="Whether to generate attack for transfer" -) -gcg_parser.add_argument( - "--progressive", action="store_true", help="Use progressive goals" -) -gcg_parser.add_argument("--stop_success", action="store_true", help="Stop on success") -gcg_parser.add_argument( - "--train_data", - type=str, - default=resource_data / "advbench" / "harmful_behaviors.csv", - help="Path to training data", -) -gcg_parser.add_argument( - "--n_train", type=int, default=50, help="Number of training samples to use" -) -gcg_parser.add_argument( - "--n_test", type=int, default=0, help="Number of test samples to use" -) -gcg_parser.add_argument( - "--outfile", - type=str, - default=gcg_resource_data / "gcg_prompts.txt", - help="Location to write GCG attack output", -) -gcg_parser.add_argument( - "--control_init", type=str, default=CONTROL_INIT, help="Initial control string" -) -gcg_parser.add_argument( - "--n_steps", type=int, default=500, help="Number of steps for optimization" -) -gcg_parser.add_argument( - "--batch_size", type=int, default=128, help="Optimization batch size" -) -gcg_parser.add_argument( - "--allow_non_ascii", - action="store_true", - help="Allow non-ASCII characters in control string", -) -gcg_parser.add_argument( - "--save_logs", action="store_true", help="Keep detailed GCG generation logs" -) - def run_gcg( target_generator: garak.generators.Generator = None, From 724a81ec37f24a5c9cca718474a1e6719aab939f Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Tue, 17 Sep 2024 08:10:09 -0500 Subject: [PATCH 02/13] move rest demo to tools Signed-off-by: Jeffrey Martin --- {garak/resources => tools}/rest/restdemo.json | 0 {garak/resources => tools}/rest/restserv.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {garak/resources => tools}/rest/restdemo.json (100%) rename {garak/resources => tools}/rest/restserv.py (100%) diff --git a/garak/resources/rest/restdemo.json b/tools/rest/restdemo.json similarity index 100% rename from garak/resources/rest/restdemo.json rename to tools/rest/restdemo.json diff --git a/garak/resources/rest/restserv.py b/tools/rest/restserv.py similarity index 100% rename from garak/resources/rest/restserv.py rename to tools/rest/restserv.py From 8890c94730ca0efe286c34d470527ce40481f0db Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Mon, 16 Sep 2024 14:32:43 -0500 Subject: [PATCH 03/13] add custom Path type for data files * returns first found instance of filename * custom `Path` raises exception when: * path escape attempt is detected * no file matching request exists --- garak/data/__init__.py | 60 +++++++++++++++++++++++++++++++++++++ tests/test_data.py | 68 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 garak/data/__init__.py create mode 100644 tests/test_data.py diff --git a/garak/data/__init__.py b/garak/data/__init__.py new file mode 100644 index 000000000..4e20dab34 --- /dev/null +++ b/garak/data/__init__.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Local read only resources found by precedence matching supported paths + +Ideal usage: + +``` +file_path = resources / "filename" +with open(file_path) as f: + f.read() +``` + +Resources that do not have a `shipped` version should wrap path access in a try block: +``` +try: + file_path = resources / "filename" +except GarakException as e: + logging.warn("No resource file found.", exc_info=e) +``` +""" + +import pathlib + +from garak import _config +from garak.exception import GarakException + + +class LocalDataPath(pathlib.Path): + """restricted Path object usable only for existing resource files""" + + ORDERED_SEARCH_PATHS = [ + _config.transient.data_dir / "data", + _config.transient.package_dir / "data", + ] + + def joinpath(self, *pathsegments): + + for segment in pathsegments: + prefix_removed = None + for path in self.ORDERED_SEARCH_PATHS: + if (path == self and segment != "..") or path in self.parents: + prefix_removed = self.relative_to(path) + break + if prefix_removed is None: + raise GarakException( + f"The requested resource does not refer to a valid path: {self}" + ) + for path in self.ORDERED_SEARCH_PATHS: + if segment == "..": + projected = (path / prefix_removed).parent + else: + projected = (path / prefix_removed).joinpath(segment) + if projected.exists(): + return LocalDataPath(projected) + + raise GarakException(f"The resource requested does not exist {segment}") + + +path = LocalDataPath(_config.transient.data_dir / "data") diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 000000000..84c8f2250 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import tempfile +import os + +from garak import _config +from garak.exception import GarakException +from garak.data import path as data_path +from garak.data import LocalDataPath + + +@pytest.fixture +def random_resource_filename(request) -> None: + with tempfile.NamedTemporaryFile( + dir=LocalDataPath.ORDERED_SEARCH_PATHS[-1], mode="w", delete=False + ) as tmpfile: + tmpfile.write("file data") + + def remove_files(): + for path in LocalDataPath.ORDERED_SEARCH_PATHS: + rem_path = path / os.path.basename(tmpfile.name) + if rem_path.exists(): + rem_path.unlink() + + request.addfinalizer(remove_files) + + return os.path.basename(tmpfile.name) + + +def test_no_relative_escape(): + with pytest.raises(GarakException) as exc_info: + data_path / ".." + assert "does not refer to a valid path" in str(exc_info.value) + + +def test_no_relative_escape_extended(): + autodan_path = data_path / "autodan" + with pytest.raises(GarakException) as exc_info: + autodan_path / ".." / ".." / "configs" + assert "does not refer to a valid path" in str(exc_info.value) + + +def test_allow_relative_in_path(): + source = data_path / "autodan" / ".." / "gcg" + assert source.name == "gcg" + + +def test_known_resource_found(): + known_filename = "misp_descriptions.tsv" + source = data_path / known_filename + assert source.name == known_filename + + +def test_local_override(random_resource_filename): + source = data_path / random_resource_filename + assert _config.transient.package_dir in source.parents + + data_root_path = _config.transient.data_dir / "resources" + data_root_path.mkdir(parents=True, exist_ok=True) + with open( + data_root_path / random_resource_filename, encoding="utf-8", mode="w" + ) as f: + f.write("fake data") + + source = data_path / random_resource_filename + assert _config.transient.data_dir in source.parents From 4a73965d9dcfad5aafaecbc879144439dc8d1f18 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Tue, 17 Sep 2024 12:14:48 -0500 Subject: [PATCH 04/13] use `garak.data.path` to access data files Signed-off-by: Jeffrey Martin --- garak/analyze/calibration.py | 4 +-- garak/analyze/misp.py | 7 ++--- garak/analyze/report_digest.py | 6 ++--- .../data => data/autodan}/autodan_init.txt | 0 .../data => data/autodan}/autodan_prompts.txt | 0 .../data => data/autodan}/prompt_group.pth | Bin garak/{resources => data}/banners | 0 .../beast/data => data/beast}/suffixes.txt | 0 garak/{resources => data}/book_cloze.tsv | 0 garak/{resources => data}/book_cloze_80.tsv | 0 garak/{resources => data}/calibration/bag.md | 0 .../calibration/calibration-2024-summer.json | 0 .../calibration/calibration-alpha-202407.json | 0 .../calibration/calibration.json | 0 ...n_exclusion_toxicity_hateful_offensive.txt | 0 .../donotanswer/human_chatbox.txt | 0 .../donotanswer/information_hazard.txt | 0 .../donotanswer/malicious_uses.txt | 0 .../donotanswer/misinformation_harms.txt | 0 .../{resources/gcg/data => data/gcg}/gcg.txt | 0 .../graph_connectivity.json | 0 garak/{resources => data}/guardian_cloze.tsv | 0 .../inthewild_jailbreak_llms.txt | 0 garak/{resources => data}/ldnoobw-en.txt | 0 .../{resources => data}/misp_descriptions.tsv | 0 garak/{resources => data}/nyt_cloze.tsv | 0 .../ofcom-potentially-offensive.txt | 0 garak/{resources => data}/potter_cloze.tsv | 0 .../primality_testing.json | 0 garak/{resources => data}/profanity_en.csv | 0 .../realtoxicityprompts/rtp_flirtation.txt | 0 .../rtp_identity_attack.txt | 0 .../realtoxicityprompts/rtp_insult.txt | 0 .../realtoxicityprompts/rtp_profanity.txt | 0 .../rtp_severe_toxicity.txt | 0 .../rtp_sexually_explicit.txt | 0 .../realtoxicityprompts/rtp_threat.txt | 0 .../safebench_filenames.txt | 0 .../safebenchtiny_filenames.txt | 0 garak/{resources => data}/senator_search.json | 0 garak/{resources => data}/slurprompts.jsonl | 0 .../slurprompts_mini.jsonl | 0 .../slursreclaimedslurs.txt | 0 .../tap/data => data/tap}/tap_jailbreaks.txt | 0 .../truefalse_falseclaims.txt | 0 .../truefalse_falseclaims_50.txt | 0 garak/detectors/riskywords.py | 7 ++--- garak/detectors/specialwords.py | 3 ++- garak/interactive.py | 5 ++-- garak/probes/continuation.py | 3 ++- garak/probes/dan.py | 11 +++----- garak/probes/donotanswer.py | 6 ++--- garak/probes/encoding.py | 3 ++- garak/probes/leakreplay.py | 7 +++-- garak/probes/misleading.py | 5 ++-- garak/probes/realtoxicityprompts.py | 6 ++--- garak/probes/snowball.py | 7 ++--- garak/probes/suffix.py | 5 ++-- garak/probes/tap.py | 7 ++--- garak/probes/topic.py | 2 +- garak/probes/visual_jailbreak.py | 12 ++++----- garak/resources/autodan/autodan.py | 9 +++---- garak/resources/beast/beast_attack.py | 2 +- garak/resources/common.py | 24 ++++++++++++------ garak/resources/gcg/generate_gcg.py | 11 ++++---- garak/resources/tap/tap_main.py | 6 +---- tests/probes/test_probes.py | 2 +- tests/test_data.py | 2 +- 68 files changed, 73 insertions(+), 89 deletions(-) rename garak/{resources/autodan/data => data/autodan}/autodan_init.txt (100%) rename garak/{resources/autodan/data => data/autodan}/autodan_prompts.txt (100%) rename garak/{resources/autodan/data => data/autodan}/prompt_group.pth (100%) rename garak/{resources => data}/banners (100%) rename garak/{resources/beast/data => data/beast}/suffixes.txt (100%) rename garak/{resources => data}/book_cloze.tsv (100%) rename garak/{resources => data}/book_cloze_80.tsv (100%) rename garak/{resources => data}/calibration/bag.md (100%) rename garak/{resources => data}/calibration/calibration-2024-summer.json (100%) rename garak/{resources => data}/calibration/calibration-alpha-202407.json (100%) rename garak/{resources => data}/calibration/calibration.json (100%) rename garak/{resources => data}/donotanswer/discrimination_exclusion_toxicity_hateful_offensive.txt (100%) rename garak/{resources => data}/donotanswer/human_chatbox.txt (100%) rename garak/{resources => data}/donotanswer/information_hazard.txt (100%) rename garak/{resources => data}/donotanswer/malicious_uses.txt (100%) rename garak/{resources => data}/donotanswer/misinformation_harms.txt (100%) rename garak/{resources/gcg/data => data/gcg}/gcg.txt (100%) rename garak/{resources => data}/graph_connectivity.json (100%) rename garak/{resources => data}/guardian_cloze.tsv (100%) rename garak/{resources => data}/inthewild_jailbreak_llms.txt (100%) rename garak/{resources => data}/ldnoobw-en.txt (100%) rename garak/{resources => data}/misp_descriptions.tsv (100%) rename garak/{resources => data}/nyt_cloze.tsv (100%) rename garak/{resources => data}/ofcom-potentially-offensive.txt (100%) rename garak/{resources => data}/potter_cloze.tsv (100%) rename garak/{resources => data}/primality_testing.json (100%) rename garak/{resources => data}/profanity_en.csv (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_flirtation.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_identity_attack.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_insult.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_profanity.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_severe_toxicity.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_sexually_explicit.txt (100%) rename garak/{resources => data}/realtoxicityprompts/rtp_threat.txt (100%) rename garak/{resources => data}/safebench_filenames.txt (100%) rename garak/{resources => data}/safebenchtiny_filenames.txt (100%) rename garak/{resources => data}/senator_search.json (100%) rename garak/{resources => data}/slurprompts.jsonl (100%) rename garak/{resources => data}/slurprompts_mini.jsonl (100%) rename garak/{resources => data}/slursreclaimedslurs.txt (100%) rename garak/{resources/tap/data => data/tap}/tap_jailbreaks.txt (100%) rename garak/{resources => data}/truefalse_falseclaims.txt (100%) rename garak/{resources => data}/truefalse_falseclaims_50.txt (100%) diff --git a/garak/analyze/calibration.py b/garak/analyze/calibration.py index 79190ed3c..f8ac5a903 100644 --- a/garak/analyze/calibration.py +++ b/garak/analyze/calibration.py @@ -10,7 +10,7 @@ from typing import Union -from garak import _config +from garak.data import path as data_path MINIMUM_STD_DEV = ( 0.01732 # stddev=0 gives unusable z-scores; give it an arbitrary floor of 3^.5 % @@ -132,7 +132,7 @@ def defcon_and_comment( return zscore_defcon, zscore_comment def _build_path(self, filename): - return _config.transient.package_dir / "resources" / "calibration" / filename + return data_path / "calibration" / filename def __init__(self, calibration_path: Union[None, str, pathlib.Path] = None) -> None: diff --git a/garak/analyze/misp.py b/garak/analyze/misp.py index c0b9a1fba..393c9bd0b 100644 --- a/garak/analyze/misp.py +++ b/garak/analyze/misp.py @@ -9,12 +9,9 @@ import os from garak import _plugins -import garak._config +from garak.data import path as data_path -# does this utility really have access to _config? -misp_resource_file = ( - garak._config.transient.package_dir / "resources" / "misp_descriptions.tsv" -) +misp_resource_file = data_path / "misp_descriptions.tsv" misp_descriptions = {} if os.path.isfile(misp_resource_file): with open(misp_resource_file, "r", encoding="utf-8") as f: diff --git a/garak/analyze/report_digest.py b/garak/analyze/report_digest.py index e1f0315ce..a655a4e29 100644 --- a/garak/analyze/report_digest.py +++ b/garak/analyze/report_digest.py @@ -14,8 +14,10 @@ import sqlite3 from garak import _config +from garak.data import path as data_path import garak.analyze.calibration + if not _config.loaded: _config.load_config() @@ -33,9 +35,7 @@ about_z_template = templateEnv.get_template("digest_about_z.jinja") -misp_resource_file = ( - _config.transient.package_dir / "resources" / "misp_descriptions.tsv" -) +misp_resource_file = data_path / "misp_descriptions.tsv" misp_descriptions = {} if os.path.isfile(misp_resource_file): with open(misp_resource_file, "r", encoding="utf-8") as f: diff --git a/garak/resources/autodan/data/autodan_init.txt b/garak/data/autodan/autodan_init.txt similarity index 100% rename from garak/resources/autodan/data/autodan_init.txt rename to garak/data/autodan/autodan_init.txt diff --git a/garak/resources/autodan/data/autodan_prompts.txt b/garak/data/autodan/autodan_prompts.txt similarity index 100% rename from garak/resources/autodan/data/autodan_prompts.txt rename to garak/data/autodan/autodan_prompts.txt diff --git a/garak/resources/autodan/data/prompt_group.pth b/garak/data/autodan/prompt_group.pth similarity index 100% rename from garak/resources/autodan/data/prompt_group.pth rename to garak/data/autodan/prompt_group.pth diff --git a/garak/resources/banners b/garak/data/banners similarity index 100% rename from garak/resources/banners rename to garak/data/banners diff --git a/garak/resources/beast/data/suffixes.txt b/garak/data/beast/suffixes.txt similarity index 100% rename from garak/resources/beast/data/suffixes.txt rename to garak/data/beast/suffixes.txt diff --git a/garak/resources/book_cloze.tsv b/garak/data/book_cloze.tsv similarity index 100% rename from garak/resources/book_cloze.tsv rename to garak/data/book_cloze.tsv diff --git a/garak/resources/book_cloze_80.tsv b/garak/data/book_cloze_80.tsv similarity index 100% rename from garak/resources/book_cloze_80.tsv rename to garak/data/book_cloze_80.tsv diff --git a/garak/resources/calibration/bag.md b/garak/data/calibration/bag.md similarity index 100% rename from garak/resources/calibration/bag.md rename to garak/data/calibration/bag.md diff --git a/garak/resources/calibration/calibration-2024-summer.json b/garak/data/calibration/calibration-2024-summer.json similarity index 100% rename from garak/resources/calibration/calibration-2024-summer.json rename to garak/data/calibration/calibration-2024-summer.json diff --git a/garak/resources/calibration/calibration-alpha-202407.json b/garak/data/calibration/calibration-alpha-202407.json similarity index 100% rename from garak/resources/calibration/calibration-alpha-202407.json rename to garak/data/calibration/calibration-alpha-202407.json diff --git a/garak/resources/calibration/calibration.json b/garak/data/calibration/calibration.json similarity index 100% rename from garak/resources/calibration/calibration.json rename to garak/data/calibration/calibration.json diff --git a/garak/resources/donotanswer/discrimination_exclusion_toxicity_hateful_offensive.txt b/garak/data/donotanswer/discrimination_exclusion_toxicity_hateful_offensive.txt similarity index 100% rename from garak/resources/donotanswer/discrimination_exclusion_toxicity_hateful_offensive.txt rename to garak/data/donotanswer/discrimination_exclusion_toxicity_hateful_offensive.txt diff --git a/garak/resources/donotanswer/human_chatbox.txt b/garak/data/donotanswer/human_chatbox.txt similarity index 100% rename from garak/resources/donotanswer/human_chatbox.txt rename to garak/data/donotanswer/human_chatbox.txt diff --git a/garak/resources/donotanswer/information_hazard.txt b/garak/data/donotanswer/information_hazard.txt similarity index 100% rename from garak/resources/donotanswer/information_hazard.txt rename to garak/data/donotanswer/information_hazard.txt diff --git a/garak/resources/donotanswer/malicious_uses.txt b/garak/data/donotanswer/malicious_uses.txt similarity index 100% rename from garak/resources/donotanswer/malicious_uses.txt rename to garak/data/donotanswer/malicious_uses.txt diff --git a/garak/resources/donotanswer/misinformation_harms.txt b/garak/data/donotanswer/misinformation_harms.txt similarity index 100% rename from garak/resources/donotanswer/misinformation_harms.txt rename to garak/data/donotanswer/misinformation_harms.txt diff --git a/garak/resources/gcg/data/gcg.txt b/garak/data/gcg/gcg.txt similarity index 100% rename from garak/resources/gcg/data/gcg.txt rename to garak/data/gcg/gcg.txt diff --git a/garak/resources/graph_connectivity.json b/garak/data/graph_connectivity.json similarity index 100% rename from garak/resources/graph_connectivity.json rename to garak/data/graph_connectivity.json diff --git a/garak/resources/guardian_cloze.tsv b/garak/data/guardian_cloze.tsv similarity index 100% rename from garak/resources/guardian_cloze.tsv rename to garak/data/guardian_cloze.tsv diff --git a/garak/resources/inthewild_jailbreak_llms.txt b/garak/data/inthewild_jailbreak_llms.txt similarity index 100% rename from garak/resources/inthewild_jailbreak_llms.txt rename to garak/data/inthewild_jailbreak_llms.txt diff --git a/garak/resources/ldnoobw-en.txt b/garak/data/ldnoobw-en.txt similarity index 100% rename from garak/resources/ldnoobw-en.txt rename to garak/data/ldnoobw-en.txt diff --git a/garak/resources/misp_descriptions.tsv b/garak/data/misp_descriptions.tsv similarity index 100% rename from garak/resources/misp_descriptions.tsv rename to garak/data/misp_descriptions.tsv diff --git a/garak/resources/nyt_cloze.tsv b/garak/data/nyt_cloze.tsv similarity index 100% rename from garak/resources/nyt_cloze.tsv rename to garak/data/nyt_cloze.tsv diff --git a/garak/resources/ofcom-potentially-offensive.txt b/garak/data/ofcom-potentially-offensive.txt similarity index 100% rename from garak/resources/ofcom-potentially-offensive.txt rename to garak/data/ofcom-potentially-offensive.txt diff --git a/garak/resources/potter_cloze.tsv b/garak/data/potter_cloze.tsv similarity index 100% rename from garak/resources/potter_cloze.tsv rename to garak/data/potter_cloze.tsv diff --git a/garak/resources/primality_testing.json b/garak/data/primality_testing.json similarity index 100% rename from garak/resources/primality_testing.json rename to garak/data/primality_testing.json diff --git a/garak/resources/profanity_en.csv b/garak/data/profanity_en.csv similarity index 100% rename from garak/resources/profanity_en.csv rename to garak/data/profanity_en.csv diff --git a/garak/resources/realtoxicityprompts/rtp_flirtation.txt b/garak/data/realtoxicityprompts/rtp_flirtation.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_flirtation.txt rename to garak/data/realtoxicityprompts/rtp_flirtation.txt diff --git a/garak/resources/realtoxicityprompts/rtp_identity_attack.txt b/garak/data/realtoxicityprompts/rtp_identity_attack.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_identity_attack.txt rename to garak/data/realtoxicityprompts/rtp_identity_attack.txt diff --git a/garak/resources/realtoxicityprompts/rtp_insult.txt b/garak/data/realtoxicityprompts/rtp_insult.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_insult.txt rename to garak/data/realtoxicityprompts/rtp_insult.txt diff --git a/garak/resources/realtoxicityprompts/rtp_profanity.txt b/garak/data/realtoxicityprompts/rtp_profanity.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_profanity.txt rename to garak/data/realtoxicityprompts/rtp_profanity.txt diff --git a/garak/resources/realtoxicityprompts/rtp_severe_toxicity.txt b/garak/data/realtoxicityprompts/rtp_severe_toxicity.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_severe_toxicity.txt rename to garak/data/realtoxicityprompts/rtp_severe_toxicity.txt diff --git a/garak/resources/realtoxicityprompts/rtp_sexually_explicit.txt b/garak/data/realtoxicityprompts/rtp_sexually_explicit.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_sexually_explicit.txt rename to garak/data/realtoxicityprompts/rtp_sexually_explicit.txt diff --git a/garak/resources/realtoxicityprompts/rtp_threat.txt b/garak/data/realtoxicityprompts/rtp_threat.txt similarity index 100% rename from garak/resources/realtoxicityprompts/rtp_threat.txt rename to garak/data/realtoxicityprompts/rtp_threat.txt diff --git a/garak/resources/safebench_filenames.txt b/garak/data/safebench_filenames.txt similarity index 100% rename from garak/resources/safebench_filenames.txt rename to garak/data/safebench_filenames.txt diff --git a/garak/resources/safebenchtiny_filenames.txt b/garak/data/safebenchtiny_filenames.txt similarity index 100% rename from garak/resources/safebenchtiny_filenames.txt rename to garak/data/safebenchtiny_filenames.txt diff --git a/garak/resources/senator_search.json b/garak/data/senator_search.json similarity index 100% rename from garak/resources/senator_search.json rename to garak/data/senator_search.json diff --git a/garak/resources/slurprompts.jsonl b/garak/data/slurprompts.jsonl similarity index 100% rename from garak/resources/slurprompts.jsonl rename to garak/data/slurprompts.jsonl diff --git a/garak/resources/slurprompts_mini.jsonl b/garak/data/slurprompts_mini.jsonl similarity index 100% rename from garak/resources/slurprompts_mini.jsonl rename to garak/data/slurprompts_mini.jsonl diff --git a/garak/resources/slursreclaimedslurs.txt b/garak/data/slursreclaimedslurs.txt similarity index 100% rename from garak/resources/slursreclaimedslurs.txt rename to garak/data/slursreclaimedslurs.txt diff --git a/garak/resources/tap/data/tap_jailbreaks.txt b/garak/data/tap/tap_jailbreaks.txt similarity index 100% rename from garak/resources/tap/data/tap_jailbreaks.txt rename to garak/data/tap/tap_jailbreaks.txt diff --git a/garak/resources/truefalse_falseclaims.txt b/garak/data/truefalse_falseclaims.txt similarity index 100% rename from garak/resources/truefalse_falseclaims.txt rename to garak/data/truefalse_falseclaims.txt diff --git a/garak/resources/truefalse_falseclaims_50.txt b/garak/data/truefalse_falseclaims_50.txt similarity index 100% rename from garak/resources/truefalse_falseclaims_50.txt rename to garak/data/truefalse_falseclaims_50.txt diff --git a/garak/detectors/riskywords.py b/garak/detectors/riskywords.py index 741522054..64538a437 100644 --- a/garak/detectors/riskywords.py +++ b/garak/detectors/riskywords.py @@ -7,11 +7,12 @@ import csv from garak import _config +from garak.data import path as data_path from garak.detectors.base import StringDetector surge_list = defaultdict(list) with open( - _config.transient.package_dir / "resources" / "profanity_en.csv", + data_path / "profanity_en.csv", "r", encoding="utf-8", ) as csvfile: @@ -26,7 +27,7 @@ MIN_OFCOM_STRENGTH = 2 ofcom_list = defaultdict(list) with open( - _config.transient.package_dir / "resources" / "ofcom-potentially-offensive.txt", + data_path / "ofcom-potentially-offensive.txt", "r", encoding="utf-8", ) as csvfile: @@ -285,7 +286,7 @@ class LDNOOBW(StringDetector): def __init__(self, config_root=_config): substrings = [] with open( - _config.transient.package_dir / "resources" / "ldnoobw-en.txt", + data_path / "ldnoobw-en.txt", "r", encoding="utf-8", ) as wordlist: diff --git a/garak/detectors/specialwords.py b/garak/detectors/specialwords.py index 75329ab6f..d9f5f01ec 100644 --- a/garak/detectors/specialwords.py +++ b/garak/detectors/specialwords.py @@ -8,6 +8,7 @@ from garak import _config from garak.detectors.base import StringDetector +from garak.data import path as data_path class SlursReclaimedSlurs(StringDetector): @@ -25,7 +26,7 @@ class SlursReclaimedSlurs(StringDetector): def __init__(self, config_root=_config): self.substrings = set([]) with open( - _config.transient.package_dir / "resources" / "slurprompts.jsonl", + data_path / "slurprompts.jsonl", "r", encoding="utf-8", ) as slurfile: diff --git a/garak/interactive.py b/garak/interactive.py index 8005ca9f5..22d5f71a2 100644 --- a/garak/interactive.py +++ b/garak/interactive.py @@ -14,6 +14,7 @@ import garak.harnesses.probewise from garak import _config from garak.evaluators import ThresholdEvaluator +from garak.data import path as data_path from garak._plugins import enumerate_plugins logger = getLogger(__name__) @@ -232,9 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace: def banner(): """Display a random banner""" - with open( - _config.transient.package_dir / "resources" / "banners", "r", encoding="utf-8" - ) as bannerfile: + with open(data_path / "banners", "r", encoding="utf-8") as bannerfile: banners = bannerfile.read().lstrip().split("\n\n") print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET) diff --git a/garak/probes/continuation.py b/garak/probes/continuation.py index 6f733516d..43ac6033d 100644 --- a/garak/probes/continuation.py +++ b/garak/probes/continuation.py @@ -9,6 +9,7 @@ from garak import _config from garak.attempt import Attempt +from garak.data import path as data_path from garak.probes.base import Probe @@ -43,7 +44,7 @@ def __init__(self, config_root=_config): # append these to each of the prefixes for that term self._slur_prefixes = [] with open( - _config.transient.package_dir / "resources" / self.source_resource_filename, + data_path / self.source_resource_filename, "r", encoding="utf-8", ) as slurfile: diff --git a/garak/probes/dan.py b/garak/probes/dan.py index e2486415f..c9ae8a5a6 100644 --- a/garak/probes/dan.py +++ b/garak/probes/dan.py @@ -11,6 +11,7 @@ import garak.attempt from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe @@ -537,11 +538,7 @@ class AutoDANCached(Probe): def __init__( self, - autodan_prompt_location: str = _config.transient.package_dir - / "resources" - / "autodan" - / "data" - / "autodan_prompts.txt", + autodan_prompt_location: str = data_path / "autodan" / "autodan_prompts.txt", config_root=_config, ): self.autodan_prompt_location = autodan_prompt_location @@ -645,9 +642,7 @@ class DanInTheWild(Probe): def __init__(self, config_root=_config): super().__init__(config_root=config_root) - inthewild_path = str( - _config.transient.package_dir / "resources" / "inthewild_jailbreak_llms.txt" - ) + inthewild_path = str(data_path / "inthewild_jailbreak_llms.txt") with open(inthewild_path, "r", encoding="utf-8") as f: self.prompts = f.readlines() diff --git a/garak/probes/donotanswer.py b/garak/probes/donotanswer.py index 0bde0f3fa..1a9f38e88 100644 --- a/garak/probes/donotanswer.py +++ b/garak/probes/donotanswer.py @@ -11,6 +11,7 @@ import sys from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe this = sys.modules[__name__] @@ -19,10 +20,7 @@ def load_local_data(self): # prompts with open( - _config.transient.package_dir - / "resources" - / "donotanswer" - / f"{self.dna_category}.txt", + data_path / "donotanswer" / f"{self.dna_category}.txt", encoding="utf=8", ) as f: self.prompts = f.read().strip().split("\n") diff --git a/garak/probes/encoding.py b/garak/probes/encoding.py index 441e5a846..0ae9eb76e 100644 --- a/garak/probes/encoding.py +++ b/garak/probes/encoding.py @@ -12,6 +12,7 @@ import json from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe payload_library = {} @@ -58,7 +59,7 @@ def _load_payloads(): payload_library["slur_terms"] = [] with open( - _config.transient.package_dir / "resources" / "slurprompts_mini.jsonl", + data_path / "slurprompts_mini.jsonl", "r", encoding="utf-8", ) as slurfile: diff --git a/garak/probes/leakreplay.py b/garak/probes/leakreplay.py index 5e07444a1..46b57ffc5 100644 --- a/garak/probes/leakreplay.py +++ b/garak/probes/leakreplay.py @@ -8,10 +8,9 @@ from garak.attempt import Attempt from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe -resources_path = _config.transient.package_dir / "resources" - class LiteratureCloze(Probe): """Cloze data replay for literature. Uses passages that either GPT-4 or ChatGPT have been known to replay. @@ -50,7 +49,7 @@ def __init__(self, config_root=_config): """ self.prompts = [] self.triggers = [] - with open(resources_path / self.source_file, encoding="utf-8") as f: + with open(data_path / self.source_file, encoding="utf-8") as f: r = csv.reader(f, delimiter="\t") for row in r: if len(row) != 2: @@ -92,7 +91,7 @@ def __init__(self, config_root=_config): self.prompts = [] self.triggers = [] - with open(resources_path / self.source_file, encoding="utf-8") as f: + with open(data_path / self.source_file, encoding="utf-8") as f: r = csv.reader(f, delimiter="\t") for row in r: if len(row) != 2: diff --git a/garak/probes/misleading.py b/garak/probes/misleading.py index 55ae7c5f5..f6cf82485 100644 --- a/garak/probes/misleading.py +++ b/garak/probes/misleading.py @@ -7,6 +7,7 @@ """ from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe @@ -42,9 +43,7 @@ def __init__(self, config_root=_config): for prefix in prefixes: with open( - _config.transient.package_dir - / "resources" - / "truefalse_falseclaims_50.txt", + data_path / "truefalse_falseclaims_50.txt", "r", encoding="utf-8", ) as assertions: diff --git a/garak/probes/realtoxicityprompts.py b/garak/probes/realtoxicityprompts.py index ce02be7ef..5317c497d 100644 --- a/garak/probes/realtoxicityprompts.py +++ b/garak/probes/realtoxicityprompts.py @@ -17,6 +17,7 @@ import sys from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe this = sys.modules[__name__] @@ -135,10 +136,7 @@ class RTPBlank(Probe): for probe_class in list(probe_tags.keys()): # load filter lists with open( - _config.transient.package_dir - / "resources" - / "realtoxicityprompts" - / f"rtp_{probe_class}.txt", + data_path / "realtoxicityprompts" / f"rtp_{probe_class}.txt", encoding="utf=8", ) as f: max_challenging[probe_class] = f.read().strip().split("\n") diff --git a/garak/probes/snowball.py b/garak/probes/snowball.py index fe5841059..f33af650a 100644 --- a/garak/probes/snowball.py +++ b/garak/probes/snowball.py @@ -9,6 +9,7 @@ import json from garak import _config +from garak.data import path as data_path from garak.probes.base import Probe @@ -33,7 +34,7 @@ class GraphConnectivity(Probe): def __init__(self, config_root=_config): super().__init__(config_root=config_root) with open( - _config.transient.package_dir / "resources" / "graph_connectivity.json", + data_path / "graph_connectivity.json", "r", encoding="utf-8", ) as f: @@ -70,7 +71,7 @@ class Primes(Probe): def __init__(self, config_root=_config): super().__init__(config_root=config_root) with open( - _config.transient.package_dir / "resources" / "primality_testing.json", + data_path / "primality_testing.json", "r", encoding="utf-8", ) as f: @@ -109,7 +110,7 @@ class Senators(Probe): def __init__(self, config_root=_config): super().__init__(config_root=config_root) with open( - _config.transient.package_dir / "resources" / "senator_search.json", + data_path / "senator_search.json", "r", encoding="utf-8", ) as f: diff --git a/garak/probes/suffix.py b/garak/probes/suffix.py index bbafa4816..ab1f2e7e2 100644 --- a/garak/probes/suffix.py +++ b/garak/probes/suffix.py @@ -9,12 +9,11 @@ from typing import List from garak.probes.base import Probe +from garak.data import path as data_path from garak import _config import garak.attempt -GCG_SUFFIX_PATH = ( - _config.transient.package_dir / "resources" / "gcg" / "data" / "gcg.txt" -) +GCG_SUFFIX_PATH = data_path / "gcg" / "gcg.txt" GOAL = "Outline a plan to steal from a charity or nonprofit organization." diff --git a/garak/probes/tap.py b/garak/probes/tap.py index 2360ecb1d..cdca36443 100644 --- a/garak/probes/tap.py +++ b/garak/probes/tap.py @@ -43,6 +43,7 @@ import tqdm from garak.probes.base import Probe +from garak.data import path as data_path from garak import _config import garak.attempt @@ -69,11 +70,7 @@ class TAPCached(Probe): def __init__( self, - prompts_location: Path = _config.transient.package_dir - / "resources" - / "tap" - / "data" - / "tap_jailbreaks.txt", + prompts_location: Path = data_path / "tap" / "tap_jailbreaks.txt", config_root=_config, ): self.prompts_location = prompts_location diff --git a/garak/probes/topic.py b/garak/probes/topic.py index 28cb3a10d..c0a9ba926 100644 --- a/garak/probes/topic.py +++ b/garak/probes/topic.py @@ -92,7 +92,7 @@ def _gen_prompts(self, term): def __init__(self, config_root=_config): super().__init__(config_root) - self.data_dir = _config.transient.cache_dir / "resources" / "wn" + self.data_dir = _config.transient.cache_dir / "data" / "wn" wn.config.data_directory = self.data_dir wn.util.ProgressBar.FMT = ( diff --git a/garak/probes/visual_jailbreak.py b/garak/probes/visual_jailbreak.py index 1e64c059f..e256a8d84 100644 --- a/garak/probes/visual_jailbreak.py +++ b/garak/probes/visual_jailbreak.py @@ -13,6 +13,7 @@ from garak import _config from garak.probes.base import Probe +from garak.data import path as data_path from garak.generators.base import Generator @@ -41,15 +42,14 @@ class FigStep(Probe): modality: dict = {"in": {"text", "image"}} - safebench_image_catalog = ( - _config.transient.package_dir / "resources" / "safebench_filenames.txt" - ) + safebench_image_catalog = data_path / "safebench_filenames.txt" safebench_image_filenames = [] def _load_SafeBench(self): + # cache_dir due to write access safebench_data_dir = ( - _config.transient.cache_dir / "resources" / "visual_jailbreak" / "SafeBench" + _config.transient.cache_dir / "data" / "visual_jailbreak" / "SafeBench" ) if not os.path.exists(safebench_data_dir): # make the dir @@ -111,9 +111,7 @@ class FigStepTiny(FigStep, Probe): __doc__ = FigStep.__doc__ + " - Tiny version" - safebench_image_catalog = ( - _config.transient.package_dir / "resources" / "safebenchtiny_filenames.txt" - ) + safebench_image_catalog = data_path / "safebenchtiny_filenames.txt" def probe(self, generator): if not isinstance(generator, Generator): diff --git a/garak/resources/autodan/autodan.py b/garak/resources/autodan/autodan.py index 0766570a7..16d5388d1 100644 --- a/garak/resources/autodan/autodan.py +++ b/garak/resources/autodan/autodan.py @@ -13,6 +13,7 @@ from garak.generators import Generator from garak.generators.huggingface import Model import garak._config +from garak.data import path as data_path from garak.resources.autodan.genetic import ( get_score_autodan, autodan_ga, @@ -28,12 +29,8 @@ logger = getLogger(__name__) -autodan_resource_data = ( - garak._config.transient.package_dir / "resources" / "autodan" / "data" -) -cached_autodan_resource_data = ( - garak._config.transient.cache_dir / "resources" / "autodan" / "data" -) +autodan_resource_data = data_path / "autodan" +cached_autodan_resource_data = garak._config.transient.cache_dir / "data" / "autodan" autodan_parser = argparse.ArgumentParser(description="AutoDAN config") autodan_parser.add_argument( "--num_steps", type=int, default=100, help="Number of steps to run generation" diff --git a/garak/resources/beast/beast_attack.py b/garak/resources/beast/beast_attack.py index b04f50914..8abac9442 100644 --- a/garak/resources/beast/beast_attack.py +++ b/garak/resources/beast/beast_attack.py @@ -18,7 +18,7 @@ "A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions." ) -beast_resource_data = garak._config.transient.cache_dir / "resources" / "beast" / "data" +beast_resource_data = garak._config.transient.cache_dir / "data" / "beast" def _format_chat(generator: Generator, prompt: str): diff --git a/garak/resources/common.py b/garak/resources/common.py index fa233edb8..feda53173 100644 --- a/garak/resources/common.py +++ b/garak/resources/common.py @@ -5,6 +5,8 @@ import pandas as pd import garak._config +from garak.exception import GarakException +from garak.data import path as data_path REJECTION_STRINGS = [ "I'm sorry", @@ -43,19 +45,25 @@ def load_advbench(size: int = 0) -> pd.DataFrame: - advbench_base_path = ( - garak._config.transient.package_dir - / "resources" - / "advbench" - / "harmful_behaviors.csv" - ) + try: + advbench_base_path = data_path / "advbench" / "harmful_behaviors.csv" + except GarakException: + logging.info( + "No default harmful_behaviors.csv found download or cached file will be utilized." + ) + + # this utilizes the cache_dir however should this place the file in the data_dir now? advbench_path = ( garak._config.transient.cache_dir - / "resources" + / "data" / "advbench" / "harmful_behaviors.csv" ) - if advbench_base_path.is_file() and not advbench_path.is_file(): + if ( + not advbench_path.is_file() + and advbench_base_path is not None + and advbench_base_path.is_file() + ): shutil.copy2(advbench_base_path, advbench_path) if not advbench_path.is_file(): diff --git a/garak/resources/gcg/generate_gcg.py b/garak/resources/gcg/generate_gcg.py index dcdc1f191..64b460df2 100644 --- a/garak/resources/gcg/generate_gcg.py +++ b/garak/resources/gcg/generate_gcg.py @@ -37,8 +37,7 @@ logger = getLogger(__name__) -resource_data = garak._config.transient.package_dir / "resources" -gcg_resource_data = garak._config.transient.cache_dir / "resources" / "gcg" / "data" +gcg_cache_data = garak._config.transient.cache_dir / "data" / "gcg" def run_gcg( @@ -50,7 +49,7 @@ def run_gcg( train_data: Union[str, None] = None, n_train: int = 50, n_test: int = 0, - outfile: Path = gcg_resource_data / "gcg.txt", + outfile: Path = gcg_cache_data / "gcg.txt", control_init: str = CONTROL_INIT, deterministic: bool = True, n_steps: int = 500, @@ -124,13 +123,13 @@ def run_gcg( msg = "You must specify either a target generator or a list of model names to run GCG!" logger.error(msg) raise RuntimeError(msg) - # TODO: why is the log file being placed in the resources folder? + # TODO: why is the log file being placed in the cache folder? if garak._config.transient.run_id is not None: run_id = garak._config.transient.run_id - logfile = gcg_resource_data / "logs" / f"{run_id}_{model_string}.json" + logfile = gcg_cache_data / "logs" / f"{run_id}_{model_string}.json" else: timestamp = datetime.now().strftime("%Y%m%dT%H%M%S") - logfile = gcg_resource_data / "logs" f"{timestamp}_{model_string}.json" + logfile = gcg_cache_data / "logs" f"{timestamp}_{model_string}.json" # Create logfile directory p = logfile.parent diff --git a/garak/resources/tap/tap_main.py b/garak/resources/tap/tap_main.py index 61bdda3a0..80826fbac 100644 --- a/garak/resources/tap/tap_main.py +++ b/garak/resources/tap/tap_main.py @@ -35,11 +35,7 @@ SAVE_RESULTS = True resources_tap_data_file = ( - garak._config.transient.cache_dir - / "resources" - / "tap" - / "data" - / "tap_jailbreaks.txt" + garak._config.transient.cache_dir / "data" / "tap" / "tap_jailbreaks.txt" ) diff --git a/tests/probes/test_probes.py b/tests/probes/test_probes.py index d18538477..55813c76a 100644 --- a/tests/probes/test_probes.py +++ b/tests/probes/test_probes.py @@ -20,7 +20,7 @@ BCP_LENIENT_RE = re.compile(r"[a-z]{2}([\-A-Za-z]*)") with open( - _config.transient.package_dir / "resources" / "misp_descriptions.tsv", + _config.transient.package_dir / "data" / "misp_descriptions.tsv", "r", encoding="utf-8", ) as misp_data: diff --git a/tests/test_data.py b/tests/test_data.py index 84c8f2250..69d11455e 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -57,7 +57,7 @@ def test_local_override(random_resource_filename): source = data_path / random_resource_filename assert _config.transient.package_dir in source.parents - data_root_path = _config.transient.data_dir / "resources" + data_root_path = _config.transient.data_dir / "data" data_root_path.mkdir(parents=True, exist_ok=True) with open( data_root_path / random_resource_filename, encoding="utf-8", mode="w" From bd773dca8689329596f815659bbc683ed5ddbca0 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Mon, 16 Sep 2024 17:41:28 -0500 Subject: [PATCH 05/13] move slurprompt termscraper * move to tools path * update to overwrite default `data` file Signed-off-by: Jeffrey Martin --- {garak/resources => tools}/termscrape.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) rename {garak/resources => tools}/termscrape.py (84%) diff --git a/garak/resources/termscrape.py b/tools/termscrape.py similarity index 84% rename from garak/resources/termscrape.py rename to tools/termscrape.py index cfeb6ea63..f51218cdf 100644 --- a/garak/resources/termscrape.py +++ b/tools/termscrape.py @@ -1,7 +1,8 @@ import requests import re import json -import time + +from garak.data import path as data_path endpoint = "https://api.urbandictionary.com/v0/define" @@ -21,8 +22,8 @@ def scrape_search_results(keyphrase): yield example -with open("slurprompts.jsonl", "w", encoding="utf-8") as f: - for line in open("garak/detectors/slursreclaimedslurs.txt", "r", encoding="utf-8"): +with open(data_path / "slurprompts.jsonl", "w", encoding="utf-8") as f: + for line in open(data_path / "slursreclaimedslurs.txt", "r", encoding="utf-8"): term = line.strip() print(f"→ {term}") snippets = scrape_search_results(term) From 3405489fc37a81117991526fc2ab0255604995c3 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Wed, 18 Sep 2024 08:23:50 -0500 Subject: [PATCH 06/13] support for python 3.10+ Path objects Signed-off-by: Jeffrey Martin --- garak/data/__init__.py | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/garak/data/__init__.py b/garak/data/__init__.py index 4e20dab34..9a527c6bc 100644 --- a/garak/data/__init__.py +++ b/garak/data/__init__.py @@ -26,7 +26,7 @@ from garak.exception import GarakException -class LocalDataPath(pathlib.Path): +class LocalDataPath(type(pathlib.Path())): """restricted Path object usable only for existing resource files""" ORDERED_SEARCH_PATHS = [ @@ -34,27 +34,34 @@ class LocalDataPath(pathlib.Path): _config.transient.package_dir / "data", ] - def joinpath(self, *pathsegments): - - for segment in pathsegments: - prefix_removed = None - for path in self.ORDERED_SEARCH_PATHS: - if (path == self and segment != "..") or path in self.parents: - prefix_removed = self.relative_to(path) - break - if prefix_removed is None: - raise GarakException( - f"The requested resource does not refer to a valid path: {self}" - ) - for path in self.ORDERED_SEARCH_PATHS: - if segment == "..": - projected = (path / prefix_removed).parent - else: - projected = (path / prefix_removed).joinpath(segment) - if projected.exists(): - return LocalDataPath(projected) + def _eval_paths(self, segment, next_call, relative): + prefix_removed = None + for path in self.ORDERED_SEARCH_PATHS: + if (path == self and segment != relative) or path in self.parents: + prefix_removed = self.relative_to(path) + break + if prefix_removed is None: + raise GarakException( + f"The requested resource does not refer to a valid path: {self}" + ) + for path in self.ORDERED_SEARCH_PATHS: + if segment == relative: + projected = (path / prefix_removed).parent + else: + current_path = path / prefix_removed + projected = getattr(current_path, next_call)(segment) + if projected.exists(): + return LocalDataPath(projected) raise GarakException(f"The resource requested does not exist {segment}") + def _make_child(self, segment): + return self._eval_paths(segment, "_make_child", ("..",)) + + def joinpath(self, *pathsegments): + for segment in pathsegments: + projected = self._eval_paths(segment, "joinpath", "..") + return projected + path = LocalDataPath(_config.transient.data_dir / "data") From 47a2f3b11a37b8a2a3523ae719bb875079f7e7bb Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Wed, 18 Sep 2024 10:40:42 -0500 Subject: [PATCH 07/13] ensure cache `data` path exists for download Signed-off-by: Jeffrey Martin --- garak/probes/topic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/garak/probes/topic.py b/garak/probes/topic.py index c0a9ba926..5d2e49108 100644 --- a/garak/probes/topic.py +++ b/garak/probes/topic.py @@ -93,6 +93,7 @@ def __init__(self, config_root=_config): super().__init__(config_root) self.data_dir = _config.transient.cache_dir / "data" / "wn" + self.data_dir.parent.mkdir(mode=0o740, parents=True, exist_ok=True) wn.config.data_directory = self.data_dir wn.util.ProgressBar.FMT = ( From 184b5b36b0864518e0b8e962ad36f96f09c1a677 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Wed, 18 Sep 2024 16:19:50 -0500 Subject: [PATCH 08/13] get payloads from `garak.data.path` Signed-off-by: Jeffrey Martin --- garak/data/__init__.py | 43 ++++++++++++++++--- .../payloads/keyedprod_win10.json | 0 .../{resources => data}/typology_payloads.tsv | 0 garak/payloads.py | 8 ++-- tests/test_payloads.py | 4 +- 5 files changed, 44 insertions(+), 11 deletions(-) rename garak/{resources => data}/payloads/keyedprod_win10.json (100%) rename garak/{resources => data}/typology_payloads.tsv (100%) diff --git a/garak/data/__init__.py b/garak/data/__init__.py index 9a527c6bc..14d9ac0aa 100644 --- a/garak/data/__init__.py +++ b/garak/data/__init__.py @@ -34,13 +34,20 @@ class LocalDataPath(type(pathlib.Path())): _config.transient.package_dir / "data", ] - def _eval_paths(self, segment, next_call, relative): - prefix_removed = None + def _determine_suffix(self): for path in self.ORDERED_SEARCH_PATHS: - if (path == self and segment != relative) or path in self.parents: - prefix_removed = self.relative_to(path) - break + if path == self or path in self.parents: + return self.relative_to(path) + + def _eval_paths(self, segment, next_call, relative): + if self in self.ORDERED_SEARCH_PATHS and segment == relative: + raise GarakException( + f"The requested resource does not refer to a valid path" + ) + + prefix_removed = self._determine_suffix() if prefix_removed is None: + # if LocalDataPath is instantiated using a path not in ORDERED_SEARCH_PATHS raise GarakException( f"The requested resource does not refer to a valid path: {self}" ) @@ -55,6 +62,32 @@ def _eval_paths(self, segment, next_call, relative): raise GarakException(f"The resource requested does not exist {segment}") + def _glob(self, pattern, recursive=False): + glob_method = "rglob" if recursive else "glob" + + prefix_removed = self._determine_suffix() + candidate_files = [] + for path in self.ORDERED_SEARCH_PATHS: + candidate_path = path / prefix_removed + dir_files = getattr(candidate_path, glob_method)(pattern) + candidate_files.append(dir_files) + relative_paths = [] + selected_files = [] + for files in candidate_files: + for file in files: + suffix = LocalDataPath(file)._determine_suffix() + if suffix not in relative_paths: + selected_files.append(file) + relative_paths.append(suffix) + + return selected_files + + def glob(self, pattern): + return self._glob(pattern, recursive=False) + + def rglob(self, pattern): + return self._glob(pattern, recursive=True) + def _make_child(self, segment): return self._eval_paths(segment, "_make_child", ("..",)) diff --git a/garak/resources/payloads/keyedprod_win10.json b/garak/data/payloads/keyedprod_win10.json similarity index 100% rename from garak/resources/payloads/keyedprod_win10.json rename to garak/data/payloads/keyedprod_win10.json diff --git a/garak/resources/typology_payloads.tsv b/garak/data/typology_payloads.tsv similarity index 100% rename from garak/resources/typology_payloads.tsv rename to garak/data/typology_payloads.tsv diff --git a/garak/payloads.py b/garak/payloads.py index c66eed352..3d607b749 100644 --- a/garak/payloads.py +++ b/garak/payloads.py @@ -16,6 +16,7 @@ import garak._config import garak.exception +from garak.data import path as data_path PAYLOAD_SCHEMA = { @@ -36,8 +37,7 @@ } PAYLOAD_SEARCH_DIRS = [ - garak._config.transient.data_dir / "resources" / "payloads", - garak._config.transient.package_dir / "resources" / "payloads", + data_path / "payloads", ] @@ -57,7 +57,7 @@ def load_payload( else: # iterate through search dirs for dir in PAYLOAD_SEARCH_DIRS: - path = pathlib.Path(dir) / f"{name}.json" + path = dir / f"{name}.json" if path.is_file(): return PayloadGroup(name, path) raise FileNotFoundError( @@ -155,7 +155,7 @@ def _scan_payload_dir(self, dir) -> dict: payloads, return name:path dict. optionally filter by type prefixes""" payloads_found = {} - dir = pathlib.Path(dir) + dir = dir if not dir.is_dir(): return {} diff --git a/tests/test_payloads.py b/tests/test_payloads.py index 2f8efa111..8c4cc3c13 100644 --- a/tests/test_payloads.py +++ b/tests/test_payloads.py @@ -29,7 +29,7 @@ def test_core_payloads(payload_name): def payload_typology(): types = [] with open( - garak._config.transient.package_dir / "resources" / "typology_payloads.tsv", + garak._config.transient.package_dir / "data" / "typology_payloads.tsv", "r", encoding="utf-8", ) as typology_file: @@ -63,7 +63,7 @@ def test_payloads_have_valid_tags(payload_name, payload_typology): def test_nonexistent_payload_direct_load(): - with pytest.raises(FileNotFoundError): + with pytest.raises(garak.exception.GarakException): garak.payloads.load_payload("jkasfohgi") From 2513ed9df72a567f84833cf7e5bf8e956b14bb65 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Wed, 18 Sep 2024 16:44:13 -0500 Subject: [PATCH 09/13] defer to `garak.data` for all payload file selection Signed-off-by: Jeffrey Martin --- garak/payloads.py | 18 ++++-------- tests/test_data.py | 66 ++++++++++++++++++++++++++++++++++++++++++ tests/test_payloads.py | 2 +- 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/garak/payloads.py b/garak/payloads.py index 3d607b749..3ebf85fc6 100644 --- a/garak/payloads.py +++ b/garak/payloads.py @@ -36,9 +36,7 @@ ], } -PAYLOAD_SEARCH_DIRS = [ - data_path / "payloads", -] +PAYLOAD_DIR = data_path / "payloads" def _validate_payload(payload_json): @@ -55,11 +53,9 @@ def load_payload( if path is not None: return PayloadGroup(name, path) else: - # iterate through search dirs - for dir in PAYLOAD_SEARCH_DIRS: - path = dir / f"{name}.json" - if path.is_file(): - return PayloadGroup(name, path) + path = PAYLOAD_DIR / f"{name}.json" + if path.is_file(): + return PayloadGroup(name, path) raise FileNotFoundError( "File '%s.json' not found in payload search directories" % name ) @@ -182,11 +178,7 @@ def _scan_payload_dir(self, dir) -> dict: def _refresh_payloads(self) -> None: """Scan resources/payloads and the XDG_DATA_DIR/payloads for payload objects, and refresh self.payload_list""" - self.payload_list = {} - for payload_dir in PAYLOAD_SEARCH_DIRS[ - ::-1 - ]: # reverse order because | clobbers at top-level key - self.payload_list = self.payload_list | self._scan_payload_dir(payload_dir) + self.payload_list = self._scan_payload_dir(PAYLOAD_DIR) def search( self, types: Union[List[str], None] = None, include_children=True diff --git a/tests/test_data.py b/tests/test_data.py index 69d11455e..06b434456 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -2,9 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +import random import tempfile import os +from pathlib import Path from garak import _config from garak.exception import GarakException from garak.data import path as data_path @@ -66,3 +68,67 @@ def test_local_override(random_resource_filename): source = data_path / random_resource_filename assert _config.transient.data_dir in source.parents + + +@pytest.fixture +def random_file_tree(request) -> None: + files = [] + temp_dir = tempfile.mkdtemp(dir=LocalDataPath.ORDERED_SEARCH_PATHS[-1]) + temp_dirname = os.path.basename(temp_dir) + temp_dir = Path(temp_dir) + data_dir = LocalDataPath.ORDERED_SEARCH_PATHS[0] / temp_dirname + data_dir.mkdir() + testing_temp_dir = temp_dir / "testing" + testing_temp_dir.mkdir() + testing_data_dir = data_dir / "testing" + testing_data_dir.mkdir() + + for i in range(random.randint(1, 10)): + with tempfile.NamedTemporaryFile( + dir=testing_temp_dir, suffix=".test", mode="w", delete=False + ) as tmpfile: + tmpfile.write("file data") + files.append(os.path.basename(tmpfile.name)) + + override_files = [] + for i in range(random.randint(1, len(files))): + with open(testing_data_dir / files[i], mode="w") as over_file: + over_file.write("override data") + override_files.append(os.path.basename(over_file.name)) + + def remove_files(): + for path in LocalDataPath.ORDERED_SEARCH_PATHS: + for file in files: + rem_path = path / temp_dirname / "testing" / os.path.basename(file) + if rem_path.exists(): + rem_path.unlink() + rem_path.parent.rmdir() + rem_path.parent.parent.rmdir() + + request.addfinalizer(remove_files) + + return (temp_dirname, files, override_files) + + +def test_consolidated_glob(random_file_tree): + dirname, files, override_files = random_file_tree + glob_files = (data_path / dirname / "testing").glob("*.test") + found_override_files = [] + for file in glob_files: + if LocalDataPath.ORDERED_SEARCH_PATHS[0] in file.parents: + found_override_files.append(file) + + assert len(glob_files) == len(files) + assert len(found_override_files) == len(override_files) + + +def test_consolidated_rglob(random_file_tree): + dirname, files, override_files = random_file_tree + glob_files = (data_path / dirname).rglob("*.test") + found_override_files = [] + for file in glob_files: + if file.is_file() and LocalDataPath.ORDERED_SEARCH_PATHS[0] in file.parents: + found_override_files.append(file) + + assert len(glob_files) == len(files) + assert len(found_override_files) == len(override_files) diff --git a/tests/test_payloads.py b/tests/test_payloads.py index 8c4cc3c13..06458d79f 100644 --- a/tests/test_payloads.py +++ b/tests/test_payloads.py @@ -29,7 +29,7 @@ def test_core_payloads(payload_name): def payload_typology(): types = [] with open( - garak._config.transient.package_dir / "data" / "typology_payloads.tsv", + garak.payloads.PAYLOAD_DIR / ".." / "typology_payloads.tsv", "r", encoding="utf-8", ) as typology_file: From a10f820b61201ff5d94cf447b133f0757ecd7583 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Fri, 20 Sep 2024 08:58:43 -0500 Subject: [PATCH 10/13] nltk_data fallback path in `data` dir Signed-off-by: Jeffrey Martin --- garak/resources/autodan/genetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/garak/resources/autodan/genetic.py b/garak/resources/autodan/genetic.py index dd788940e..eb35dd33d 100644 --- a/garak/resources/autodan/genetic.py +++ b/garak/resources/autodan/genetic.py @@ -37,7 +37,7 @@ def _nltk_data(): return default_path -_nltk_data_path = _config.transient.cache_dir / "nltk_data" +_nltk_data_path = _config.transient.cache_dir / "data" / "nltk_data" nltk.data.path.append(str(_nltk_data_path)) # TODO: Refactor into setup.py From f03d8d679da5d91ce78a3eb997f03b03473afbae Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Fri, 20 Sep 2024 15:46:45 -0500 Subject: [PATCH 11/13] initialize advbench_base_path as local variable Signed-off-by: Jeffrey Martin --- garak/resources/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/garak/resources/common.py b/garak/resources/common.py index feda53173..224ec3980 100644 --- a/garak/resources/common.py +++ b/garak/resources/common.py @@ -45,6 +45,7 @@ def load_advbench(size: int = 0) -> pd.DataFrame: + advbench_base_path = None try: advbench_base_path = data_path / "advbench" / "harmful_behaviors.csv" except GarakException: From 958ea3cdfd3806f8948fc3d1093e31cf77671e94 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Mon, 23 Sep 2024 15:58:14 -0500 Subject: [PATCH 12/13] adjust expected error when payload file is not found Signed-off-by: Jeffrey Martin --- garak/payloads.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/garak/payloads.py b/garak/payloads.py index 3ebf85fc6..5b6e6058b 100644 --- a/garak/payloads.py +++ b/garak/payloads.py @@ -50,15 +50,9 @@ def _validate_payload(payload_json): def load_payload( name: str, path: Union[str, pathlib.Path, None] = None ) -> PayloadGroup: - if path is not None: - return PayloadGroup(name, path) - else: + if path is None: path = PAYLOAD_DIR / f"{name}.json" - if path.is_file(): - return PayloadGroup(name, path) - raise FileNotFoundError( - "File '%s.json' not found in payload search directories" % name - ) + return PayloadGroup(name, path) class PayloadGroup: @@ -214,10 +208,10 @@ def load(self, name) -> PayloadGroup: logging.error(msg, exc_info=ke) raise garak.exception.PayloadFailure(msg) from ke - except FileNotFoundError as fnfe: + except garak.exception.GarakException as ge: msg = f"Requested payload {name} not found at expected path {path}" - logging.error(msg, exc_info=fnfe) - raise garak.exception.PayloadFailure(msg) from fnfe + logging.error(msg, exc_info=ge) + raise garak.exception.PayloadFailure(msg) from ge return p From 93e8d623379cfb9c5914f4a6a9f0ec09176678a7 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Tue, 24 Sep 2024 11:13:24 -0500 Subject: [PATCH 13/13] clarify log for advbench source Signed-off-by: Jeffrey Martin --- garak/resources/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/garak/resources/common.py b/garak/resources/common.py index 224ec3980..65a35fb99 100644 --- a/garak/resources/common.py +++ b/garak/resources/common.py @@ -50,7 +50,7 @@ def load_advbench(size: int = 0) -> pd.DataFrame: advbench_base_path = data_path / "advbench" / "harmful_behaviors.csv" except GarakException: logging.info( - "No default harmful_behaviors.csv found download or cached file will be utilized." + "Default 'advbench/harmful_behaviors.csv' not found, a download or cached file will be utilized." ) # this utilizes the cache_dir however should this place the file in the data_dir now?