Skip to content

Commit

Permalink
use XDG paths for configuration data and caching
Browse files Browse the repository at this point in the history
Support using [XDG ver 0.8](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html) for project data.

Specifically support:
| ENV VAR          | DEFAULT            |
|------------------|--------------------|
| $XDG_DATA_HOME   | $HOME/.local/share |
| $XDG_CONFIG_HOME | $HOME/.config      |
| $XDG_CACHE_HOME  | $HOME/.cache       |

Project name `garak` is appended to each location.

This is represents the followina breaking changes to project expecations:
* report_prefix passed either at the command line or as config file option
  * set filename values only
  * no longer overrides report_dir
* report_dir passed as a config file option
  * when provided as a relative path will be prepend with `<xdg_data_home>/garak`
  * provided as an absolute path will be used as the output directory
* default `user/site` configuration file `garak.site.yaml` has moved
  * previously `<basedir>/garak.site.yaml`
  * updated location `<xdg_config_home>/garak/garak.site.yaml`

Signed-off-by: Jeffrey Martin <[email protected]>
  • Loading branch information
jmartin-tech committed Jul 22, 2024
1 parent 27fb55d commit 83435db
Show file tree
Hide file tree
Showing 19 changed files with 204 additions and 70 deletions.
16 changes: 15 additions & 1 deletion garak/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
import pathlib
from typing import List
import yaml
from xdg_base_dirs import (
xdg_cache_home,
xdg_config_home,
xdg_data_home,
)

DICT_CONFIG_AFTER_LOAD = False

Expand All @@ -26,6 +31,7 @@
run_params = "seed deprefix eval_threshold generations probe_tags interactive".split()
plugins_params = "model_type model_name extended_detectors".split()
reporting_params = "taxonomy report_prefix".split()
project_dir = "garak"


loaded = False
Expand Down Expand Up @@ -53,9 +59,17 @@ class TransientConfig(GarakSubConfig):
args = None # only access this when determining what was passed on CLI
run_id = None
basedir = pathlib.Path(__file__).parents[0]
config_dir = xdg_config_home() / project_dir
user_data_dir = xdg_data_home() / project_dir
cache_dir = xdg_cache_home() / project_dir
starttime = None
starttime_iso = None

# initialize the user home and cache paths if they do not exist
config_dir.mkdir(mode=0o740, parents=True, exist_ok=True)
user_data_dir.mkdir(mode=0o740, parents=True, exist_ok=True)
cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True)


transient = TransientConfig()

Expand Down Expand Up @@ -151,7 +165,7 @@ def load_config(

settings_files = [str(transient.basedir / "resources" / "garak.core.yaml")]

fq_site_config_filename = str(transient.basedir / site_config_filename)
fq_site_config_filename = str(transient.config_dir / site_config_filename)
if os.path.isfile(fq_site_config_filename):
settings_files.append(fq_site_config_filename)
else:
Expand Down
1 change: 1 addition & 0 deletions garak/analyze/report_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def compile_digest(report_path, taxonomy=_config.reporting.taxonomy):

digest_content = header_template.render(
{
# consider expecting a `Path` here or even putting the value in a path and calling `parent`
"reportfile": report_path.split(os.sep)[-1],
"garak_version": garak_version,
"start_time": start_time,
Expand Down
33 changes: 21 additions & 12 deletions garak/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@


def start_logging():
from garak import _config

logging.basicConfig(
filename="garak.log",
filename=_config.transient.config_dir / "garak.log",
level=logging.DEBUG,
format="%(asctime)s %(levelname)s %(message)s",
)
Expand All @@ -32,6 +34,7 @@ def start_run():
import os
import uuid

from pathlib import Path
from garak import _config

logging.info("started at %s", _config.transient.starttime_iso)
Expand All @@ -41,19 +44,25 @@ def start_run():
"⚠️ The current/default config is optimised for speed rather than thoroughness. Try e.g. --config full for a stronger test, or specify some probes."
)
_config.transient.run_id = str(uuid.uuid4()) # uuid1 is safe but leaks host info
# why is report prefix a condition of placing file in the report_dir?
report_path = Path(_config.reporting.report_dir)
if not report_path.is_absolute():
logging.debug("relative report dir provided")
report_path = _config.transient.user_data_dir / _config.reporting.report_dir
if not os.path.isdir(report_path):
try:
report_path.mkdir(mode=0o740, parents=True, exist_ok=True)
except PermissionError as e:
raise PermissionError(
f"Can't create logging directory {report_path}, quitting"
) from e

filename = f"garak.{_config.transient.run_id}.report.jsonl"
if not _config.reporting.report_prefix:
if not os.path.isdir(_config.reporting.report_dir):
try:
os.mkdir(_config.reporting.report_dir)
except PermissionError as e:
raise PermissionError(
f"Can't create logging directory {_config.reporting.report_dir}, quitting"
) from e
_config.transient.report_filename = f"{_config.reporting.report_dir}/garak.{_config.transient.run_id}.report.jsonl"
filename = f"garak.{_config.transient.run_id}.report.jsonl"
else:
_config.transient.report_filename = (
_config.reporting.report_prefix + ".report.jsonl"
)
filename = _config.reporting.report_prefix + ".report.jsonl"
_config.transient.report_filename = str(report_path / filename)
_config.transient.reportfile = open(
_config.transient.report_filename, "w", buffering=1, encoding="utf-8"
)
Expand Down
12 changes: 10 additions & 2 deletions garak/evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import logging
from pathlib import Path
from typing import Iterable

from colorama import Fore, Style
Expand Down Expand Up @@ -72,12 +73,19 @@ def evaluate(self, attempts: Iterable[garak.attempt.Attempt]) -> None:
hitlog_mode = (
"w" if _config.transient.hitlogfile is None else "a"
)
# why is report prefix a condition of placing file in the report_dir?
hitlog_path = Path(_config.transient.report_filename).parent
if not _config.reporting.report_prefix:
hitlog_filename = f"{_config.reporting.report_dir}/garak.{_config.transient.run_id}.hitlog.jsonl"
hitlog_filename = (
hitlog_path
/ f"garak.{_config.transient.run_id}.hitlog.jsonl"
) #
else:
hitlog_filename = (
_config.reporting.report_prefix + ".hitlog.jsonl"
hitlog_path
/ f"{_config.reporting.report_prefix}.hitlog.jsonl"
)

logging.info("hit log in %s", hitlog_filename)
_config.transient.hitlogfile = open(
hitlog_filename,
Expand Down
2 changes: 1 addition & 1 deletion garak/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace:
def banner():
"""Display a random banner"""
with open(
_config.transient.basedir / "resources/banners", "r", encoding="utf-8"
_config.transient.basedir / "resources" / "banners", "r", encoding="utf-8"
) as bannerfile:
banners = bannerfile.read().lstrip().split("\n\n")
print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET)
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"""

import logging
from pathlib import Path
from typing import List

import tqdm
Expand Down Expand Up @@ -68,7 +69,7 @@ class TAPCached(Probe):

def __init__(
self,
prompts_location: str = _config.transient.basedir
prompts_location: Path = _config.transient.basedir
/ "resources"
/ "tap"
/ "data"
Expand Down
6 changes: 3 additions & 3 deletions garak/probes/visual_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class FigStep(Probe):

def _load_SafeBench(self):
safebench_data_dir = (
_config.transient.basedir / "resources" / "visual_jailbreak" / "SafeBench"
_config.transient.cache_dir / "resources" / "visual_jailbreak" / "SafeBench"
)
if not os.path.exists(safebench_data_dir):
# make the dir
Expand Down Expand Up @@ -94,7 +94,7 @@ def probe(self, generator):
self.prompts = [
{
"text": prompt["text"],
"image": str(_config.transient.basedir / prompt["image"]),
"image": str(_config.transient.cache_dir / prompt["image"]),
}
for prompt in self.prompts
]
Expand Down Expand Up @@ -125,7 +125,7 @@ def probe(self, generator):
self.prompts = [
{
"text": prompt["text"],
"image": str(_config.transient.basedir / prompt["image"]),
"image": str(_config.transient.cache_dir / prompt["image"]),
}
for prompt in self.prompts
]
Expand Down
15 changes: 9 additions & 6 deletions garak/resources/autodan/autodan.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
autodan_resource_data = (
garak._config.transient.basedir / "resources" / "autodan" / "data"
)
cached_autodan_resource_data = (
garak._config.transient.cache_dir / "resources" / "autodan" / "data"
)
autodan_parser = argparse.ArgumentParser(description="AutoDAN config")
autodan_parser.add_argument(
"--num_steps", type=int, default=100, help="Number of steps to run generation"
Expand Down Expand Up @@ -87,9 +90,9 @@ def autodan_generate(
mutation_generator_name: str = "gpt-3.5-turbo",
mutation_generator_type: str = "openai",
hierarchical: bool = False,
out_path: str = str(autodan_resource_data / "autodan_prompts.txt"),
init_prompt_path: str = str(autodan_resource_data / "autodan_init.txt"),
reference_path: str = str(autodan_resource_data / "prompt_group.pth"),
out_path: Path = cached_autodan_resource_data / "autodan_prompts.txt",
init_prompt_path: Path = autodan_resource_data / "autodan_init.txt",
reference_path: Path = autodan_resource_data / "prompt_group.pth",
low_memory: bool = False,
random_seed: int = None,
):
Expand All @@ -108,9 +111,9 @@ def autodan_generate(
mutation_generator_name (str): Name of model to use as the mutation generator
mutation_generator_type (str): Type of model to use as the mutation generator
hierarchical (bool): Whether ot use hierarchical GA
out_path (str): Path to write generated AutoDAN string
init_prompt_path (str): Path to initial prompts
reference_path (str): Path to reference prompt tensors
out_path (Path): Path to write generated AutoDAN string
init_prompt_path (Path): Path to initial prompts
reference_path (Path): Path to reference prompt tensors
low_memory (bool): Whether to use low memory
random_seed (int): Random seed, if used.
Expand Down
26 changes: 23 additions & 3 deletions garak/resources/autodan/genetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import gc
import nltk.downloader
import numpy as np
import torch
import random
Expand All @@ -11,6 +12,7 @@
import nltk
from nltk.corpus import stopwords, wordnet
from collections import defaultdict, OrderedDict
from pathlib import Path
import sys
import time
from logging import getLogger
Expand All @@ -20,15 +22,33 @@

logger = getLogger(__name__)


def _nltk_data():
from nltk.downloader import Downloader

nltk_data_path = Path(Downloader.default_download_dir)
if not nltk_data_path.exists():
# if path not found then place in the user cache
# get env var for NLTK_DATA, fallback to create in cachedir / nltk_data
nltk_data_path = os.getenv("NLTK_DATA", None)
if nltk_data_path is None:
from garak import _config

nltk_data_path = _config.transient.cache_dir / "nltk_data"
os.environ["NLTK_DATA"] = nltk_data_path
return nltk_data_path


# TODO: Refactor into setup.py
try:
_ = stopwords.words("english")
_ = nltk.word_tokenize("This is a normal English sentence")
_ = wordnet.synsets("word")
except LookupError as e:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk_data_path = _nltk_data()
nltk.download("stopwords", download_dir=nltk_data_path)
nltk.download("punkt", download_dir=nltk_data_path)
nltk.download("wordnet", download_dir=nltk_data_path)


# TODO: Could probably clean up the inputs here by using imports.
Expand Down
12 changes: 9 additions & 3 deletions garak/resources/beast/beast_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
from pathlib import Path
from typing import Union, Optional, Tuple

import torch
Expand All @@ -17,7 +18,7 @@
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions."
)
beast_resource_data = garak._config.transient.basedir / "resources" / "beast" / "data"
beast_resource_data = garak._config.transient.cache_dir / "resources" / "beast" / "data"


def _format_chat(generator: Generator, prompt: str):
Expand Down Expand Up @@ -296,7 +297,11 @@ def _attack(
if responses is None:
responses = ["" for _ in range(len(prompts))]
for prompt, response in tqdm(
zip(prompts, responses), total=len(prompts), leave=False, position=0, desc="BEAST attack"
zip(prompts, responses),
total=len(prompts),
leave=False,
position=0,
desc="BEAST attack",
):
best_candidate = []
if trials > 1:
Expand Down Expand Up @@ -342,7 +347,7 @@ def run_beast(
suffix_len: int = 40,
data_size: int = 20,
target: Optional[str] = "",
outfile: str = beast_resource_data / "suffixes.txt",
outfile: Path = beast_resource_data / "suffixes.txt",
stop_early: bool = False,
) -> Union[list[str], None]:
"""
Expand Down Expand Up @@ -395,6 +400,7 @@ def run_beast(
)

if suffixes and outfile:
outfile.parent.mkdir(mode=0o740, parents=True, exist_ok=True)
with open(outfile, "a") as f:
for suffix in suffixes:
f.write(f"{suffix}\n")
Expand Down
12 changes: 11 additions & 1 deletion garak/resources/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import shutil
import urllib.error
from pathlib import Path
import pandas as pd
Expand Down Expand Up @@ -42,12 +43,21 @@


def load_advbench(size: int = 0) -> pd.DataFrame:
advbench_path = (
advbench_base_path = (
garak._config.transient.basedir
/ "resources"
/ "advbench"
/ "harmful_behaviors.csv"
)
advbench_path = (
garak._config.transient.cache_dir
/ "resources"
/ "advbench"
/ "harmful_behaviors.csv"
)
if advbench_base_path.is_file() and not advbench_path.is_file():
shutil.copy2(advbench_base_path, advbench_path)

if not advbench_path.is_file():
try:
hb = "https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/harmful_behaviors.csv"
Expand Down
Loading

0 comments on commit 83435db

Please sign in to comment.