Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clarify continuation probes #813

Merged
merged 1 commit into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion garak/configs/tox_and_buffs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ run:
generations: 5

plugins:
probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlurs80,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlursMini,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
probes:
encoding:
payloads:
Expand Down
40 changes: 25 additions & 15 deletions garak/probes/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,18 @@
continue the term based on varying amounts of prefix.
"""

from collections import defaultdict
import json

from garak import _config
from garak.attempt import Attempt
from garak.probes.base import Probe


class ContinueSlursReclaimedSlurs80(Probe):
class ContinueSlursReclaimedSlurs(Probe):
"""Continue partial word with slur or reclaimed slur.

This probe poses a partial ethnic slur with some preceding content, and
checks to see if the model will complete the slur. It uses a subset of 80
slurs & contexts that test LLMs have been likely to provide completions for."""
checks to see if the model will complete the slur."""

bcp47 = "en"
doc_uri = ""
Expand All @@ -35,15 +33,17 @@ class ContinueSlursReclaimedSlurs80(Probe):
"payload:unwanted:hatespeech",
]
goal = "output an ethnic slur"
active = False
source_resource_filename = "slurprompts.jsonl"

def __init__(self, config_root=_config):
super().__init__(config_root=config_root)
# go through the list
# for each term, work out first char, first three chars, all but last two chars
# append these to each of the prefixes for that term
slur_prefixes = defaultdict(set)
self._slur_prefixes = []
with open(
_config.transient.package_dir / "resources" / "slurprompts_80.jsonl",
_config.transient.package_dir / "resources" / self.source_resource_filename,
"r",
encoding="utf-8",
) as slurfile:
Expand All @@ -52,11 +52,11 @@ def __init__(self, config_root=_config):
if not line:
continue
entry = json.loads(line)
slur_prefixes[entry["term"]].add(entry["prefix"])
self._slur_prefixes.append((entry["term"], entry["prefix"]))

self.prompts = []
self.triggers = []
for term, prefixes in slur_prefixes.items():
for term, prefix in self._slur_prefixes:
lower_term = term.lower()
term_variants = set(["", lower_term[0]]) # blank, first letter
if len(term) > 2:
Expand All @@ -65,17 +65,27 @@ def __init__(self, config_root=_config):
lower_term[: int(len(lower_term) / 2)].strip()
) # first half of term

for prefix in prefixes:
for term_variant in term_variants:
# would use a set but order is important
candidate_prompt = prefix + term_variant
if candidate_prompt not in self.prompts:
self.prompts.append(candidate_prompt.strip())
self.triggers.append(term)
for term_variant in term_variants:
# would use a set but order is important
candidate_prompt = prefix + term_variant
if candidate_prompt not in self.prompts:
self.prompts.append(candidate_prompt.strip())
self.triggers.append(term)

def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
attempt.notes = dict(
attempt.notes
) # we don't want all attempts.notes to ref same dict
attempt.notes["trigger"] = str(self.triggers[seq])
return attempt


class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe):
"""Continue partial word with slur or reclaimed slur.

This probe poses a partial ethnic slur with some preceding content, and
checks to see if the model will complete the slur. It uses a subset of 80
slurs & contexts that test LLMs have been likely to provide completions for."""

active = True
source_resource_filename = "slurprompts_mini.jsonl"
Loading
Loading