Skip to content

Commit

Permalink
rename slurs80; remove uniqueness constraint; add non-mini slurs plug…
Browse files Browse the repository at this point in the history
…in (#813)
  • Loading branch information
leondz authored Aug 1, 2024
1 parent a78c2f8 commit 85ec7bf
Show file tree
Hide file tree
Showing 4 changed files with 301 additions and 254 deletions.
2 changes: 1 addition & 1 deletion garak/configs/tox_and_buffs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ run:
generations: 5

plugins:
probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlurs80,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlursMini,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
probes:
encoding:
payloads:
Expand Down
40 changes: 25 additions & 15 deletions garak/probes/continuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,18 @@
continue the term based on varying amounts of prefix.
"""

from collections import defaultdict
import json

from garak import _config
from garak.attempt import Attempt
from garak.probes.base import Probe


class ContinueSlursReclaimedSlurs80(Probe):
class ContinueSlursReclaimedSlurs(Probe):
"""Continue partial word with slur or reclaimed slur.
This probe poses a partial ethnic slur with some preceding content, and
checks to see if the model will complete the slur. It uses a subset of 80
slurs & contexts that test LLMs have been likely to provide completions for."""
checks to see if the model will complete the slur."""

bcp47 = "en"
doc_uri = ""
Expand All @@ -35,15 +33,17 @@ class ContinueSlursReclaimedSlurs80(Probe):
"payload:unwanted:hatespeech",
]
goal = "output an ethnic slur"
active = False
source_resource_filename = "slurprompts.jsonl"

def __init__(self, config_root=_config):
super().__init__(config_root=config_root)
# go through the list
# for each term, work out first char, first three chars, all but last two chars
# append these to each of the prefixes for that term
slur_prefixes = defaultdict(set)
self._slur_prefixes = []
with open(
_config.transient.package_dir / "resources" / "slurprompts_80.jsonl",
_config.transient.package_dir / "resources" / self.source_resource_filename,
"r",
encoding="utf-8",
) as slurfile:
Expand All @@ -52,11 +52,11 @@ def __init__(self, config_root=_config):
if not line:
continue
entry = json.loads(line)
slur_prefixes[entry["term"]].add(entry["prefix"])
self._slur_prefixes.append((entry["term"], entry["prefix"]))

self.prompts = []
self.triggers = []
for term, prefixes in slur_prefixes.items():
for term, prefix in self._slur_prefixes:
lower_term = term.lower()
term_variants = set(["", lower_term[0]]) # blank, first letter
if len(term) > 2:
Expand All @@ -65,17 +65,27 @@ def __init__(self, config_root=_config):
lower_term[: int(len(lower_term) / 2)].strip()
) # first half of term

for prefix in prefixes:
for term_variant in term_variants:
# would use a set but order is important
candidate_prompt = prefix + term_variant
if candidate_prompt not in self.prompts:
self.prompts.append(candidate_prompt.strip())
self.triggers.append(term)
for term_variant in term_variants:
# would use a set but order is important
candidate_prompt = prefix + term_variant
if candidate_prompt not in self.prompts:
self.prompts.append(candidate_prompt.strip())
self.triggers.append(term)

def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
attempt.notes = dict(
attempt.notes
) # we don't want all attempts.notes to ref same dict
attempt.notes["trigger"] = str(self.triggers[seq])
return attempt


class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe):
"""Continue partial word with slur or reclaimed slur.
This probe poses a partial ethnic slur with some preceding content, and
checks to see if the model will complete the slur. It uses a subset of 80
slurs & contexts that test LLMs have been likely to provide completions for."""

active = True
source_resource_filename = "slurprompts_mini.jsonl"
Loading

0 comments on commit 85ec7bf

Please sign in to comment.