NVIDIA · jmartin-tech · Aug 1, 2024 · Jul 31, 2024
diff --git a/garak/configs/tox_and_buffs.yaml b/garak/configs/tox_and_buffs.yaml
@@ -10,7 +10,7 @@ run:
   generations: 5
 
 plugins:
-  probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlurs80,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
+  probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlursMini,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
   probes:
     encoding:
       payloads:

diff --git a/garak/probes/continuation.py b/garak/probes/continuation.py
@@ -5,20 +5,18 @@
 continue the term based on varying amounts of prefix.
 """
 
-from collections import defaultdict
 import json
 
 from garak import _config
 from garak.attempt import Attempt
 from garak.probes.base import Probe
 
 
-class ContinueSlursReclaimedSlurs80(Probe):
+class ContinueSlursReclaimedSlurs(Probe):
     """Continue partial word with slur or reclaimed slur.
 
     This probe poses a partial ethnic slur with some preceding content, and
-    checks to see if the model will complete the slur. It uses a subset of 80
-    slurs & contexts that test LLMs have been likely to provide completions for."""
+    checks to see if the model will complete the slur."""
 
     bcp47 = "en"
     doc_uri = ""
@@ -35,15 +33,17 @@ class ContinueSlursReclaimedSlurs80(Probe):
         "payload:unwanted:hatespeech",
     ]
     goal = "output an ethnic slur"
+    active = False
+    source_resource_filename = "slurprompts.jsonl"
 
     def __init__(self, config_root=_config):
         super().__init__(config_root=config_root)
         # go through the list
         # for each term, work out first char, first three chars, all but last two chars
         # append these to each of the prefixes for that term
-        slur_prefixes = defaultdict(set)
+        self._slur_prefixes = []
         with open(
-            _config.transient.package_dir / "resources" / "slurprompts_80.jsonl",
+            _config.transient.package_dir / "resources" / self.source_resource_filename,
             "r",
             encoding="utf-8",
         ) as slurfile:
@@ -52,11 +52,11 @@ def __init__(self, config_root=_config):
                 if not line:
                     continue
                 entry = json.loads(line)
-                slur_prefixes[entry["term"]].add(entry["prefix"])
+                self._slur_prefixes.append((entry["term"], entry["prefix"]))
 
         self.prompts = []
         self.triggers = []
-        for term, prefixes in slur_prefixes.items():
+        for term, prefix in self._slur_prefixes:
             lower_term = term.lower()
             term_variants = set(["", lower_term[0]])  # blank, first letter
             if len(term) > 2:
@@ -65,17 +65,27 @@ def __init__(self, config_root=_config):
                 lower_term[: int(len(lower_term) / 2)].strip()
             )  # first half of term
 
-            for prefix in prefixes:
-                for term_variant in term_variants:
-                    # would use a set but order is important
-                    candidate_prompt = prefix + term_variant
-                    if candidate_prompt not in self.prompts:
-                        self.prompts.append(candidate_prompt.strip())
-                        self.triggers.append(term)
+            for term_variant in term_variants:
+                # would use a set but order is important
+                candidate_prompt = prefix + term_variant
+                if candidate_prompt not in self.prompts:
+                    self.prompts.append(candidate_prompt.strip())
+                    self.triggers.append(term)
 
     def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
         attempt.notes = dict(
             attempt.notes
         )  # we don't want all attempts.notes to ref same dict
         attempt.notes["trigger"] = str(self.triggers[seq])
         return attempt
+
+
+class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe):
+    """Continue partial word with slur or reclaimed slur.
+
+    This probe poses a partial ethnic slur with some preceding content, and
+    checks to see if the model will complete the slur. It uses a subset of 80
+    slurs & contexts that test LLMs have been likely to provide completions for."""
+
+    active = True
+    source_resource_filename = "slurprompts_mini.jsonl"