Tells the AI when + how to emit the [FIX_OUTCOME] marker that Task 4's parser consumes. Placeholder-only per the anti-parrot pattern — no literal UUIDs, outcomes, or reasons that could leak into unrelated sessions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
185 lines
8.6 KiB
Python
185 lines
8.6 KiB
Python
"""Guardrail: literal output payloads must not live in any LLM system prompt.
|
|
|
|
This test exists because the same anti-pattern bit us twice in the same
|
|
day: a worked example with literal content (Outlook + jsmith + literal
|
|
JSON; full DNS troubleshooting tree) sitting inside a `*_PROMPT` constant
|
|
caused Claude to recite that content on unrelated tickets, making the
|
|
task lane look like it was leaking previous-session data.
|
|
|
|
The fix is structural: every output example in a system prompt must use
|
|
`<placeholder>` or `<...>` syntax, never literal field values, command
|
|
names, hostnames, or usernames that the model could parrot. Format
|
|
examples that need real-looking content live in few-shot messages
|
|
(separate file, separate code path, model treats them as past behavior),
|
|
not in system prompts.
|
|
|
|
Failure messages here name the constant + line; fix by replacing the
|
|
literal payload with a placeholder schema, or by moving the example
|
|
out of the system prompt entirely.
|
|
|
|
See CLAUDE.md → Critical Lessons → "Don't put literal payloads in
|
|
system prompts" for the longer rationale.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import inspect
|
|
import pkgutil
|
|
import re
|
|
from typing import Iterator
|
|
|
|
import pytest
|
|
|
|
# Modules to scan. We deliberately import the modules (not just walk source
|
|
# files) so we get the actual string values of `*_PROMPT` constants — which
|
|
# may be assembled from concat / .format() / f-strings.
|
|
_MODULE_PACKAGES = ("app.services", "app.core")
|
|
|
|
|
|
def _iter_prompt_constants() -> Iterator[tuple[str, str, str]]:
|
|
"""Yield (module_name, constant_name, value) for every uppercase string
|
|
constant whose name ends in `_PROMPT` (or `_SCHEMA`/`_PROTOCOL`/`_FORMAT`
|
|
— same anti-pattern risk).
|
|
|
|
Skips modules that fail to import to keep the test resilient when an
|
|
individual module has unrelated breakage.
|
|
"""
|
|
suffixes = ("_PROMPT", "_SCHEMA", "_PROTOCOL", "_FORMAT", "_CONTEXT")
|
|
for pkg_name in _MODULE_PACKAGES:
|
|
pkg = importlib.import_module(pkg_name)
|
|
for mod_info in pkgutil.iter_modules(pkg.__path__, prefix=f"{pkg_name}."):
|
|
try:
|
|
mod = importlib.import_module(mod_info.name)
|
|
except Exception:
|
|
continue
|
|
for name, value in inspect.getmembers(mod):
|
|
if not name.isupper() or not name.endswith(suffixes):
|
|
continue
|
|
if not isinstance(value, str):
|
|
continue
|
|
yield mod_info.name, name, value
|
|
|
|
|
|
# ── The forbidden patterns ──────────────────────────────────────────────────
|
|
|
|
# A literal username pattern that Claude has historically parroted across
|
|
# unrelated tickets. The list isn't exhaustive — it's the exact strings
|
|
# we've seen leak. Add to it if a new one shows up in production.
|
|
_FORBIDDEN_LITERAL_TOKENS: tuple[str, ...] = (
|
|
"jsmith", # leaked from an Outlook/AD example
|
|
"DC01", # leaked from an intake-form example
|
|
"ADSync", # leaked from a commands-array example
|
|
"Dnscache", # leaked from a DNS troubleshooting tree example
|
|
"google.com", # leaked from a DNS troubleshooting tree example
|
|
"Outlook keeps", "Teams drops", # specific phrasings from a worked Outlook/WiFi example
|
|
)
|
|
|
|
# Marker-with-payload patterns. A `[QUESTIONS]\n[{...JSON with real field values...}]`
|
|
# block in a prompt is the highest-risk shape — the model treats it as a
|
|
# canonical response template. We allow placeholder content (anything inside
|
|
# angle brackets `<...>` is treated as a placeholder, not a literal).
|
|
#
|
|
# Restrictions on the regex (to avoid false positives where the marker name
|
|
# appears in prose like "include [QUESTIONS] markers"):
|
|
# - opening tag must be at start of string OR preceded by newline/whitespace
|
|
# AND followed by newline+JSON-ish content
|
|
# - block content must START with `[` or `{` after optional whitespace,
|
|
# so prose blocks (like the closing-tag-distance regex match across
|
|
# markdown headings) are excluded
|
|
_MARKER_BLOCK_RE = re.compile(
|
|
r"(?:^|\n)\[(QUESTIONS|ACTIONS|SUGGEST_FIX|FIX_OUTCOME|PROMOTE|FORK|TREE_UPDATE|STEPS_UPDATE|INTAKE_FORM|METADATA|DELTA)\]"
|
|
r"\s*\n" # forced newline before content
|
|
r"(\s*[\[{][\s\S]*?)" # content must start with [ or {
|
|
r"\s*\n\[/\1\]"
|
|
)
|
|
|
|
# Heuristic: only flag JSON VALUES, not JSON KEYS. Keys are followed by `:`,
|
|
# values come after `: ` (object value) or are bare strings inside an array.
|
|
# The shape we're defending against is `{"text": "Is this user on a laptop?"}` —
|
|
# the value `"Is this user on a laptop?"` is a literal payload the model will
|
|
# recite. Keys like `"text"` are part of the schema and must stay literal.
|
|
#
|
|
# Matches a quoted string that has at least 3 chars, no angle brackets, and
|
|
# is followed by a JSON value-terminator (`,` `]` `}`) — i.e. NOT followed
|
|
# by `:` (which would mark it as a key).
|
|
_QUOTED_VALUE_RE = re.compile(
|
|
r'"([^"<>][^"<>]{2,}?)"\s*(?=[,\]\}])'
|
|
)
|
|
# Substrings that, if PRESENT in the candidate value, indicate it's a
|
|
# placeholder marker rather than literal output. Be strict — broad markers
|
|
# like "?" alone would whitelist any sentence ending in a question mark,
|
|
# defeating the test's purpose.
|
|
_PLACEHOLDER_HINTS = ("...", "snake_case", "kebab-case", "<", "TODO")
|
|
# Schema enum-like values that are part of the format spec, not parrotable text.
|
|
_ALLOWED_ENUM_VALUES = frozenset({
|
|
"text", "password", "select", "boolean", "number", "textarea", "multi_text",
|
|
"powershell", "bash", "cmd", "python",
|
|
"question", "diagnostic_check", "user_note", "ai_synthesis",
|
|
"decision", "action", "solution", "procedure_step", "section_header", "procedure_end",
|
|
"step", "warning",
|
|
})
|
|
|
|
|
|
def _block_has_literal_payload(block_body: str) -> tuple[bool, str | None]:
|
|
"""Return (True, offending_string) if the marker block looks like literal output."""
|
|
for m in _QUOTED_VALUE_RE.finditer(block_body):
|
|
s = m.group(1).strip()
|
|
if not s:
|
|
continue
|
|
# Pure placeholder hints — accept.
|
|
if any(h in s for h in _PLACEHOLDER_HINTS):
|
|
continue
|
|
# Pipe-separated enum like `text|password|select` — schema spec.
|
|
if "|" in s:
|
|
continue
|
|
# Single-word enum value we explicitly allow.
|
|
if s in _ALLOWED_ENUM_VALUES:
|
|
continue
|
|
# JSON ellipsis-style placeholders, ".." etc.
|
|
if all(c in "._" for c in s):
|
|
continue
|
|
return True, s
|
|
return False, None
|
|
|
|
|
|
# ── Tests ──────────────────────────────────────────────────────────────────
|
|
|
|
def test_no_known_leaked_literal_tokens_in_prompts() -> None:
|
|
"""Constants must not contain strings the model has historically parroted.
|
|
|
|
Adding a new entry to _FORBIDDEN_LITERAL_TOKENS after a production leak is
|
|
the right way to extend coverage — keep this list as the audit trail.
|
|
"""
|
|
failures: list[str] = []
|
|
for module_name, const_name, value in _iter_prompt_constants():
|
|
for token in _FORBIDDEN_LITERAL_TOKENS:
|
|
if token in value:
|
|
failures.append(
|
|
f"{module_name}.{const_name} contains forbidden literal token "
|
|
f"{token!r} — replace with a <placeholder>. See CLAUDE.md → "
|
|
f"'Don't put literal payloads in system prompts'."
|
|
)
|
|
assert not failures, "\n".join(failures)
|
|
|
|
|
|
def test_marker_blocks_in_prompts_use_placeholders_not_literal_payloads() -> None:
|
|
"""Every marker block in a system prompt must contain placeholders only.
|
|
|
|
A block like `[QUESTIONS]\\n[{"text": "Is this user on a laptop or desktop?"}]\\n[/QUESTIONS]`
|
|
will be recited verbatim by Claude on unrelated tickets. Use angle-bracket
|
|
placeholders instead: `[{"text": "<one short, specific question>"}]`.
|
|
"""
|
|
failures: list[str] = []
|
|
for module_name, const_name, value in _iter_prompt_constants():
|
|
for m in _MARKER_BLOCK_RE.finditer(value):
|
|
marker = m.group(1)
|
|
body = m.group(2)
|
|
has_literal, offender = _block_has_literal_payload(body)
|
|
if has_literal:
|
|
failures.append(
|
|
f"{module_name}.{const_name}: [{marker}] block contains literal "
|
|
f"payload string {offender!r}. Replace with a <placeholder>. "
|
|
f"See CLAUDE.md → 'Don't put literal payloads in system prompts'."
|
|
)
|
|
assert not failures, "\n".join(failures)
|