Files
resolutionflow/backend/tests/test_prompt_anti_parrot.py
Michael Chihlas 2cde6673b0 feat(pilot): [FIX_OUTCOME] system prompt instructions
Tells the AI when + how to emit the [FIX_OUTCOME] marker that Task 4's
parser consumes. Placeholder-only per the anti-parrot pattern — no
literal UUIDs, outcomes, or reasons that could leak into unrelated
sessions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:17:21 -04:00

185 lines
8.6 KiB
Python

"""Guardrail: literal output payloads must not live in any LLM system prompt.
This test exists because the same anti-pattern bit us twice in the same
day: a worked example with literal content (Outlook + jsmith + literal
JSON; full DNS troubleshooting tree) sitting inside a `*_PROMPT` constant
caused Claude to recite that content on unrelated tickets, making the
task lane look like it was leaking previous-session data.
The fix is structural: every output example in a system prompt must use
`<placeholder>` or `<...>` syntax, never literal field values, command
names, hostnames, or usernames that the model could parrot. Format
examples that need real-looking content live in few-shot messages
(separate file, separate code path, model treats them as past behavior),
not in system prompts.
Failure messages here name the constant + line; fix by replacing the
literal payload with a placeholder schema, or by moving the example
out of the system prompt entirely.
See CLAUDE.md → Critical Lessons → "Don't put literal payloads in
system prompts" for the longer rationale.
"""
from __future__ import annotations
import importlib
import inspect
import pkgutil
import re
from typing import Iterator
import pytest
# Modules to scan. We deliberately import the modules (not just walk source
# files) so we get the actual string values of `*_PROMPT` constants — which
# may be assembled from concat / .format() / f-strings.
_MODULE_PACKAGES = ("app.services", "app.core")
def _iter_prompt_constants() -> Iterator[tuple[str, str, str]]:
"""Yield (module_name, constant_name, value) for every uppercase string
constant whose name ends in `_PROMPT` (or `_SCHEMA`/`_PROTOCOL`/`_FORMAT`
— same anti-pattern risk).
Skips modules that fail to import to keep the test resilient when an
individual module has unrelated breakage.
"""
suffixes = ("_PROMPT", "_SCHEMA", "_PROTOCOL", "_FORMAT", "_CONTEXT")
for pkg_name in _MODULE_PACKAGES:
pkg = importlib.import_module(pkg_name)
for mod_info in pkgutil.iter_modules(pkg.__path__, prefix=f"{pkg_name}."):
try:
mod = importlib.import_module(mod_info.name)
except Exception:
continue
for name, value in inspect.getmembers(mod):
if not name.isupper() or not name.endswith(suffixes):
continue
if not isinstance(value, str):
continue
yield mod_info.name, name, value
# ── The forbidden patterns ──────────────────────────────────────────────────
# A literal username pattern that Claude has historically parroted across
# unrelated tickets. The list isn't exhaustive — it's the exact strings
# we've seen leak. Add to it if a new one shows up in production.
_FORBIDDEN_LITERAL_TOKENS: tuple[str, ...] = (
"jsmith", # leaked from an Outlook/AD example
"DC01", # leaked from an intake-form example
"ADSync", # leaked from a commands-array example
"Dnscache", # leaked from a DNS troubleshooting tree example
"google.com", # leaked from a DNS troubleshooting tree example
"Outlook keeps", "Teams drops", # specific phrasings from a worked Outlook/WiFi example
)
# Marker-with-payload patterns. A `[QUESTIONS]\n[{...JSON with real field values...}]`
# block in a prompt is the highest-risk shape — the model treats it as a
# canonical response template. We allow placeholder content (anything inside
# angle brackets `<...>` is treated as a placeholder, not a literal).
#
# Restrictions on the regex (to avoid false positives where the marker name
# appears in prose like "include [QUESTIONS] markers"):
# - opening tag must be at start of string OR preceded by newline/whitespace
# AND followed by newline+JSON-ish content
# - block content must START with `[` or `{` after optional whitespace,
# so prose blocks (like the closing-tag-distance regex match across
# markdown headings) are excluded
_MARKER_BLOCK_RE = re.compile(
r"(?:^|\n)\[(QUESTIONS|ACTIONS|SUGGEST_FIX|FIX_OUTCOME|PROMOTE|FORK|TREE_UPDATE|STEPS_UPDATE|INTAKE_FORM|METADATA|DELTA)\]"
r"\s*\n" # forced newline before content
r"(\s*[\[{][\s\S]*?)" # content must start with [ or {
r"\s*\n\[/\1\]"
)
# Heuristic: only flag JSON VALUES, not JSON KEYS. Keys are followed by `:`,
# values come after `: ` (object value) or are bare strings inside an array.
# The shape we're defending against is `{"text": "Is this user on a laptop?"}` —
# the value `"Is this user on a laptop?"` is a literal payload the model will
# recite. Keys like `"text"` are part of the schema and must stay literal.
#
# Matches a quoted string that has at least 3 chars, no angle brackets, and
# is followed by a JSON value-terminator (`,` `]` `}`) — i.e. NOT followed
# by `:` (which would mark it as a key).
_QUOTED_VALUE_RE = re.compile(
r'"([^"<>][^"<>]{2,}?)"\s*(?=[,\]\}])'
)
# Substrings that, if PRESENT in the candidate value, indicate it's a
# placeholder marker rather than literal output. Be strict — broad markers
# like "?" alone would whitelist any sentence ending in a question mark,
# defeating the test's purpose.
_PLACEHOLDER_HINTS = ("...", "snake_case", "kebab-case", "<", "TODO")
# Schema enum-like values that are part of the format spec, not parrotable text.
_ALLOWED_ENUM_VALUES = frozenset({
"text", "password", "select", "boolean", "number", "textarea", "multi_text",
"powershell", "bash", "cmd", "python",
"question", "diagnostic_check", "user_note", "ai_synthesis",
"decision", "action", "solution", "procedure_step", "section_header", "procedure_end",
"step", "warning",
})
def _block_has_literal_payload(block_body: str) -> tuple[bool, str | None]:
"""Return (True, offending_string) if the marker block looks like literal output."""
for m in _QUOTED_VALUE_RE.finditer(block_body):
s = m.group(1).strip()
if not s:
continue
# Pure placeholder hints — accept.
if any(h in s for h in _PLACEHOLDER_HINTS):
continue
# Pipe-separated enum like `text|password|select` — schema spec.
if "|" in s:
continue
# Single-word enum value we explicitly allow.
if s in _ALLOWED_ENUM_VALUES:
continue
# JSON ellipsis-style placeholders, ".." etc.
if all(c in "._" for c in s):
continue
return True, s
return False, None
# ── Tests ──────────────────────────────────────────────────────────────────
def test_no_known_leaked_literal_tokens_in_prompts() -> None:
"""Constants must not contain strings the model has historically parroted.
Adding a new entry to _FORBIDDEN_LITERAL_TOKENS after a production leak is
the right way to extend coverage — keep this list as the audit trail.
"""
failures: list[str] = []
for module_name, const_name, value in _iter_prompt_constants():
for token in _FORBIDDEN_LITERAL_TOKENS:
if token in value:
failures.append(
f"{module_name}.{const_name} contains forbidden literal token "
f"{token!r} — replace with a <placeholder>. See CLAUDE.md → "
f"'Don't put literal payloads in system prompts'."
)
assert not failures, "\n".join(failures)
def test_marker_blocks_in_prompts_use_placeholders_not_literal_payloads() -> None:
"""Every marker block in a system prompt must contain placeholders only.
A block like `[QUESTIONS]\\n[{"text": "Is this user on a laptop or desktop?"}]\\n[/QUESTIONS]`
will be recited verbatim by Claude on unrelated tickets. Use angle-bracket
placeholders instead: `[{"text": "<one short, specific question>"}]`.
"""
failures: list[str] = []
for module_name, const_name, value in _iter_prompt_constants():
for m in _MARKER_BLOCK_RE.finditer(value):
marker = m.group(1)
body = m.group(2)
has_literal, offender = _block_has_literal_payload(body)
if has_literal:
failures.append(
f"{module_name}.{const_name}: [{marker}] block contains literal "
f"payload string {offender!r}. Replace with a <placeholder>. "
f"See CLAUDE.md → 'Don't put literal payloads in system prompts'."
)
assert not failures, "\n".join(failures)