All checks were successful
Mirror to GitHub / mirror (push) Successful in 10s
The "AI parrots example content from system prompt" bug bit us twice in one day across two different prompt sites. Patching individual prompts is treating the symptom; this commit makes the rule structural. Audit + sanitize: - assistant_chat_service.ASSISTANT_SYSTEM_PROMPT — already cleaned in prior commits, but the [FORK] schema still had literal "Brief reason" / "Short name" / "One sentence" placeholders. Replaced with <angle-bracket> placeholders. Anti-parrot rule itself rewritten to describe the failure mode abstractly instead of naming "jsmith" so the rule no longer trips the guardrail (and so the model doesn't see "jsmith" as a token at all). - ai_chat_service.py — removed three concrete-example offenders: "Get-Service ADSync" command literal, the "DC01 server_name" intake form payload (in two places), and the inline interview demos using "Azure AD Sync failures" / "Exchange Online mailbox migration". Replaced with technology-neutral schema descriptions. - ai_tree_generator_service.BRANCH_DETAIL_SYSTEM_PROMPT — replaced the fully-fleshed DNS troubleshooting tree (with literal Dnscache / ipconfig / google.com / Start-Service) with a placeholder schema showing only ID-linkage shape. - kb_conversion_service.PROCEDURAL_SYSTEM_PROMPT — replaced the worked Server Manager + DC01 example payload with a placeholder schema. Guardrail (tests/test_prompt_anti_parrot.py): - Imports every module under app/services/ and app/core/ and walks every uppercase string constant ending in _PROMPT, _SCHEMA, _PROTOCOL, _FORMAT, or _CONTEXT. - test 1: known-leaked-token list (jsmith, DC01, ADSync, Dnscache, google.com, "Outlook keeps", "Teams drops") must not appear in any prompt constant. Add to the list when a new leak shows up in prod — the list IS the audit trail. - test 2: marker blocks ([QUESTIONS], [ACTIONS], [SUGGEST_FIX], etc.) must contain placeholders only. Distinguishes JSON keys (followed by ':', allowed) from JSON values (followed by ',' / ']' / '}', must be <placeholder>); allows pipe-separated enum types (text|password|select) and a small set of fixed enum values (question, diagnostic_check, decision, action, ...). Verified by feeding the test a known-bad block — caught it correctly. Documented the rule in CLAUDE.md → AI / FlowPilot lessons, naming the test as the enforcement point so future contributors know how to extend it (add to the known-leaked list when a new leak surfaces). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
185 lines
8.5 KiB
Python
185 lines
8.5 KiB
Python
"""Guardrail: literal output payloads must not live in any LLM system prompt.
|
|
|
|
This test exists because the same anti-pattern bit us twice in the same
|
|
day: a worked example with literal content (Outlook + jsmith + literal
|
|
JSON; full DNS troubleshooting tree) sitting inside a `*_PROMPT` constant
|
|
caused Claude to recite that content on unrelated tickets, making the
|
|
task lane look like it was leaking previous-session data.
|
|
|
|
The fix is structural: every output example in a system prompt must use
|
|
`<placeholder>` or `<...>` syntax, never literal field values, command
|
|
names, hostnames, or usernames that the model could parrot. Format
|
|
examples that need real-looking content live in few-shot messages
|
|
(separate file, separate code path, model treats them as past behavior),
|
|
not in system prompts.
|
|
|
|
Failure messages here name the constant + line; fix by replacing the
|
|
literal payload with a placeholder schema, or by moving the example
|
|
out of the system prompt entirely.
|
|
|
|
See CLAUDE.md → Critical Lessons → "Don't put literal payloads in
|
|
system prompts" for the longer rationale.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import inspect
|
|
import pkgutil
|
|
import re
|
|
from typing import Iterator
|
|
|
|
import pytest
|
|
|
|
# Modules to scan. We deliberately import the modules (not just walk source
|
|
# files) so we get the actual string values of `*_PROMPT` constants — which
|
|
# may be assembled from concat / .format() / f-strings.
|
|
_MODULE_PACKAGES = ("app.services", "app.core")
|
|
|
|
|
|
def _iter_prompt_constants() -> Iterator[tuple[str, str, str]]:
|
|
"""Yield (module_name, constant_name, value) for every uppercase string
|
|
constant whose name ends in `_PROMPT` (or `_SCHEMA`/`_PROTOCOL`/`_FORMAT`
|
|
— same anti-pattern risk).
|
|
|
|
Skips modules that fail to import to keep the test resilient when an
|
|
individual module has unrelated breakage.
|
|
"""
|
|
suffixes = ("_PROMPT", "_SCHEMA", "_PROTOCOL", "_FORMAT", "_CONTEXT")
|
|
for pkg_name in _MODULE_PACKAGES:
|
|
pkg = importlib.import_module(pkg_name)
|
|
for mod_info in pkgutil.iter_modules(pkg.__path__, prefix=f"{pkg_name}."):
|
|
try:
|
|
mod = importlib.import_module(mod_info.name)
|
|
except Exception:
|
|
continue
|
|
for name, value in inspect.getmembers(mod):
|
|
if not name.isupper() or not name.endswith(suffixes):
|
|
continue
|
|
if not isinstance(value, str):
|
|
continue
|
|
yield mod_info.name, name, value
|
|
|
|
|
|
# ── The forbidden patterns ──────────────────────────────────────────────────
|
|
|
|
# A literal username pattern that Claude has historically parroted across
|
|
# unrelated tickets. The list isn't exhaustive — it's the exact strings
|
|
# we've seen leak. Add to it if a new one shows up in production.
|
|
_FORBIDDEN_LITERAL_TOKENS: tuple[str, ...] = (
|
|
"jsmith", # leaked from an Outlook/AD example
|
|
"DC01", # leaked from an intake-form example
|
|
"ADSync", # leaked from a commands-array example
|
|
"Dnscache", # leaked from a DNS troubleshooting tree example
|
|
"google.com", # leaked from a DNS troubleshooting tree example
|
|
"Outlook keeps", "Teams drops", # specific phrasings from a worked Outlook/WiFi example
|
|
)
|
|
|
|
# Marker-with-payload patterns. A `[QUESTIONS]\n[{...JSON with real field values...}]`
|
|
# block in a prompt is the highest-risk shape — the model treats it as a
|
|
# canonical response template. We allow placeholder content (anything inside
|
|
# angle brackets `<...>` is treated as a placeholder, not a literal).
|
|
#
|
|
# Restrictions on the regex (to avoid false positives where the marker name
|
|
# appears in prose like "include [QUESTIONS] markers"):
|
|
# - opening tag must be at start of string OR preceded by newline/whitespace
|
|
# AND followed by newline+JSON-ish content
|
|
# - block content must START with `[` or `{` after optional whitespace,
|
|
# so prose blocks (like the closing-tag-distance regex match across
|
|
# markdown headings) are excluded
|
|
_MARKER_BLOCK_RE = re.compile(
|
|
r"(?:^|\n)\[(QUESTIONS|ACTIONS|SUGGEST_FIX|PROMOTE|FORK|TREE_UPDATE|STEPS_UPDATE|INTAKE_FORM|METADATA|DELTA)\]"
|
|
r"\s*\n" # forced newline before content
|
|
r"(\s*[\[{][\s\S]*?)" # content must start with [ or {
|
|
r"\s*\n\[/\1\]"
|
|
)
|
|
|
|
# Heuristic: only flag JSON VALUES, not JSON KEYS. Keys are followed by `:`,
|
|
# values come after `: ` (object value) or are bare strings inside an array.
|
|
# The shape we're defending against is `{"text": "Is this user on a laptop?"}` —
|
|
# the value `"Is this user on a laptop?"` is a literal payload the model will
|
|
# recite. Keys like `"text"` are part of the schema and must stay literal.
|
|
#
|
|
# Matches a quoted string that has at least 3 chars, no angle brackets, and
|
|
# is followed by a JSON value-terminator (`,` `]` `}`) — i.e. NOT followed
|
|
# by `:` (which would mark it as a key).
|
|
_QUOTED_VALUE_RE = re.compile(
|
|
r'"([^"<>][^"<>]{2,}?)"\s*(?=[,\]\}])'
|
|
)
|
|
# Substrings that, if PRESENT in the candidate value, indicate it's a
|
|
# placeholder marker rather than literal output. Be strict — broad markers
|
|
# like "?" alone would whitelist any sentence ending in a question mark,
|
|
# defeating the test's purpose.
|
|
_PLACEHOLDER_HINTS = ("...", "snake_case", "kebab-case", "<", "TODO")
|
|
# Schema enum-like values that are part of the format spec, not parrotable text.
|
|
_ALLOWED_ENUM_VALUES = frozenset({
|
|
"text", "password", "select", "boolean", "number", "textarea", "multi_text",
|
|
"powershell", "bash", "cmd", "python",
|
|
"question", "diagnostic_check", "user_note", "ai_synthesis",
|
|
"decision", "action", "solution", "procedure_step", "section_header", "procedure_end",
|
|
"step", "warning",
|
|
})
|
|
|
|
|
|
def _block_has_literal_payload(block_body: str) -> tuple[bool, str | None]:
|
|
"""Return (True, offending_string) if the marker block looks like literal output."""
|
|
for m in _QUOTED_VALUE_RE.finditer(block_body):
|
|
s = m.group(1).strip()
|
|
if not s:
|
|
continue
|
|
# Pure placeholder hints — accept.
|
|
if any(h in s for h in _PLACEHOLDER_HINTS):
|
|
continue
|
|
# Pipe-separated enum like `text|password|select` — schema spec.
|
|
if "|" in s:
|
|
continue
|
|
# Single-word enum value we explicitly allow.
|
|
if s in _ALLOWED_ENUM_VALUES:
|
|
continue
|
|
# JSON ellipsis-style placeholders, ".." etc.
|
|
if all(c in "._" for c in s):
|
|
continue
|
|
return True, s
|
|
return False, None
|
|
|
|
|
|
# ── Tests ──────────────────────────────────────────────────────────────────
|
|
|
|
def test_no_known_leaked_literal_tokens_in_prompts() -> None:
|
|
"""Constants must not contain strings the model has historically parroted.
|
|
|
|
Adding a new entry to _FORBIDDEN_LITERAL_TOKENS after a production leak is
|
|
the right way to extend coverage — keep this list as the audit trail.
|
|
"""
|
|
failures: list[str] = []
|
|
for module_name, const_name, value in _iter_prompt_constants():
|
|
for token in _FORBIDDEN_LITERAL_TOKENS:
|
|
if token in value:
|
|
failures.append(
|
|
f"{module_name}.{const_name} contains forbidden literal token "
|
|
f"{token!r} — replace with a <placeholder>. See CLAUDE.md → "
|
|
f"'Don't put literal payloads in system prompts'."
|
|
)
|
|
assert not failures, "\n".join(failures)
|
|
|
|
|
|
def test_marker_blocks_in_prompts_use_placeholders_not_literal_payloads() -> None:
|
|
"""Every marker block in a system prompt must contain placeholders only.
|
|
|
|
A block like `[QUESTIONS]\\n[{"text": "Is this user on a laptop or desktop?"}]\\n[/QUESTIONS]`
|
|
will be recited verbatim by Claude on unrelated tickets. Use angle-bracket
|
|
placeholders instead: `[{"text": "<one short, specific question>"}]`.
|
|
"""
|
|
failures: list[str] = []
|
|
for module_name, const_name, value in _iter_prompt_constants():
|
|
for m in _MARKER_BLOCK_RE.finditer(value):
|
|
marker = m.group(1)
|
|
body = m.group(2)
|
|
has_literal, offender = _block_has_literal_payload(body)
|
|
if has_literal:
|
|
failures.append(
|
|
f"{module_name}.{const_name}: [{marker}] block contains literal "
|
|
f"payload string {offender!r}. Replace with a <placeholder>. "
|
|
f"See CLAUDE.md → 'Don't put literal payloads in system prompts'."
|
|
)
|
|
assert not failures, "\n".join(failures)
|