"""Guardrail: literal output payloads must not live in any LLM system prompt. This test exists because the same anti-pattern bit us twice in the same day: a worked example with literal content (Outlook + jsmith + literal JSON; full DNS troubleshooting tree) sitting inside a `*_PROMPT` constant caused Claude to recite that content on unrelated tickets, making the task lane look like it was leaking previous-session data. The fix is structural: every output example in a system prompt must use `` or `<...>` syntax, never literal field values, command names, hostnames, or usernames that the model could parrot. Format examples that need real-looking content live in few-shot messages (separate file, separate code path, model treats them as past behavior), not in system prompts. Failure messages here name the constant + line; fix by replacing the literal payload with a placeholder schema, or by moving the example out of the system prompt entirely. See CLAUDE.md → Critical Lessons → "Don't put literal payloads in system prompts" for the longer rationale. """ from __future__ import annotations import importlib import inspect import pkgutil import re from typing import Iterator import pytest # Modules to scan. We deliberately import the modules (not just walk source # files) so we get the actual string values of `*_PROMPT` constants — which # may be assembled from concat / .format() / f-strings. _MODULE_PACKAGES = ("app.services", "app.core") def _iter_prompt_constants() -> Iterator[tuple[str, str, str]]: """Yield (module_name, constant_name, value) for every uppercase string constant whose name ends in `_PROMPT` (or `_SCHEMA`/`_PROTOCOL`/`_FORMAT` — same anti-pattern risk). Skips modules that fail to import to keep the test resilient when an individual module has unrelated breakage. """ suffixes = ("_PROMPT", "_SCHEMA", "_PROTOCOL", "_FORMAT", "_CONTEXT") for pkg_name in _MODULE_PACKAGES: pkg = importlib.import_module(pkg_name) for mod_info in pkgutil.iter_modules(pkg.__path__, prefix=f"{pkg_name}."): try: mod = importlib.import_module(mod_info.name) except Exception: continue for name, value in inspect.getmembers(mod): if not name.isupper() or not name.endswith(suffixes): continue if not isinstance(value, str): continue yield mod_info.name, name, value # ── The forbidden patterns ────────────────────────────────────────────────── # A literal username pattern that Claude has historically parroted across # unrelated tickets. The list isn't exhaustive — it's the exact strings # we've seen leak. Add to it if a new one shows up in production. _FORBIDDEN_LITERAL_TOKENS: tuple[str, ...] = ( "jsmith", # leaked from an Outlook/AD example "DC01", # leaked from an intake-form example "ADSync", # leaked from a commands-array example "Dnscache", # leaked from a DNS troubleshooting tree example "google.com", # leaked from a DNS troubleshooting tree example "Outlook keeps", "Teams drops", # specific phrasings from a worked Outlook/WiFi example ) # Marker-with-payload patterns. A `[QUESTIONS]\n[{...JSON with real field values...}]` # block in a prompt is the highest-risk shape — the model treats it as a # canonical response template. We allow placeholder content (anything inside # angle brackets `<...>` is treated as a placeholder, not a literal). # # Restrictions on the regex (to avoid false positives where the marker name # appears in prose like "include [QUESTIONS] markers"): # - opening tag must be at start of string OR preceded by newline/whitespace # AND followed by newline+JSON-ish content # - block content must START with `[` or `{` after optional whitespace, # so prose blocks (like the closing-tag-distance regex match across # markdown headings) are excluded _MARKER_BLOCK_RE = re.compile( r"(?:^|\n)\[(QUESTIONS|ACTIONS|SUGGEST_FIX|FIX_OUTCOME|PROMOTE|FORK|TREE_UPDATE|STEPS_UPDATE|INTAKE_FORM|METADATA|DELTA)\]" r"\s*\n" # forced newline before content r"(\s*[\[{][\s\S]*?)" # content must start with [ or { r"\s*\n\[/\1\]" ) # Heuristic: only flag JSON VALUES, not JSON KEYS. Keys are followed by `:`, # values come after `: ` (object value) or are bare strings inside an array. # The shape we're defending against is `{"text": "Is this user on a laptop?"}` — # the value `"Is this user on a laptop?"` is a literal payload the model will # recite. Keys like `"text"` are part of the schema and must stay literal. # # Matches a quoted string that has at least 3 chars, no angle brackets, and # is followed by a JSON value-terminator (`,` `]` `}`) — i.e. NOT followed # by `:` (which would mark it as a key). _QUOTED_VALUE_RE = re.compile( r'"([^"<>][^"<>]{2,}?)"\s*(?=[,\]\}])' ) # Substrings that, if PRESENT in the candidate value, indicate it's a # placeholder marker rather than literal output. Be strict — broad markers # like "?" alone would whitelist any sentence ending in a question mark, # defeating the test's purpose. _PLACEHOLDER_HINTS = ("...", "snake_case", "kebab-case", "<", "TODO") # Schema enum-like values that are part of the format spec, not parrotable text. _ALLOWED_ENUM_VALUES = frozenset({ "text", "password", "select", "boolean", "number", "textarea", "multi_text", "powershell", "bash", "cmd", "python", "question", "diagnostic_check", "user_note", "ai_synthesis", "decision", "action", "solution", "procedure_step", "section_header", "procedure_end", "step", "warning", }) def _block_has_literal_payload(block_body: str) -> tuple[bool, str | None]: """Return (True, offending_string) if the marker block looks like literal output.""" for m in _QUOTED_VALUE_RE.finditer(block_body): s = m.group(1).strip() if not s: continue # Pure placeholder hints — accept. if any(h in s for h in _PLACEHOLDER_HINTS): continue # Pipe-separated enum like `text|password|select` — schema spec. if "|" in s: continue # Single-word enum value we explicitly allow. if s in _ALLOWED_ENUM_VALUES: continue # JSON ellipsis-style placeholders, ".." etc. if all(c in "._" for c in s): continue return True, s return False, None # ── Tests ────────────────────────────────────────────────────────────────── def test_no_known_leaked_literal_tokens_in_prompts() -> None: """Constants must not contain strings the model has historically parroted. Adding a new entry to _FORBIDDEN_LITERAL_TOKENS after a production leak is the right way to extend coverage — keep this list as the audit trail. """ failures: list[str] = [] for module_name, const_name, value in _iter_prompt_constants(): for token in _FORBIDDEN_LITERAL_TOKENS: if token in value: failures.append( f"{module_name}.{const_name} contains forbidden literal token " f"{token!r} — replace with a . See CLAUDE.md → " f"'Don't put literal payloads in system prompts'." ) assert not failures, "\n".join(failures) def test_marker_blocks_in_prompts_use_placeholders_not_literal_payloads() -> None: """Every marker block in a system prompt must contain placeholders only. A block like `[QUESTIONS]\\n[{"text": "Is this user on a laptop or desktop?"}]\\n[/QUESTIONS]` will be recited verbatim by Claude on unrelated tickets. Use angle-bracket placeholders instead: `[{"text": ""}]`. """ failures: list[str] = [] for module_name, const_name, value in _iter_prompt_constants(): for m in _MARKER_BLOCK_RE.finditer(value): marker = m.group(1) body = m.group(2) has_literal, offender = _block_has_literal_payload(body) if has_literal: failures.append( f"{module_name}.{const_name}: [{marker}] block contains literal " f"payload string {offender!r}. Replace with a . " f"See CLAUDE.md → 'Don't put literal payloads in system prompts'." ) assert not failures, "\n".join(failures)