resolutionflow/backend/tests/test_prompt_anti_parrot.py

"""Guardrail: literal output payloads must not live in any LLM system prompt.

This test exists because the same anti-pattern bit us twice in the same
day: a worked example with literal content (Outlook + jsmith + literal
JSON; full DNS troubleshooting tree) sitting inside a `*_PROMPT` constant
caused Claude to recite that content on unrelated tickets, making the
task lane look like it was leaking previous-session data.

The fix is structural: every output example in a system prompt must use
`<placeholder>` or `<...>` syntax, never literal field values, command
names, hostnames, or usernames that the model could parrot. Format
examples that need real-looking content live in few-shot messages
(separate file, separate code path, model treats them as past behavior),
not in system prompts.

Failure messages here name the constant + line; fix by replacing the
literal payload with a placeholder schema, or by moving the example
out of the system prompt entirely.

See CLAUDE.md → Critical Lessons → "Don't put literal payloads in
system prompts" for the longer rationale.
"""
from __future__ import annotations

import importlib
import inspect
import pkgutil
import re
from typing import Iterator

import pytest

# Modules to scan. We deliberately import the modules (not just walk source
# files) so we get the actual string values of `*_PROMPT` constants — which
# may be assembled from concat / .format() / f-strings.
_MODULE_PACKAGES = ("app.services", "app.core")


def _iter_prompt_constants() -> Iterator[tuple[str, str, str]]:
    """Yield (module_name, constant_name, value) for every uppercase string
    constant whose name ends in `_PROMPT` (or `_SCHEMA`/`_PROTOCOL`/`_FORMAT`
    — same anti-pattern risk).

    Skips modules that fail to import to keep the test resilient when an
    individual module has unrelated breakage.
    """
    suffixes = ("_PROMPT", "_SCHEMA", "_PROTOCOL", "_FORMAT", "_CONTEXT")
    for pkg_name in _MODULE_PACKAGES:
        pkg = importlib.import_module(pkg_name)
        for mod_info in pkgutil.iter_modules(pkg.__path__, prefix=f"{pkg_name}."):
            try:
                mod = importlib.import_module(mod_info.name)
            except Exception:
                continue
            for name, value in inspect.getmembers(mod):
                if not name.isupper() or not name.endswith(suffixes):
                    continue
                if not isinstance(value, str):
                    continue
                yield mod_info.name, name, value


# ── The forbidden patterns ──────────────────────────────────────────────────

# A literal username pattern that Claude has historically parroted across
# unrelated tickets. The list isn't exhaustive — it's the exact strings
# we've seen leak. Add to it if a new one shows up in production.
_FORBIDDEN_LITERAL_TOKENS: tuple[str, ...] = (
    "jsmith",       # leaked from an Outlook/AD example
    "DC01",         # leaked from an intake-form example
    "ADSync",       # leaked from a commands-array example
    "Dnscache",     # leaked from a DNS troubleshooting tree example
    "google.com",   # leaked from a DNS troubleshooting tree example
    "Outlook keeps", "Teams drops",  # specific phrasings from a worked Outlook/WiFi example
)

# Marker-with-payload patterns. A `[QUESTIONS]\n[{...JSON with real field values...}]`
# block in a prompt is the highest-risk shape — the model treats it as a
# canonical response template. We allow placeholder content (anything inside
# angle brackets `<...>` is treated as a placeholder, not a literal).
#
# Restrictions on the regex (to avoid false positives where the marker name
# appears in prose like "include [QUESTIONS] markers"):
# - opening tag must be at start of string OR preceded by newline/whitespace
#   AND followed by newline+JSON-ish content
# - block content must START with `[` or `{` after optional whitespace,
#   so prose blocks (like the closing-tag-distance regex match across
#   markdown headings) are excluded
_MARKER_BLOCK_RE = re.compile(
    r"(?:^|\n)\[(QUESTIONS|ACTIONS|SUGGEST_FIX|FIX_OUTCOME|PROMOTE|FORK|TREE_UPDATE|STEPS_UPDATE|INTAKE_FORM|METADATA|DELTA)\]"
    r"\s*\n"                          # forced newline before content
    r"(\s*[\[{][\s\S]*?)"             # content must start with [ or {
    r"\s*\n\[/\1\]"
)

# Heuristic: only flag JSON VALUES, not JSON KEYS. Keys are followed by `:`,
# values come after `: ` (object value) or are bare strings inside an array.
# The shape we're defending against is `{"text": "Is this user on a laptop?"}` —
# the value `"Is this user on a laptop?"` is a literal payload the model will
# recite. Keys like `"text"` are part of the schema and must stay literal.
#
# Matches a quoted string that has at least 3 chars, no angle brackets, and
# is followed by a JSON value-terminator (`,` `]` `}`) — i.e. NOT followed
# by `:` (which would mark it as a key).
_QUOTED_VALUE_RE = re.compile(
    r'"([^"<>][^"<>]{2,}?)"\s*(?=[,\]\}])'
)
# Substrings that, if PRESENT in the candidate value, indicate it's a
# placeholder marker rather than literal output. Be strict — broad markers
# like "?" alone would whitelist any sentence ending in a question mark,
# defeating the test's purpose.
_PLACEHOLDER_HINTS = ("...", "snake_case", "kebab-case", "<", "TODO")
# Schema enum-like values that are part of the format spec, not parrotable text.
_ALLOWED_ENUM_VALUES = frozenset({
    "text", "password", "select", "boolean", "number", "textarea", "multi_text",
    "powershell", "bash", "cmd", "python",
    "question", "diagnostic_check", "user_note", "ai_synthesis",
    "decision", "action", "solution", "procedure_step", "section_header", "procedure_end",
    "step", "warning",
})


def _block_has_literal_payload(block_body: str) -> tuple[bool, str | None]:
    """Return (True, offending_string) if the marker block looks like literal output."""
    for m in _QUOTED_VALUE_RE.finditer(block_body):
        s = m.group(1).strip()
        if not s:
            continue
        # Pure placeholder hints — accept.
        if any(h in s for h in _PLACEHOLDER_HINTS):
            continue
        # Pipe-separated enum like `text|password|select` — schema spec.
        if "|" in s:
            continue
        # Single-word enum value we explicitly allow.
        if s in _ALLOWED_ENUM_VALUES:
            continue
        # JSON ellipsis-style placeholders, ".." etc.
        if all(c in "._" for c in s):
            continue
        return True, s
    return False, None


# ── Tests ──────────────────────────────────────────────────────────────────

def test_no_known_leaked_literal_tokens_in_prompts() -> None:
    """Constants must not contain strings the model has historically parroted.

    Adding a new entry to _FORBIDDEN_LITERAL_TOKENS after a production leak is
    the right way to extend coverage — keep this list as the audit trail.
    """
    failures: list[str] = []
    for module_name, const_name, value in _iter_prompt_constants():
        for token in _FORBIDDEN_LITERAL_TOKENS:
            if token in value:
                failures.append(
                    f"{module_name}.{const_name} contains forbidden literal token "
                    f"{token!r} — replace with a <placeholder>. See CLAUDE.md → "
                    f"'Don't put literal payloads in system prompts'."
                )
    assert not failures, "\n".join(failures)


def test_marker_blocks_in_prompts_use_placeholders_not_literal_payloads() -> None:
    """Every marker block in a system prompt must contain placeholders only.

    A block like `[QUESTIONS]\\n[{"text": "Is this user on a laptop or desktop?"}]\\n[/QUESTIONS]`
    will be recited verbatim by Claude on unrelated tickets. Use angle-bracket
    placeholders instead: `[{"text": "<one short, specific question>"}]`.
    """
    failures: list[str] = []
    for module_name, const_name, value in _iter_prompt_constants():
        for m in _MARKER_BLOCK_RE.finditer(value):
            marker = m.group(1)
            body = m.group(2)
            has_literal, offender = _block_has_literal_payload(body)
            if has_literal:
                failures.append(
                    f"{module_name}.{const_name}: [{marker}] block contains literal "
                    f"payload string {offender!r}. Replace with a <placeholder>. "
                    f"See CLAUDE.md → 'Don't put literal payloads in system prompts'."
                )
    assert not failures, "\n".join(failures)