feat(ai): structured-system-block caching in AnthropicProvider (Phase 0.1)

Widens AIProvider.generate_json / generate_text / generate_text_stream signatures to accept `system_prompt: str | list[SystemBlock]`: - `str` (the existing call shape): passes through uncached, unchanged behavior. Every existing caller stays on the uncached path — no silent behavior change. - `list[SystemBlock]`: enables Anthropic prompt caching via structured system blocks. Caller-authored `cache_control` is honored verbatim (policy α); if no block carries it, the provider applies `cache_control: {"type": "ephemeral"}` to the first block only. Gemini ignores cache_control and concatenates list entries into one system string — the widened signature is strictly additive on that path. Adds `anthropic.cache` structured-log telemetry: on every Anthropic response (streaming included, via `stream.get_final_message()`), logs `cache_read_input_tokens` and `cache_creation_input_tokens`. Telemetry failure in streaming is swallowed so the user-facing stream never breaks. Verification deferred: cannot run from code-server (no Python, no DB, no dev env). TODO(phase0-verify) left inline in the module docstring. First verification task on the new dev environment is to hit any FlowPilot endpoint twice within 5 minutes and confirm the second call shows cache_read_input_tokens > 0 in the `anthropic.cache` log event. If verification fails, that's a debug task on the new env — not a blocker for continuing Phase 0.2/0.3/0.4. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 16:17:12 +00:00
parent 0fbc1e0a57
commit b3be66652e
1 changed files with 124 additions and 35 deletions
--- a/backend/app/core/ai_provider.py
+++ b/backend/app/core/ai_provider.py
@@ -3,16 +3,102 @@ AI Provider abstraction layer.
 Supports Gemini (google-genai) and Anthropic (anthropic) as interchangeable
 backends for JSON generation used by the AI Flow Builder.
 ## Prompt caching (Anthropic only)
 Callers may pass `system_prompt` as either:
 - `str` — backward-compatible, uncached.
 - `list[SystemBlock]` — Anthropic structured system blocks. Each block is a
  dict of shape `{"type": "text", "text": str, "cache_control": {...}?}`.
 Caching policy (policy α, per Phase 0.1 design):
 - If any block in the list carries an explicit `cache_control` key, that
  caller-authored configuration is honored verbatim.
 - If no block carries `cache_control`, the provider applies
  `cache_control: {"type": "ephemeral"}` to the first block only. First block
  is the common "large static prefix" case (e.g. system prompt, reference data).
 Gemini ignores cache_control and concatenates list blocks into one system
 string — callers should not rely on Gemini for cache-hit behavior.
 TODO(phase0-verify): When a dev environment is available, verify cache-hit
 behavior by hitting any FlowPilot endpoint twice within the 5-minute
 ephemeral TTL. First call should emit `anthropic.cache` with
 `cache_creation_input_tokens > 0`; second call with `cache_read_input_tokens > 0`.
 If the second call returns zero reads, inspect the prefix for silent
 invalidators (timestamps, unsorted JSON keys, varying tool list ordering).
 """
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
 from typing import Any
 from app.core.config import settings
 logger = logging.getLogger(__name__)
 # Anthropic structured system block. See module docstring for caching policy.
 SystemBlock = dict[str, Any]
 def _normalize_system_for_anthropic(
    system_prompt: str | list[SystemBlock],
 ) -> str | list[SystemBlock]:
    """Return the value to pass as the `system=` parameter to the Anthropic API.
    - Plain strings pass through untouched (uncached path).
    - Lists are returned as structured system blocks. If no block in the list
      carries an explicit `cache_control`, `cache_control: {"type": "ephemeral"}`
      is applied to the FIRST block only (policy α).
    - Caller-authored `cache_control` is never overwritten.
    """
    if isinstance(system_prompt, str):
        return system_prompt
    if not system_prompt:
        # Empty list is not a meaningful system prompt — pass empty string so
        # Anthropic treats this as "no system prompt" rather than erroring.
        return ""
    blocks = [dict(b) for b in system_prompt]
    already_cached = any("cache_control" in b for b in blocks)
    if not already_cached:
        blocks[0]["cache_control"] = {"type": "ephemeral"}
    return blocks
 def _flatten_system_for_gemini(
    system_prompt: str | list[SystemBlock],
 ) -> str:
    """Gemini has no structured system blocks; concatenate list entries."""
    if isinstance(system_prompt, str):
        return system_prompt
    return "\n\n".join(b.get("text", "") for b in system_prompt)
 def _log_anthropic_cache_usage(usage: Any, model: str) -> None:
    """Emit a structured log line capturing cache_read / cache_creation tokens."""
    cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
    cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
    input_tokens = getattr(usage, "input_tokens", 0) or 0
    output_tokens = getattr(usage, "output_tokens", 0) or 0
    if cache_read or cache_creation:
        logger.info(
            "anthropic.cache",
            extra={
                "event": "anthropic.cache",
                "model": model,
                "cache_read_input_tokens": cache_read,
                "cache_creation_input_tokens": cache_creation,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
            },
        )
 class AIProvider(ABC):
    """Abstract base class for AI providers."""
@@ -20,14 +106,16 @@ class AIProvider(ABC):
    @abstractmethod
    async def generate_json(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        """Generate a JSON response from the AI model.
        Args:
-            system_prompt: System-level instruction for the model.
+            system_prompt: System-level instruction. Plain `str` is uncached
                (Anthropic) or used as-is (Gemini). `list[SystemBlock]` enables
                Anthropic prompt caching per module-docstring policy.
            messages: List of message dicts with "role" and "content" keys.
            max_tokens: Maximum output tokens.
@@ -39,37 +127,25 @@ class AIProvider(ABC):
    @abstractmethod
    async def generate_text(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        """Generate a text response from the AI model (no JSON constraint).
-        Args:
+        See `generate_json` for argument semantics.
            system_prompt: System-level instruction for the model.
            messages: List of message dicts with "role" and "content" keys.
            max_tokens: Maximum output tokens.
        Returns:
            Tuple of (response_text, input_tokens, output_tokens).
        """
        ...
    async def generate_text_stream(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> "AsyncIterator[str]":
        """Stream a text response token by token.
-        Args:
+        See `generate_json` for argument semantics.
            system_prompt: System-level instruction for the model.
            messages: List of message dicts with "role" and "content" keys.
            max_tokens: Maximum output tokens.
        Yields:
            Text chunks as they are generated.
        """
        raise NotImplementedError("Streaming not supported for this provider")
        # Make this an async generator to satisfy type checker
@@ -85,14 +161,15 @@ class GeminiProvider(AIProvider):
    async def generate_json(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        from google import genai
        from google.genai import types as genai_types
        client = genai.Client(api_key=self._api_key)
        system_text = _flatten_system_for_gemini(system_prompt)
        # Convert messages to Gemini Content format
        contents: list[genai_types.Content] = []
@@ -106,7 +183,7 @@ class GeminiProvider(AIProvider):
            )
        config = genai_types.GenerateContentConfig(
-            system_instruction=system_prompt,
+            system_instruction=system_text,
            max_output_tokens=max_tokens,
            response_mime_type="application/json",
        )
@@ -137,14 +214,15 @@ class GeminiProvider(AIProvider):
    async def generate_text(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        from google import genai
        from google.genai import types as genai_types
        client = genai.Client(api_key=self._api_key)
        system_text = _flatten_system_for_gemini(system_prompt)
        contents: list[genai_types.Content] = []
        for msg in messages:
@@ -157,7 +235,7 @@ class GeminiProvider(AIProvider):
            )
        config = genai_types.GenerateContentConfig(
-            system_instruction=system_prompt,
+            system_instruction=system_text,
            max_output_tokens=max_tokens,
            # No response_mime_type — allow free-form text
        )
@@ -214,16 +292,17 @@ class AnthropicProvider(AIProvider):
    async def generate_json(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        client = _get_anthropic_client(self._api_key, self._timeout)
        normalized_system = _normalize_system_for_anthropic(system_prompt)
        response = await client.messages.create(
            model=self._model,
            max_tokens=max_tokens,
-            system=system_prompt,
+            system=normalized_system,
            messages=messages,
        )
@@ -231,12 +310,14 @@ class AnthropicProvider(AIProvider):
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens
        _log_anthropic_cache_usage(response.usage, self._model)
        return text, input_tokens, output_tokens
    async def generate_text(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> tuple[str, int, int]:
        # Anthropic doesn't differentiate between JSON and text mode
@@ -244,20 +325,28 @@ class AnthropicProvider(AIProvider):
    async def generate_text_stream(
        self,
-        system_prompt: str,
+        system_prompt: str | list[SystemBlock],
-        messages: list[dict[str, str]],
+        messages: list[dict[str, Any]],
        max_tokens: int = 4096,
    ) -> AsyncIterator[str]:
        client = _get_anthropic_client(self._api_key, self._timeout)
        normalized_system = _normalize_system_for_anthropic(system_prompt)
        async with client.messages.stream(
            model=self._model,
            max_tokens=max_tokens,
-            system=system_prompt,
+            system=normalized_system,
            messages=messages,
        ) as stream:
            async for text in stream.text_stream:
                yield text
            # Per Anthropic SDK, get_final_message() resolves the stream's
            # final usage object (including cache_read/cache_creation tokens).
            try:
                final = await stream.get_final_message()
                _log_anthropic_cache_usage(final.usage, self._model)
            except Exception as exc:  # best-effort telemetry, never fail the stream
                logger.debug("anthropic.cache streaming usage unavailable: %s", exc)
 def get_ai_provider(model: str | None = None) -> AIProvider: