feat: add Anthropic prompt caching to assistant chat

Cache the static system prompt and conversation history prefix across turns, reducing input token costs by ~80% on multi-turn conversations. RAG context is intentionally uncached since it changes per query. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 18:17:55 -05:00
parent c5e1039717
commit 2007dcb990
1 changed files with 127 additions and 12 deletions
--- a/backend/app/services/assistant_chat_service.py
+++ b/backend/app/services/assistant_chat_service.py
@@ -2,15 +2,20 @@
 Provides persistent conversation history for general IT questions
 with semantic search over the team's flow library.
 Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
 - The static system prompt is cached (ephemeral, 5-min TTL)
 - The conversation history prefix is cached via a breakpoint on the
  last existing message before the new user input
 """
 import logging
-from typing import Optional, Any
+from typing import Any
 from uuid import UUID
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from app.core.ai_provider import get_ai_provider
+from app.core.config import settings
 from app.models.assistant_chat import AssistantChat
 from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
@@ -33,6 +38,118 @@ When answering:
 """
 async def _call_ai(
    system_base: str,
    rag_context: str,
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int = 4096,
 ) -> tuple[str, int, int]:
    """Call the AI with prompt caching when using Anthropic.
    Caching strategy:
    - System prompt base: cached (stable across all turns)
    - RAG context: NOT cached (changes per query)
    - Conversation history prefix: cached via breakpoint on last
      existing message (stable — only new user message is uncached)
    """
    if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
        return await _call_anthropic_cached(
            system_base, rag_context, history, new_message, max_tokens
        )
    # Fallback: generic provider (Gemini, etc.)
    from app.core.ai_provider import get_ai_provider
    system_prompt = system_base + rag_context
    messages = history + [{"role": "user", "content": new_message}]
    provider = get_ai_provider()
    return await provider.generate_text(
        system_prompt=system_prompt,
        messages=messages,
        max_tokens=max_tokens,
    )
 async def _call_anthropic_cached(
    system_base: str,
    rag_context: str,
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int,
 ) -> tuple[str, int, int]:
    """Call Anthropic with prompt caching on system prompt and history.
    Uses structured system blocks so the static base prompt is cached
    independently from the per-query RAG context.
    """
    import anthropic
    client = anthropic.AsyncAnthropic(
        api_key=settings.ANTHROPIC_API_KEY,
        timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
    )
    # System prompt as structured blocks:
    # Block 1: static base prompt (cached)
    # Block 2: RAG context (changes per query, not cached)
    system_blocks: list[dict[str, Any]] = [
        {
            "type": "text",
            "text": system_base,
            "cache_control": {"type": "ephemeral"},
        },
    ]
    if rag_context:
        system_blocks.append({"type": "text", "text": rag_context})
    # Build messages with cache breakpoint on conversation history
    messages: list[dict[str, Any]] = []
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})
    # Place cache breakpoint on the last history message so the entire
    # conversation prefix is cached across turns
    if messages:
        last = messages[-1]
        messages[-1] = {
            "role": last["role"],
            "content": [
                {
                    "type": "text",
                    "text": last["content"],
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        }
    # Add the new user message (uncached — it's new each turn)
    messages.append({"role": "user", "content": new_message})
    response = await client.messages.create(
        model=settings.AI_MODEL_ANTHROPIC,
        max_tokens=max_tokens,
        system=system_blocks,
        messages=messages,
    )
    text = response.content[0].text
    usage = response.usage
    input_tokens = usage.input_tokens
    output_tokens = usage.output_tokens
    # Log cache performance
    cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
    cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
    if cache_read or cache_creation:
        logger.info(
            "Anthropic cache: read=%d creation=%d input=%d output=%d",
            cache_read, cache_creation, input_tokens, output_tokens,
        )
    return text, input_tokens, output_tokens
 def _auto_title(message: str) -> str:
    """Generate a short title from the first user message."""
    title = message.strip()[:100]
@@ -90,22 +207,20 @@ async def send_message(
        limit=8,
    )
-    # Build system prompt
+    rag_context = build_rag_context(rag_results)
    system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results)
    # Build messages for AI
-    ai_messages = []
+    ai_messages: list[dict[str, Any]] = []
    for msg in chat.messages:
        if msg["role"] in ("user", "assistant"):
            ai_messages.append({"role": msg["role"], "content": msg["content"]})
    ai_messages.append({"role": "user", "content": message})
-    # Call AI
+    # Call AI with prompt caching (Anthropic) or generic provider
-    provider = get_ai_provider()
+    ai_content, input_tokens, output_tokens = await _call_ai(
-    ai_content, input_tokens, output_tokens = await provider.generate_text(
+        system_base=ASSISTANT_SYSTEM_PROMPT,
-        system_prompt=system_prompt,
+        rag_context=rag_context,
-        messages=ai_messages,
+        history=ai_messages,
-        max_tokens=4096,
+        new_message=message,
    )
    # Update chat