From 2007dcb990dcedd9e4be1341c5c8e5abaf22aed3 Mon Sep 17 00:00:00 2001
From: Michael Chihlas <michael@chihlas.com>
Date: Thu, 5 Mar 2026 18:17:55 -0500
Subject: [PATCH] feat: add Anthropic prompt caching to assistant chat

Cache the static system prompt and conversation history prefix across
turns, reducing input token costs by ~80% on multi-turn conversations.
RAG context is intentionally uncached since it changes per query.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../app/services/assistant_chat_service.py    | 139 ++++++++++++++++--
 1 file changed, 127 insertions(+), 12 deletions(-)

diff --git a/backend/app/services/assistant_chat_service.py b/backend/app/services/assistant_chat_service.py
index 797a3be7..275db947 100644
--- a/backend/app/services/assistant_chat_service.py
+++ b/backend/app/services/assistant_chat_service.py
@@ -2,15 +2,20 @@
 
 Provides persistent conversation history for general IT questions
 with semantic search over the team's flow library.
+
+Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
+- The static system prompt is cached (ephemeral, 5-min TTL)
+- The conversation history prefix is cached via a breakpoint on the
+  last existing message before the new user input
 """
 import logging
-from typing import Optional, Any
+from typing import Any
 from uuid import UUID
 
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.core.ai_provider import get_ai_provider
+from app.core.config import settings
 from app.models.assistant_chat import AssistantChat
 from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
 
@@ -33,6 +38,118 @@ When answering:
 """
 
 
+async def _call_ai(
+    system_base: str,
+    rag_context: str,
+    history: list[dict[str, Any]],
+    new_message: str,
+    max_tokens: int = 4096,
+) -> tuple[str, int, int]:
+    """Call the AI with prompt caching when using Anthropic.
+
+    Caching strategy:
+    - System prompt base: cached (stable across all turns)
+    - RAG context: NOT cached (changes per query)
+    - Conversation history prefix: cached via breakpoint on last
+      existing message (stable — only new user message is uncached)
+    """
+    if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
+        return await _call_anthropic_cached(
+            system_base, rag_context, history, new_message, max_tokens
+        )
+
+    # Fallback: generic provider (Gemini, etc.)
+    from app.core.ai_provider import get_ai_provider
+
+    system_prompt = system_base + rag_context
+    messages = history + [{"role": "user", "content": new_message}]
+    provider = get_ai_provider()
+    return await provider.generate_text(
+        system_prompt=system_prompt,
+        messages=messages,
+        max_tokens=max_tokens,
+    )
+
+
+async def _call_anthropic_cached(
+    system_base: str,
+    rag_context: str,
+    history: list[dict[str, Any]],
+    new_message: str,
+    max_tokens: int,
+) -> tuple[str, int, int]:
+    """Call Anthropic with prompt caching on system prompt and history.
+
+    Uses structured system blocks so the static base prompt is cached
+    independently from the per-query RAG context.
+    """
+    import anthropic
+
+    client = anthropic.AsyncAnthropic(
+        api_key=settings.ANTHROPIC_API_KEY,
+        timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
+    )
+
+    # System prompt as structured blocks:
+    # Block 1: static base prompt (cached)
+    # Block 2: RAG context (changes per query, not cached)
+    system_blocks: list[dict[str, Any]] = [
+        {
+            "type": "text",
+            "text": system_base,
+            "cache_control": {"type": "ephemeral"},
+        },
+    ]
+    if rag_context:
+        system_blocks.append({"type": "text", "text": rag_context})
+
+    # Build messages with cache breakpoint on conversation history
+    messages: list[dict[str, Any]] = []
+    for msg in history:
+        messages.append({"role": msg["role"], "content": msg["content"]})
+
+    # Place cache breakpoint on the last history message so the entire
+    # conversation prefix is cached across turns
+    if messages:
+        last = messages[-1]
+        messages[-1] = {
+            "role": last["role"],
+            "content": [
+                {
+                    "type": "text",
+                    "text": last["content"],
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ],
+        }
+
+    # Add the new user message (uncached — it's new each turn)
+    messages.append({"role": "user", "content": new_message})
+
+    response = await client.messages.create(
+        model=settings.AI_MODEL_ANTHROPIC,
+        max_tokens=max_tokens,
+        system=system_blocks,
+        messages=messages,
+    )
+
+    text = response.content[0].text
+    usage = response.usage
+    input_tokens = usage.input_tokens
+    output_tokens = usage.output_tokens
+
+    # Log cache performance
+    cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
+    cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
+    if cache_read or cache_creation:
+        logger.info(
+            "Anthropic cache: read=%d creation=%d input=%d output=%d",
+            cache_read, cache_creation, input_tokens, output_tokens,
+        )
+
+    return text, input_tokens, output_tokens
+
+
 def _auto_title(message: str) -> str:
     """Generate a short title from the first user message."""
     title = message.strip()[:100]
@@ -90,22 +207,20 @@ async def send_message(
         limit=8,
     )
 
-    # Build system prompt
-    system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results)
+    rag_context = build_rag_context(rag_results)
 
     # Build messages for AI
-    ai_messages = []
+    ai_messages: list[dict[str, Any]] = []
     for msg in chat.messages:
         if msg["role"] in ("user", "assistant"):
             ai_messages.append({"role": msg["role"], "content": msg["content"]})
-    ai_messages.append({"role": "user", "content": message})
 
-    # Call AI
-    provider = get_ai_provider()
-    ai_content, input_tokens, output_tokens = await provider.generate_text(
-        system_prompt=system_prompt,
-        messages=ai_messages,
-        max_tokens=4096,
+    # Call AI with prompt caching (Anthropic) or generic provider
+    ai_content, input_tokens, output_tokens = await _call_ai(
+        system_base=ASSISTANT_SYSTEM_PROMPT,
+        rag_context=rag_context,
+        history=ai_messages,
+        new_message=message,
     )
 
     # Update chat