From 2007dcb990dcedd9e4be1341c5c8e5abaf22aed3 Mon Sep 17 00:00:00 2001 From: Michael Chihlas Date: Thu, 5 Mar 2026 18:17:55 -0500 Subject: [PATCH] feat: add Anthropic prompt caching to assistant chat Cache the static system prompt and conversation history prefix across turns, reducing input token costs by ~80% on multi-turn conversations. RAG context is intentionally uncached since it changes per query. Co-Authored-By: Claude Opus 4.6 --- .../app/services/assistant_chat_service.py | 139 ++++++++++++++++-- 1 file changed, 127 insertions(+), 12 deletions(-) diff --git a/backend/app/services/assistant_chat_service.py b/backend/app/services/assistant_chat_service.py index 797a3be7..275db947 100644 --- a/backend/app/services/assistant_chat_service.py +++ b/backend/app/services/assistant_chat_service.py @@ -2,15 +2,20 @@ Provides persistent conversation history for general IT questions with semantic search over the team's flow library. + +Uses Anthropic prompt caching to reduce cost on multi-turn conversations: +- The static system prompt is cached (ephemeral, 5-min TTL) +- The conversation history prefix is cached via a breakpoint on the + last existing message before the new user input """ import logging -from typing import Optional, Any +from typing import Any from uuid import UUID from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.core.ai_provider import get_ai_provider +from app.core.config import settings from app.models.assistant_chat import AssistantChat from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows @@ -33,6 +38,118 @@ When answering: """ +async def _call_ai( + system_base: str, + rag_context: str, + history: list[dict[str, Any]], + new_message: str, + max_tokens: int = 4096, +) -> tuple[str, int, int]: + """Call the AI with prompt caching when using Anthropic. + + Caching strategy: + - System prompt base: cached (stable across all turns) + - RAG context: NOT cached (changes per query) + - Conversation history prefix: cached via breakpoint on last + existing message (stable — only new user message is uncached) + """ + if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY: + return await _call_anthropic_cached( + system_base, rag_context, history, new_message, max_tokens + ) + + # Fallback: generic provider (Gemini, etc.) + from app.core.ai_provider import get_ai_provider + + system_prompt = system_base + rag_context + messages = history + [{"role": "user", "content": new_message}] + provider = get_ai_provider() + return await provider.generate_text( + system_prompt=system_prompt, + messages=messages, + max_tokens=max_tokens, + ) + + +async def _call_anthropic_cached( + system_base: str, + rag_context: str, + history: list[dict[str, Any]], + new_message: str, + max_tokens: int, +) -> tuple[str, int, int]: + """Call Anthropic with prompt caching on system prompt and history. + + Uses structured system blocks so the static base prompt is cached + independently from the per-query RAG context. + """ + import anthropic + + client = anthropic.AsyncAnthropic( + api_key=settings.ANTHROPIC_API_KEY, + timeout=settings.AI_REQUEST_TIMEOUT_SECONDS, + ) + + # System prompt as structured blocks: + # Block 1: static base prompt (cached) + # Block 2: RAG context (changes per query, not cached) + system_blocks: list[dict[str, Any]] = [ + { + "type": "text", + "text": system_base, + "cache_control": {"type": "ephemeral"}, + }, + ] + if rag_context: + system_blocks.append({"type": "text", "text": rag_context}) + + # Build messages with cache breakpoint on conversation history + messages: list[dict[str, Any]] = [] + for msg in history: + messages.append({"role": msg["role"], "content": msg["content"]}) + + # Place cache breakpoint on the last history message so the entire + # conversation prefix is cached across turns + if messages: + last = messages[-1] + messages[-1] = { + "role": last["role"], + "content": [ + { + "type": "text", + "text": last["content"], + "cache_control": {"type": "ephemeral"}, + } + ], + } + + # Add the new user message (uncached — it's new each turn) + messages.append({"role": "user", "content": new_message}) + + response = await client.messages.create( + model=settings.AI_MODEL_ANTHROPIC, + max_tokens=max_tokens, + system=system_blocks, + messages=messages, + ) + + text = response.content[0].text + usage = response.usage + input_tokens = usage.input_tokens + output_tokens = usage.output_tokens + + # Log cache performance + cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0 + cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0 + if cache_read or cache_creation: + logger.info( + "Anthropic cache: read=%d creation=%d input=%d output=%d", + cache_read, cache_creation, input_tokens, output_tokens, + ) + + return text, input_tokens, output_tokens + + def _auto_title(message: str) -> str: """Generate a short title from the first user message.""" title = message.strip()[:100] @@ -90,22 +207,20 @@ async def send_message( limit=8, ) - # Build system prompt - system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results) + rag_context = build_rag_context(rag_results) # Build messages for AI - ai_messages = [] + ai_messages: list[dict[str, Any]] = [] for msg in chat.messages: if msg["role"] in ("user", "assistant"): ai_messages.append({"role": msg["role"], "content": msg["content"]}) - ai_messages.append({"role": "user", "content": message}) - # Call AI - provider = get_ai_provider() - ai_content, input_tokens, output_tokens = await provider.generate_text( - system_prompt=system_prompt, - messages=ai_messages, - max_tokens=4096, + # Call AI with prompt caching (Anthropic) or generic provider + ai_content, input_tokens, output_tokens = await _call_ai( + system_base=ASSISTANT_SYSTEM_PROMPT, + rag_context=rag_context, + history=ai_messages, + new_message=message, ) # Update chat