feat: add Anthropic prompt caching to assistant chat

Cache the static system prompt and conversation history prefix across
turns, reducing input token costs by ~80% on multi-turn conversations.
RAG context is intentionally uncached since it changes per query.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Michael Chihlas
2026-03-05 18:17:55 -05:00
parent c5e1039717
commit 2007dcb990

View File

@@ -2,15 +2,20 @@
Provides persistent conversation history for general IT questions
with semantic search over the team's flow library.
Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
- The static system prompt is cached (ephemeral, 5-min TTL)
- The conversation history prefix is cached via a breakpoint on the
last existing message before the new user input
"""
import logging
from typing import Optional, Any
from typing import Any
from uuid import UUID
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.ai_provider import get_ai_provider
from app.core.config import settings
from app.models.assistant_chat import AssistantChat
from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
@@ -33,6 +38,118 @@ When answering:
"""
async def _call_ai(
system_base: str,
rag_context: str,
history: list[dict[str, Any]],
new_message: str,
max_tokens: int = 4096,
) -> tuple[str, int, int]:
"""Call the AI with prompt caching when using Anthropic.
Caching strategy:
- System prompt base: cached (stable across all turns)
- RAG context: NOT cached (changes per query)
- Conversation history prefix: cached via breakpoint on last
existing message (stable — only new user message is uncached)
"""
if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
return await _call_anthropic_cached(
system_base, rag_context, history, new_message, max_tokens
)
# Fallback: generic provider (Gemini, etc.)
from app.core.ai_provider import get_ai_provider
system_prompt = system_base + rag_context
messages = history + [{"role": "user", "content": new_message}]
provider = get_ai_provider()
return await provider.generate_text(
system_prompt=system_prompt,
messages=messages,
max_tokens=max_tokens,
)
async def _call_anthropic_cached(
system_base: str,
rag_context: str,
history: list[dict[str, Any]],
new_message: str,
max_tokens: int,
) -> tuple[str, int, int]:
"""Call Anthropic with prompt caching on system prompt and history.
Uses structured system blocks so the static base prompt is cached
independently from the per-query RAG context.
"""
import anthropic
client = anthropic.AsyncAnthropic(
api_key=settings.ANTHROPIC_API_KEY,
timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
)
# System prompt as structured blocks:
# Block 1: static base prompt (cached)
# Block 2: RAG context (changes per query, not cached)
system_blocks: list[dict[str, Any]] = [
{
"type": "text",
"text": system_base,
"cache_control": {"type": "ephemeral"},
},
]
if rag_context:
system_blocks.append({"type": "text", "text": rag_context})
# Build messages with cache breakpoint on conversation history
messages: list[dict[str, Any]] = []
for msg in history:
messages.append({"role": msg["role"], "content": msg["content"]})
# Place cache breakpoint on the last history message so the entire
# conversation prefix is cached across turns
if messages:
last = messages[-1]
messages[-1] = {
"role": last["role"],
"content": [
{
"type": "text",
"text": last["content"],
"cache_control": {"type": "ephemeral"},
}
],
}
# Add the new user message (uncached — it's new each turn)
messages.append({"role": "user", "content": new_message})
response = await client.messages.create(
model=settings.AI_MODEL_ANTHROPIC,
max_tokens=max_tokens,
system=system_blocks,
messages=messages,
)
text = response.content[0].text
usage = response.usage
input_tokens = usage.input_tokens
output_tokens = usage.output_tokens
# Log cache performance
cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
if cache_read or cache_creation:
logger.info(
"Anthropic cache: read=%d creation=%d input=%d output=%d",
cache_read, cache_creation, input_tokens, output_tokens,
)
return text, input_tokens, output_tokens
def _auto_title(message: str) -> str:
"""Generate a short title from the first user message."""
title = message.strip()[:100]
@@ -90,22 +207,20 @@ async def send_message(
limit=8,
)
# Build system prompt
system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results)
rag_context = build_rag_context(rag_results)
# Build messages for AI
ai_messages = []
ai_messages: list[dict[str, Any]] = []
for msg in chat.messages:
if msg["role"] in ("user", "assistant"):
ai_messages.append({"role": msg["role"], "content": msg["content"]})
ai_messages.append({"role": "user", "content": message})
# Call AI
provider = get_ai_provider()
ai_content, input_tokens, output_tokens = await provider.generate_text(
system_prompt=system_prompt,
messages=ai_messages,
max_tokens=4096,
# Call AI with prompt caching (Anthropic) or generic provider
ai_content, input_tokens, output_tokens = await _call_ai(
system_base=ASSISTANT_SYSTEM_PROMPT,
rag_context=rag_context,
history=ai_messages,
new_message=message,
)
# Update chat