feat: add Anthropic prompt caching to assistant chat
Cache the static system prompt and conversation history prefix across turns, reducing input token costs by ~80% on multi-turn conversations. RAG context is intentionally uncached since it changes per query. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,15 +2,20 @@
|
||||
|
||||
Provides persistent conversation history for general IT questions
|
||||
with semantic search over the team's flow library.
|
||||
|
||||
Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
|
||||
- The static system prompt is cached (ephemeral, 5-min TTL)
|
||||
- The conversation history prefix is cached via a breakpoint on the
|
||||
last existing message before the new user input
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, Any
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.ai_provider import get_ai_provider
|
||||
from app.core.config import settings
|
||||
from app.models.assistant_chat import AssistantChat
|
||||
from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
|
||||
|
||||
@@ -33,6 +38,118 @@ When answering:
|
||||
"""
|
||||
|
||||
|
||||
async def _call_ai(
|
||||
system_base: str,
|
||||
rag_context: str,
|
||||
history: list[dict[str, Any]],
|
||||
new_message: str,
|
||||
max_tokens: int = 4096,
|
||||
) -> tuple[str, int, int]:
|
||||
"""Call the AI with prompt caching when using Anthropic.
|
||||
|
||||
Caching strategy:
|
||||
- System prompt base: cached (stable across all turns)
|
||||
- RAG context: NOT cached (changes per query)
|
||||
- Conversation history prefix: cached via breakpoint on last
|
||||
existing message (stable — only new user message is uncached)
|
||||
"""
|
||||
if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
|
||||
return await _call_anthropic_cached(
|
||||
system_base, rag_context, history, new_message, max_tokens
|
||||
)
|
||||
|
||||
# Fallback: generic provider (Gemini, etc.)
|
||||
from app.core.ai_provider import get_ai_provider
|
||||
|
||||
system_prompt = system_base + rag_context
|
||||
messages = history + [{"role": "user", "content": new_message}]
|
||||
provider = get_ai_provider()
|
||||
return await provider.generate_text(
|
||||
system_prompt=system_prompt,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
|
||||
async def _call_anthropic_cached(
|
||||
system_base: str,
|
||||
rag_context: str,
|
||||
history: list[dict[str, Any]],
|
||||
new_message: str,
|
||||
max_tokens: int,
|
||||
) -> tuple[str, int, int]:
|
||||
"""Call Anthropic with prompt caching on system prompt and history.
|
||||
|
||||
Uses structured system blocks so the static base prompt is cached
|
||||
independently from the per-query RAG context.
|
||||
"""
|
||||
import anthropic
|
||||
|
||||
client = anthropic.AsyncAnthropic(
|
||||
api_key=settings.ANTHROPIC_API_KEY,
|
||||
timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
|
||||
)
|
||||
|
||||
# System prompt as structured blocks:
|
||||
# Block 1: static base prompt (cached)
|
||||
# Block 2: RAG context (changes per query, not cached)
|
||||
system_blocks: list[dict[str, Any]] = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": system_base,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
},
|
||||
]
|
||||
if rag_context:
|
||||
system_blocks.append({"type": "text", "text": rag_context})
|
||||
|
||||
# Build messages with cache breakpoint on conversation history
|
||||
messages: list[dict[str, Any]] = []
|
||||
for msg in history:
|
||||
messages.append({"role": msg["role"], "content": msg["content"]})
|
||||
|
||||
# Place cache breakpoint on the last history message so the entire
|
||||
# conversation prefix is cached across turns
|
||||
if messages:
|
||||
last = messages[-1]
|
||||
messages[-1] = {
|
||||
"role": last["role"],
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": last["content"],
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Add the new user message (uncached — it's new each turn)
|
||||
messages.append({"role": "user", "content": new_message})
|
||||
|
||||
response = await client.messages.create(
|
||||
model=settings.AI_MODEL_ANTHROPIC,
|
||||
max_tokens=max_tokens,
|
||||
system=system_blocks,
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
text = response.content[0].text
|
||||
usage = response.usage
|
||||
input_tokens = usage.input_tokens
|
||||
output_tokens = usage.output_tokens
|
||||
|
||||
# Log cache performance
|
||||
cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||
cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
|
||||
if cache_read or cache_creation:
|
||||
logger.info(
|
||||
"Anthropic cache: read=%d creation=%d input=%d output=%d",
|
||||
cache_read, cache_creation, input_tokens, output_tokens,
|
||||
)
|
||||
|
||||
return text, input_tokens, output_tokens
|
||||
|
||||
|
||||
def _auto_title(message: str) -> str:
|
||||
"""Generate a short title from the first user message."""
|
||||
title = message.strip()[:100]
|
||||
@@ -90,22 +207,20 @@ async def send_message(
|
||||
limit=8,
|
||||
)
|
||||
|
||||
# Build system prompt
|
||||
system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results)
|
||||
rag_context = build_rag_context(rag_results)
|
||||
|
||||
# Build messages for AI
|
||||
ai_messages = []
|
||||
ai_messages: list[dict[str, Any]] = []
|
||||
for msg in chat.messages:
|
||||
if msg["role"] in ("user", "assistant"):
|
||||
ai_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||
ai_messages.append({"role": "user", "content": message})
|
||||
|
||||
# Call AI
|
||||
provider = get_ai_provider()
|
||||
ai_content, input_tokens, output_tokens = await provider.generate_text(
|
||||
system_prompt=system_prompt,
|
||||
messages=ai_messages,
|
||||
max_tokens=4096,
|
||||
# Call AI with prompt caching (Anthropic) or generic provider
|
||||
ai_content, input_tokens, output_tokens = await _call_ai(
|
||||
system_base=ASSISTANT_SYSTEM_PROMPT,
|
||||
rag_context=rag_context,
|
||||
history=ai_messages,
|
||||
new_message=message,
|
||||
)
|
||||
|
||||
# Update chat
|
||||
|
||||
Reference in New Issue
Block a user