feat: add Anthropic prompt caching to assistant chat
Cache the static system prompt and conversation history prefix across turns, reducing input token costs by ~80% on multi-turn conversations. RAG context is intentionally uncached since it changes per query. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,15 +2,20 @@
|
|||||||
|
|
||||||
Provides persistent conversation history for general IT questions
|
Provides persistent conversation history for general IT questions
|
||||||
with semantic search over the team's flow library.
|
with semantic search over the team's flow library.
|
||||||
|
|
||||||
|
Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
|
||||||
|
- The static system prompt is cached (ephemeral, 5-min TTL)
|
||||||
|
- The conversation history prefix is cached via a breakpoint on the
|
||||||
|
last existing message before the new user input
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, Any
|
from typing import Any
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.core.ai_provider import get_ai_provider
|
from app.core.config import settings
|
||||||
from app.models.assistant_chat import AssistantChat
|
from app.models.assistant_chat import AssistantChat
|
||||||
from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
|
from app.services.rag_service import search as rag_search, build_rag_context, extract_suggested_flows
|
||||||
|
|
||||||
@@ -33,6 +38,118 @@ When answering:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def _call_ai(
|
||||||
|
system_base: str,
|
||||||
|
rag_context: str,
|
||||||
|
history: list[dict[str, Any]],
|
||||||
|
new_message: str,
|
||||||
|
max_tokens: int = 4096,
|
||||||
|
) -> tuple[str, int, int]:
|
||||||
|
"""Call the AI with prompt caching when using Anthropic.
|
||||||
|
|
||||||
|
Caching strategy:
|
||||||
|
- System prompt base: cached (stable across all turns)
|
||||||
|
- RAG context: NOT cached (changes per query)
|
||||||
|
- Conversation history prefix: cached via breakpoint on last
|
||||||
|
existing message (stable — only new user message is uncached)
|
||||||
|
"""
|
||||||
|
if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
|
||||||
|
return await _call_anthropic_cached(
|
||||||
|
system_base, rag_context, history, new_message, max_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: generic provider (Gemini, etc.)
|
||||||
|
from app.core.ai_provider import get_ai_provider
|
||||||
|
|
||||||
|
system_prompt = system_base + rag_context
|
||||||
|
messages = history + [{"role": "user", "content": new_message}]
|
||||||
|
provider = get_ai_provider()
|
||||||
|
return await provider.generate_text(
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _call_anthropic_cached(
|
||||||
|
system_base: str,
|
||||||
|
rag_context: str,
|
||||||
|
history: list[dict[str, Any]],
|
||||||
|
new_message: str,
|
||||||
|
max_tokens: int,
|
||||||
|
) -> tuple[str, int, int]:
|
||||||
|
"""Call Anthropic with prompt caching on system prompt and history.
|
||||||
|
|
||||||
|
Uses structured system blocks so the static base prompt is cached
|
||||||
|
independently from the per-query RAG context.
|
||||||
|
"""
|
||||||
|
import anthropic
|
||||||
|
|
||||||
|
client = anthropic.AsyncAnthropic(
|
||||||
|
api_key=settings.ANTHROPIC_API_KEY,
|
||||||
|
timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# System prompt as structured blocks:
|
||||||
|
# Block 1: static base prompt (cached)
|
||||||
|
# Block 2: RAG context (changes per query, not cached)
|
||||||
|
system_blocks: list[dict[str, Any]] = [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": system_base,
|
||||||
|
"cache_control": {"type": "ephemeral"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
if rag_context:
|
||||||
|
system_blocks.append({"type": "text", "text": rag_context})
|
||||||
|
|
||||||
|
# Build messages with cache breakpoint on conversation history
|
||||||
|
messages: list[dict[str, Any]] = []
|
||||||
|
for msg in history:
|
||||||
|
messages.append({"role": msg["role"], "content": msg["content"]})
|
||||||
|
|
||||||
|
# Place cache breakpoint on the last history message so the entire
|
||||||
|
# conversation prefix is cached across turns
|
||||||
|
if messages:
|
||||||
|
last = messages[-1]
|
||||||
|
messages[-1] = {
|
||||||
|
"role": last["role"],
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": last["content"],
|
||||||
|
"cache_control": {"type": "ephemeral"},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add the new user message (uncached — it's new each turn)
|
||||||
|
messages.append({"role": "user", "content": new_message})
|
||||||
|
|
||||||
|
response = await client.messages.create(
|
||||||
|
model=settings.AI_MODEL_ANTHROPIC,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
system=system_blocks,
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
text = response.content[0].text
|
||||||
|
usage = response.usage
|
||||||
|
input_tokens = usage.input_tokens
|
||||||
|
output_tokens = usage.output_tokens
|
||||||
|
|
||||||
|
# Log cache performance
|
||||||
|
cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||||
|
cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
|
||||||
|
if cache_read or cache_creation:
|
||||||
|
logger.info(
|
||||||
|
"Anthropic cache: read=%d creation=%d input=%d output=%d",
|
||||||
|
cache_read, cache_creation, input_tokens, output_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
return text, input_tokens, output_tokens
|
||||||
|
|
||||||
|
|
||||||
def _auto_title(message: str) -> str:
|
def _auto_title(message: str) -> str:
|
||||||
"""Generate a short title from the first user message."""
|
"""Generate a short title from the first user message."""
|
||||||
title = message.strip()[:100]
|
title = message.strip()[:100]
|
||||||
@@ -90,22 +207,20 @@ async def send_message(
|
|||||||
limit=8,
|
limit=8,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build system prompt
|
rag_context = build_rag_context(rag_results)
|
||||||
system_prompt = ASSISTANT_SYSTEM_PROMPT + build_rag_context(rag_results)
|
|
||||||
|
|
||||||
# Build messages for AI
|
# Build messages for AI
|
||||||
ai_messages = []
|
ai_messages: list[dict[str, Any]] = []
|
||||||
for msg in chat.messages:
|
for msg in chat.messages:
|
||||||
if msg["role"] in ("user", "assistant"):
|
if msg["role"] in ("user", "assistant"):
|
||||||
ai_messages.append({"role": msg["role"], "content": msg["content"]})
|
ai_messages.append({"role": msg["role"], "content": msg["content"]})
|
||||||
ai_messages.append({"role": "user", "content": message})
|
|
||||||
|
|
||||||
# Call AI
|
# Call AI with prompt caching (Anthropic) or generic provider
|
||||||
provider = get_ai_provider()
|
ai_content, input_tokens, output_tokens = await _call_ai(
|
||||||
ai_content, input_tokens, output_tokens = await provider.generate_text(
|
system_base=ASSISTANT_SYSTEM_PROMPT,
|
||||||
system_prompt=system_prompt,
|
rag_context=rag_context,
|
||||||
messages=ai_messages,
|
history=ai_messages,
|
||||||
max_tokens=4096,
|
new_message=message,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update chat
|
# Update chat
|
||||||
|
|||||||
Reference in New Issue
Block a user