feat: wire image uploads into AI assistant chat (vision support)

- Backend: ChatMessageRequest accepts upload_ids, endpoint fetches
  images from S3, base64-encodes them, passes to Claude as multimodal
  content blocks (vision API)
- Backend: add download_file() to storage_service for fetching from S3
- Frontend: handleSend collects completed upload IDs from pendingUploads
  and includes them in the sendChatMessage API call
- Frontend: prefill handler passes upload IDs from dashboard nav state
- Enables paste-screenshot → AI-sees-it flow end-to-end

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
chihlasm
2026-03-24 04:39:54 +00:00
parent 48f2b3faaf
commit 3b682069d3
6 changed files with 99 additions and 7 deletions

View File

@@ -87,6 +87,7 @@ async def _call_ai(
history: list[dict[str, Any]],
new_message: str,
max_tokens: int = 4096,
images: list[dict[str, Any]] | None = None,
) -> tuple[str, int, int]:
"""Call the AI with prompt caching when using Anthropic.
@@ -95,13 +96,18 @@ async def _call_ai(
- RAG context: NOT cached (changes per query)
- Conversation history prefix: cached via breakpoint on last
existing message (stable — only new user message is uncached)
Args:
images: Optional list of {"media_type": str, "data": str (base64)}
to include alongside the new_message as vision content.
"""
if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
return await _call_anthropic_cached(
system_base, rag_context, history, new_message, max_tokens
system_base, rag_context, history, new_message, max_tokens,
images=images,
)
# Fallback: generic provider (Gemini, etc.)
# Fallback: generic provider (Gemini, etc.) — images not supported
from app.core.ai_provider import get_ai_provider
system_prompt = system_base + rag_context
@@ -120,6 +126,7 @@ async def _call_anthropic_cached(
history: list[dict[str, Any]],
new_message: str,
max_tokens: int,
images: list[dict[str, Any]] | None = None,
) -> tuple[str, int, int]:
"""Call Anthropic with prompt caching on system prompt and history.
@@ -168,7 +175,22 @@ async def _call_anthropic_cached(
}
# Add the new user message (uncached — it's new each turn)
messages.append({"role": "user", "content": new_message})
# If images are attached, build multimodal content blocks
if images:
content_blocks: list[dict[str, Any]] = []
for img in images:
content_blocks.append({
"type": "image",
"source": {
"type": "base64",
"media_type": img["media_type"],
"data": img["data"],
},
})
content_blocks.append({"type": "text", "text": new_message})
messages.append({"role": "user", "content": content_blocks})
else:
messages.append({"role": "user", "content": new_message})
# MCP server config (optional — controlled by settings)
mcp_servers = anthropic.NOT_GIVEN
@@ -386,9 +408,14 @@ async def send_message(
account_id: UUID,
message: str,
db: AsyncSession,
images: list[dict[str, Any]] | None = None,
) -> tuple[str, list[dict[str, Any]], AssistantChat]:
"""Send a user message and get AI response.
Args:
images: Optional list of {"media_type": str, "data": str (base64)}
for vision content attached to this message.
Returns (ai_content, suggested_flows, chat).
"""
result = await db.execute(
@@ -427,6 +454,7 @@ async def send_message(
rag_context=rag_context,
history=ai_messages,
new_message=message,
images=images,
)
# Update chat

View File

@@ -67,6 +67,14 @@ async def upload_file(
return storage_key
def download_file(storage_key: str) -> bytes:
"""Download a file from S3 and return its contents as bytes."""
client = _get_client()
buf = BytesIO()
client.download_fileobj(settings.STORAGE_BUCKET_NAME, storage_key, buf)
return buf.getvalue()
def get_presigned_url(storage_key: str) -> str:
"""Generate a time-limited presigned URL for downloading a file."""
client = _get_client()