feat: wire image uploads into AI assistant chat (vision support)

- Backend: ChatMessageRequest accepts upload_ids, endpoint fetches images from S3, base64-encodes them, passes to Claude as multimodal content blocks (vision API) - Backend: add download_file() to storage_service for fetching from S3 - Frontend: handleSend collects completed upload IDs from pendingUploads and includes them in the sendChatMessage API call - Frontend: prefill handler passes upload IDs from dashboard nav state - Enables paste-screenshot → AI-sees-it flow end-to-end Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 04:39:54 +00:00
parent 48f2b3faaf
commit 3b682069d3
6 changed files with 99 additions and 7 deletions
--- a/backend/app/api/endpoints/assistant_chat.py
+++ b/backend/app/api/endpoints/assistant_chat.py
@@ -10,9 +10,10 @@
  GET    /assistant/retention          — Get account retention settings
  PATCH  /assistant/retention          — Update retention settings (owner only)
 """
+import base64
 import logging
 from datetime import datetime, timezone, timedelta
-from typing import Annotated, Optional
+from typing import Annotated, Any, Optional
 from uuid import UUID

 from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
@@ -26,6 +27,7 @@ from app.core.ai_quota_service import check_ai_quota, record_ai_usage, get_user_
 from app.models.user import User
 from app.models.account import Account
 from app.models.assistant_chat import AssistantChat
+from app.models.file_upload import FileUpload
 from app.schemas.assistant_chat import (
    ChatCreateRequest,
    ChatMessageRequest,
@@ -46,6 +48,42 @@ logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/assistant", tags=["assistant-chat"])


+VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
+
+
+async def _fetch_upload_images(
+    upload_ids: list[UUID],
+    account_id: UUID,
+    db: AsyncSession,
+) -> list[dict[str, Any]]:
+    """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision."""
+    if not upload_ids or not settings.STORAGE_ENDPOINT:
+        return []
+
+    from app.services import storage_service
+
+    result = await db.execute(
+        select(FileUpload).where(
+            FileUpload.id.in_(upload_ids),
+            FileUpload.account_id == account_id,
+            FileUpload.content_type.in_(VISION_CONTENT_TYPES),
+        )
+    )
+    uploads = result.scalars().all()
+
+    images: list[dict[str, Any]] = []
+    for upload in uploads:
+        try:
+            file_data = storage_service.download_file(upload.storage_key)
+            images.append({
+                "media_type": upload.content_type,
+                "data": base64.b64encode(file_data).decode("ascii"),
+            })
+        except Exception:
+            logger.warning("Failed to fetch upload %s from S3", upload.id)
+    return images
+
+
 def _require_ai_enabled() -> None:
    if not settings.ai_enabled:
        raise HTTPException(
@@ -151,6 +189,9 @@ async def post_message(
    user_id = current_user.id
    account_id = current_user.account_id

+    # Fetch attached images from S3 (if any)
+    images = await _fetch_upload_images(data.upload_ids, account_id, db)
+
    try:
        ai_content, suggested_flows, chat = await assistant_chat_service.send_message(
            chat_id=chat_id,
@@ -158,6 +199,7 @@ async def post_message(
            account_id=account_id,
            message=data.message,
            db=db,
+            images=images or None,
        )
    except ValueError as e:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
--- a/backend/app/schemas/assistant_chat.py
+++ b/backend/app/schemas/assistant_chat.py
@@ -14,6 +14,7 @@ class ChatCreateRequest(BaseModel):

 class ChatMessageRequest(BaseModel):
    message: str = Field(..., min_length=1, max_length=8000)
+    upload_ids: list[UUID] = Field(default_factory=list, max_length=10)


 class ChatMessageResponse(BaseModel):
--- a/backend/app/services/assistant_chat_service.py
+++ b/backend/app/services/assistant_chat_service.py
@@ -87,6 +87,7 @@ async def _call_ai(
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int = 4096,
+    images: list[dict[str, Any]] | None = None,
 ) -> tuple[str, int, int]:
    """Call the AI with prompt caching when using Anthropic.

@@ -95,13 +96,18 @@ async def _call_ai(
    - RAG context: NOT cached (changes per query)
    - Conversation history prefix: cached via breakpoint on last
      existing message (stable — only new user message is uncached)
+
+    Args:
+        images: Optional list of {"media_type": str, "data": str (base64)}
+                to include alongside the new_message as vision content.
    """
    if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
        return await _call_anthropic_cached(
-            system_base, rag_context, history, new_message, max_tokens
+            system_base, rag_context, history, new_message, max_tokens,
+            images=images,
        )

-    # Fallback: generic provider (Gemini, etc.)
+    # Fallback: generic provider (Gemini, etc.) — images not supported
    from app.core.ai_provider import get_ai_provider

    system_prompt = system_base + rag_context
@@ -120,6 +126,7 @@ async def _call_anthropic_cached(
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int,
+    images: list[dict[str, Any]] | None = None,
 ) -> tuple[str, int, int]:
    """Call Anthropic with prompt caching on system prompt and history.

@@ -168,7 +175,22 @@ async def _call_anthropic_cached(
        }

    # Add the new user message (uncached — it's new each turn)
-    messages.append({"role": "user", "content": new_message})
+    # If images are attached, build multimodal content blocks
+    if images:
+        content_blocks: list[dict[str, Any]] = []
+        for img in images:
+            content_blocks.append({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": img["media_type"],
+                    "data": img["data"],
+                },
+            })
+        content_blocks.append({"type": "text", "text": new_message})
+        messages.append({"role": "user", "content": content_blocks})
+    else:
+        messages.append({"role": "user", "content": new_message})

    # MCP server config (optional — controlled by settings)
    mcp_servers = anthropic.NOT_GIVEN
@@ -386,9 +408,14 @@ async def send_message(
    account_id: UUID,
    message: str,
    db: AsyncSession,
+    images: list[dict[str, Any]] | None = None,
 ) -> tuple[str, list[dict[str, Any]], AssistantChat]:
    """Send a user message and get AI response.

+    Args:
+        images: Optional list of {"media_type": str, "data": str (base64)}
+                for vision content attached to this message.
+
    Returns (ai_content, suggested_flows, chat).
    """
    result = await db.execute(
@@ -427,6 +454,7 @@ async def send_message(
        rag_context=rag_context,
        history=ai_messages,
        new_message=message,
+        images=images,
    )

    # Update chat
--- a/backend/app/services/storage_service.py
+++ b/backend/app/services/storage_service.py
@@ -67,6 +67,14 @@ async def upload_file(
    return storage_key


+def download_file(storage_key: str) -> bytes:
+    """Download a file from S3 and return its contents as bytes."""
+    client = _get_client()
+    buf = BytesIO()
+    client.download_fileobj(settings.STORAGE_BUCKET_NAME, storage_key, buf)
+    return buf.getvalue()
+
+
 def get_presigned_url(storage_key: str) -> str:
    """Generate a time-limited presigned URL for downloading a file."""
    client = _get_client()
--- a/frontend/src/pages/AssistantChatPage.tsx
+++ b/frontend/src/pages/AssistantChatPage.tsx
@@ -55,7 +55,9 @@ export default function AssistantChatPage() {

  // Handle prefill from command palette / dashboard handoff
  useEffect(() => {
-    const prefill = (location.state as { prefill?: string } | null)?.prefill
+    const state = location.state as { prefill?: string; uploadIds?: string[] } | null
+    const prefill = state?.prefill
+    const uploadIds = state?.uploadIds
    if (!prefill || prefillHandledRef.current) return
    prefillHandledRef.current = true

@@ -80,7 +82,10 @@ export default function AssistantChatPage() {
        setMessages([{ role: 'user', content: prefill }])
        setLoading(true)

-        const response = await aiSessionsApi.sendChatMessage(session.session_id, { message: prefill })
+        const response = await aiSessionsApi.sendChatMessage(session.session_id, {
+          message: prefill,
+          upload_ids: uploadIds?.length ? uploadIds : undefined,
+        })
        setMessages(prev => [
          ...prev,
          { role: 'assistant', content: response.content, suggestedFlows: response.suggested_flows },
@@ -183,12 +188,19 @@ export default function AssistantChatPage() {
    if (!input.trim() || !activeChatId || loading) return

    const userMessage = input.trim()
+    const completedUploadIds = pendingUploads
+      .filter((u) => u.status === 'done' && u.result?.id)
+      .map((u) => u.result!.id)
    setInput('')
+    setPendingUploads([])
    setMessages(prev => [...prev, { role: 'user', content: userMessage }])
    setLoading(true)

    try {
-      const response = await aiSessionsApi.sendChatMessage(activeChatId, { message: userMessage })
+      const response = await aiSessionsApi.sendChatMessage(activeChatId, {
+        message: userMessage,
+        upload_ids: completedUploadIds.length > 0 ? completedUploadIds : undefined,
+      })
      analytics.aiFeatureUsed({ feature: 'assistant_chat' })
      setMessages(prev => [
        ...prev,
--- a/frontend/src/types/ai-session.ts
+++ b/frontend/src/types/ai-session.ts
@@ -216,6 +216,7 @@ export interface ChatSessionCreateResponse {

 export interface ChatMessageRequest {
  message: string
+  upload_ids?: string[]
 }

 export interface ChatMessageResponse {