perf: resize images server-side before sending to Claude vision

- Resize to 1568px max (Claude's efficient ceiling) via Pillow - Convert PNG screenshots to JPEG q85 (~5MB → ~200KB typical) - Cap at 3 images per message (~4,800 token budget max) - Graceful fallback if Pillow unavailable (Claude auto-resizes) - Add Pillow + libjpeg/zlib deps to requirements + Dockerfile Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 04:46:02 +00:00
parent 3b682069d3
commit 1c0f912cf6
3 changed files with 77 additions and 4 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -10,6 +10,8 @@ RUN apt-get update && apt-get install -y \
    libcairo2-dev \
    libgdk-pixbuf-2.0-dev \
    libffi-dev \
+    libjpeg-dev \
+    zlib1g-dev \
    && rm -rf /var/lib/apt/lists/*

 # Install Python dependencies
--- a/backend/app/api/endpoints/assistant_chat.py
+++ b/backend/app/api/endpoints/assistant_chat.py
@@ -50,21 +50,86 @@ router = APIRouter(prefix="/assistant", tags=["assistant-chat"])

 VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}

+# Claude vision costs: (width × height) / 750 tokens per image.
+# Claude auto-resizes images >1568px on the longest edge.
+# We resize server-side to avoid sending multi-MB base64 payloads over the wire.
+MAX_IMAGE_DIMENSION = 1568  # Claude's max efficient resolution
+MAX_IMAGES_PER_MESSAGE = 3  # Cap to control token budget
+
+
+def _resize_image_for_vision(file_data: bytes, content_type: str) -> tuple[bytes, str]:
+    """Resize image to fit within Claude's efficient vision bounds.
+
+    Returns (resized_bytes, media_type). Converts PNG screenshots to JPEG
+    when it reduces size significantly (screenshots are often huge PNGs).
+    """
+    try:
+        from PIL import Image
+        from io import BytesIO
+
+        img = Image.open(BytesIO(file_data))
+        w, h = img.size
+
+        # Only resize if larger than Claude's max efficient dimension
+        if max(w, h) > MAX_IMAGE_DIMENSION:
+            ratio = MAX_IMAGE_DIMENSION / max(w, h)
+            new_w, new_h = int(w * ratio), int(h * ratio)
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+
+        # Convert RGBA (common in screenshots) to RGB for JPEG
+        out_type = content_type
+        if img.mode in ("RGBA", "P") and content_type == "image/png":
+            img = img.convert("RGB")
+            out_type = "image/jpeg"
+
+        buf = BytesIO()
+        if out_type == "image/jpeg":
+            img.save(buf, format="JPEG", quality=85, optimize=True)
+        else:
+            img.save(buf, format=img.format or "PNG", optimize=True)
+
+        result = buf.getvalue()
+
+        # Only use resized version if it's actually smaller
+        if len(result) < len(file_data):
+            return result, out_type
+        return file_data, content_type
+
+    except ImportError:
+        # Pillow not installed — send original (Claude auto-resizes)
+        logger.debug("Pillow not available, sending original image to Claude")
+        return file_data, content_type
+    except Exception:
+        logger.warning("Image resize failed, sending original")
+        return file_data, content_type
+

 async def _fetch_upload_images(
    upload_ids: list[UUID],
    account_id: UUID,
    db: AsyncSession,
 ) -> list[dict[str, Any]]:
-    """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision."""
+    """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision.
+
+    Resizes images server-side to reduce network payload and applies a per-message
+    cap to control token budget (~1,600 tokens per full-res image).
+    """
    if not upload_ids or not settings.STORAGE_ENDPOINT:
        return []

    from app.services import storage_service

+    # Cap the number of images to limit token cost
+    capped_ids = upload_ids[:MAX_IMAGES_PER_MESSAGE]
+    if len(upload_ids) > MAX_IMAGES_PER_MESSAGE:
+        logger.info(
+            "Capped images from %d to %d for token budget",
+            len(upload_ids), MAX_IMAGES_PER_MESSAGE,
+        )
+
    result = await db.execute(
        select(FileUpload).where(
-            FileUpload.id.in_(upload_ids),
+            FileUpload.id.in_(capped_ids),
            FileUpload.account_id == account_id,
            FileUpload.content_type.in_(VISION_CONTENT_TYPES),
        )
@@ -75,9 +140,12 @@ async def _fetch_upload_images(
    for upload in uploads:
        try:
            file_data = storage_service.download_file(upload.storage_key)
+            resized_data, media_type = _resize_image_for_vision(
+                file_data, upload.content_type
+            )
            images.append({
-                "media_type": upload.content_type,
-                "data": base64.b64encode(file_data).decode("ascii"),
+                "media_type": media_type,
+                "data": base64.b64encode(resized_data).decode("ascii"),
            })
        except Exception:
            logger.warning("Failed to fetch upload %s from S3", upload.id)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -54,3 +54,6 @@ apscheduler>=3.10.4

 # Object Storage
 boto3>=1.34.0
+
+# Image processing (vision upload resize)
+Pillow>=10.0.0