perf: resize images server-side before sending to Claude vision

- Resize to 1568px max (Claude's efficient ceiling) via Pillow - Convert PNG screenshots to JPEG q85 (~5MB → ~200KB typical) - Cap at 3 images per message (~4,800 token budget max) - Graceful fallback if Pillow unavailable (Claude auto-resizes) - Add Pillow + libjpeg/zlib deps to requirements + Dockerfile Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 04:46:02 +00:00
parent 3b682069d3
commit 1c0f912cf6
3 changed files with 77 additions and 4 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -10,6 +10,8 @@ RUN apt-get update && apt-get install -y \
    libcairo2-dev \
    libgdk-pixbuf-2.0-dev \
    libffi-dev \
    libjpeg-dev \
    zlib1g-dev \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
--- a/backend/app/api/endpoints/assistant_chat.py
+++ b/backend/app/api/endpoints/assistant_chat.py
@@ -50,21 +50,86 @@ router = APIRouter(prefix="/assistant", tags=["assistant-chat"])
 VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
 # Claude vision costs: (width × height) / 750 tokens per image.
 # Claude auto-resizes images >1568px on the longest edge.
 # We resize server-side to avoid sending multi-MB base64 payloads over the wire.
 MAX_IMAGE_DIMENSION = 1568  # Claude's max efficient resolution
 MAX_IMAGES_PER_MESSAGE = 3  # Cap to control token budget
 def _resize_image_for_vision(file_data: bytes, content_type: str) -> tuple[bytes, str]:
    """Resize image to fit within Claude's efficient vision bounds.
    Returns (resized_bytes, media_type). Converts PNG screenshots to JPEG
    when it reduces size significantly (screenshots are often huge PNGs).
    """
    try:
        from PIL import Image
        from io import BytesIO
        img = Image.open(BytesIO(file_data))
        w, h = img.size
        # Only resize if larger than Claude's max efficient dimension
        if max(w, h) > MAX_IMAGE_DIMENSION:
            ratio = MAX_IMAGE_DIMENSION / max(w, h)
            new_w, new_h = int(w * ratio), int(h * ratio)
            img = img.resize((new_w, new_h), Image.LANCZOS)
        # Convert RGBA (common in screenshots) to RGB for JPEG
        out_type = content_type
        if img.mode in ("RGBA", "P") and content_type == "image/png":
            img = img.convert("RGB")
            out_type = "image/jpeg"
        buf = BytesIO()
        if out_type == "image/jpeg":
            img.save(buf, format="JPEG", quality=85, optimize=True)
        else:
            img.save(buf, format=img.format or "PNG", optimize=True)
        result = buf.getvalue()
        # Only use resized version if it's actually smaller
        if len(result) < len(file_data):
            return result, out_type
        return file_data, content_type
    except ImportError:
        # Pillow not installed — send original (Claude auto-resizes)
        logger.debug("Pillow not available, sending original image to Claude")
        return file_data, content_type
    except Exception:
        logger.warning("Image resize failed, sending original")
        return file_data, content_type
 async def _fetch_upload_images(
    upload_ids: list[UUID],
    account_id: UUID,
    db: AsyncSession,
 ) -> list[dict[str, Any]]:
-    """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision."""
+    """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision.
    Resizes images server-side to reduce network payload and applies a per-message
    cap to control token budget (~1,600 tokens per full-res image).
    """
    if not upload_ids or not settings.STORAGE_ENDPOINT:
        return []
    from app.services import storage_service
    # Cap the number of images to limit token cost
    capped_ids = upload_ids[:MAX_IMAGES_PER_MESSAGE]
    if len(upload_ids) > MAX_IMAGES_PER_MESSAGE:
        logger.info(
            "Capped images from %d to %d for token budget",
            len(upload_ids), MAX_IMAGES_PER_MESSAGE,
        )
    result = await db.execute(
        select(FileUpload).where(
-            FileUpload.id.in_(upload_ids),
+            FileUpload.id.in_(capped_ids),
            FileUpload.account_id == account_id,
            FileUpload.content_type.in_(VISION_CONTENT_TYPES),
        )
@@ -75,9 +140,12 @@ async def _fetch_upload_images(
    for upload in uploads:
        try:
            file_data = storage_service.download_file(upload.storage_key)
            resized_data, media_type = _resize_image_for_vision(
                file_data, upload.content_type
            )
            images.append({
-                "media_type": upload.content_type,
+                "media_type": media_type,
-                "data": base64.b64encode(file_data).decode("ascii"),
+                "data": base64.b64encode(resized_data).decode("ascii"),
            })
        except Exception:
            logger.warning("Failed to fetch upload %s from S3", upload.id)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -54,3 +54,6 @@ apscheduler>=3.10.4
 # Object Storage
 boto3>=1.34.0
 # Image processing (vision upload resize)
 Pillow>=10.0.0