diff --git a/backend/Dockerfile b/backend/Dockerfile index 1cbc6ea6..21f5ab52 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -10,6 +10,8 @@ RUN apt-get update && apt-get install -y \ libcairo2-dev \ libgdk-pixbuf-2.0-dev \ libffi-dev \ + libjpeg-dev \ + zlib1g-dev \ && rm -rf /var/lib/apt/lists/* # Install Python dependencies diff --git a/backend/app/api/endpoints/assistant_chat.py b/backend/app/api/endpoints/assistant_chat.py index 0c13d3e2..83188213 100644 --- a/backend/app/api/endpoints/assistant_chat.py +++ b/backend/app/api/endpoints/assistant_chat.py @@ -50,21 +50,86 @@ router = APIRouter(prefix="/assistant", tags=["assistant-chat"]) VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"} +# Claude vision costs: (width × height) / 750 tokens per image. +# Claude auto-resizes images >1568px on the longest edge. +# We resize server-side to avoid sending multi-MB base64 payloads over the wire. +MAX_IMAGE_DIMENSION = 1568 # Claude's max efficient resolution +MAX_IMAGES_PER_MESSAGE = 3 # Cap to control token budget + + +def _resize_image_for_vision(file_data: bytes, content_type: str) -> tuple[bytes, str]: + """Resize image to fit within Claude's efficient vision bounds. + + Returns (resized_bytes, media_type). Converts PNG screenshots to JPEG + when it reduces size significantly (screenshots are often huge PNGs). + """ + try: + from PIL import Image + from io import BytesIO + + img = Image.open(BytesIO(file_data)) + w, h = img.size + + # Only resize if larger than Claude's max efficient dimension + if max(w, h) > MAX_IMAGE_DIMENSION: + ratio = MAX_IMAGE_DIMENSION / max(w, h) + new_w, new_h = int(w * ratio), int(h * ratio) + img = img.resize((new_w, new_h), Image.LANCZOS) + + # Convert RGBA (common in screenshots) to RGB for JPEG + out_type = content_type + if img.mode in ("RGBA", "P") and content_type == "image/png": + img = img.convert("RGB") + out_type = "image/jpeg" + + buf = BytesIO() + if out_type == "image/jpeg": + img.save(buf, format="JPEG", quality=85, optimize=True) + else: + img.save(buf, format=img.format or "PNG", optimize=True) + + result = buf.getvalue() + + # Only use resized version if it's actually smaller + if len(result) < len(file_data): + return result, out_type + return file_data, content_type + + except ImportError: + # Pillow not installed — send original (Claude auto-resizes) + logger.debug("Pillow not available, sending original image to Claude") + return file_data, content_type + except Exception: + logger.warning("Image resize failed, sending original") + return file_data, content_type + async def _fetch_upload_images( upload_ids: list[UUID], account_id: UUID, db: AsyncSession, ) -> list[dict[str, Any]]: - """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision.""" + """Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision. + + Resizes images server-side to reduce network payload and applies a per-message + cap to control token budget (~1,600 tokens per full-res image). + """ if not upload_ids or not settings.STORAGE_ENDPOINT: return [] from app.services import storage_service + # Cap the number of images to limit token cost + capped_ids = upload_ids[:MAX_IMAGES_PER_MESSAGE] + if len(upload_ids) > MAX_IMAGES_PER_MESSAGE: + logger.info( + "Capped images from %d to %d for token budget", + len(upload_ids), MAX_IMAGES_PER_MESSAGE, + ) + result = await db.execute( select(FileUpload).where( - FileUpload.id.in_(upload_ids), + FileUpload.id.in_(capped_ids), FileUpload.account_id == account_id, FileUpload.content_type.in_(VISION_CONTENT_TYPES), ) @@ -75,9 +140,12 @@ async def _fetch_upload_images( for upload in uploads: try: file_data = storage_service.download_file(upload.storage_key) + resized_data, media_type = _resize_image_for_vision( + file_data, upload.content_type + ) images.append({ - "media_type": upload.content_type, - "data": base64.b64encode(file_data).decode("ascii"), + "media_type": media_type, + "data": base64.b64encode(resized_data).decode("ascii"), }) except Exception: logger.warning("Failed to fetch upload %s from S3", upload.id) diff --git a/backend/requirements.txt b/backend/requirements.txt index 634f9fee..c52db22e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -54,3 +54,6 @@ apscheduler>=3.10.4 # Object Storage boto3>=1.34.0 + +# Image processing (vision upload resize) +Pillow>=10.0.0