perf: resize images server-side before sending to Claude vision
- Resize to 1568px max (Claude's efficient ceiling) via Pillow - Convert PNG screenshots to JPEG q85 (~5MB → ~200KB typical) - Cap at 3 images per message (~4,800 token budget max) - Graceful fallback if Pillow unavailable (Claude auto-resizes) - Add Pillow + libjpeg/zlib deps to requirements + Dockerfile Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,8 @@ RUN apt-get update && apt-get install -y \
|
|||||||
libcairo2-dev \
|
libcairo2-dev \
|
||||||
libgdk-pixbuf-2.0-dev \
|
libgdk-pixbuf-2.0-dev \
|
||||||
libffi-dev \
|
libffi-dev \
|
||||||
|
libjpeg-dev \
|
||||||
|
zlib1g-dev \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install Python dependencies
|
# Install Python dependencies
|
||||||
|
|||||||
@@ -50,21 +50,86 @@ router = APIRouter(prefix="/assistant", tags=["assistant-chat"])
|
|||||||
|
|
||||||
VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
|
VISION_CONTENT_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
|
||||||
|
|
||||||
|
# Claude vision costs: (width × height) / 750 tokens per image.
|
||||||
|
# Claude auto-resizes images >1568px on the longest edge.
|
||||||
|
# We resize server-side to avoid sending multi-MB base64 payloads over the wire.
|
||||||
|
MAX_IMAGE_DIMENSION = 1568 # Claude's max efficient resolution
|
||||||
|
MAX_IMAGES_PER_MESSAGE = 3 # Cap to control token budget
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_image_for_vision(file_data: bytes, content_type: str) -> tuple[bytes, str]:
|
||||||
|
"""Resize image to fit within Claude's efficient vision bounds.
|
||||||
|
|
||||||
|
Returns (resized_bytes, media_type). Converts PNG screenshots to JPEG
|
||||||
|
when it reduces size significantly (screenshots are often huge PNGs).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
img = Image.open(BytesIO(file_data))
|
||||||
|
w, h = img.size
|
||||||
|
|
||||||
|
# Only resize if larger than Claude's max efficient dimension
|
||||||
|
if max(w, h) > MAX_IMAGE_DIMENSION:
|
||||||
|
ratio = MAX_IMAGE_DIMENSION / max(w, h)
|
||||||
|
new_w, new_h = int(w * ratio), int(h * ratio)
|
||||||
|
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||||
|
|
||||||
|
# Convert RGBA (common in screenshots) to RGB for JPEG
|
||||||
|
out_type = content_type
|
||||||
|
if img.mode in ("RGBA", "P") and content_type == "image/png":
|
||||||
|
img = img.convert("RGB")
|
||||||
|
out_type = "image/jpeg"
|
||||||
|
|
||||||
|
buf = BytesIO()
|
||||||
|
if out_type == "image/jpeg":
|
||||||
|
img.save(buf, format="JPEG", quality=85, optimize=True)
|
||||||
|
else:
|
||||||
|
img.save(buf, format=img.format or "PNG", optimize=True)
|
||||||
|
|
||||||
|
result = buf.getvalue()
|
||||||
|
|
||||||
|
# Only use resized version if it's actually smaller
|
||||||
|
if len(result) < len(file_data):
|
||||||
|
return result, out_type
|
||||||
|
return file_data, content_type
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
# Pillow not installed — send original (Claude auto-resizes)
|
||||||
|
logger.debug("Pillow not available, sending original image to Claude")
|
||||||
|
return file_data, content_type
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Image resize failed, sending original")
|
||||||
|
return file_data, content_type
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_upload_images(
|
async def _fetch_upload_images(
|
||||||
upload_ids: list[UUID],
|
upload_ids: list[UUID],
|
||||||
account_id: UUID,
|
account_id: UUID,
|
||||||
db: AsyncSession,
|
db: AsyncSession,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision."""
|
"""Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision.
|
||||||
|
|
||||||
|
Resizes images server-side to reduce network payload and applies a per-message
|
||||||
|
cap to control token budget (~1,600 tokens per full-res image).
|
||||||
|
"""
|
||||||
if not upload_ids or not settings.STORAGE_ENDPOINT:
|
if not upload_ids or not settings.STORAGE_ENDPOINT:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
from app.services import storage_service
|
from app.services import storage_service
|
||||||
|
|
||||||
|
# Cap the number of images to limit token cost
|
||||||
|
capped_ids = upload_ids[:MAX_IMAGES_PER_MESSAGE]
|
||||||
|
if len(upload_ids) > MAX_IMAGES_PER_MESSAGE:
|
||||||
|
logger.info(
|
||||||
|
"Capped images from %d to %d for token budget",
|
||||||
|
len(upload_ids), MAX_IMAGES_PER_MESSAGE,
|
||||||
|
)
|
||||||
|
|
||||||
result = await db.execute(
|
result = await db.execute(
|
||||||
select(FileUpload).where(
|
select(FileUpload).where(
|
||||||
FileUpload.id.in_(upload_ids),
|
FileUpload.id.in_(capped_ids),
|
||||||
FileUpload.account_id == account_id,
|
FileUpload.account_id == account_id,
|
||||||
FileUpload.content_type.in_(VISION_CONTENT_TYPES),
|
FileUpload.content_type.in_(VISION_CONTENT_TYPES),
|
||||||
)
|
)
|
||||||
@@ -75,9 +140,12 @@ async def _fetch_upload_images(
|
|||||||
for upload in uploads:
|
for upload in uploads:
|
||||||
try:
|
try:
|
||||||
file_data = storage_service.download_file(upload.storage_key)
|
file_data = storage_service.download_file(upload.storage_key)
|
||||||
|
resized_data, media_type = _resize_image_for_vision(
|
||||||
|
file_data, upload.content_type
|
||||||
|
)
|
||||||
images.append({
|
images.append({
|
||||||
"media_type": upload.content_type,
|
"media_type": media_type,
|
||||||
"data": base64.b64encode(file_data).decode("ascii"),
|
"data": base64.b64encode(resized_data).decode("ascii"),
|
||||||
})
|
})
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("Failed to fetch upload %s from S3", upload.id)
|
logger.warning("Failed to fetch upload %s from S3", upload.id)
|
||||||
|
|||||||
@@ -54,3 +54,6 @@ apscheduler>=3.10.4
|
|||||||
|
|
||||||
# Object Storage
|
# Object Storage
|
||||||
boto3>=1.34.0
|
boto3>=1.34.0
|
||||||
|
|
||||||
|
# Image processing (vision upload resize)
|
||||||
|
Pillow>=10.0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user