Files
resolutionflow/backend/app/services/storage_service.py
chihlasm 217e70cb81 feat: add .docx upload support with text extraction
- Add DOCX MIME type to ALLOWED_DOCUMENT_TYPES in storage_service.py
- Add python-docx text extraction in _generate_ai_description
- Extract shared _store_document_content helper for PDF/DOCX
- Add python-docx>=1.1.0 to requirements.txt
- Add tests for docx upload acceptance and document fetch

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 21:08:12 +00:00

284 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""S3-compatible object storage service for file uploads."""
import base64
import logging
import uuid
from io import BytesIO
from typing import Any
from uuid import UUID
import boto3
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError
from app.core.config import settings
logger = logging.getLogger(__name__)
ALLOWED_IMAGE_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
ALLOWED_TEXT_TYPES = {"text/plain", "text/csv", "application/octet-stream"}
DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
ALLOWED_DOCUMENT_TYPES = {"application/pdf", DOCX_MIME}
ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES | ALLOWED_DOCUMENT_TYPES
MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB
MAX_TEXT_SIZE = 1 * 1024 * 1024 # 1MB
MAX_DOCUMENT_SIZE = 10 * 1024 * 1024 # 10MB
MAX_FILES_PER_SESSION = 20
MAX_BYTES_PER_SESSION = 50 * 1024 * 1024 # 50MB
PRESIGNED_URL_EXPIRY = 3600 # 1 hour
def _get_client():
"""Get S3 client configured for Railway Object Storage."""
if not settings.STORAGE_ENDPOINT:
raise RuntimeError("Object storage not configured (STORAGE_ENDPOINT missing)")
return boto3.client(
"s3",
endpoint_url=settings.STORAGE_ENDPOINT,
aws_access_key_id=settings.STORAGE_ACCESS_KEY,
aws_secret_access_key=settings.STORAGE_SECRET_KEY,
region_name=settings.STORAGE_REGION,
config=BotoConfig(signature_version="s3v4"),
)
def validate_upload(content_type: str, size_bytes: int) -> str | None:
"""Validate file type and size. Returns error message or None."""
if content_type not in ALLOWED_TYPES:
return f"File type {content_type} not allowed"
if content_type in ALLOWED_IMAGE_TYPES:
max_size = MAX_IMAGE_SIZE
elif content_type in ALLOWED_DOCUMENT_TYPES:
max_size = MAX_DOCUMENT_SIZE
else:
max_size = MAX_TEXT_SIZE
if size_bytes > max_size:
return f"File too large ({size_bytes} bytes, max {max_size})"
return None
async def upload_file(
file_data: bytes,
filename: str,
content_type: str,
account_id: str,
) -> str:
"""Upload file to S3, returns the storage key."""
ext = filename.rsplit(".", 1)[-1] if "." in filename else "bin"
storage_key = f"uploads/{account_id}/{uuid.uuid4()}.{ext}"
client = _get_client()
client.upload_fileobj(
BytesIO(file_data),
settings.STORAGE_BUCKET_NAME,
storage_key,
ExtraArgs={"ContentType": content_type},
)
return storage_key
def download_file(storage_key: str) -> bytes:
"""Download a file from S3 and return its contents as bytes."""
client = _get_client()
buf = BytesIO()
client.download_fileobj(settings.STORAGE_BUCKET_NAME, storage_key, buf)
return buf.getvalue()
def get_presigned_url(storage_key: str) -> str:
"""Generate a time-limited presigned URL for downloading a file."""
client = _get_client()
return client.generate_presigned_url(
"get_object",
Params={"Bucket": settings.STORAGE_BUCKET_NAME, "Key": storage_key},
ExpiresIn=PRESIGNED_URL_EXPIRY,
)
async def delete_file(storage_key: str) -> None:
"""Delete a file from S3."""
try:
client = _get_client()
client.delete_object(Bucket=settings.STORAGE_BUCKET_NAME, Key=storage_key)
except ClientError:
logger.warning(f"Failed to delete S3 object: {storage_key}")
# ── Vision helpers (resize + fetch for AI) ─────────────────────
# Claude vision costs: (width × height) / 750 tokens per image.
# Claude auto-resizes images >1568px on the longest edge.
# We resize server-side to avoid sending multi-MB base64 payloads over the wire.
MAX_IMAGE_DIMENSION = 1568 # Claude's max efficient resolution
MAX_IMAGES_PER_MESSAGE = 3 # Cap to control token budget
def resize_image_for_vision(file_data: bytes, content_type: str) -> tuple[bytes, str]:
"""Resize image to fit within Claude's efficient vision bounds.
Returns (resized_bytes, media_type). Converts PNG screenshots to JPEG
when it reduces size significantly (screenshots are often huge PNGs).
"""
try:
from PIL import Image
img = Image.open(BytesIO(file_data))
w, h = img.size
# Only resize if larger than Claude's max efficient dimension
if max(w, h) > MAX_IMAGE_DIMENSION:
ratio = MAX_IMAGE_DIMENSION / max(w, h)
new_w, new_h = int(w * ratio), int(h * ratio)
img = img.resize((new_w, new_h), Image.LANCZOS)
# Convert RGBA (common in screenshots) to RGB for JPEG
out_type = content_type
if img.mode in ("RGBA", "P") and content_type == "image/png":
img = img.convert("RGB")
out_type = "image/jpeg"
buf = BytesIO()
if out_type == "image/jpeg":
img.save(buf, format="JPEG", quality=85, optimize=True)
else:
img.save(buf, format=img.format or "PNG", optimize=True)
result = buf.getvalue()
# Only use resized version if it's actually smaller
if len(result) < len(file_data):
return result, out_type
return file_data, content_type
except ImportError:
# Pillow not installed — send original (Claude auto-resizes)
logger.debug("Pillow not available, sending original image to Claude")
return file_data, content_type
except Exception:
logger.warning("Image resize failed, sending original")
return file_data, content_type
async def fetch_upload_images(
upload_ids: list[UUID],
account_id: UUID,
db: Any,
) -> list[dict[str, Any]]:
"""Fetch uploaded images from S3 and return as base64-encoded dicts for Claude vision.
Resizes images server-side to reduce network payload and applies a per-message
cap to control token budget (~1,600 tokens per full-res image).
"""
if not upload_ids or not settings.STORAGE_ENDPOINT:
return []
from sqlalchemy import select
from app.models.file_upload import FileUpload
# Cap the number of images to limit token cost
capped_ids = upload_ids[:MAX_IMAGES_PER_MESSAGE]
if len(upload_ids) > MAX_IMAGES_PER_MESSAGE:
logger.info(
"Capped images from %d to %d for token budget",
len(upload_ids), MAX_IMAGES_PER_MESSAGE,
)
result = await db.execute(
select(FileUpload).where(
FileUpload.id.in_(capped_ids),
FileUpload.account_id == account_id,
FileUpload.content_type.in_(ALLOWED_IMAGE_TYPES),
)
)
uploads = result.scalars().all()
images: list[dict[str, Any]] = []
for upload in uploads:
try:
file_data = download_file(upload.storage_key)
resized_data, media_type = resize_image_for_vision(
file_data, upload.content_type
)
images.append({
"media_type": media_type,
"data": base64.b64encode(resized_data).decode("ascii"),
})
except Exception:
logger.warning("Failed to fetch upload %s from S3", upload.id)
return images
DOCUMENT_CONTENT_TYPES = ALLOWED_DOCUMENT_TYPES | ALLOWED_TEXT_TYPES
MAX_DOCUMENT_CONTEXT_CHARS = 10_000 # Cap total injected text to control token cost
async def fetch_upload_documents(
upload_ids: list[UUID],
account_id: UUID,
db: Any,
) -> list[dict[str, str]]:
"""Fetch extracted text content for non-image uploads (PDFs, text files).
Returns a list of dicts with 'filename', 'content_type', and 'text' keys.
Text is sourced from the FileUpload.extracted_content field (populated
during upload by _generate_ai_description). Falls back to downloading
and decoding text files from S3 if extracted_content is empty.
"""
if not upload_ids:
return []
from sqlalchemy import select
from app.models.file_upload import FileUpload
result = await db.execute(
select(FileUpload).where(
FileUpload.id.in_(upload_ids),
FileUpload.account_id == account_id,
FileUpload.content_type.in_(DOCUMENT_CONTENT_TYPES),
)
)
uploads = result.scalars().all()
documents: list[dict[str, str]] = []
total_chars = 0
for upload in uploads:
text = upload.extracted_content or ""
# Fallback: for text files without extracted_content, fetch from S3
if not text and upload.content_type in ALLOWED_TEXT_TYPES and settings.STORAGE_ENDPOINT:
try:
file_data = download_file(upload.storage_key)
try:
text = file_data.decode("utf-8")
except UnicodeDecodeError:
text = file_data.decode("latin-1")
text = text[:MAX_DOCUMENT_CONTEXT_CHARS]
except Exception:
logger.warning("Failed to fetch text upload %s from S3", upload.id)
continue
if not text:
# PDF with no extractable text — include a note so AI knows
documents.append({
"filename": upload.filename,
"content_type": upload.content_type,
"text": f"[Attached file: {upload.filename} — no extractable text content]",
})
continue
# Cap per-document and total to control token budget
remaining = MAX_DOCUMENT_CONTEXT_CHARS - total_chars
if remaining <= 0:
break
truncated = text[:remaining]
total_chars += len(truncated)
documents.append({
"filename": upload.filename,
"content_type": upload.content_type,
"text": truncated,
})
return documents