feat: wire PDF and text file content into AI chat messages
PDF uploads were stored in S3 and had text extracted during upload, but fetch_upload_images() filtered exclusively for image MIME types, so document content never reached the AI. - Add fetch_upload_documents() in storage_service.py to retrieve extracted_content for PDFs and text files - Update ai_sessions.py chat endpoint to call both fetch_upload_images and fetch_upload_documents, injecting document text as context - Add PDF text extraction in _generate_ai_description (pypdf) - Add pypdf>=4.0.0 to requirements.txt - Fix test_db teardown to avoid connection pool issues - Add 5 tests for fetch_upload_documents Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,10 +16,12 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
ALLOWED_IMAGE_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
|
||||
ALLOWED_TEXT_TYPES = {"text/plain", "text/csv", "application/octet-stream"}
|
||||
ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES
|
||||
ALLOWED_DOCUMENT_TYPES = {"application/pdf"}
|
||||
ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES | ALLOWED_DOCUMENT_TYPES
|
||||
|
||||
MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB
|
||||
MAX_TEXT_SIZE = 1 * 1024 * 1024 # 1MB
|
||||
MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB
|
||||
MAX_TEXT_SIZE = 1 * 1024 * 1024 # 1MB
|
||||
MAX_DOCUMENT_SIZE = 10 * 1024 * 1024 # 10MB
|
||||
MAX_FILES_PER_SESSION = 20
|
||||
MAX_BYTES_PER_SESSION = 50 * 1024 * 1024 # 50MB
|
||||
|
||||
@@ -44,7 +46,12 @@ def validate_upload(content_type: str, size_bytes: int) -> str | None:
|
||||
"""Validate file type and size. Returns error message or None."""
|
||||
if content_type not in ALLOWED_TYPES:
|
||||
return f"File type {content_type} not allowed"
|
||||
max_size = MAX_IMAGE_SIZE if content_type in ALLOWED_IMAGE_TYPES else MAX_TEXT_SIZE
|
||||
if content_type in ALLOWED_IMAGE_TYPES:
|
||||
max_size = MAX_IMAGE_SIZE
|
||||
elif content_type in ALLOWED_DOCUMENT_TYPES:
|
||||
max_size = MAX_DOCUMENT_SIZE
|
||||
else:
|
||||
max_size = MAX_TEXT_SIZE
|
||||
if size_bytes > max_size:
|
||||
return f"File too large ({size_bytes} bytes, max {max_size})"
|
||||
return None
|
||||
@@ -199,3 +206,77 @@ async def fetch_upload_images(
|
||||
except Exception:
|
||||
logger.warning("Failed to fetch upload %s from S3", upload.id)
|
||||
return images
|
||||
|
||||
|
||||
DOCUMENT_CONTENT_TYPES = ALLOWED_DOCUMENT_TYPES | ALLOWED_TEXT_TYPES
|
||||
MAX_DOCUMENT_CONTEXT_CHARS = 10_000 # Cap total injected text to control token cost
|
||||
|
||||
|
||||
async def fetch_upload_documents(
|
||||
upload_ids: list[UUID],
|
||||
account_id: UUID,
|
||||
db: Any,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Fetch extracted text content for non-image uploads (PDFs, text files).
|
||||
|
||||
Returns a list of dicts with 'filename', 'content_type', and 'text' keys.
|
||||
Text is sourced from the FileUpload.extracted_content field (populated
|
||||
during upload by _generate_ai_description). Falls back to downloading
|
||||
and decoding text files from S3 if extracted_content is empty.
|
||||
"""
|
||||
if not upload_ids:
|
||||
return []
|
||||
|
||||
from sqlalchemy import select
|
||||
from app.models.file_upload import FileUpload
|
||||
|
||||
result = await db.execute(
|
||||
select(FileUpload).where(
|
||||
FileUpload.id.in_(upload_ids),
|
||||
FileUpload.account_id == account_id,
|
||||
FileUpload.content_type.in_(DOCUMENT_CONTENT_TYPES),
|
||||
)
|
||||
)
|
||||
uploads = result.scalars().all()
|
||||
|
||||
documents: list[dict[str, str]] = []
|
||||
total_chars = 0
|
||||
for upload in uploads:
|
||||
text = upload.extracted_content or ""
|
||||
|
||||
# Fallback: for text files without extracted_content, fetch from S3
|
||||
if not text and upload.content_type in ALLOWED_TEXT_TYPES and settings.STORAGE_ENDPOINT:
|
||||
try:
|
||||
file_data = download_file(upload.storage_key)
|
||||
try:
|
||||
text = file_data.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = file_data.decode("latin-1")
|
||||
text = text[:MAX_DOCUMENT_CONTEXT_CHARS]
|
||||
except Exception:
|
||||
logger.warning("Failed to fetch text upload %s from S3", upload.id)
|
||||
continue
|
||||
|
||||
if not text:
|
||||
# PDF with no extractable text — include a note so AI knows
|
||||
documents.append({
|
||||
"filename": upload.filename,
|
||||
"content_type": upload.content_type,
|
||||
"text": f"[Attached file: {upload.filename} — no extractable text content]",
|
||||
})
|
||||
continue
|
||||
|
||||
# Cap per-document and total to control token budget
|
||||
remaining = MAX_DOCUMENT_CONTEXT_CHARS - total_chars
|
||||
if remaining <= 0:
|
||||
break
|
||||
truncated = text[:remaining]
|
||||
total_chars += len(truncated)
|
||||
|
||||
documents.append({
|
||||
"filename": upload.filename,
|
||||
"content_type": upload.content_type,
|
||||
"text": truncated,
|
||||
})
|
||||
|
||||
return documents
|
||||
|
||||
Reference in New Issue
Block a user