feat: wire PDF and text file content into AI chat messages

PDF uploads were stored in S3 and had text extracted during upload, but fetch_upload_images() filtered exclusively for image MIME types, so document content never reached the AI. - Add fetch_upload_documents() in storage_service.py to retrieve extracted_content for PDFs and text files - Update ai_sessions.py chat endpoint to call both fetch_upload_images and fetch_upload_documents, injecting document text as context - Add PDF text extraction in _generate_ai_description (pypdf) - Add pypdf>=4.0.0 to requirements.txt - Fix test_db teardown to avoid connection pool issues - Add 5 tests for fetch_upload_documents Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 21:02:56 +00:00
parent 3cea949519
commit 11de850054
6 changed files with 324 additions and 12 deletions
--- a/backend/app/api/endpoints/ai_sessions.py
+++ b/backend/app/api/endpoints/ai_sessions.py
@@ -280,18 +280,28 @@ async def send_chat_message(
    user_id = current_user.id
    account_id = current_user.account_id

-    # Fetch attached images from S3 (if any)
+    # Fetch attached uploads from S3 (if any)
    images = None
+    message = data.message
    if data.upload_ids:
-        from app.services.storage_service import fetch_upload_images
+        from app.services.storage_service import fetch_upload_images, fetch_upload_documents
        images = await fetch_upload_images(data.upload_ids, account_id, db) or None

+        # Inject document text (PDFs, text files) as context in the message
+        documents = await fetch_upload_documents(data.upload_ids, account_id, db)
+        if documents:
+            doc_parts = []
+            for doc in documents:
+                doc_parts.append(f"--- Attached file: {doc['filename']} ---\n{doc['text']}")
+            doc_context = "\n\n".join(doc_parts)
+            message = f"{message}\n\n[Attached document content]\n{doc_context}"
+
    try:
        ai_content, suggested_flows, session, fork_metadata, actions_data, questions_data = await unified_chat_service.send_chat_message(
            session_id=session_id,
            user_id=user_id,
            account_id=account_id,
-            message=data.message,
+            message=message,
            db=db,
            images=images,
        )
--- a/backend/app/api/endpoints/uploads.py
+++ b/backend/app/api/endpoints/uploads.py
@@ -61,6 +61,40 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty
                    max_tokens=100,
                )
                upload.ai_description = description
+            elif content_type == "application/pdf":
+                try:
+                    from pypdf import PdfReader
+                    import io as _io
+
+                    reader = PdfReader(_io.BytesIO(file_data))
+                    pages_text = []
+                    for page in reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            pages_text.append(page_text)
+                    text_content = "\n\n".join(pages_text)
+                except Exception:
+                    logger.warning("PDF text extraction failed for upload %s", upload_id)
+                    text_content = ""
+
+                if text_content:
+                    upload.extracted_content = text_content[:10000]
+
+                    if len(text_content) > 2000:
+                        summary, _, _ = await _call_ai(
+                            system_base="You are a technical document analyst for IT troubleshooting.",
+                            rag_context="",
+                            history=[],
+                            new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}",
+                            max_tokens=200,
+                        )
+                        upload.content_summary = summary
+                        upload.ai_description = summary
+                    else:
+                        upload.ai_description = f"PDF document: {upload.filename}"
+                else:
+                    upload.ai_description = f"PDF document (no extractable text): {upload.filename}"
+
            elif content_type.startswith("text/") or content_type in (
                "application/json", "application/xml", "application/yaml",
            ):
--- a/backend/app/services/storage_service.py
+++ b/backend/app/services/storage_service.py
@@ -16,10 +16,12 @@ logger = logging.getLogger(__name__)

 ALLOWED_IMAGE_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
 ALLOWED_TEXT_TYPES = {"text/plain", "text/csv", "application/octet-stream"}
-ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES
+ALLOWED_DOCUMENT_TYPES = {"application/pdf"}
+ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES | ALLOWED_DOCUMENT_TYPES

-MAX_IMAGE_SIZE = 5 * 1024 * 1024  # 5MB
-MAX_TEXT_SIZE = 1 * 1024 * 1024   # 1MB
+MAX_IMAGE_SIZE = 5 * 1024 * 1024      # 5MB
+MAX_TEXT_SIZE = 1 * 1024 * 1024        # 1MB
+MAX_DOCUMENT_SIZE = 10 * 1024 * 1024   # 10MB
 MAX_FILES_PER_SESSION = 20
 MAX_BYTES_PER_SESSION = 50 * 1024 * 1024  # 50MB

@@ -44,7 +46,12 @@ def validate_upload(content_type: str, size_bytes: int) -> str | None:
    """Validate file type and size. Returns error message or None."""
    if content_type not in ALLOWED_TYPES:
        return f"File type {content_type} not allowed"
-    max_size = MAX_IMAGE_SIZE if content_type in ALLOWED_IMAGE_TYPES else MAX_TEXT_SIZE
+    if content_type in ALLOWED_IMAGE_TYPES:
+        max_size = MAX_IMAGE_SIZE
+    elif content_type in ALLOWED_DOCUMENT_TYPES:
+        max_size = MAX_DOCUMENT_SIZE
+    else:
+        max_size = MAX_TEXT_SIZE
    if size_bytes > max_size:
        return f"File too large ({size_bytes} bytes, max {max_size})"
    return None
@@ -199,3 +206,77 @@ async def fetch_upload_images(
        except Exception:
            logger.warning("Failed to fetch upload %s from S3", upload.id)
    return images
+
+
+DOCUMENT_CONTENT_TYPES = ALLOWED_DOCUMENT_TYPES | ALLOWED_TEXT_TYPES
+MAX_DOCUMENT_CONTEXT_CHARS = 10_000  # Cap total injected text to control token cost
+
+
+async def fetch_upload_documents(
+    upload_ids: list[UUID],
+    account_id: UUID,
+    db: Any,
+) -> list[dict[str, str]]:
+    """Fetch extracted text content for non-image uploads (PDFs, text files).
+
+    Returns a list of dicts with 'filename', 'content_type', and 'text' keys.
+    Text is sourced from the FileUpload.extracted_content field (populated
+    during upload by _generate_ai_description). Falls back to downloading
+    and decoding text files from S3 if extracted_content is empty.
+    """
+    if not upload_ids:
+        return []
+
+    from sqlalchemy import select
+    from app.models.file_upload import FileUpload
+
+    result = await db.execute(
+        select(FileUpload).where(
+            FileUpload.id.in_(upload_ids),
+            FileUpload.account_id == account_id,
+            FileUpload.content_type.in_(DOCUMENT_CONTENT_TYPES),
+        )
+    )
+    uploads = result.scalars().all()
+
+    documents: list[dict[str, str]] = []
+    total_chars = 0
+    for upload in uploads:
+        text = upload.extracted_content or ""
+
+        # Fallback: for text files without extracted_content, fetch from S3
+        if not text and upload.content_type in ALLOWED_TEXT_TYPES and settings.STORAGE_ENDPOINT:
+            try:
+                file_data = download_file(upload.storage_key)
+                try:
+                    text = file_data.decode("utf-8")
+                except UnicodeDecodeError:
+                    text = file_data.decode("latin-1")
+                text = text[:MAX_DOCUMENT_CONTEXT_CHARS]
+            except Exception:
+                logger.warning("Failed to fetch text upload %s from S3", upload.id)
+                continue
+
+        if not text:
+            # PDF with no extractable text — include a note so AI knows
+            documents.append({
+                "filename": upload.filename,
+                "content_type": upload.content_type,
+                "text": f"[Attached file: {upload.filename} — no extractable text content]",
+            })
+            continue
+
+        # Cap per-document and total to control token budget
+        remaining = MAX_DOCUMENT_CONTEXT_CHARS - total_chars
+        if remaining <= 0:
+            break
+        truncated = text[:remaining]
+        total_chars += len(truncated)
+
+        documents.append({
+            "filename": upload.filename,
+            "content_type": upload.content_type,
+            "text": truncated,
+        })
+
+    return documents