feat: add .docx upload support with text extraction

- Add DOCX MIME type to ALLOWED_DOCUMENT_TYPES in storage_service.py - Add python-docx text extraction in _generate_ai_description - Extract shared _store_document_content helper for PDF/DOCX - Add python-docx>=1.1.0 to requirements.txt - Add tests for docx upload acceptance and document fetch Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 21:08:12 +00:00
parent 11de850054
commit 217e70cb81
4 changed files with 102 additions and 17 deletions
--- a/backend/app/api/endpoints/uploads.py
+++ b/backend/app/api/endpoints/uploads.py
@@ -35,6 +35,29 @@ def _check_storage_configured() -> None:
        )


+async def _store_document_content(upload, text_content: str, doc_type: str) -> None:
+    """Store extracted document text and optionally generate an AI summary."""
+    from app.services.assistant_chat_service import _call_ai
+
+    if text_content:
+        upload.extracted_content = text_content[:10000]
+
+        if len(text_content) > 2000:
+            summary, _, _ = await _call_ai(
+                system_base="You are a technical document analyst for IT troubleshooting.",
+                rag_context="",
+                history=[],
+                new_message=f"Summarize this {doc_type} content in 2-3 sentences:\n\n{text_content[:5000]}",
+                max_tokens=200,
+            )
+            upload.content_summary = summary
+            upload.ai_description = summary
+        else:
+            upload.ai_description = f"{doc_type}: {upload.filename}"
+    else:
+        upload.ai_description = f"{doc_type} (no extractable text): {upload.filename}"
+
+
 async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_type: str) -> None:
    """Background task: generate AI description for uploaded file."""
    try:
@@ -77,23 +100,22 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty
                    logger.warning("PDF text extraction failed for upload %s", upload_id)
                    text_content = ""

-                if text_content:
-                    upload.extracted_content = text_content[:10000]
+                await _store_document_content(upload, text_content, "PDF")

-                    if len(text_content) > 2000:
-                        summary, _, _ = await _call_ai(
-                            system_base="You are a technical document analyst for IT troubleshooting.",
-                            rag_context="",
-                            history=[],
-                            new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}",
-                            max_tokens=200,
-                        )
-                        upload.content_summary = summary
-                        upload.ai_description = summary
-                    else:
-                        upload.ai_description = f"PDF document: {upload.filename}"
-                else:
-                    upload.ai_description = f"PDF document (no extractable text): {upload.filename}"
+            elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+                try:
+                    from docx import Document as DocxDocument
+                    import io as _io
+
+                    doc = DocxDocument(_io.BytesIO(file_data))
+                    text_content = "\n\n".join(
+                        p.text for p in doc.paragraphs if p.text.strip()
+                    )
+                except Exception:
+                    logger.warning("DOCX text extraction failed for upload %s", upload_id)
+                    text_content = ""
+
+                await _store_document_content(upload, text_content, "Word document")

            elif content_type.startswith("text/") or content_type in (
                "application/json", "application/xml", "application/yaml",
--- a/backend/app/services/storage_service.py
+++ b/backend/app/services/storage_service.py
@@ -16,7 +16,8 @@ logger = logging.getLogger(__name__)

 ALLOWED_IMAGE_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"}
 ALLOWED_TEXT_TYPES = {"text/plain", "text/csv", "application/octet-stream"}
-ALLOWED_DOCUMENT_TYPES = {"application/pdf"}
+DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ALLOWED_DOCUMENT_TYPES = {"application/pdf", DOCX_MIME}
 ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES | ALLOWED_DOCUMENT_TYPES

 MAX_IMAGE_SIZE = 5 * 1024 * 1024      # 5MB
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -60,3 +60,6 @@ Pillow>=10.0.0

 # PDF text extraction (upload analysis)
 pypdf>=4.0.0
+
+# DOCX text extraction (upload analysis)
+python-docx>=1.1.0
--- a/backend/tests/test_uploads.py
+++ b/backend/tests/test_uploads.py
@@ -158,6 +158,31 @@ async def test_upload_accepts_pdf(client, auth_headers):
    assert data["content_type"] == "application/pdf"


+@pytest.mark.asyncio
+async def test_upload_accepts_docx(client, auth_headers):
+    """Upload accepts .docx files."""
+    fake_key = f"uploads/acc/{uuid.uuid4()}.docx"
+    fake_url = "https://fake-s3.example.com/presigned?token=docx"
+    docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+    with patch("app.api.endpoints.uploads.settings") as mock_settings, \
+         patch("app.api.endpoints.uploads.storage_service") as mock_storage:
+        mock_settings.STORAGE_ENDPOINT = "http://fake-s3"
+        mock_storage.validate_upload.return_value = None
+        mock_storage.MAX_FILES_PER_SESSION = 20
+        mock_storage.MAX_BYTES_PER_SESSION = 50 * 1024 * 1024
+        mock_storage.upload_file = AsyncMock(return_value=fake_key)
+        mock_storage.get_presigned_url.return_value = fake_url
+
+        files = {"file": ("runbook.docx", io.BytesIO(b"PK\x03\x04 fake docx"), docx_mime)}
+        response = await client.post("/api/v1/uploads", files=files, headers=auth_headers)
+
+    assert response.status_code == 201
+    data = response.json()
+    assert data["filename"] == "runbook.docx"
+    assert data["content_type"] == docx_mime
+
+
@pytest.mark.asyncio
 async def test_upload_rejects_oversized_pdf(client, auth_headers):
    """Upload rejects PDF files exceeding 10 MB."""
@@ -464,6 +489,40 @@ async def test_fetch_upload_documents_respects_account_filter(client, auth_heade
    assert len(docs) == 0


+@pytest.mark.asyncio
+async def test_fetch_upload_documents_returns_docx_content(client, auth_headers, test_db):
+    """fetch_upload_documents returns extracted_content for DOCX uploads."""
+    from app.models.file_upload import FileUpload
+    from app.models.user import User
+    from app.services.storage_service import fetch_upload_documents
+    from sqlalchemy import select
+
+    result = await test_db.execute(select(User).where(User.email == "test@example.com"))
+    user = result.scalar_one()
+
+    docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    upload = FileUpload(
+        account_id=user.account_id,
+        uploaded_by=user.id,
+        session_id=None,
+        filename="runbook.docx",
+        content_type=docx_mime,
+        size_bytes=8000,
+        storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.docx",
+        extracted_content="Step 1: Restart the service\n\nStep 2: Verify logs",
+    )
+    test_db.add(upload)
+    await test_db.commit()
+    await test_db.refresh(upload)
+
+    docs = await fetch_upload_documents([upload.id], user.account_id, test_db)
+
+    assert len(docs) == 1
+    assert docs[0]["filename"] == "runbook.docx"
+    assert docs[0]["content_type"] == docx_mime
+    assert "Restart the service" in docs[0]["text"]
+
+
@pytest.mark.asyncio
 async def test_fetch_upload_documents_empty_ids(client, auth_headers, test_db):
    """Empty upload_ids returns empty list without querying DB."""