diff --git a/backend/app/api/endpoints/uploads.py b/backend/app/api/endpoints/uploads.py index e6dcee12..eb9d0e38 100644 --- a/backend/app/api/endpoints/uploads.py +++ b/backend/app/api/endpoints/uploads.py @@ -35,6 +35,29 @@ def _check_storage_configured() -> None: ) +async def _store_document_content(upload, text_content: str, doc_type: str) -> None: + """Store extracted document text and optionally generate an AI summary.""" + from app.services.assistant_chat_service import _call_ai + + if text_content: + upload.extracted_content = text_content[:10000] + + if len(text_content) > 2000: + summary, _, _ = await _call_ai( + system_base="You are a technical document analyst for IT troubleshooting.", + rag_context="", + history=[], + new_message=f"Summarize this {doc_type} content in 2-3 sentences:\n\n{text_content[:5000]}", + max_tokens=200, + ) + upload.content_summary = summary + upload.ai_description = summary + else: + upload.ai_description = f"{doc_type}: {upload.filename}" + else: + upload.ai_description = f"{doc_type} (no extractable text): {upload.filename}" + + async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_type: str) -> None: """Background task: generate AI description for uploaded file.""" try: @@ -77,23 +100,22 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty logger.warning("PDF text extraction failed for upload %s", upload_id) text_content = "" - if text_content: - upload.extracted_content = text_content[:10000] + await _store_document_content(upload, text_content, "PDF") - if len(text_content) > 2000: - summary, _, _ = await _call_ai( - system_base="You are a technical document analyst for IT troubleshooting.", - rag_context="", - history=[], - new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}", - max_tokens=200, - ) - upload.content_summary = summary - upload.ai_description = summary - else: - upload.ai_description = f"PDF document: {upload.filename}" - else: - upload.ai_description = f"PDF document (no extractable text): {upload.filename}" + elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": + try: + from docx import Document as DocxDocument + import io as _io + + doc = DocxDocument(_io.BytesIO(file_data)) + text_content = "\n\n".join( + p.text for p in doc.paragraphs if p.text.strip() + ) + except Exception: + logger.warning("DOCX text extraction failed for upload %s", upload_id) + text_content = "" + + await _store_document_content(upload, text_content, "Word document") elif content_type.startswith("text/") or content_type in ( "application/json", "application/xml", "application/yaml", diff --git a/backend/app/services/storage_service.py b/backend/app/services/storage_service.py index f0aae308..7d6d2fe4 100644 --- a/backend/app/services/storage_service.py +++ b/backend/app/services/storage_service.py @@ -16,7 +16,8 @@ logger = logging.getLogger(__name__) ALLOWED_IMAGE_TYPES = {"image/png", "image/jpeg", "image/gif", "image/webp"} ALLOWED_TEXT_TYPES = {"text/plain", "text/csv", "application/octet-stream"} -ALLOWED_DOCUMENT_TYPES = {"application/pdf"} +DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" +ALLOWED_DOCUMENT_TYPES = {"application/pdf", DOCX_MIME} ALLOWED_TYPES = ALLOWED_IMAGE_TYPES | ALLOWED_TEXT_TYPES | ALLOWED_DOCUMENT_TYPES MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5MB diff --git a/backend/requirements.txt b/backend/requirements.txt index 7ff10426..75c9b19f 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -60,3 +60,6 @@ Pillow>=10.0.0 # PDF text extraction (upload analysis) pypdf>=4.0.0 + +# DOCX text extraction (upload analysis) +python-docx>=1.1.0 diff --git a/backend/tests/test_uploads.py b/backend/tests/test_uploads.py index 8e3149f9..fa4f54cb 100644 --- a/backend/tests/test_uploads.py +++ b/backend/tests/test_uploads.py @@ -158,6 +158,31 @@ async def test_upload_accepts_pdf(client, auth_headers): assert data["content_type"] == "application/pdf" +@pytest.mark.asyncio +async def test_upload_accepts_docx(client, auth_headers): + """Upload accepts .docx files.""" + fake_key = f"uploads/acc/{uuid.uuid4()}.docx" + fake_url = "https://fake-s3.example.com/presigned?token=docx" + docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + + with patch("app.api.endpoints.uploads.settings") as mock_settings, \ + patch("app.api.endpoints.uploads.storage_service") as mock_storage: + mock_settings.STORAGE_ENDPOINT = "http://fake-s3" + mock_storage.validate_upload.return_value = None + mock_storage.MAX_FILES_PER_SESSION = 20 + mock_storage.MAX_BYTES_PER_SESSION = 50 * 1024 * 1024 + mock_storage.upload_file = AsyncMock(return_value=fake_key) + mock_storage.get_presigned_url.return_value = fake_url + + files = {"file": ("runbook.docx", io.BytesIO(b"PK\x03\x04 fake docx"), docx_mime)} + response = await client.post("/api/v1/uploads", files=files, headers=auth_headers) + + assert response.status_code == 201 + data = response.json() + assert data["filename"] == "runbook.docx" + assert data["content_type"] == docx_mime + + @pytest.mark.asyncio async def test_upload_rejects_oversized_pdf(client, auth_headers): """Upload rejects PDF files exceeding 10 MB.""" @@ -464,6 +489,40 @@ async def test_fetch_upload_documents_respects_account_filter(client, auth_heade assert len(docs) == 0 +@pytest.mark.asyncio +async def test_fetch_upload_documents_returns_docx_content(client, auth_headers, test_db): + """fetch_upload_documents returns extracted_content for DOCX uploads.""" + from app.models.file_upload import FileUpload + from app.models.user import User + from app.services.storage_service import fetch_upload_documents + from sqlalchemy import select + + result = await test_db.execute(select(User).where(User.email == "test@example.com")) + user = result.scalar_one() + + docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + upload = FileUpload( + account_id=user.account_id, + uploaded_by=user.id, + session_id=None, + filename="runbook.docx", + content_type=docx_mime, + size_bytes=8000, + storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.docx", + extracted_content="Step 1: Restart the service\n\nStep 2: Verify logs", + ) + test_db.add(upload) + await test_db.commit() + await test_db.refresh(upload) + + docs = await fetch_upload_documents([upload.id], user.account_id, test_db) + + assert len(docs) == 1 + assert docs[0]["filename"] == "runbook.docx" + assert docs[0]["content_type"] == docx_mime + assert "Restart the service" in docs[0]["text"] + + @pytest.mark.asyncio async def test_fetch_upload_documents_empty_ids(client, auth_headers, test_db): """Empty upload_ids returns empty list without querying DB."""