feat: add .docx upload support with text extraction

- Add DOCX MIME type to ALLOWED_DOCUMENT_TYPES in storage_service.py
- Add python-docx text extraction in _generate_ai_description
- Extract shared _store_document_content helper for PDF/DOCX
- Add python-docx>=1.1.0 to requirements.txt
- Add tests for docx upload acceptance and document fetch

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
chihlasm
2026-03-27 21:08:12 +00:00
parent 11de850054
commit 217e70cb81
4 changed files with 102 additions and 17 deletions

View File

@@ -35,6 +35,29 @@ def _check_storage_configured() -> None:
)
async def _store_document_content(upload, text_content: str, doc_type: str) -> None:
"""Store extracted document text and optionally generate an AI summary."""
from app.services.assistant_chat_service import _call_ai
if text_content:
upload.extracted_content = text_content[:10000]
if len(text_content) > 2000:
summary, _, _ = await _call_ai(
system_base="You are a technical document analyst for IT troubleshooting.",
rag_context="",
history=[],
new_message=f"Summarize this {doc_type} content in 2-3 sentences:\n\n{text_content[:5000]}",
max_tokens=200,
)
upload.content_summary = summary
upload.ai_description = summary
else:
upload.ai_description = f"{doc_type}: {upload.filename}"
else:
upload.ai_description = f"{doc_type} (no extractable text): {upload.filename}"
async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_type: str) -> None:
"""Background task: generate AI description for uploaded file."""
try:
@@ -77,23 +100,22 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty
logger.warning("PDF text extraction failed for upload %s", upload_id)
text_content = ""
if text_content:
upload.extracted_content = text_content[:10000]
await _store_document_content(upload, text_content, "PDF")
if len(text_content) > 2000:
summary, _, _ = await _call_ai(
system_base="You are a technical document analyst for IT troubleshooting.",
rag_context="",
history=[],
new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}",
max_tokens=200,
)
upload.content_summary = summary
upload.ai_description = summary
else:
upload.ai_description = f"PDF document: {upload.filename}"
else:
upload.ai_description = f"PDF document (no extractable text): {upload.filename}"
elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
try:
from docx import Document as DocxDocument
import io as _io
doc = DocxDocument(_io.BytesIO(file_data))
text_content = "\n\n".join(
p.text for p in doc.paragraphs if p.text.strip()
)
except Exception:
logger.warning("DOCX text extraction failed for upload %s", upload_id)
text_content = ""
await _store_document_content(upload, text_content, "Word document")
elif content_type.startswith("text/") or content_type in (
"application/json", "application/xml", "application/yaml",