feat: add .docx upload support with text extraction

- Add DOCX MIME type to ALLOWED_DOCUMENT_TYPES in storage_service.py
- Add python-docx text extraction in _generate_ai_description
- Extract shared _store_document_content helper for PDF/DOCX
- Add python-docx>=1.1.0 to requirements.txt
- Add tests for docx upload acceptance and document fetch

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
chihlasm
2026-03-27 21:08:12 +00:00
parent 11de850054
commit 217e70cb81
4 changed files with 102 additions and 17 deletions

View File

@@ -158,6 +158,31 @@ async def test_upload_accepts_pdf(client, auth_headers):
assert data["content_type"] == "application/pdf"
@pytest.mark.asyncio
async def test_upload_accepts_docx(client, auth_headers):
"""Upload accepts .docx files."""
fake_key = f"uploads/acc/{uuid.uuid4()}.docx"
fake_url = "https://fake-s3.example.com/presigned?token=docx"
docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
with patch("app.api.endpoints.uploads.settings") as mock_settings, \
patch("app.api.endpoints.uploads.storage_service") as mock_storage:
mock_settings.STORAGE_ENDPOINT = "http://fake-s3"
mock_storage.validate_upload.return_value = None
mock_storage.MAX_FILES_PER_SESSION = 20
mock_storage.MAX_BYTES_PER_SESSION = 50 * 1024 * 1024
mock_storage.upload_file = AsyncMock(return_value=fake_key)
mock_storage.get_presigned_url.return_value = fake_url
files = {"file": ("runbook.docx", io.BytesIO(b"PK\x03\x04 fake docx"), docx_mime)}
response = await client.post("/api/v1/uploads", files=files, headers=auth_headers)
assert response.status_code == 201
data = response.json()
assert data["filename"] == "runbook.docx"
assert data["content_type"] == docx_mime
@pytest.mark.asyncio
async def test_upload_rejects_oversized_pdf(client, auth_headers):
"""Upload rejects PDF files exceeding 10 MB."""
@@ -464,6 +489,40 @@ async def test_fetch_upload_documents_respects_account_filter(client, auth_heade
assert len(docs) == 0
@pytest.mark.asyncio
async def test_fetch_upload_documents_returns_docx_content(client, auth_headers, test_db):
"""fetch_upload_documents returns extracted_content for DOCX uploads."""
from app.models.file_upload import FileUpload
from app.models.user import User
from app.services.storage_service import fetch_upload_documents
from sqlalchemy import select
result = await test_db.execute(select(User).where(User.email == "test@example.com"))
user = result.scalar_one()
docx_mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
upload = FileUpload(
account_id=user.account_id,
uploaded_by=user.id,
session_id=None,
filename="runbook.docx",
content_type=docx_mime,
size_bytes=8000,
storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.docx",
extracted_content="Step 1: Restart the service\n\nStep 2: Verify logs",
)
test_db.add(upload)
await test_db.commit()
await test_db.refresh(upload)
docs = await fetch_upload_documents([upload.id], user.account_id, test_db)
assert len(docs) == 1
assert docs[0]["filename"] == "runbook.docx"
assert docs[0]["content_type"] == docx_mime
assert "Restart the service" in docs[0]["text"]
@pytest.mark.asyncio
async def test_fetch_upload_documents_empty_ids(client, auth_headers, test_db):
"""Empty upload_ids returns empty list without querying DB."""