feat: wire PDF and text file content into AI chat messages

PDF uploads were stored in S3 and had text extracted during upload, but fetch_upload_images() filtered exclusively for image MIME types, so document content never reached the AI. - Add fetch_upload_documents() in storage_service.py to retrieve extracted_content for PDFs and text files - Update ai_sessions.py chat endpoint to call both fetch_upload_images and fetch_upload_documents, injecting document text as context - Add PDF text extraction in _generate_ai_description (pypdf) - Add pypdf>=4.0.0 to requirements.txt - Fix test_db teardown to avoid connection pool issues - Add 5 tests for fetch_upload_documents Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-27 21:02:56 +00:00
parent 3cea949519
commit 11de850054
6 changed files with 324 additions and 12 deletions
--- a/backend/tests/test_uploads.py
+++ b/backend/tests/test_uploads.py
@@ -134,6 +134,42 @@ async def test_upload_rejects_oversized_text(client, auth_headers):
    assert "too large" in response.json()["detail"].lower()


+@pytest.mark.asyncio
+async def test_upload_accepts_pdf(client, auth_headers):
+    """Upload accepts application/pdf files (regression: was rejected with 400)."""
+    fake_key = f"uploads/acc/{uuid.uuid4()}.pdf"
+    fake_url = "https://fake-s3.example.com/presigned?token=pdf"
+
+    with patch("app.api.endpoints.uploads.settings") as mock_settings, \
+         patch("app.api.endpoints.uploads.storage_service") as mock_storage:
+        mock_settings.STORAGE_ENDPOINT = "http://fake-s3"
+        mock_storage.validate_upload.return_value = None
+        mock_storage.MAX_FILES_PER_SESSION = 20
+        mock_storage.MAX_BYTES_PER_SESSION = 50 * 1024 * 1024
+        mock_storage.upload_file = AsyncMock(return_value=fake_key)
+        mock_storage.get_presigned_url.return_value = fake_url
+
+        files = {"file": ("report.pdf", io.BytesIO(b"%PDF-1.4 test"), "application/pdf")}
+        response = await client.post("/api/v1/uploads", files=files, headers=auth_headers)
+
+    assert response.status_code == 201
+    data = response.json()
+    assert data["filename"] == "report.pdf"
+    assert data["content_type"] == "application/pdf"
+
+
+@pytest.mark.asyncio
+async def test_upload_rejects_oversized_pdf(client, auth_headers):
+    """Upload rejects PDF files exceeding 10 MB."""
+    large_data = b"%PDF-1.4 " + b"\x00" * (11 * 1024 * 1024)  # 11 MB
+    with patch("app.api.endpoints.uploads.settings") as mock_settings:
+        mock_settings.STORAGE_ENDPOINT = "http://fake-s3"
+        files = {"file": ("huge.pdf", io.BytesIO(large_data), "application/pdf")}
+        response = await client.post("/api/v1/uploads", files=files, headers=auth_headers)
+    assert response.status_code == 400
+    assert "too large" in response.json()["detail"].lower()
+
+
 # ---------------------------------------------------------------------------
 # Happy path tests (storage fully mocked)
 # ---------------------------------------------------------------------------
@@ -299,3 +335,139 @@ async def test_delete_upload_forbidden_for_non_owner(client, auth_headers, test_
        )

    assert response.status_code == 403
+
+
+# ---------------------------------------------------------------------------
+# fetch_upload_documents tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_fetch_upload_documents_returns_pdf_content(client, auth_headers, test_db):
+    """fetch_upload_documents returns extracted_content for PDF uploads."""
+    from app.models.file_upload import FileUpload
+    from app.models.user import User
+    from app.services.storage_service import fetch_upload_documents
+    from sqlalchemy import select
+
+    result = await test_db.execute(select(User).where(User.email == "test@example.com"))
+    user = result.scalar_one()
+
+    upload = FileUpload(
+        account_id=user.account_id,
+        uploaded_by=user.id,
+        session_id=None,
+        filename="report.pdf",
+        content_type="application/pdf",
+        size_bytes=5000,
+        storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.pdf",
+        extracted_content="This is the extracted PDF text content.",
+    )
+    test_db.add(upload)
+    await test_db.commit()
+    await test_db.refresh(upload)
+
+    docs = await fetch_upload_documents([upload.id], user.account_id, test_db)
+
+    assert len(docs) == 1
+    assert docs[0]["filename"] == "report.pdf"
+    assert docs[0]["content_type"] == "application/pdf"
+    assert docs[0]["text"] == "This is the extracted PDF text content."
+
+
+@pytest.mark.asyncio
+async def test_fetch_upload_documents_excludes_images(client, auth_headers, test_db):
+    """fetch_upload_documents does not return image uploads."""
+    from app.models.file_upload import FileUpload
+    from app.models.user import User
+    from app.services.storage_service import fetch_upload_documents
+    from sqlalchemy import select
+
+    result = await test_db.execute(select(User).where(User.email == "test@example.com"))
+    user = result.scalar_one()
+
+    upload = FileUpload(
+        account_id=user.account_id,
+        uploaded_by=user.id,
+        session_id=None,
+        filename="screenshot.png",
+        content_type="image/png",
+        size_bytes=1024,
+        storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.png",
+    )
+    test_db.add(upload)
+    await test_db.commit()
+    await test_db.refresh(upload)
+
+    docs = await fetch_upload_documents([upload.id], user.account_id, test_db)
+    assert len(docs) == 0
+
+
+@pytest.mark.asyncio
+async def test_fetch_upload_documents_pdf_no_text(client, auth_headers, test_db):
+    """PDF with no extracted text returns a placeholder note."""
+    from app.models.file_upload import FileUpload
+    from app.models.user import User
+    from app.services.storage_service import fetch_upload_documents
+    from sqlalchemy import select
+
+    result = await test_db.execute(select(User).where(User.email == "test@example.com"))
+    user = result.scalar_one()
+
+    upload = FileUpload(
+        account_id=user.account_id,
+        uploaded_by=user.id,
+        session_id=None,
+        filename="scanned.pdf",
+        content_type="application/pdf",
+        size_bytes=2000,
+        storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.pdf",
+        extracted_content=None,
+    )
+    test_db.add(upload)
+    await test_db.commit()
+    await test_db.refresh(upload)
+
+    docs = await fetch_upload_documents([upload.id], user.account_id, test_db)
+
+    assert len(docs) == 1
+    assert "no extractable text" in docs[0]["text"]
+
+
+@pytest.mark.asyncio
+async def test_fetch_upload_documents_respects_account_filter(client, auth_headers, test_db):
+    """fetch_upload_documents only returns uploads belonging to the given account."""
+    from app.models.file_upload import FileUpload
+    from app.models.user import User
+    from app.services.storage_service import fetch_upload_documents
+    from sqlalchemy import select
+
+    result = await test_db.execute(select(User).where(User.email == "test@example.com"))
+    user = result.scalar_one()
+
+    upload = FileUpload(
+        account_id=user.account_id,
+        uploaded_by=user.id,
+        session_id=None,
+        filename="report.pdf",
+        content_type="application/pdf",
+        size_bytes=5000,
+        storage_key=f"uploads/{user.account_id}/{uuid.uuid4()}.pdf",
+        extracted_content="Secret content",
+    )
+    test_db.add(upload)
+    await test_db.commit()
+    await test_db.refresh(upload)
+
+    # Query with a different account_id — should get nothing
+    other_account = uuid.uuid4()
+    docs = await fetch_upload_documents([upload.id], other_account, test_db)
+    assert len(docs) == 0
+
+
+@pytest.mark.asyncio
+async def test_fetch_upload_documents_empty_ids(client, auth_headers, test_db):
+    """Empty upload_ids returns empty list without querying DB."""
+    from app.services.storage_service import fetch_upload_documents
+
+    docs = await fetch_upload_documents([], uuid.uuid4(), test_db)
+    assert docs == []