feat: wire PDF and text file content into AI chat messages

PDF uploads were stored in S3 and had text extracted during upload, but
fetch_upload_images() filtered exclusively for image MIME types, so
document content never reached the AI.

- Add fetch_upload_documents() in storage_service.py to retrieve
  extracted_content for PDFs and text files
- Update ai_sessions.py chat endpoint to call both fetch_upload_images
  and fetch_upload_documents, injecting document text as context
- Add PDF text extraction in _generate_ai_description (pypdf)
- Add pypdf>=4.0.0 to requirements.txt
- Fix test_db teardown to avoid connection pool issues
- Add 5 tests for fetch_upload_documents

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
chihlasm
2026-03-27 21:02:56 +00:00
parent 3cea949519
commit 11de850054
6 changed files with 324 additions and 12 deletions

View File

@@ -61,6 +61,40 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty
max_tokens=100,
)
upload.ai_description = description
elif content_type == "application/pdf":
try:
from pypdf import PdfReader
import io as _io
reader = PdfReader(_io.BytesIO(file_data))
pages_text = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
pages_text.append(page_text)
text_content = "\n\n".join(pages_text)
except Exception:
logger.warning("PDF text extraction failed for upload %s", upload_id)
text_content = ""
if text_content:
upload.extracted_content = text_content[:10000]
if len(text_content) > 2000:
summary, _, _ = await _call_ai(
system_base="You are a technical document analyst for IT troubleshooting.",
rag_context="",
history=[],
new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}",
max_tokens=200,
)
upload.content_summary = summary
upload.ai_description = summary
else:
upload.ai_description = f"PDF document: {upload.filename}"
else:
upload.ai_description = f"PDF document (no extractable text): {upload.filename}"
elif content_type.startswith("text/") or content_type in (
"application/json", "application/xml", "application/yaml",
):