feat: wire PDF and text file content into AI chat messages
PDF uploads were stored in S3 and had text extracted during upload, but fetch_upload_images() filtered exclusively for image MIME types, so document content never reached the AI. - Add fetch_upload_documents() in storage_service.py to retrieve extracted_content for PDFs and text files - Update ai_sessions.py chat endpoint to call both fetch_upload_images and fetch_upload_documents, injecting document text as context - Add PDF text extraction in _generate_ai_description (pypdf) - Add pypdf>=4.0.0 to requirements.txt - Fix test_db teardown to avoid connection pool issues - Add 5 tests for fetch_upload_documents Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -280,18 +280,28 @@ async def send_chat_message(
|
||||
user_id = current_user.id
|
||||
account_id = current_user.account_id
|
||||
|
||||
# Fetch attached images from S3 (if any)
|
||||
# Fetch attached uploads from S3 (if any)
|
||||
images = None
|
||||
message = data.message
|
||||
if data.upload_ids:
|
||||
from app.services.storage_service import fetch_upload_images
|
||||
from app.services.storage_service import fetch_upload_images, fetch_upload_documents
|
||||
images = await fetch_upload_images(data.upload_ids, account_id, db) or None
|
||||
|
||||
# Inject document text (PDFs, text files) as context in the message
|
||||
documents = await fetch_upload_documents(data.upload_ids, account_id, db)
|
||||
if documents:
|
||||
doc_parts = []
|
||||
for doc in documents:
|
||||
doc_parts.append(f"--- Attached file: {doc['filename']} ---\n{doc['text']}")
|
||||
doc_context = "\n\n".join(doc_parts)
|
||||
message = f"{message}\n\n[Attached document content]\n{doc_context}"
|
||||
|
||||
try:
|
||||
ai_content, suggested_flows, session, fork_metadata, actions_data, questions_data = await unified_chat_service.send_chat_message(
|
||||
session_id=session_id,
|
||||
user_id=user_id,
|
||||
account_id=account_id,
|
||||
message=data.message,
|
||||
message=message,
|
||||
db=db,
|
||||
images=images,
|
||||
)
|
||||
|
||||
@@ -61,6 +61,40 @@ async def _generate_ai_description(upload_id: UUID, file_data: bytes, content_ty
|
||||
max_tokens=100,
|
||||
)
|
||||
upload.ai_description = description
|
||||
elif content_type == "application/pdf":
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
import io as _io
|
||||
|
||||
reader = PdfReader(_io.BytesIO(file_data))
|
||||
pages_text = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
pages_text.append(page_text)
|
||||
text_content = "\n\n".join(pages_text)
|
||||
except Exception:
|
||||
logger.warning("PDF text extraction failed for upload %s", upload_id)
|
||||
text_content = ""
|
||||
|
||||
if text_content:
|
||||
upload.extracted_content = text_content[:10000]
|
||||
|
||||
if len(text_content) > 2000:
|
||||
summary, _, _ = await _call_ai(
|
||||
system_base="You are a technical document analyst for IT troubleshooting.",
|
||||
rag_context="",
|
||||
history=[],
|
||||
new_message=f"Summarize this PDF content in 2-3 sentences:\n\n{text_content[:5000]}",
|
||||
max_tokens=200,
|
||||
)
|
||||
upload.content_summary = summary
|
||||
upload.ai_description = summary
|
||||
else:
|
||||
upload.ai_description = f"PDF document: {upload.filename}"
|
||||
else:
|
||||
upload.ai_description = f"PDF document (no extractable text): {upload.filename}"
|
||||
|
||||
elif content_type.startswith("text/") or content_type in (
|
||||
"application/json", "application/xml", "application/yaml",
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user