"""Handoff management — unified park/escalate with dual-write backward compat. Creates handoff snapshots, AI assessments (for escalations), claim workflow, and queue queries. Dual-writes to ai_sessions.escalation_package for backward compatibility with the existing escalation queue. For intent='escalate', `create_handoff` also runs the legacy enrichment that the deprecated `/escalate` endpoint used to do directly: setting `escalated_to_id`, building the AI-enhanced escalation_package (Sonnet), and recording escalation_reason. `finalize_escalation` then generates the SessionDocumentation and pushes to PSA. `dispatch_escalation_notifications` fans out the bell-icon AppNotification + external channels (Slack/Teams) on top of per-user emails. The `/escalate` endpoint is now a thin shim calling these in sequence. """ import asyncio import json import logging from datetime import datetime, timezone from typing import Any from uuid import UUID, uuid4 from sqlalchemy import select, update from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload from app.core.ai_provider import get_ai_provider from app.core.config import settings from app.core.email import EmailService from app.core.escalation_bus import bus as escalation_bus from app.models.ai_session import AISession from app.models.session_branch import SessionBranch from app.models.session_handoff import SessionHandoff from app.models.user import User from app.schemas.ai_session import SessionDocumentation from app.services.notification_service import notify logger = logging.getLogger(__name__) class HandoffAlreadyClaimedError(Exception): """Raised when a senior tries to claim a handoff another senior already won. Carries the winning claimer's id, display name, and claim timestamp so the API layer can surface a "Already claimed by {name} {time_ago}" toast on the losing client. The race story is the locked design — without this exception the endpoint would silently overwrite `claimed_by` and both seniors would think they own the session. """ def __init__( self, claimed_by_id: UUID, claimed_by_name: str, claimed_at: datetime, ) -> None: super().__init__( f"Handoff already claimed by {claimed_by_name} at {claimed_at.isoformat()}" ) self.claimed_by_id = claimed_by_id self.claimed_by_name = claimed_by_name self.claimed_at = claimed_at class HandoffManager: """Unified park/escalate handoff management.""" def __init__(self, db: AsyncSession): self.db = db async def create_handoff( self, session_id: UUID, intent: str, engineer_notes: str | None, user_id: UUID, priority: str = "normal", target_user_id: UUID | None = None, ) -> SessionHandoff: """Create a handoff (park or escalate). Generates snapshot, updates session status, dual-writes to escalation_package for backward compat. For intent='escalate' also: sets `session.escalation_reason` and optionally `session.escalated_to_id`, builds the AI-enhanced escalation package (the rich one the legacy `/escalate` path used to produce), and merges the handoff metadata into it. Self-targeting is rejected with ValueError, matching legacy behavior. """ user_id = UUID(str(user_id)) if target_user_id: target_user_id = UUID(str(target_user_id)) # Eager-load steps + user — _build_escalation_package_enhanced and # finalize_escalation iterate over session.steps to compose the # legacy enriched package and the SessionDocumentation, and the # notify() dispatcher reads session.user.name. Without selectinload # the async session raises MissingGreenlet on attribute access. result = await self.db.execute( select(AISession) .options( selectinload(AISession.steps), selectinload(AISession.user), ) .where(AISession.id == session_id) ) session = result.scalar_one_or_none() if not session: raise ValueError(f"Session {session_id} not found") if intent == "escalate": if target_user_id and target_user_id == user_id: raise ValueError( "Cannot escalate a session to yourself. Use pause instead." ) if session.status not in ("active", "paused"): raise ValueError( f"Cannot escalate session in status: {session.status}" ) # Generate snapshot — fast, no AI calls. snapshot = await self._generate_snapshot(session) # AI enrichment (assessment + enhanced escalation_package) is now # deferred to a background task scheduled by the endpoint after # commit — both calls hit Sonnet and together can take 15-25s, # which is too long to block the click path. The handoff row lands # immediately with `ai_assessment=None`; the magic-moment screen # shows "Assessment still computing" until enrich_async finishes # and the senior refreshes (or, eventually, polls). handoff_id = uuid4() handoff = SessionHandoff( id=handoff_id, session_id=session_id, account_id=session.account_id, handed_off_by=user_id, intent=intent, source_branch_id=session.active_branch_id, snapshot=snapshot, ai_assessment=None, ai_assessment_data=None, engineer_notes=engineer_notes, priority=priority, ) self.db.add(handoff) # Update session status if intent == "park": session.status = "paused" elif intent == "escalate": session.status = "escalated" session.escalation_reason = engineer_notes if target_user_id: session.escalated_to_id = target_user_id session.handoff_count = (session.handoff_count or 0) + 1 # Dual-write the minimal escalation_package shape now. The async # enrichment task overwrites this with the AI-enhanced shape # (`steps_tried`, `remaining_hypotheses`, etc.) when it completes — # consumers that read these fields (PSA writeback, legacy # SessionBriefing) tolerate either shape. session.escalation_package = { "snapshot": snapshot, "intent": intent, "engineer_notes": engineer_notes, "handoff_id": str(handoff_id), } await self.db.flush() return handoff async def finalize_escalation( self, handoff: SessionHandoff, session: AISession, user_id: UUID, ) -> tuple[SessionDocumentation | None, dict[str, Any]]: """Post-create enrichment for intent='escalate' handoffs. Generates the SessionDocumentation + pushes documentation to PSA if a ticket is linked. Returns (documentation, psa_result) so the legacy `/escalate` shim can map back to SessionCloseResponse. Safe to call only when handoff.intent == 'escalate' — for park, returns a no-op no-PSA dict. """ if handoff.intent != "escalate": return None, { "psa_push_status": "no_psa", "psa_push_error": None, "member_mapping_warning": None, } # Lazy import to avoid circular dependency: flowpilot_engine imports # plenty of services at module load time and we don't want # handoff_manager pulled into that graph at import. from app.services.flowpilot_engine import ( _generate_documentation, _push_to_psa, ) documentation = _generate_documentation(session) psa_result = await _push_to_psa(session, user_id, self.db) # Bell-icon AppNotification rows + external account-level channels # (Slack/Teams webhooks, shared escalations inboxes). This is the # `notify()` call the legacy /escalate path used to make directly, # and it has to happen BEFORE the endpoint commits so the # AppNotification rows land atomically with the handoff. Per-user # emails come after commit in dispatch_escalation_notifications — # those are pure IO with no persistent state. try: engineer_user = ( await self.db.execute( select(User).where(User.id == user_id) ) ).scalar_one_or_none() engineer_name = ( engineer_user.name if engineer_user and engineer_user.name else "Unknown" ) target_user_ids = ( [session.escalated_to_id] if session.escalated_to_id else None ) await notify( "session.escalated", handoff.account_id, { "session_id": str(handoff.session_id), "engineer_name": engineer_name, "escalation_reason": handoff.engineer_notes or "", "problem_summary": session.problem_summary or "N/A", # Surface the PSA ticket id in the bell-icon title so two # similarly-worded escalations are still distinguishable # at a glance. "psa_ticket_id": session.psa_ticket_id, }, self.db, target_user_ids=target_user_ids, ) except Exception: logger.exception( "notify() dispatch failed for handoff %s", handoff.id ) return documentation, psa_result async def _build_enhanced_escalation_package( self, session: AISession, user_id: UUID, ) -> dict[str, Any]: """Lazy wrapper around the legacy enhanced-package builder. The builder lives in flowpilot_engine; we only need it for the escalate path. Failures are caught here so handoff creation never depends on the optional Sonnet enrichment — return the minimal shape on failure. """ try: from app.services.flowpilot_engine import ( _build_escalation_package_enhanced, ) return await _build_escalation_package_enhanced(session, user_id) except Exception: logger.exception( "Enhanced escalation package build failed for session %s; " "falling back to minimal package", session.id, ) return {} async def dispatch_escalation_notifications( self, handoff: SessionHandoff ) -> int: """Email engineer-or-admin users in the account about a new escalation. Call this AFTER `db.commit()` has succeeded — sending email for a rolled-back handoff is the kind of trust-erosion bug that makes pilot customers stop trusting the tool. Returns the number of recipients successfully emailed (best-effort, not authoritative). Failures are logged but never raise: the wedge demo's reliability story is "handoff creation always succeeds; notification is best-effort," not "handoff creation depends on the email service being up." This is the graceful-degradation regression the eng + codex reviews both flagged as critical. Per-channel delivery records (Codex correction on the dead `notification_sent` boolean) are a v1.x story — for now the application logs are the audit trail. """ if handoff.intent != "escalate": return 0 # Publish to the in-memory bus first so connected senior-tech inboxes # see the new card slide in within ~1s of escalate. This path is # fire-and-forget (no IO, just memory) so it can sit ahead of the # email fan-out. try: await escalation_bus.publish( handoff.account_id, { "type": "handoff_created", "handoff_id": str(handoff.id), "session_id": str(handoff.session_id), "priority": handoff.priority, "engineer_notes": handoff.engineer_notes or "", "created_at": handoff.created_at.isoformat() if handoff.created_at else None, }, ) except Exception: logger.exception( "EscalationBus publish failed for handoff %s", handoff.id ) try: recipients = ( await self.db.execute( select(User).where( User.account_id == handoff.account_id, User.id != handoff.handed_off_by, User.account_role.in_(("owner", "admin", "engineer")), User.is_active.is_(True), User.deleted_at.is_(None), ) ) ).scalars().all() if not recipients: logger.info( "No notification recipients for handoff %s in account %s", handoff.id, handoff.account_id, ) return 0 # Pull session for the email subject. Fall back to a generic title # if the session is gone (e.g. cascade delete mid-dispatch). session_result = await self.db.execute( select(AISession).where(AISession.id == handoff.session_id) ) session = session_result.scalar_one_or_none() problem = ( session.problem_summary if session and session.problem_summary else "an active session" ) title = f"New escalation: {problem}" notes = (handoff.engineer_notes or "").strip() body = ( "A teammate has escalated a session and is asking for help.\n\n" f"Reason: {notes if notes else 'No reason provided.'}\n" f"Priority: {handoff.priority}" ) link_url = ( f"{settings.FRONTEND_URL.rstrip('/')}/escalations" if settings.FRONTEND_URL else None ) results = await asyncio.gather( *[ EmailService.send_notification_email( to_email=r.email, title=title, body=body, link_url=link_url, ) for r in recipients ], return_exceptions=True, ) sent = sum(1 for r in results if r is True) logger.info( "Escalation notifications dispatched for handoff %s: %d/%d recipients", handoff.id, sent, len(recipients), ) return sent except Exception: logger.exception( "Escalation notification dispatch failed for handoff %s", handoff.id, ) return 0 async def _generate_snapshot(self, session: AISession) -> dict[str, Any]: """Generate a snapshot of the session state at handoff time.""" snapshot: dict[str, Any] = { "problem_summary": session.problem_summary, "problem_domain": session.problem_domain, "status": session.status, "step_count": session.step_count, "confidence_tier": session.confidence_tier, } # Add branch map if branching is active if session.is_branching: branches_result = await self.db.execute( select(SessionBranch) .where(SessionBranch.session_id == session.id) .order_by(SessionBranch.branch_order) ) branches = list(branches_result.scalars().all()) branch_map = [] for b in branches: branch_map.append({ "id": str(b.id), "label": b.label, "status": b.status, "status_reason": b.status_reason, "parent_branch_id": str(b.parent_branch_id) if b.parent_branch_id else None, }) snapshot["branch_map"] = branch_map snapshot["active_branch_id"] = str(session.active_branch_id) if session.active_branch_id else None return snapshot async def claim_session( self, handoff_id: UUID, claiming_user_id: UUID, ) -> SessionHandoff: """Claim a handed-off session. If the handoff was already claimed by a *different* user (the race story: two seniors clicking Pick Up simultaneously), raise `HandoffAlreadyClaimedError` with the winning claimer's details so the API can return 409 with the data the loser's toast needs. A re-claim by the same user is idempotent. """ claiming_user_id = UUID(str(claiming_user_id)) claimed_at = datetime.now(timezone.utc) update_result = await self.db.execute( update(SessionHandoff) .where( SessionHandoff.id == handoff_id, SessionHandoff.claimed_by.is_(None), SessionHandoff.handed_off_by != claiming_user_id, ) .values(claimed_by=claiming_user_id, claimed_at=claimed_at) .returning(SessionHandoff.id) ) claimed_now = update_result.scalar_one_or_none() is not None result = await self.db.execute( select(SessionHandoff) .options( selectinload(SessionHandoff.claimed_by_user), selectinload(SessionHandoff.handed_off_by_user), ) .where(SessionHandoff.id == handoff_id) ) handoff = result.scalar_one_or_none() if not handoff: raise ValueError(f"Handoff {handoff_id} not found") handed_off_by = UUID(str(handoff.handed_off_by)) claimed_by = ( UUID(str(handoff.claimed_by)) if handoff.claimed_by is not None else None ) if handed_off_by == claiming_user_id: raise PermissionError("Cannot claim your own handoff") if not claimed_now and claimed_by != claiming_user_id: claimer = handoff.claimed_by_user raise HandoffAlreadyClaimedError( claimed_by_id=claimed_by, claimed_by_name=claimer.name if claimer else "another engineer", claimed_at=handoff.claimed_at or datetime.now(timezone.utc), ) # Reactivate session session_result = await self.db.execute( select(AISession).where(AISession.id == handoff.session_id) ) session = session_result.scalar_one() session.status = "active" # Dual-write session.escalated_to_id = claiming_user_id await self.db.flush() return handoff async def _generate_handoff_summary( self, session: AISession ) -> dict[str, Any] | None: """Single structured AI call for the escalation magic-moment screen. Returns a dict with summary_prose, what_we_know, likely_cause, suggested_steps, and confidence. Returns None on timeout or error. Replaces the old _generate_ai_assessment + _generate_ai_assessment_with_timeout pair, which returned freeform prose with no usable structured fields. """ timeout = settings.ESCALATION_AI_ASSESSMENT_TIMEOUT_SECONDS try: return await asyncio.wait_for( self._generate_handoff_summary_inner(session), timeout=timeout, ) except asyncio.TimeoutError: logger.warning( "Handoff summary timed out after %ss for session %s", timeout, session.id, ) return None except Exception: logger.exception( "Handoff summary failed for session %s", session.id ) return None async def _generate_handoff_summary_inner( self, session: AISession ) -> dict[str, Any]: steps = session.steps or [] steps_tried = [] for step in sorted(steps, key=lambda s: s.step_order): content = step.content or {} text = content.get("text", "").strip() if not text: continue entry = text if step.selected_option: entry += f" → {step.selected_option}" elif step.free_text_input: entry += f" → {step.free_text_input[:100]}" elif step.was_skipped: entry += " (skipped)" steps_tried.append(entry) steps_text = ( "\n".join(f"- {s}" for s in steps_tried[:15]) or "No diagnostic steps recorded." ) msgs = session.conversation_messages or [] recent_msgs = "\n".join( f"[{m.get('role', '?')}]: {m.get('content', '')[:200]}" for m in msgs[-10:] ) prompt = ( "Generate a structured escalation handoff summary.\n\n" f"Problem: {session.problem_summary or 'Unknown'}\n" f"Domain: {session.problem_domain or 'Unknown'}\n" f"Escalation reason: {session.escalation_reason or 'Not provided'}\n\n" f"Diagnostic steps taken:\n{steps_text}\n\n" f"Recent conversation:\n{recent_msgs}\n\n" "Respond with ONLY a valid JSON object matching this schema exactly:\n" '{"summary_prose": "<2-3 sentences suitable for PSA ticket notes>",\n' ' "what_we_know": ["", ""],\n' ' "likely_cause": "",\n' ' "suggested_steps": ["", ""],\n' ' "confidence": ""}' ) provider = get_ai_provider(settings.get_model_for_action("escalation_package")) raw, _, _ = await provider.generate_json( system_prompt=( "You are a diagnostic assessment generator for MSP tech support escalations. " "Always respond with valid JSON and nothing else. " "Be concise and factual." ), messages=[{"role": "user", "content": prompt}], max_tokens=700, ) cleaned = raw.strip() if cleaned.startswith("```"): lines = cleaned.split("\n", 1) cleaned = lines[1] if len(lines) > 1 else cleaned if cleaned.endswith("```"): cleaned = cleaned[:-3].rstrip() result = json.loads(cleaned) if not isinstance(result.get("suggested_steps"), list): result["suggested_steps"] = [] if not isinstance(result.get("what_we_know"), list): result["what_we_know"] = [] if result.get("confidence") not in ("low", "medium", "high"): result["confidence"] = "medium" if not isinstance(result.get("summary_prose"), str) or not result.get("summary_prose"): result["summary_prose"] = result.get("likely_cause", "Assessment generated.") if not isinstance(result.get("likely_cause"), str): result["likely_cause"] = "" return result async def generate_briefing( self, handoff_id: UUID, claiming_user_id: UUID ) -> str: """Generate a natural-language briefing for the engineer claiming the session.""" result = await self.db.execute( select(SessionHandoff).where(SessionHandoff.id == handoff_id) ) handoff = result.scalar_one_or_none() if not handoff: raise ValueError(f"Handoff {handoff_id} not found") session_result = await self.db.execute( select(AISession).where(AISession.id == handoff.session_id) ) session = session_result.scalar_one() from app.services.assistant_chat_service import _call_ai snapshot_text = str(handoff.snapshot)[:2000] briefing, _, _ = await _call_ai( system_base="You are a handoff briefing generator for MSP teams.", rag_context="", history=[], new_message=( f"Generate a concise briefing for an engineer picking up this session.\n" f"Problem: {session.problem_summary}\n" f"Intent: {handoff.intent}\n" f"Engineer notes: {handoff.engineer_notes or 'None'}\n" f"Snapshot: {snapshot_text}\n" f"AI Assessment: {handoff.ai_assessment or 'None'}" ), max_tokens=500, ) return briefing async def push_to_psa(self, handoff_id: UUID) -> SessionHandoff: """Push handoff notes to PSA via existing psa_documentation_service.""" result = await self.db.execute( select(SessionHandoff).where(SessionHandoff.id == handoff_id) ) handoff = result.scalar_one_or_none() if not handoff: raise ValueError(f"Handoff {handoff_id} not found") try: from app.services.psa_documentation_service import push_session_notes session_result = await self.db.execute( select(AISession).where(AISession.id == handoff.session_id) ) session = session_result.scalar_one() if session.psa_ticket_id and session.psa_connection_id: note_id = await push_session_notes( session=session, notes_content=handoff.ai_assessment or str(handoff.snapshot), db=self.db, ) handoff.psa_note_pushed = True handoff.psa_note_id = note_id except Exception: logger.exception(f"Failed to push handoff {handoff_id} to PSA") await self.db.flush() return handoff async def get_queue( self, team_id: UUID | None = None, account_id: UUID | None = None, ) -> list[dict[str, Any]]: """Get team queue of parked + escalated sessions.""" query = ( select(SessionHandoff, AISession) .join(AISession, SessionHandoff.session_id == AISession.id) .where(SessionHandoff.claimed_by.is_(None)) .order_by(SessionHandoff.created_at.desc()) ) if team_id: query = query.where(AISession.team_id == team_id) elif account_id: query = query.where(AISession.account_id == account_id) result = await self.db.execute(query) rows = result.all() queue_items = [] for handoff, session in rows: queue_items.append({ "handoff_id": handoff.id, "session_id": session.id, "intent": handoff.intent, "problem_summary": session.problem_summary, "problem_domain": session.problem_domain, "priority": handoff.priority, "engineer_notes": handoff.engineer_notes, "created_at": handoff.created_at, "claimed_by": handoff.claimed_by, "claimed_at": handoff.claimed_at, }) return queue_items async def enrich_escalation_async(handoff_id: UUID, user_id: UUID) -> None: """Run the AI enrichment for an escalation handoff in the background. Scheduled by `/escalate` and `/handoff` (intent=escalate) endpoints via FastAPI BackgroundTasks. Opens its own DB session because the request session is closed by the time this runs. Generates: 1. The legacy AI-enhanced escalation_package (Sonnet, ~5-10s) — saved to `session.escalation_package`, preserving the `intent` / `engineer_notes` / `handoff_id` keys the dual-write set so legacy consumers keep working. 2. The diagnostic AI assessment (Sonnet, ~4-15s) — saved to `handoff.ai_assessment` and `handoff.ai_assessment_data`. On completion publishes a `handoff_assessment_ready` event on the escalation bus so any connected magic-moment screen can refresh without a manual reload. Failures are logged but never propagated — the click-path-side handoff creation already committed, so worst case the senior sees the "Assessment still computing" placeholder until they refresh manually. """ from app.core.database import async_session_maker from app.core.escalation_bus import bus as escalation_bus async with async_session_maker() as db: try: result = await db.execute( select(SessionHandoff).where(SessionHandoff.id == handoff_id) ) handoff = result.scalar_one_or_none() if not handoff or handoff.intent != "escalate": return session_result = await db.execute( select(AISession) .options(selectinload(AISession.steps), selectinload(AISession.user)) .where(AISession.id == handoff.session_id) ) session = session_result.scalar_one_or_none() if not session: logger.warning( "enrich_escalation_async: session %s gone for handoff %s", handoff.session_id, handoff_id, ) return manager = HandoffManager(db) # Single consolidated AI call — replaces the old # _generate_ai_assessment + _build_enhanced_escalation_package pair. try: summary = await manager._generate_handoff_summary(session) if summary: # ai_assessment (text) holds the PSA prose for backward compat # (push_to_psa reads it; generate_status_update falls back to it). handoff.ai_assessment = summary.get("summary_prose") handoff.ai_assessment_data = summary # Keep suggested_next_steps in escalation_package so # psa_documentation_service can read it without a handoff join. existing_pkg = ( session.escalation_package if isinstance(session.escalation_package, dict) else {} ) session.escalation_package = { **existing_pkg, "suggested_next_steps": summary.get("suggested_steps", []), } except Exception: logger.exception( "enrich_escalation_async: summary generation failed for handoff %s", handoff_id, ) await db.commit() try: await escalation_bus.publish( handoff.account_id, { "type": "handoff_assessment_ready", "handoff_id": str(handoff.id), "session_id": str(handoff.session_id), "has_assessment": handoff.ai_assessment_data is not None, }, ) except Exception: logger.exception( "enrich_escalation_async: bus publish failed for handoff %s", handoff_id, ) except Exception: logger.exception( "enrich_escalation_async failed for handoff %s", handoff_id ) try: await db.rollback() except Exception: pass