From 362c7b1d799e9823985910bd576f4efcf3dad125 Mon Sep 17 00:00:00 2001 From: Michael Chihlas Date: Thu, 23 Apr 2026 22:04:56 -0400 Subject: [PATCH] fix(pilot): outcome-aware Resolve/Escalate previews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #1 from phase-8-review-issues.md. Cache invalidation alone isn't enough — previews were also omitting outcome fields from the LLM bundle, so a fresh regenerate still couldn't distinguish proposed / failed / partial / success. - PATCH /outcome now bumps ai_sessions.state_version (matches record_decision's existing pattern). - Resolution-note + escalation-package bundles now include status, applied_at, verified_at, partial_notes, failure_reason on the active fix. - Generator prompts prescribe outcome-aware phrasing (closure language for success; what-we've-tried + next-steps for failed/partial). - New end-to-end test asserts the regenerated preview reflects the recorded outcome, not just that the cache key changed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/endpoints/session_suggested_fixes.py | 9 ++ .../services/escalation_package_generator.py | 52 +++++-- .../app/services/resolution_note_generator.py | 32 ++++- backend/tests/test_fix_outcome_endpoint.py | 130 ++++++++++++++++++ 4 files changed, 208 insertions(+), 15 deletions(-) diff --git a/backend/app/api/endpoints/session_suggested_fixes.py b/backend/app/api/endpoints/session_suggested_fixes.py index 60f59674..0804b013 100644 --- a/backend/app/api/endpoints/session_suggested_fixes.py +++ b/backend/app/api/endpoints/session_suggested_fixes.py @@ -276,6 +276,15 @@ async def patch_suggested_fix_outcome( if fix.applied_at is None and body.outcome != "dismissed": fix.applied_at = now + # Outcome changes the bundle that resolution-note/escalation-package + # previews see, so bump state_version inside the same transaction — + # mirrors the pattern in record_decision above. + await db.execute( + update(AISession) + .where(AISession.id == session_id) + .values(state_version=AISession.state_version + 1) + ) + await db.commit() await db.refresh(fix) return SessionSuggestedFixResponse.model_validate(fix) diff --git a/backend/app/services/escalation_package_generator.py b/backend/app/services/escalation_package_generator.py index cb539afe..62978f27 100644 --- a/backend/app/services/escalation_package_generator.py +++ b/backend/app/services/escalation_package_generator.py @@ -55,22 +55,45 @@ header.> If there are no facts, write "Nothing confirmed yet." and continue.> ## What we've tried - + + +- applied_failed: List the fix as a tried path. Include the failure reason if \ +provided. State that it did not resolve the issue. +- applied_partial: Include the fix as a partially tried path. Include partial \ +notes if provided. Indicate it was not fully completed or not verified. +- applied_success: Note that the fix was applied and verified but escalation \ +is still needed for another reason (unusual — reflect this accurately). +- dismissed: Do not mention the fix as a tried path; it was only considered. +- proposed (no outcome yet): Do not list it here; it goes in Current hypothesis. + +If nothing has been tried at all (no checks, no scripts, no applied/partial \ +fix), write "No diagnostic actions run yet." and continue. ## Current hypothesis - + + +- proposed (no outcome yet): State the fix title and confidence. If confidence \ +is below 60% or there is no active fix, say "No leading hypothesis yet — \ +symptoms are still being narrowed." +- applied_failed or dismissed: Say the proposed fix did not hold or was set \ +aside. State any remaining uncertainty. +- applied_partial: Note the partial application and what remains open. +- applied_success: Unusual in an escalate path — state the fix resolved the \ +original symptom but a new or related issue requires escalation. ## Suggested next steps 80%), the first bullet is "Try the \ -suggested fix: ."> +Derive from the gap between confirmed facts and a complete resolution. \ +If the active suggested fix failed (applied_failed), inform the next steps \ +accordingly — e.g. suggest alternatives or deeper investigation paths, \ +drawing on the failure reason if provided. \ +If the fix is partially applied (applied_partial), the first step is typically \ +to complete or verify it. \ +If the fix is still proposed (no outcome), the first step is to try it if \ +confidence is high (>80%).> Strict rules: - Use ONLY the input I provide. Never invent command names, KB articles, or \ @@ -269,6 +292,15 @@ class EscalationPackageGeneratorService: lines.append(f"Title: {active_fix.title}") lines.append(f"Confidence: {active_fix.confidence_pct}%") lines.append(f"Description: {active_fix.description}") + lines.append(f"Outcome status: {active_fix.status}") + if active_fix.applied_at: + lines.append(f"Applied at: {active_fix.applied_at.isoformat()}") + if active_fix.verified_at: + lines.append(f"Verified at: {active_fix.verified_at.isoformat()}") + if active_fix.partial_notes: + lines.append(f"Partial notes: {active_fix.partial_notes}") + if active_fix.failure_reason: + lines.append(f"Failure reason: {active_fix.failure_reason}") lines.append("") lines.append( diff --git a/backend/app/services/resolution_note_generator.py b/backend/app/services/resolution_note_generator.py index 8cdc9206..974840eb 100644 --- a/backend/app/services/resolution_note_generator.py +++ b/backend/app/services/resolution_note_generator.py @@ -69,11 +69,24 @@ say "Root cause not definitively isolated." and explain what is suspected based on facts.> ## Resolution -<one short paragraph describing the resolution applied. If a script ran during \ -the session, mention it (e.g. "Cleared cached credentials via the \ -clear-outlook-credentials script."). If no resolution has been performed yet, \ -write "Resolution not yet applied — fix proposed: <fix title>." Pull verbatim \ -script names and template references when available.> +<The content of this section depends on the outcome recorded for the active \ +suggested fix, as given in the input bundle under "fix.status":> + +- applied_success: Write in past tense using closure language. State that the \ +fix was applied and verified as working. If verified_at is provided, you may \ +reference it as the time resolution was confirmed. Example phrasing: \ +"Applied <fix title>; confirmed working." +- applied_failed: Acknowledge that the proposed fix did not resolve the issue \ +and was discarded. If failure_reason is provided, include it. Then describe \ +the actual resolution path taken (derived from facts and scripts run). This \ +state means the engineer resolved the issue another way; the note should cover \ +that actual resolution, not just the failed attempt. +- applied_partial: Note that the fix was partially applied. If partial_notes \ +are provided, include them. Then describe the final resolution path taken. +- dismissed: Treat the fix as considered and set aside. Do not center the note \ +on it. Describe the resolution based on what was actually confirmed and done. +- proposed (no outcome yet): Write "Resolution not yet applied — fix proposed: \ +<fix title>." Pull verbatim script names and template references when available. Strict rules: - Use ONLY the facts and state I provide. Never invent specifics that are not \ @@ -302,6 +315,15 @@ class ResolutionNoteGeneratorService: lines.append(f"Description: {active_fix.description}") if active_fix.user_decision: lines.append(f"Engineer decision: {active_fix.user_decision}") + lines.append(f"Outcome status: {active_fix.status}") + if active_fix.applied_at: + lines.append(f"Applied at: {active_fix.applied_at.isoformat()}") + if active_fix.verified_at: + lines.append(f"Verified at: {active_fix.verified_at.isoformat()}") + if active_fix.partial_notes: + lines.append(f"Partial notes: {active_fix.partial_notes}") + if active_fix.failure_reason: + lines.append(f"Failure reason: {active_fix.failure_reason}") lines.append("") lines.append("# Scripts run during the session (passwords redacted)") diff --git a/backend/tests/test_fix_outcome_endpoint.py b/backend/tests/test_fix_outcome_endpoint.py index 7b5dd702..b0969417 100644 --- a/backend/tests/test_fix_outcome_endpoint.py +++ b/backend/tests/test_fix_outcome_endpoint.py @@ -5,13 +5,24 @@ Fixture style follows test_session_suggested_fixes_api.py: """ from __future__ import annotations +from unittest.mock import AsyncMock, call, patch + import pytest from httpx import AsyncClient +from sqlalchemy import select +from app.api.endpoints.session_suggested_fixes import _clear_preview_cache_for_tests from app.models.ai_session import AISession from app.models.session_suggested_fix import SessionSuggestedFix +@pytest.fixture(autouse=True) +def _isolate_preview_cache(): + _clear_preview_cache_for_tests() + yield + _clear_preview_cache_for_tests() + + # ── shared helper ──────────────────────────────────────────────────────────── async def _make_session_with_fix(test_db, user) -> tuple[str, str]: @@ -197,3 +208,122 @@ async def test_failed_outcome_stores_notes_as_failure_reason( body = r.json() assert body["failure_reason"] == "user reports no change" assert body["partial_notes"] is None + + +# ── state_version bump ──────────────────────────────────────────────────────── + +@pytest.mark.asyncio +async def test_outcome_patch_bumps_state_version( + client: AsyncClient, test_user, auth_headers, test_db +): + """PATCH /outcome must increment ai_sessions.state_version (like record_decision).""" + session_id, fix_id = await _make_session_with_fix(test_db, test_user) + + # Capture the initial state_version from DB. + from uuid import UUID + result = await test_db.execute( + select(AISession).where(AISession.id == UUID(session_id)) + ) + session_obj = result.scalar_one() + initial_version = session_obj.state_version + + r = await client.patch( + f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome", + json={"outcome": "applied_success"}, + headers=auth_headers, + ) + assert r.status_code == 200 + + await test_db.refresh(session_obj) + assert session_obj.state_version == initial_version + 1, ( + "Outcome patch must bump state_version so preview cache is invalidated" + ) + + +# ── outcome propagation into preview bundle ─────────────────────────────────── + +@pytest.mark.asyncio +async def test_resolution_note_preview_reflects_outcome_after_patch( + client: AsyncClient, test_user, auth_headers, test_db +): + """End-to-end: preview before outcome != preview after outcome; new preview + bundle includes failure_reason; state_version was bumped between the two. + + The LLM is stubbed so the test is deterministic. The stub returns whatever + the user-message content is, which means the captured call args reflect + what the bundle actually contained. + """ + session_id, fix_id = await _make_session_with_fix(test_db, test_user) + + distinct_failure_reason = "DISTINCT-FAILURE-REASON-XYZZY-42" + + calls_made: list[str] = [] + + async def fake_generate_text(system_prompt, messages, max_tokens): + user_content = messages[0]["content"] + calls_made.append(user_content) + # Return markdown that includes the user-message bundle verbatim so we + # can assert the bundle shape without inspecting mock internals. + return ( + f"## Problem\ntest\n\n## What we confirmed\n(none)\n\n" + f"## Root cause\ntest\n\n## Resolution\nBUNDLE_CONTENT={user_content}", + 100, + 50, + ) + + fake_provider = AsyncMock() + fake_provider.generate_text = AsyncMock(side_effect=fake_generate_text) + + with patch( + "app.services.resolution_note_generator.get_ai_provider", + return_value=fake_provider, + ): + # Preview A — before any outcome recorded (status = "proposed"). + r_a = await client.post( + f"/api/v1/ai-sessions/{session_id}/resolution-note/preview", + headers=auth_headers, + ) + assert r_a.status_code == 200 + markdown_a = r_a.json()["markdown"] + version_a = r_a.json()["state_version"] + assert r_a.json()["from_cache"] is False + + # Record an applied_failed outcome with a distinctive reason. + r_patch = await client.patch( + f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome", + json={"outcome": "applied_failed", "notes": distinct_failure_reason}, + headers=auth_headers, + ) + assert r_patch.status_code == 200 + + # Preview B — must be a cache miss because state_version changed. + r_b = await client.post( + f"/api/v1/ai-sessions/{session_id}/resolution-note/preview", + headers=auth_headers, + ) + assert r_b.status_code == 200 + markdown_b = r_b.json()["markdown"] + version_b = r_b.json()["state_version"] + assert r_b.json()["from_cache"] is False, ( + "Preview after outcome patch must be a cache miss (state_version changed)" + ) + + # State version increased between the two previews. + assert version_b > version_a, ( + f"state_version should have increased; got {version_a} → {version_b}" + ) + + # Markdown differs between the two previews. + assert markdown_a != markdown_b, ( + "Regenerated preview after outcome patch should differ from pre-outcome preview" + ) + + # The bundle passed to the LLM for preview B includes the outcome fields. + assert len(calls_made) == 2, f"Expected 2 LLM calls (one per preview); got {len(calls_made)}" + bundle_b = calls_made[1] + assert "applied_failed" in bundle_b, ( + "Bundle for second preview should include 'Outcome status: applied_failed'" + ) + assert distinct_failure_reason in bundle_b, ( + "Bundle for second preview should include the failure_reason text" + )