fix(pilot): outcome-aware Resolve/Escalate previews

Issue #1 from phase-8-review-issues.md. Cache invalidation alone isn't enough — previews were also omitting outcome fields from the LLM bundle, so a fresh regenerate still couldn't distinguish proposed / failed / partial / success. - PATCH /outcome now bumps ai_sessions.state_version (matches record_decision's existing pattern). - Resolution-note + escalation-package bundles now include status, applied_at, verified_at, partial_notes, failure_reason on the active fix. - Generator prompts prescribe outcome-aware phrasing (closure language for success; what-we've-tried + next-steps for failed/partial). - New end-to-end test asserts the regenerated preview reflects the recorded outcome, not just that the cache key changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 22:04:56 -04:00
parent ec104dc8de
commit 362c7b1d79
4 changed files with 208 additions and 15 deletions
--- a/backend/tests/test_fix_outcome_endpoint.py
+++ b/backend/tests/test_fix_outcome_endpoint.py
@@ -5,13 +5,24 @@ Fixture style follows test_session_suggested_fixes_api.py:
 """
 from __future__ import annotations

+from unittest.mock import AsyncMock, call, patch
+
 import pytest
 from httpx import AsyncClient
+from sqlalchemy import select

+from app.api.endpoints.session_suggested_fixes import _clear_preview_cache_for_tests
 from app.models.ai_session import AISession
 from app.models.session_suggested_fix import SessionSuggestedFix


+@pytest.fixture(autouse=True)
+def _isolate_preview_cache():
+    _clear_preview_cache_for_tests()
+    yield
+    _clear_preview_cache_for_tests()
+
+
 # ── shared helper ────────────────────────────────────────────────────────────

 async def _make_session_with_fix(test_db, user) -> tuple[str, str]:
@@ -197,3 +208,122 @@ async def test_failed_outcome_stores_notes_as_failure_reason(
    body = r.json()
    assert body["failure_reason"] == "user reports no change"
    assert body["partial_notes"] is None
+
+
+# ── state_version bump ────────────────────────────────────────────────────────
+
+@pytest.mark.asyncio
+async def test_outcome_patch_bumps_state_version(
+    client: AsyncClient, test_user, auth_headers, test_db
+):
+    """PATCH /outcome must increment ai_sessions.state_version (like record_decision)."""
+    session_id, fix_id = await _make_session_with_fix(test_db, test_user)
+
+    # Capture the initial state_version from DB.
+    from uuid import UUID
+    result = await test_db.execute(
+        select(AISession).where(AISession.id == UUID(session_id))
+    )
+    session_obj = result.scalar_one()
+    initial_version = session_obj.state_version
+
+    r = await client.patch(
+        f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome",
+        json={"outcome": "applied_success"},
+        headers=auth_headers,
+    )
+    assert r.status_code == 200
+
+    await test_db.refresh(session_obj)
+    assert session_obj.state_version == initial_version + 1, (
+        "Outcome patch must bump state_version so preview cache is invalidated"
+    )
+
+
+# ── outcome propagation into preview bundle ───────────────────────────────────
+
+@pytest.mark.asyncio
+async def test_resolution_note_preview_reflects_outcome_after_patch(
+    client: AsyncClient, test_user, auth_headers, test_db
+):
+    """End-to-end: preview before outcome != preview after outcome; new preview
+    bundle includes failure_reason; state_version was bumped between the two.
+
+    The LLM is stubbed so the test is deterministic. The stub returns whatever
+    the user-message content is, which means the captured call args reflect
+    what the bundle actually contained.
+    """
+    session_id, fix_id = await _make_session_with_fix(test_db, test_user)
+
+    distinct_failure_reason = "DISTINCT-FAILURE-REASON-XYZZY-42"
+
+    calls_made: list[str] = []
+
+    async def fake_generate_text(system_prompt, messages, max_tokens):
+        user_content = messages[0]["content"]
+        calls_made.append(user_content)
+        # Return markdown that includes the user-message bundle verbatim so we
+        # can assert the bundle shape without inspecting mock internals.
+        return (
+            f"## Problem\ntest\n\n## What we confirmed\n(none)\n\n"
+            f"## Root cause\ntest\n\n## Resolution\nBUNDLE_CONTENT={user_content}",
+            100,
+            50,
+        )
+
+    fake_provider = AsyncMock()
+    fake_provider.generate_text = AsyncMock(side_effect=fake_generate_text)
+
+    with patch(
+        "app.services.resolution_note_generator.get_ai_provider",
+        return_value=fake_provider,
+    ):
+        # Preview A — before any outcome recorded (status = "proposed").
+        r_a = await client.post(
+            f"/api/v1/ai-sessions/{session_id}/resolution-note/preview",
+            headers=auth_headers,
+        )
+        assert r_a.status_code == 200
+        markdown_a = r_a.json()["markdown"]
+        version_a = r_a.json()["state_version"]
+        assert r_a.json()["from_cache"] is False
+
+        # Record an applied_failed outcome with a distinctive reason.
+        r_patch = await client.patch(
+            f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome",
+            json={"outcome": "applied_failed", "notes": distinct_failure_reason},
+            headers=auth_headers,
+        )
+        assert r_patch.status_code == 200
+
+        # Preview B — must be a cache miss because state_version changed.
+        r_b = await client.post(
+            f"/api/v1/ai-sessions/{session_id}/resolution-note/preview",
+            headers=auth_headers,
+        )
+        assert r_b.status_code == 200
+        markdown_b = r_b.json()["markdown"]
+        version_b = r_b.json()["state_version"]
+        assert r_b.json()["from_cache"] is False, (
+            "Preview after outcome patch must be a cache miss (state_version changed)"
+        )
+
+    # State version increased between the two previews.
+    assert version_b > version_a, (
+        f"state_version should have increased; got {version_a} → {version_b}"
+    )
+
+    # Markdown differs between the two previews.
+    assert markdown_a != markdown_b, (
+        "Regenerated preview after outcome patch should differ from pre-outcome preview"
+    )
+
+    # The bundle passed to the LLM for preview B includes the outcome fields.
+    assert len(calls_made) == 2, f"Expected 2 LLM calls (one per preview); got {len(calls_made)}"
+    bundle_b = calls_made[1]
+    assert "applied_failed" in bundle_b, (
+        "Bundle for second preview should include 'Outcome status: applied_failed'"
+    )
+    assert distinct_failure_reason in bundle_b, (
+        "Bundle for second preview should include the failure_reason text"
+    )