fix(pilot): outcome-aware Resolve/Escalate previews
Issue #1 from phase-8-review-issues.md. Cache invalidation alone isn't enough — previews were also omitting outcome fields from the LLM bundle, so a fresh regenerate still couldn't distinguish proposed / failed / partial / success. - PATCH /outcome now bumps ai_sessions.state_version (matches record_decision's existing pattern). - Resolution-note + escalation-package bundles now include status, applied_at, verified_at, partial_notes, failure_reason on the active fix. - Generator prompts prescribe outcome-aware phrasing (closure language for success; what-we've-tried + next-steps for failed/partial). - New end-to-end test asserts the regenerated preview reflects the recorded outcome, not just that the cache key changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,13 +5,24 @@ Fixture style follows test_session_suggested_fixes_api.py:
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, call, patch
|
||||
|
||||
import pytest
|
||||
from httpx import AsyncClient
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.api.endpoints.session_suggested_fixes import _clear_preview_cache_for_tests
|
||||
from app.models.ai_session import AISession
|
||||
from app.models.session_suggested_fix import SessionSuggestedFix
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _isolate_preview_cache():
|
||||
_clear_preview_cache_for_tests()
|
||||
yield
|
||||
_clear_preview_cache_for_tests()
|
||||
|
||||
|
||||
# ── shared helper ────────────────────────────────────────────────────────────
|
||||
|
||||
async def _make_session_with_fix(test_db, user) -> tuple[str, str]:
|
||||
@@ -197,3 +208,122 @@ async def test_failed_outcome_stores_notes_as_failure_reason(
|
||||
body = r.json()
|
||||
assert body["failure_reason"] == "user reports no change"
|
||||
assert body["partial_notes"] is None
|
||||
|
||||
|
||||
# ── state_version bump ────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_outcome_patch_bumps_state_version(
|
||||
client: AsyncClient, test_user, auth_headers, test_db
|
||||
):
|
||||
"""PATCH /outcome must increment ai_sessions.state_version (like record_decision)."""
|
||||
session_id, fix_id = await _make_session_with_fix(test_db, test_user)
|
||||
|
||||
# Capture the initial state_version from DB.
|
||||
from uuid import UUID
|
||||
result = await test_db.execute(
|
||||
select(AISession).where(AISession.id == UUID(session_id))
|
||||
)
|
||||
session_obj = result.scalar_one()
|
||||
initial_version = session_obj.state_version
|
||||
|
||||
r = await client.patch(
|
||||
f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome",
|
||||
json={"outcome": "applied_success"},
|
||||
headers=auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
await test_db.refresh(session_obj)
|
||||
assert session_obj.state_version == initial_version + 1, (
|
||||
"Outcome patch must bump state_version so preview cache is invalidated"
|
||||
)
|
||||
|
||||
|
||||
# ── outcome propagation into preview bundle ───────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_resolution_note_preview_reflects_outcome_after_patch(
|
||||
client: AsyncClient, test_user, auth_headers, test_db
|
||||
):
|
||||
"""End-to-end: preview before outcome != preview after outcome; new preview
|
||||
bundle includes failure_reason; state_version was bumped between the two.
|
||||
|
||||
The LLM is stubbed so the test is deterministic. The stub returns whatever
|
||||
the user-message content is, which means the captured call args reflect
|
||||
what the bundle actually contained.
|
||||
"""
|
||||
session_id, fix_id = await _make_session_with_fix(test_db, test_user)
|
||||
|
||||
distinct_failure_reason = "DISTINCT-FAILURE-REASON-XYZZY-42"
|
||||
|
||||
calls_made: list[str] = []
|
||||
|
||||
async def fake_generate_text(system_prompt, messages, max_tokens):
|
||||
user_content = messages[0]["content"]
|
||||
calls_made.append(user_content)
|
||||
# Return markdown that includes the user-message bundle verbatim so we
|
||||
# can assert the bundle shape without inspecting mock internals.
|
||||
return (
|
||||
f"## Problem\ntest\n\n## What we confirmed\n(none)\n\n"
|
||||
f"## Root cause\ntest\n\n## Resolution\nBUNDLE_CONTENT={user_content}",
|
||||
100,
|
||||
50,
|
||||
)
|
||||
|
||||
fake_provider = AsyncMock()
|
||||
fake_provider.generate_text = AsyncMock(side_effect=fake_generate_text)
|
||||
|
||||
with patch(
|
||||
"app.services.resolution_note_generator.get_ai_provider",
|
||||
return_value=fake_provider,
|
||||
):
|
||||
# Preview A — before any outcome recorded (status = "proposed").
|
||||
r_a = await client.post(
|
||||
f"/api/v1/ai-sessions/{session_id}/resolution-note/preview",
|
||||
headers=auth_headers,
|
||||
)
|
||||
assert r_a.status_code == 200
|
||||
markdown_a = r_a.json()["markdown"]
|
||||
version_a = r_a.json()["state_version"]
|
||||
assert r_a.json()["from_cache"] is False
|
||||
|
||||
# Record an applied_failed outcome with a distinctive reason.
|
||||
r_patch = await client.patch(
|
||||
f"/api/v1/ai-sessions/{session_id}/suggested-fixes/{fix_id}/outcome",
|
||||
json={"outcome": "applied_failed", "notes": distinct_failure_reason},
|
||||
headers=auth_headers,
|
||||
)
|
||||
assert r_patch.status_code == 200
|
||||
|
||||
# Preview B — must be a cache miss because state_version changed.
|
||||
r_b = await client.post(
|
||||
f"/api/v1/ai-sessions/{session_id}/resolution-note/preview",
|
||||
headers=auth_headers,
|
||||
)
|
||||
assert r_b.status_code == 200
|
||||
markdown_b = r_b.json()["markdown"]
|
||||
version_b = r_b.json()["state_version"]
|
||||
assert r_b.json()["from_cache"] is False, (
|
||||
"Preview after outcome patch must be a cache miss (state_version changed)"
|
||||
)
|
||||
|
||||
# State version increased between the two previews.
|
||||
assert version_b > version_a, (
|
||||
f"state_version should have increased; got {version_a} → {version_b}"
|
||||
)
|
||||
|
||||
# Markdown differs between the two previews.
|
||||
assert markdown_a != markdown_b, (
|
||||
"Regenerated preview after outcome patch should differ from pre-outcome preview"
|
||||
)
|
||||
|
||||
# The bundle passed to the LLM for preview B includes the outcome fields.
|
||||
assert len(calls_made) == 2, f"Expected 2 LLM calls (one per preview); got {len(calls_made)}"
|
||||
bundle_b = calls_made[1]
|
||||
assert "applied_failed" in bundle_b, (
|
||||
"Bundle for second preview should include 'Outcome status: applied_failed'"
|
||||
)
|
||||
assert distinct_failure_reason in bundle_b, (
|
||||
"Bundle for second preview should include the failure_reason text"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user