WIP: SSE pub/sub for live escalation arrivals (paused for Codex review)

First half of the WebSocket/SSE push slice. Paused mid-flight to hand
the branch to Codex for outside-voice review before stacking more
commits on top. See .ai/HANDOFF.md for the full pause context + what
to look at.

What's here:
- backend/app/core/escalation_bus.py — module-level singleton in-memory
  pub/sub keyed by account_id. asyncio.Queue per subscriber with
  64-event maxsize and drop-on-full semantics. Designed to be swappable
  for Redis pub/sub when Railway scales past single-replica.
- backend/app/api/endpoints/session_handoffs.py — GET
  /api/v1/ai-sessions/escalations/stream SSE endpoint. Auth via
  require_engineer_or_admin. 25s heartbeat. Account-scoped subscribe
  bound to current_user.account_id.
- backend/app/services/handoff_manager.py — dispatch_escalation_notifications
  now publishes a `handoff_created` event to the bus BEFORE the email
  fan-out, in a try/except so a bus failure can't block email delivery.
- backend/tests/test_escalation_bus.py — 7 unit tests, all green
  standalone (0.14s). Cross-tenant isolation, drop-on-full, no-subscribers.
- backend/tests/test_handoff_manager.py — +1 dispatcher integration test
  (publishes to bus, payload shape).
- backend/tests/test_session_handoffs_api.py — +2 endpoint tests (viewer
  blocked, ready event handshake).

[gstack-context]
Decisions:
  - SSE over WebSocket (one-way, browser EventSource semantics, fewer
    moving parts behind Railway proxy)
  - In-memory bus over Redis for v1 pilot (3 MSPs, single replica)
  - Drop-on-full subscriber queue rather than back-pressure publishers
  - Bus publish ahead of email send, both wrapped in try/except so
    neither can break handoff creation
  - Frontend will be a fetch-based ReadableStream reader matching the
    existing streamDocumentation pattern, not native EventSource
    (custom-header auth)
Remaining (post-Codex):
  - Frontend SSE subscription in EscalationQueue.tsx (slide-in,
    reconnect, tab-title flash, prefers-reduced-motion)
  - Magic-moment handoff-context screen
  - Re-run the full backend test suite to verify the SSE +
    dispatcher integration tests (bus units already green standalone)
Tried:
  - Running the full test suite repeatedly without xdist; the per-test
    DROP SCHEMA + recreate fixture made wall-clock prohibitive when
    multiple stale runs collided on the same Postgres test schema.
    Resolution: -n auto next time.
[/gstack-context]

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 19:29:07 -04:00
parent a283d0d3fd
commit 87bd0b7c56
6 changed files with 408 additions and 4 deletions

View File

@@ -0,0 +1,106 @@
"""Unit tests for the in-memory escalation pub/sub bus."""
import asyncio
from uuid import uuid4
import pytest
from app.core.escalation_bus import EscalationBus
@pytest.mark.asyncio
async def test_publish_with_no_subscribers_returns_zero():
bus = EscalationBus()
delivered = await bus.publish(uuid4(), {"type": "handoff_created"})
assert delivered == 0
@pytest.mark.asyncio
async def test_subscribe_then_publish_delivers_event():
bus = EscalationBus()
account = uuid4()
queue = await bus.subscribe(account)
try:
delivered = await bus.publish(account, {"type": "handoff_created", "id": "x"})
assert delivered == 1
event = await asyncio.wait_for(queue.get(), timeout=1.0)
assert event == {"type": "handoff_created", "id": "x"}
finally:
await bus.unsubscribe(account, queue)
@pytest.mark.asyncio
async def test_two_subscribers_same_account_both_receive():
bus = EscalationBus()
account = uuid4()
q1 = await bus.subscribe(account)
q2 = await bus.subscribe(account)
try:
delivered = await bus.publish(account, {"type": "x"})
assert delivered == 2
e1 = await asyncio.wait_for(q1.get(), timeout=1.0)
e2 = await asyncio.wait_for(q2.get(), timeout=1.0)
assert e1 == e2 == {"type": "x"}
finally:
await bus.unsubscribe(account, q1)
await bus.unsubscribe(account, q2)
@pytest.mark.asyncio
async def test_subscriber_in_other_account_does_not_receive():
"""Cross-tenant isolation is the whole point — sanity check it directly."""
bus = EscalationBus()
account_a = uuid4()
account_b = uuid4()
q_a = await bus.subscribe(account_a)
q_b = await bus.subscribe(account_b)
try:
delivered = await bus.publish(account_a, {"type": "x"})
assert delivered == 1
e_a = await asyncio.wait_for(q_a.get(), timeout=1.0)
assert e_a == {"type": "x"}
# B's queue must remain empty.
with pytest.raises(asyncio.TimeoutError):
await asyncio.wait_for(q_b.get(), timeout=0.1)
finally:
await bus.unsubscribe(account_a, q_a)
await bus.unsubscribe(account_b, q_b)
@pytest.mark.asyncio
async def test_unsubscribe_drops_subscriber_count_to_zero():
bus = EscalationBus()
account = uuid4()
q = await bus.subscribe(account)
assert bus.subscriber_count(account) == 1
await bus.unsubscribe(account, q)
assert bus.subscriber_count(account) == 0
@pytest.mark.asyncio
async def test_publish_drops_events_when_subscriber_queue_is_full():
"""A stuck subscriber must not back-pressure publishers."""
bus = EscalationBus()
account = uuid4()
queue = await bus.subscribe(account)
try:
# Stuff the queue past capacity (maxsize is 64) without consuming.
for _ in range(65):
await bus.publish(account, {"type": "x"})
# Sanity: queue holds at most maxsize.
assert queue.qsize() <= 64
# Publishes after capacity didn't raise — they were dropped silently.
finally:
await bus.unsubscribe(account, queue)
@pytest.mark.asyncio
async def test_unsubscribe_unknown_queue_is_noop():
"""Defensive: unsubscribe on an account/queue that isn't registered
should not raise — finally blocks rely on this."""
bus = EscalationBus()
account = uuid4()
fake_queue: asyncio.Queue = asyncio.Queue()
# Should not raise.
await bus.unsubscribe(account, fake_queue)

View File

@@ -278,6 +278,58 @@ async def test_dispatch_graceful_degradation_when_email_raises(
assert sent == 0
@pytest.mark.asyncio
async def test_dispatch_publishes_to_escalation_bus(
client: AsyncClient, test_user, auth_headers, test_db
):
"""dispatch_escalation_notifications puts an event on the in-memory bus
so connected SSE subscribers see live arrivals."""
from app.core.escalation_bus import bus as escalation_bus
session = AISession(
user_id=test_user["user_data"]["id"],
account_id=test_user["user_data"]["account_id"],
session_type="guided",
intake_type="free_text",
intake_content={"text": "x"},
problem_summary="VPN down",
status="active",
confidence_tier="discovery",
conversation_messages=[],
)
test_db.add(session)
await test_db.commit()
manager = HandoffManager(test_db)
handoff = await manager.create_handoff(
session_id=session.id,
intent="escalate",
engineer_notes="please help",
user_id=test_user["user_data"]["id"],
)
await test_db.commit()
from uuid import UUID as PyUUID
account_id = PyUUID(test_user["user_data"]["account_id"])
queue = await escalation_bus.subscribe(account_id)
try:
with patch(
"app.services.handoff_manager.EmailService.send_notification_email",
new=AsyncMock(return_value=True),
):
await manager.dispatch_escalation_notifications(handoff)
import asyncio
event = await asyncio.wait_for(queue.get(), timeout=1.0)
assert event["type"] == "handoff_created"
assert event["handoff_id"] == str(handoff.id)
assert event["session_id"] == str(session.id)
assert event["priority"] == "normal"
finally:
await escalation_bus.unsubscribe(account_id, queue)
@pytest.mark.asyncio
async def test_create_handoff_endpoint_dispatches_on_escalate(
client: AsyncClient, test_user, auth_headers, test_db

View File

@@ -113,6 +113,49 @@ async def test_claim_blocked_for_viewer_role(
assert "engineer" in claim_resp.json()["detail"].lower()
@pytest.mark.asyncio
async def test_escalations_stream_blocked_for_viewer(
client: AsyncClient, test_user, auth_headers, test_db
):
"""SSE stream is role-gated to engineer-or-admin (matches queue/claim)."""
user_id = PyUUID(test_user["user_data"]["id"])
user = (
await test_db.execute(select(User).where(User.id == user_id))
).scalar_one()
user.account_role = "viewer"
await test_db.commit()
resp = await client.get(
"/api/v1/ai-sessions/escalations/stream", headers=auth_headers
)
assert resp.status_code == 403
@pytest.mark.asyncio
async def test_escalations_stream_returns_sse_content_type(
client: AsyncClient, test_user, auth_headers, test_db
):
"""Engineer/owner can open the SSE stream and gets text/event-stream
plus an initial `ready` event. Read just enough bytes to confirm the
handshake — the full pub/sub flow is covered by the bus + dispatcher
tests separately."""
async with client.stream(
"GET",
"/api/v1/ai-sessions/escalations/stream",
headers=auth_headers,
) as resp:
assert resp.status_code == 200
assert resp.headers["content-type"].startswith("text/event-stream")
# First chunk must contain the ready event.
first = b""
async for chunk in resp.aiter_bytes():
first += chunk
if b"event: ready" in first and b"\n\n" in first:
break
assert b"event: ready" in first
assert b'"account_id"' in first
@pytest.mark.asyncio
async def test_claim_allowed_for_engineer_role(
client: AsyncClient, test_user, auth_headers, test_db