Files
resolutionflow/backend/app/services/flow_matching_engine.py
chihlasm 5494816b06 feat(ai-session): add FlowPilot AI-powered troubleshooting sessions
Implements Phase 1 of the FlowPilot-First pivot — the core AI session
experience where engineers describe a problem and FlowPilot guides them
through structured diagnosis with selectable options, free-text escape
hatches, and auto-generated documentation on resolution.

Backend: AISession + AISessionStep models, FlowPilot Engine (LLM
orchestration with structured JSON output), Flow Matching Engine v1
(semantic + keyword + recency scoring), 8 API endpoints with auth,
rate limiting, and AI quota enforcement.

Frontend: Intake screen, conversational session view with sidebar,
step cards with options/actions/resolution suggestions, resolve/escalate
modals, documentation view with rating, session history integration,
and /pilot route with sidebar navigation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 14:27:36 +00:00

279 lines
9.0 KiB
Python

"""Flow Matching Engine v1 — find existing flows relevant to an AI session's intake.
Combines keyword matching, semantic search (via RAG embeddings), and recency
scoring to rank flows. Deliberately simple for v1; v2 (Phase 3) adds deeper
semantic matching.
Scoring weights: semantic 0.5, keyword 0.3, recency 0.2.
Threshold: only return matches with composite score > 0.5.
"""
import logging
from datetime import datetime, timezone, timedelta
from typing import Any, Optional
from uuid import UUID
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.tree import Tree
from app.services.rag_service import search as rag_search
logger = logging.getLogger(__name__)
# Scoring weights
SEMANTIC_WEIGHT = 0.5
KEYWORD_WEIGHT = 0.3
RECENCY_WEIGHT = 0.2
# Only return matches above this composite score
SCORE_THRESHOLD = 0.5
async def find_matches(
intake_text: str,
problem_domain: Optional[str],
account_id: UUID,
db: AsyncSession,
limit: int = 5,
) -> list[dict[str, Any]]:
"""Find existing flows that match the intake description.
Returns list of dicts sorted by composite score:
{tree_id, tree_name, score, match_reason}
"""
candidates: dict[str, dict[str, Any]] = {}
# 1. Semantic search via existing RAG embeddings
try:
rag_results = await rag_search(
query=intake_text,
account_id=account_id,
db=db,
limit=10,
)
for r in rag_results:
tree_id = str(r["tree_id"])
similarity = r.get("similarity", 0.0)
if tree_id not in candidates:
candidates[tree_id] = {
"tree_id": tree_id,
"tree_name": r["tree_name"],
"semantic_score": similarity,
"keyword_score": 0.0,
"recency_score": 0.0,
"match_reasons": [],
}
else:
# Take the best semantic score across chunks
candidates[tree_id]["semantic_score"] = max(
candidates[tree_id]["semantic_score"], similarity
)
if similarity > 0.5:
candidates[tree_id]["match_reasons"].append(
f"semantic match ({similarity:.0%})"
)
except Exception as e:
logger.warning("Semantic search failed during flow matching: %s", e)
# 2. Keyword matching against trees.match_keywords
try:
keyword_matches = await _keyword_match(intake_text, account_id, db)
for km in keyword_matches:
tree_id = str(km["tree_id"])
if tree_id not in candidates:
candidates[tree_id] = {
"tree_id": tree_id,
"tree_name": km["tree_name"],
"semantic_score": 0.0,
"keyword_score": km["score"],
"recency_score": 0.0,
"match_reasons": [],
}
else:
candidates[tree_id]["keyword_score"] = km["score"]
if km["score"] > 0.3:
candidates[tree_id]["match_reasons"].append(
f"keyword match: {', '.join(km.get('matched_keywords', []))}"
)
except Exception as e:
logger.warning("Keyword matching failed: %s", e)
# 3. Category/domain match
if problem_domain:
try:
domain_matches = await _domain_match(problem_domain, account_id, db)
for dm in domain_matches:
tree_id = str(dm["tree_id"])
if tree_id not in candidates:
candidates[tree_id] = {
"tree_id": tree_id,
"tree_name": dm["tree_name"],
"semantic_score": 0.0,
"keyword_score": 0.2, # Small boost for domain match
"recency_score": 0.0,
"match_reasons": [],
}
else:
candidates[tree_id]["keyword_score"] = max(
candidates[tree_id]["keyword_score"], 0.2
)
candidates[tree_id]["match_reasons"].append(f"domain match: {problem_domain}")
except Exception as e:
logger.warning("Domain matching failed: %s", e)
# 4. Apply recency boost
now = datetime.now(timezone.utc)
for tree_id, candidate in candidates.items():
# We'll compute recency from the tree data if available
candidate["recency_score"] = 0.0 # Default, updated below
# Fetch recency data for all candidates
if candidates:
try:
recency_data = await _get_recency_scores(
list(candidates.keys()), db
)
for tree_id, recency_score in recency_data.items():
if tree_id in candidates:
candidates[tree_id]["recency_score"] = recency_score
except Exception as e:
logger.warning("Recency scoring failed: %s", e)
# 5. Compute composite scores and filter
results = []
for tree_id, c in candidates.items():
composite = (
c["semantic_score"] * SEMANTIC_WEIGHT
+ c["keyword_score"] * KEYWORD_WEIGHT
+ c["recency_score"] * RECENCY_WEIGHT
)
if composite > SCORE_THRESHOLD:
results.append({
"tree_id": UUID(tree_id),
"tree_name": c["tree_name"],
"score": round(composite, 3),
"match_reason": "; ".join(c["match_reasons"][:3]) if c["match_reasons"] else "composite match",
})
# Sort by score descending, take top N
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
async def _keyword_match(
intake_text: str,
account_id: UUID,
db: AsyncSession,
) -> list[dict[str, Any]]:
"""Match intake text against trees.match_keywords JSONB arrays.
Simple approach: tokenize intake text, check overlap with each tree's keywords.
"""
# Extract meaningful tokens from intake (lowercase, 3+ chars)
tokens = set()
for word in intake_text.lower().split():
cleaned = "".join(c for c in word if c.isalnum())
if len(cleaned) >= 3:
tokens.add(cleaned)
if not tokens:
return []
# Find trees with match_keywords set
result = await db.execute(
select(Tree.id, Tree.name, Tree.match_keywords)
.where(
Tree.account_id == account_id,
Tree.deleted_at.is_(None),
Tree.status == "published",
Tree.match_keywords.isnot(None),
)
)
rows = result.all()
matches = []
for row in rows:
tree_keywords = row.match_keywords or []
if not isinstance(tree_keywords, list):
continue
# Lowercase keywords for comparison
kw_lower = {str(kw).lower() for kw in tree_keywords}
# Calculate overlap
matched = tokens & kw_lower
if matched:
score = len(matched) / max(len(kw_lower), 1)
matches.append({
"tree_id": row.id,
"tree_name": row.name,
"score": min(score, 1.0),
"matched_keywords": list(matched)[:5],
})
return matches
async def _domain_match(
problem_domain: str,
account_id: UUID,
db: AsyncSession,
) -> list[dict[str, Any]]:
"""Find trees whose category matches the classified problem domain."""
result = await db.execute(
select(Tree.id, Tree.name)
.where(
Tree.account_id == account_id,
Tree.deleted_at.is_(None),
Tree.status == "published",
Tree.category.ilike(f"%{problem_domain}%"),
)
.limit(10)
)
rows = result.all()
return [{"tree_id": row.id, "tree_name": row.name} for row in rows]
async def _get_recency_scores(
tree_ids: list[str],
db: AsyncSession,
) -> dict[str, float]:
"""Calculate recency scores based on last_matched_at.
Trees matched within the last 7 days get full recency boost (0.2 → 1.0).
Trees matched within 30 days get partial boost.
Older or never-matched trees get 0.
"""
if not tree_ids:
return {}
result = await db.execute(
select(Tree.id, Tree.last_matched_at, Tree.success_rate)
.where(Tree.id.in_([UUID(tid) for tid in tree_ids]))
)
rows = result.all()
now = datetime.now(timezone.utc)
scores = {}
for row in rows:
tree_id = str(row.id)
if row.last_matched_at is None:
scores[tree_id] = 0.0
continue
days_since = (now - row.last_matched_at).days
if days_since <= 7:
recency = 1.0
elif days_since <= 30:
recency = 1.0 - ((days_since - 7) / 23) # Linear decay 7-30 days
else:
recency = 0.0
# Factor in success rate if available
if row.success_rate is not None:
recency *= row.success_rate
scores[tree_id] = max(0.0, min(1.0, recency))
return scores