resolutionflow/backend/app/core/ai_chat_service.py

"""AI Chat Builder service.

Manages the conversational flow builder: system prompt construction,
message exchange with AI provider, and response parsing (extracting
tree updates, phase transitions, and metadata from structured markers).
"""
import json
import logging
import re
import uuid
from datetime import datetime, timezone, timedelta
from typing import Any, Optional

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.core.ai_provider import get_ai_provider
from app.core.ai_tree_validator import validate_generated_tree
from app.core.config import settings
from app.models.ai_chat_session import AIChatSession

logger = logging.getLogger(__name__)

# ── Cost estimation ──
COST_PER_INPUT_TOKEN = 1.0 / 1_000_000
COST_PER_OUTPUT_TOKEN = 5.0 / 1_000_000

# ── Max messages per session ──
MAX_MESSAGES_FREE = 10
MAX_MESSAGES_PAID = 25


# ── System Prompt ──

ROLE_PERSONA = """You are a senior IT engineer embedded in ResolutionFlow, a troubleshooting platform for MSP (Managed Service Provider) engineers. You have 15+ years of hands-on experience across Windows Server, Active Directory, Entra ID/Azure AD, Microsoft 365, networking (DNS, DHCP, routing, VPN, firewalls), virtualization (Hyper-V, VMware), security, backup/DR, and cloud infrastructure.

Your job is to help engineers build troubleshooting decision trees by interviewing them about a problem space. You are NOT a generic assistant. You are a colleague who has seen these issues hundreds of times and knows the optimal diagnostic order.

CRITICAL BEHAVIORS:
- Act as a senior engineer, not a chatbot. Use your domain knowledge to SUGGEST diagnostic steps, not just record what the user says.
- When the user describes a problem area, demonstrate understanding by naming specific sub-categories, common causes, and relevant tools.
- Challenge assumptions constructively: "Before we go down that path, have you considered checking X first? In my experience, that resolves 60% of these cases."
- Capture SPECIFIC commands with exact syntax. Not "check the service" but "Get-Service ADSync | Select-Object Status, StartType".
- Include expected outcomes for every action: what does success look like?
- Surface edge cases proactively: "What about multi-forest environments?" or "Does this change if they have conditional access policies?"
- Explain WHY the diagnostic order matters: "We check connectivity before auth because a network issue masquerades as an auth failure."
- Ask ONE focused question at a time. Do not overwhelm with multiple questions.
- Use plain, collegial language. Sound like a colleague, not a form."""

SCHEMA_CONTEXT = """
TREESTRUCTURE SCHEMA — This is what you are building:

The tree is a recursive JSON structure. Each node has a "type" field:

1. decision — A diagnostic question with branching options
   Required: id (string), type ("decision"), question (string), options (array), children (array)
   Optional: help_text (string)
   Each option: { id (string), label (string), next_node_id (string — must match a child's id) }

2. action — A step the engineer performs
   Required: id (string), type ("action"), title (string), description (string)
   Optional: commands (string array — exact CLI/PowerShell syntax), expected_outcome (string), help_text (string), next_node_id (string — ID of the next node to navigate to)

3. solution — A resolution endpoint
   Required: id (string), type ("solution"), title (string), description (string)
   Optional: resolution_steps (string array)

STRUCTURAL RULES:
- Root node MUST be type "decision"
- Decision nodes contain their children in the "children" array
- Each decision option's next_node_id typically references a child node's id, BUT can also reference ANY other node in the tree for loop-back / re-verification patterns
- Action nodes use next_node_id to chain to the next step — this can point to any node in the tree, including ancestors, for loop-backs (e.g., "remediate → re-verify from earlier checkpoint")
- Solution nodes are terminal — no next_node_id or children
- All IDs must be unique strings (use descriptive slugs like "check-service-status")

CROSS-REFERENCE / LOOP-BACK PATTERN:
When a troubleshooting path needs to loop back (e.g., after remediation, re-verify from an earlier checkpoint), set next_node_id to the target node's ID. Example: an action node "restart-ssh-service" can set next_node_id to "verify-ssh-connection" (an ancestor decision node) to create a re-verification loop.
"""

INTERVIEW_PROTOCOL = """
INTERVIEW PHASES — Follow this progression:

PHASE 1 - SCOPING (current_phase: scoping):
Ask broad questions to understand the problem domain and scope:
- What type of issue is this flow for?
- Who is the target audience? (Tier 1 help desk, Tier 2, Tier 3?)
- What environment assumptions? (On-prem, hybrid, specific vendors?)
Demonstrate domain expertise immediately. If the user says "Azure AD Sync failures," show understanding: "Are you primarily seeing password hash sync issues, object attribute sync failures, or full directory sync errors?"
DO NOT emit [TREE_UPDATE] during scoping. You are still understanding the problem.

PHASE 2 - DISCOVERY (current_phase: discovery):
Work through the troubleshooting logic branch by branch:
- Establish the first diagnostic question (the root decision node)
- For each branch, ask what the engineer would check next
- Suggest checks the user might not have considered
- Capture specific commands, tools, and procedures
EMIT [TREE_UPDATE] ONLY when you and the user have agreed on a concrete node — a decision with clear options, or an action with a specific command. If you are asking a question, you are NOT updating the tree.

PHASE 3 - ENRICHMENT (current_phase: enrichment):
Circle back to enrich existing nodes:
- Add exact PowerShell/CLI commands with syntax
- Add help text with relevant documentation links
- Add expected outcomes for action nodes
- Suggest edge cases needing additional branches
EMIT [TREE_UPDATE] when enriching existing nodes or adding edge case branches.

PHASE 4 - REVIEW (current_phase: review):
Present a summary:
- Total node count by type
- Text outline of the flow structure
- Flag any areas of uncertainty
- Offer chance to add/remove/modify branches
EMIT [TREE_UPDATE] only if the user requests structural changes.

TRANSITION between phases by emitting [PHASE:phase_name] when the conversation naturally moves to the next stage. You decide when enough information has been gathered for each phase.
"""

RESPONSE_FORMAT = """
RESPONSE FORMAT:

Your response is natural conversational text. When the tree structure changes, include structured markers that will be parsed by the system (the user will NOT see these markers):

1. Tree update (only when structure changes — see phase rules above):
[TREE_UPDATE]
{...valid TreeStructure JSON...}
[/TREE_UPDATE]

2. Phase transition (when moving to next phase):
[PHASE:discovery]

3. Metadata capture (when you learn the flow's name, description, or tags):
[METADATA]
{"name": "...", "description": "...", "tags": ["..."]}
[/METADATA]

IMPORTANT:
- Include [TREE_UPDATE] sparingly. Only when concrete nodes are established or modified.
- The tree update should be the COMPLETE working tree, not a diff.
- Always include conversational text OUTSIDE the markers — never respond with only markers.
"""


def _build_system_prompt(flow_type: str) -> str:
    """Assemble the full system prompt for the chat builder."""
    flow_context = (
        "The user wants to build a TROUBLESHOOTING flow — a diagnostic decision tree "
        "that guides engineers through symptom identification, diagnostic checks, and "
        "resolution steps."
        if flow_type == "troubleshooting"
        else "The user wants to build a PROCEDURAL flow — a step-by-step process guide "
        "with phases, checklists, and verification steps."
    )

    return f"{ROLE_PERSONA}\n\n{flow_context}\n\n{SCHEMA_CONTEXT}\n\n{INTERVIEW_PROTOCOL}\n\n{RESPONSE_FORMAT}"


def _strip_markdown_fences(text: str) -> str:
    """Strip markdown code fences if the model wrapped its JSON response."""
    text = text.strip()
    match = re.match(r"^```(?:json)?\s*([\s\S]*?)```$", text)
    if match:
        return match.group(1).strip()
    return text


def _parse_ai_response(raw_response: str) -> dict[str, Any]:
    """Parse structured markers from AI response.

    Returns dict with:
      - content: str (conversational text with markers stripped)
      - tree_update: dict | None (parsed TreeStructure JSON)
      - phase: str | None (new phase name)
      - metadata: dict | None (name, description, tags)
    """
    result: dict[str, Any] = {
        "content": raw_response,
        "tree_update": None,
        "phase": None,
        "metadata": None,
    }

    # Extract [TREE_UPDATE]...[/TREE_UPDATE]
    tree_match = re.search(
        r"\[TREE_UPDATE\]\s*([\s\S]*?)\s*\[/TREE_UPDATE\]", raw_response
    )
    if tree_match:
        try:
            raw_json = _strip_markdown_fences(tree_match.group(1))
            result["tree_update"] = json.loads(raw_json)
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning("Failed to parse tree update JSON: %s", e)
        result["content"] = raw_response[: tree_match.start()] + raw_response[tree_match.end() :]
    else:
        # Handle truncated response — opening tag exists but no closing tag
        # (happens when max_tokens cuts off the JSON block)
        truncated_match = re.search(r"\[TREE_UPDATE\][\s\S]*$", raw_response)
        if truncated_match:
            logger.warning("Truncated [TREE_UPDATE] block detected (no closing tag) — stripping from display")
            result["content"] = raw_response[: truncated_match.start()]

    # Extract [PHASE:name]
    phase_match = re.search(r"\[PHASE:(\w+)\]", result["content"])
    if phase_match:
        result["phase"] = phase_match.group(1)
        result["content"] = result["content"][: phase_match.start()] + result["content"][phase_match.end() :]

    # Extract [METADATA]...[/METADATA]
    meta_match = re.search(
        r"\[METADATA\]\s*([\s\S]*?)\s*\[/METADATA\]", result["content"]
    )
    if meta_match:
        try:
            raw_json = _strip_markdown_fences(meta_match.group(1))
            result["metadata"] = json.loads(raw_json)
        except (json.JSONDecodeError, ValueError) as e:
            logger.warning("Failed to parse metadata JSON: %s", e)
        result["content"] = result["content"][: meta_match.start()] + result["content"][meta_match.end() :]
    else:
        truncated_meta = re.search(r"\[METADATA\][\s\S]*$", result["content"])
        if truncated_meta:
            logger.warning("Truncated [METADATA] block detected — stripping from display")
            result["content"] = result["content"][: truncated_meta.start()]

    # Clean up extra whitespace from marker removal
    result["content"] = re.sub(r"\n{3,}", "\n\n", result["content"]).strip()

    return result


# ── Main Service Functions ──


async def start_chat_session(
    flow_type: str,
    user_id: uuid.UUID,
    account_id: uuid.UUID,
    db: AsyncSession,
) -> tuple[AIChatSession, str]:
    """Create a chat session and return the AI's opening greeting.

    Returns (session, greeting_text).
    """
    session = AIChatSession(
        user_id=user_id,
        account_id=account_id,
        flow_type=flow_type,
        expires_at=datetime.now(timezone.utc) + timedelta(hours=settings.AI_CONVERSATION_TTL_HOURS),
    )
    db.add(session)
    await db.flush()

    # Build system prompt and get opening message
    system_prompt = _build_system_prompt(flow_type)
    primer = f"I want to build a {flow_type} flow. Help me get started."

    provider = get_ai_provider()
    provider_name = settings.AI_PROVIDER

    messages = [{"role": "user", "content": primer}]
    response_text, input_tokens, output_tokens = await provider.generate_text(
        system_prompt=system_prompt,
        messages=messages,
        max_tokens=1500,
    )

    # Parse response (greeting shouldn't have tree updates, but handle gracefully)
    parsed = _parse_ai_response(response_text)

    # Store conversation history
    now_iso = datetime.now(timezone.utc).isoformat()
    session.conversation_history = [
        {"role": "user", "content": primer, "timestamp": now_iso, "hidden": True},
        {"role": "assistant", "content": parsed["content"], "timestamp": now_iso},
    ]
    session.provider_used = provider_name
    session.message_count = 1
    session.total_input_tokens = input_tokens
    session.total_output_tokens = output_tokens

    if parsed["metadata"]:
        session.tree_metadata = parsed["metadata"]

    return session, parsed["content"]


async def send_message(
    session: AIChatSession,
    user_message: str,
    db: AsyncSession,
) -> tuple[str, Optional[dict], Optional[str], Optional[dict]]:
    """Send a user message and get AI response.

    Returns (ai_content, working_tree_update, new_phase, metadata_update).
    """
    system_prompt = _build_system_prompt(session.flow_type)

    # Build messages array from conversation history
    now_iso = datetime.now(timezone.utc).isoformat()
    history = list(session.conversation_history)
    history.append({"role": "user", "content": user_message, "timestamp": now_iso})

    # Convert to provider format (just role + content)
    provider_messages = [
        {"role": msg["role"], "content": msg["content"]}
        for msg in history
    ]

    provider = get_ai_provider()
    response_text, input_tokens, output_tokens = await provider.generate_text(
        system_prompt=system_prompt,
        messages=provider_messages,
        max_tokens=8000,
    )

    parsed = _parse_ai_response(response_text)

    # Validate tree update if present (lightweight check for progressive builds —
    # only require valid root structure, not min node counts)
    tree_update = parsed["tree_update"]
    if tree_update:
        if not isinstance(tree_update, dict) or tree_update.get("type") != "decision":
            logger.warning("AI tree update rejected: root must be a decision node")
            tree_update = None
        elif not tree_update.get("id"):
            logger.warning("AI tree update rejected: root node missing id")
            tree_update = None

    # Update session state
    history.append({"role": "assistant", "content": parsed["content"], "timestamp": now_iso})
    session.conversation_history = history
    session.message_count = session.message_count + 1
    session.total_input_tokens = session.total_input_tokens + input_tokens
    session.total_output_tokens = session.total_output_tokens + output_tokens

    if tree_update:
        session.working_tree = tree_update

    if parsed["phase"]:
        valid_phases = {"scoping", "discovery", "enrichment", "review", "generation"}
        if parsed["phase"] in valid_phases:
            session.current_phase = parsed["phase"]

    if parsed["metadata"]:
        merged = dict(session.tree_metadata)
        merged.update(parsed["metadata"])
        session.tree_metadata = merged

    session.updated_at = datetime.now(timezone.utc)

    return parsed["content"], tree_update, parsed["phase"], parsed["metadata"]


async def generate_final_tree(
    session: AIChatSession,
    db: AsyncSession,
) -> tuple[dict[str, Any], dict[str, Any]]:
    """Generate the final validated TreeStructure from the conversation.

    Returns (tree_structure, metadata).
    Raises ValueError if generation fails after retry.
    """
    system_prompt = _build_system_prompt(session.flow_type)

    # Build generation prompt from full conversation
    provider_messages = [
        {"role": msg["role"], "content": msg["content"]}
        for msg in session.conversation_history
    ]

    generation_instruction = """Based on our entire conversation, generate the COMPLETE and FINAL TreeStructure JSON for this flow.

Requirements:
- Include ALL branches, steps, and solutions we discussed
- Use descriptive node IDs (slugs, not UUIDs)
- Root node must be type "decision"
- Every decision option must have a valid next_node_id pointing to a child
- Every action node should have commands with exact syntax where discussed
- Every action node should have expected_outcome where discussed
- Solution nodes should have resolution_steps
- Respond with ONLY the JSON — no conversational text, no markdown fences

Also provide metadata as a separate JSON object after the tree:
[METADATA]
{"name": "...", "description": "...", "tags": ["..."]}
[/METADATA]"""

    provider_messages.append({"role": "user", "content": generation_instruction})

    provider = get_ai_provider()

    for attempt in range(2):  # One try + one retry
        response_text, input_tokens, output_tokens = await provider.generate_text(
            system_prompt=system_prompt,
            messages=provider_messages,
            max_tokens=8000,
        )

        session.total_input_tokens = session.total_input_tokens + input_tokens
        session.total_output_tokens = session.total_output_tokens + output_tokens

        # Extract metadata first
        parsed = _parse_ai_response(response_text)
        metadata = parsed["metadata"] or dict(session.tree_metadata)

        # Parse tree JSON — could be in tree_update marker or raw
        tree = parsed["tree_update"]
        if not tree:
            try:
                raw = _strip_markdown_fences(parsed["content"])
                tree = json.loads(raw)
            except (json.JSONDecodeError, ValueError):
                pass

        if not tree:
            if attempt == 0:
                provider_messages.append({"role": "assistant", "content": response_text})
                provider_messages.append({
                    "role": "user",
                    "content": "That response was not valid JSON. Please respond with ONLY the TreeStructure JSON object, starting with { and ending with }. No markdown fences, no explanatory text.",
                })
                continue
            raise ValueError("AI failed to produce valid JSON after retry")

        errors = validate_generated_tree(tree)
        if errors:
            if attempt == 0:
                provider_messages.append({"role": "assistant", "content": response_text})
                correction = (
                    f"The tree has validation errors: {'; '.join(errors)}. "
                    "Please fix these issues and respond with the corrected JSON only."
                )
                provider_messages.append({"role": "user", "content": correction})
                continue
            raise ValueError(f"Generated tree failed validation: {'; '.join(errors)}")

        # Success
        session.working_tree = tree
        session.tree_metadata = metadata
        session.current_phase = "generation"
        session.updated_at = datetime.now(timezone.utc)

        return tree, metadata

    raise ValueError("AI failed to generate a valid tree")


async def get_chat_session(
    session_id: uuid.UUID,
    user_id: uuid.UUID,
    db: AsyncSession,
) -> AIChatSession:
    """Get a chat session, validating ownership and expiry.

    Raises HTTPException on not found, forbidden, or expired.
    """
    from fastapi import HTTPException, status

    result = await db.execute(
        select(AIChatSession).where(AIChatSession.id == session_id)
    )
    session = result.scalar_one_or_none()

    if not session:
        raise HTTPException(status_code=404, detail="Chat session not found")

    if session.user_id != user_id:
        raise HTTPException(status_code=403, detail="Access denied")

    if session.expires_at < datetime.now(timezone.utc):
        session.status = "abandoned"
        await db.flush()
        raise HTTPException(status_code=410, detail="Chat session has expired")

    return session