resolutionflow/backend/app/services/assistant_chat_service.py

"""Shared AI chat infrastructure — system prompt, prompt caching, and AI calling.

Used by unified_chat_service (the active chat backend). The assistant_chat
CRUD endpoints were removed — only retention settings remain on that router.

Uses Anthropic prompt caching to reduce cost on multi-turn conversations:
- The static system prompt is cached (ephemeral, 5-min TTL)
- The conversation history prefix is cached via a breakpoint on the
  last existing message before the new user input

Optionally connects to Microsoft Learn via Anthropic's MCP connector
for real-time documentation lookups (controlled by ENABLE_MCP_MICROSOFT_LEARN).
"""
import logging
from typing import Any

from app.core.config import settings

logger = logging.getLogger(__name__)

ASSISTANT_SYSTEM_PROMPT = """\
You are ResolutionFlow Assistant — an expert IT systems engineer embedded in a \
troubleshooting platform built for Managed Service Provider (MSP) teams.

## Your Role
You are a senior peer helping fellow MSP engineers solve problems fast. You have \
deep expertise across the MSP technology stack:
- Windows Server, Active Directory, Group Policy, Hybrid Identity (Entra ID / Azure AD)
- Networking: TCP/IP, DNS, DHCP, VPN, firewalls (Cisco, Fortinet, Meraki, SonicWall)
- Virtualization: VMware vSphere, Hyper-V, Proxmox
- Cloud platforms: Microsoft 365, Azure, AWS
- Endpoint management, RMM tools, and PSA platforms (ConnectWise, Datto, Kaseya, NinjaRMM)
- PowerShell scripting and automation
- Security: MFA, Conditional Access, EDR, backup/DR

## How to Answer
- **Be direct and actionable.** Engineers are mid-ticket — lead with the fix or next \
diagnostic step, then explain why in one sentence if helpful. Skip background unless asked.
- **Include specifics.** Exact commands, registry paths, config values, port numbers. \
Vague advice wastes time.
- **Warn before you wreck.** If a step could cause downtime, data loss, or a lockout, \
say so upfront — before the command.
- **Use structured formatting.** Bullet points for steps, code blocks for commands, \
bold for key terms. Engineers scan, they don't read essays.
- **Say when you're unsure.** If you don't know the exact answer, say so. Suggest \
where to verify (vendor docs, a specific KB article) rather than guessing.

## How to Ask Questions
- **Default to a single focused question.** Ask what you need to know right now to make progress.
- **Use contextual bullets sparingly.** If the question could be ambiguous (e.g., "what error?" \
when there are multiple common patterns), add 2-3 sub-bullets to help the engineer recognize \
what you're asking for — but keep it short.
- **Multiple questions only when blocking.** If you genuinely cannot proceed without knowing \
two things (e.g., both the error message AND which users are affected), preface it clearly: \
"Before continuing troubleshooting, I need to know: 1) [question], 2) [question]." Use this rarely.
- **Avoid interrogation mode.** Don't fire off 5 questions in a row. Get one answer, make \
progress, then ask the next question if needed.

## Using the Team's Flow Library
Your team has built troubleshooting flows in ResolutionFlow. When relevant flows \
appear in the context below, reference them by name so the engineer can launch them \
directly. Prefer the team's proven flows over ad-hoc instructions when they exist.

## Using Microsoft Learn Documentation
You have access to Microsoft's official documentation via Microsoft Learn. Use it when:
- The question involves exact cmdlet syntax, API parameters, or configuration steps
- You need to verify current Microsoft/Azure behavior or requirements
- No team flow covers the topic and vendor-specific detail would help
Do NOT use Microsoft Learn for every question — only when official docs add real value.

## Image Analysis
When an image is attached, analyze it carefully. Screenshots of error messages, \
config panels, event viewer logs, and network diagrams are common in MSP work. \
Describe what you see and use the visual information to inform your troubleshooting advice.

## Boundaries
- Stay focused on IT infrastructure, systems administration, and MSP operations.
- If a question is clearly outside your domain, say so briefly and redirect.
- Never fabricate error codes, KB article numbers, or CLI flags. If unsure, say so.
"""


async def _call_ai(
    system_base: str,
    rag_context: str,
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int = 4096,
    images: list[dict[str, Any]] | None = None,
) -> tuple[str, int, int]:
    """Call the AI with prompt caching when using Anthropic.

    Caching strategy:
    - System prompt base: cached (stable across all turns)
    - RAG context: NOT cached (changes per query)
    - Conversation history prefix: cached via breakpoint on last
      existing message (stable — only new user message is uncached)

    Args:
        images: Optional list of {"media_type": str, "data": str (base64)}
                to include alongside the new_message as vision content.
    """
    if settings.AI_PROVIDER == "anthropic" and settings.ANTHROPIC_API_KEY:
        return await _call_anthropic_cached(
            system_base, rag_context, history, new_message, max_tokens,
            images=images,
        )

    # Fallback: generic provider (Gemini, etc.) — images not supported
    from app.core.ai_provider import get_ai_provider

    system_prompt = system_base + rag_context
    messages = history + [{"role": "user", "content": new_message}]
    provider = get_ai_provider()
    return await provider.generate_text(
        system_prompt=system_prompt,
        messages=messages,
        max_tokens=max_tokens,
    )


async def _call_anthropic_cached(
    system_base: str,
    rag_context: str,
    history: list[dict[str, Any]],
    new_message: str,
    max_tokens: int,
    images: list[dict[str, Any]] | None = None,
) -> tuple[str, int, int]:
    """Call Anthropic with prompt caching on system prompt and history.

    Uses structured system blocks so the static base prompt is cached
    independently from the per-query RAG context. Optionally connects
    to Microsoft Learn via MCP for real-time documentation lookups.
    """
    import anthropic

    client = anthropic.AsyncAnthropic(
        api_key=settings.ANTHROPIC_API_KEY,
        timeout=settings.AI_REQUEST_TIMEOUT_SECONDS,
    )

    # System prompt as structured blocks:
    # Block 1: static base prompt (cached)
    # Block 2: RAG context (changes per query, not cached)
    system_blocks: list[dict[str, Any]] = [
        {
            "type": "text",
            "text": system_base,
            "cache_control": {"type": "ephemeral"},
        },
    ]
    if rag_context:
        system_blocks.append({"type": "text", "text": rag_context})

    # Build messages with cache breakpoint on conversation history
    messages: list[dict[str, Any]] = []
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})

    # Place cache breakpoint on the last history message so the entire
    # conversation prefix is cached across turns
    if messages:
        last = messages[-1]
        messages[-1] = {
            "role": last["role"],
            "content": [
                {
                    "type": "text",
                    "text": last["content"],
                    "cache_control": {"type": "ephemeral"},
                }
            ],
        }

    # Add the new user message (uncached — it's new each turn)
    # If images are attached, build multimodal content blocks
    if images:
        content_blocks: list[dict[str, Any]] = []
        for img in images:
            content_blocks.append({
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": img["media_type"],
                    "data": img["data"],
                },
            })
        content_blocks.append({"type": "text", "text": new_message})
        messages.append({"role": "user", "content": content_blocks})
    else:
        messages.append({"role": "user", "content": new_message})

    # MCP server config (optional — controlled by settings)
    mcp_servers = anthropic.NOT_GIVEN
    tools = anthropic.NOT_GIVEN

    if settings.ENABLE_MCP_MICROSOFT_LEARN:
        mcp_servers = [
            {
                "type": "url",
                "url": "https://learn.microsoft.com/api/mcp",
                "name": "microsoft-learn",
            }
        ]
        tools = [
            {
                "type": "mcp_toolset",
                "mcp_server_name": "microsoft-learn",
            }
        ]

    try:
        response = await client.beta.messages.create(
            model=settings.AI_MODEL_ANTHROPIC,
            max_tokens=max_tokens,
            system=system_blocks,
            messages=messages,
            mcp_servers=mcp_servers,
            tools=tools,
            betas=["mcp-client-2025-11-20"],
        )
    except anthropic.BadRequestError as e:
        # MCP server failures (rate limits, connection errors) should not
        # block the assistant entirely — retry without MCP tools.
        if "MCP server" in str(e) and mcp_servers is not anthropic.NOT_GIVEN:
            logger.warning("MCP server error, retrying without MCP: %s", e)
            response = await client.beta.messages.create(
                model=settings.AI_MODEL_ANTHROPIC,
                max_tokens=max_tokens,
                system=system_blocks,
                messages=messages,
            )
        else:
            raise

    # Extract text from response — MCP responses can have multiple block
    # types (text, mcp_tool_use, mcp_tool_result). We join all text blocks.
    text_parts = []
    mcp_tools_used = []
    for block in response.content:
        if hasattr(block, "text"):
            text_parts.append(block.text)
        if getattr(block, "type", None) == "mcp_tool_use":
            mcp_tools_used.append(getattr(block, "name", "unknown"))

    text = "\n".join(text_parts) if text_parts else ""

    usage = response.usage
    input_tokens = usage.input_tokens
    output_tokens = usage.output_tokens

    # Log MCP tool usage
    if mcp_tools_used:
        logger.info("MCP tools used: %s", ", ".join(mcp_tools_used))

    # Log cache performance
    cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
    cache_creation = getattr(usage, "cache_creation_input_tokens", 0) or 0
    if cache_read or cache_creation:
        logger.info(
            "Anthropic cache: read=%d creation=%d input=%d output=%d",
            cache_read, cache_creation, input_tokens, output_tokens,
        )

    return text, input_tokens, output_tokens


def _auto_title(message: str) -> str:
    """Generate a short title from the first user message."""
    title = message.strip()[:100]
    if len(message) > 100:
        title = title.rsplit(" ", 1)[0] + "..."
    return title