feat(ai): robust response extraction + structured-output foundation
Harden the Anthropic provider and lay the groundwork for schema-constrained JSON, optimizing the existing claude-sonnet-4-6 / claude-haiku-4-5 usage (no model changes). ai_provider.py: - _extract_text_from_response replaces fragile response.content[0].text: skips non-text leading blocks (e.g. thinking), returns the first text block, logs an anthropic.stop_reason warning on max_tokens/refusal (truncation now observable), and raises ValueError on a no-text response. - generate_json gains an optional `schema` param. Anthropic wires it to output_config.format (structured outputs); schema=None preserves the exact prior call for every existing caller. Gemini accepts-and-ignores it. kb_conversion_service.py: - TROUBLESHOOTING_SCHEMA / PROCEDURAL_SCHEMA + _schema_for_target_type(), modelled as a strict superset of every field the prompts emit. - convert_document passes the schema only when the new AI_KB_CONVERT_STRUCTURED_OUTPUT setting is True (default False). The _try_repair_json fallback stays as belt-and-suspenders. Tests: 14 provider + 7 schema, TDD (red-green). Live constrained-decoding smoke-test still required before enabling the flag in production. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -147,6 +147,40 @@ def build_anthropic_chat_messages(
|
||||
return messages
|
||||
|
||||
|
||||
def _extract_text_from_response(response: Any, model: str) -> str:
|
||||
"""Return the first text block's text from an Anthropic message response.
|
||||
|
||||
Robustness over the naive ``response.content[0].text``:
|
||||
- Skips non-text leading blocks (e.g. ``thinking``) and returns the first
|
||||
block whose ``type == "text"``. Indexing ``content[0]`` blindly throws or
|
||||
returns garbage the moment a non-text block leads the response.
|
||||
- Surfaces truncation/refusal: when ``stop_reason`` is ``max_tokens`` or
|
||||
``refusal``, emits a structured warning so silent output corruption
|
||||
(truncated JSON, empty refusals) is observable rather than handed
|
||||
downstream to be guessed at.
|
||||
- Raises ``ValueError`` when no text block is present (e.g. a bare refusal)
|
||||
instead of returning a non-text block's attributes.
|
||||
"""
|
||||
stop_reason = getattr(response, "stop_reason", None)
|
||||
if stop_reason in ("max_tokens", "refusal"):
|
||||
logger.warning(
|
||||
"anthropic.stop_reason",
|
||||
extra={
|
||||
"event": "anthropic.stop_reason",
|
||||
"model": model,
|
||||
"stop_reason": stop_reason,
|
||||
},
|
||||
)
|
||||
|
||||
for block in response.content:
|
||||
if getattr(block, "type", None) == "text":
|
||||
return block.text
|
||||
|
||||
raise ValueError(
|
||||
f"Anthropic response contained no text block (stop_reason={stop_reason!r})"
|
||||
)
|
||||
|
||||
|
||||
def _log_anthropic_cache_usage(usage: Any, model: str) -> None:
|
||||
"""Emit a structured log line capturing cache_read / cache_creation tokens."""
|
||||
cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||
@@ -176,6 +210,7 @@ class AIProvider(ABC):
|
||||
system_prompt: str | list[SystemBlock],
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int = 4096,
|
||||
schema: dict[str, Any] | None = None,
|
||||
) -> tuple[str, int, int]:
|
||||
"""Generate a JSON response from the AI model.
|
||||
|
||||
@@ -185,6 +220,15 @@ class AIProvider(ABC):
|
||||
Anthropic prompt caching per module-docstring policy.
|
||||
messages: List of message dicts with "role" and "content" keys.
|
||||
max_tokens: Maximum output tokens.
|
||||
schema: Optional JSON Schema constraining the response shape.
|
||||
When provided, the Anthropic backend uses structured outputs
|
||||
(`output_config.format`) to guarantee valid, parseable JSON —
|
||||
no markdown fences, no truncated-brace repair. Must satisfy the
|
||||
structured-output schema limits (every object needs
|
||||
`additionalProperties: false`; no recursion; numeric/string
|
||||
constraints are stripped). `None` preserves the legacy
|
||||
prompt-only behavior. The Gemini backend currently ignores this
|
||||
argument (it already requests `application/json`).
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, input_tokens, output_tokens).
|
||||
@@ -231,7 +275,11 @@ class GeminiProvider(AIProvider):
|
||||
system_prompt: str | list[SystemBlock],
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int = 4096,
|
||||
schema: dict[str, Any] | None = None,
|
||||
) -> tuple[str, int, int]:
|
||||
# `schema` is accepted for interface parity but ignored: Gemini already
|
||||
# constrains output via response_mime_type="application/json" below.
|
||||
# Mapping JSON Schema -> Gemini response_schema is deferred.
|
||||
from google import genai
|
||||
from google.genai import types as genai_types
|
||||
|
||||
@@ -362,18 +410,28 @@ class AnthropicProvider(AIProvider):
|
||||
system_prompt: str | list[SystemBlock],
|
||||
messages: list[dict[str, Any]],
|
||||
max_tokens: int = 4096,
|
||||
schema: dict[str, Any] | None = None,
|
||||
) -> tuple[str, int, int]:
|
||||
client = _get_anthropic_client(self._api_key, self._timeout)
|
||||
normalized_system = _normalize_system_for_anthropic(system_prompt)
|
||||
|
||||
response = await client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=max_tokens,
|
||||
system=normalized_system,
|
||||
messages=messages,
|
||||
)
|
||||
create_kwargs: dict[str, Any] = {
|
||||
"model": self._model,
|
||||
"max_tokens": max_tokens,
|
||||
"system": normalized_system,
|
||||
"messages": messages,
|
||||
}
|
||||
if schema is not None:
|
||||
# Structured outputs: constrain the response to valid JSON matching
|
||||
# the schema (Sonnet 4.6 / Haiku 4.5). Removes the need for
|
||||
# markdown-fence stripping and truncated-JSON repair downstream.
|
||||
create_kwargs["output_config"] = {
|
||||
"format": {"type": "json_schema", "schema": schema}
|
||||
}
|
||||
|
||||
text = response.content[0].text
|
||||
response = await client.messages.create(**create_kwargs)
|
||||
|
||||
text = _extract_text_from_response(response, self._model)
|
||||
input_tokens = response.usage.input_tokens
|
||||
output_tokens = response.usage.output_tokens
|
||||
|
||||
|
||||
Reference in New Issue
Block a user