fix: KB conversion — increase max_tokens, add JSON repair, improve error handling

- Increase max_tokens from 8192 to 16384 to prevent truncation on long articles - Add _try_repair_json() that fixes trailing commas and attempts to close unclosed brackets/braces from truncated AI responses - Log full raw response (first 2000 chars) on parse failure for debugging - Set status to 'failed' with user-friendly error message instead of leaving imports stuck in 'processing' state Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 02:57:27 -04:00
parent 8b3033ca9d
commit 8c73233dd0
1 changed files with 67 additions and 12 deletions
--- a/backend/app/core/kb_conversion_service.py
+++ b/backend/app/core/kb_conversion_service.py
@@ -33,6 +33,49 @@ def _strip_markdown_fences(text: str) -> str:
    return text


+def _try_repair_json(text: str) -> dict | None:
+    """Attempt to repair common JSON issues from AI responses.
+
+    Handles: trailing commas, unclosed brackets/braces, truncated responses.
+    Returns parsed dict on success, None on failure.
+    """
+    # Strip trailing commas before closing brackets/braces
+    repaired = re.sub(r",\s*([}\]])", r"\1", text)
+
+    # Try parsing after comma cleanup
+    try:
+        return json.loads(repaired)
+    except json.JSONDecodeError:
+        pass
+
+    # Try closing unclosed brackets/braces (truncated response)
+    # Count open vs close brackets
+    open_braces = repaired.count("{") - repaired.count("}")
+    open_brackets = repaired.count("[") - repaired.count("]")
+
+    if open_braces > 0 or open_brackets > 0:
+        # Remove any trailing partial key-value pair or string
+        # Find the last complete value (ends with }, ], ", number, true, false, null)
+        truncated = repaired.rstrip()
+        # Strip trailing partial string or key
+        truncated = re.sub(r',\s*"[^"]*$', "", truncated)  # trailing "partial_key
+        truncated = re.sub(r',\s*$', "", truncated)  # trailing comma
+
+        # Close remaining brackets/braces
+        truncated += "]" * max(0, open_brackets)
+        truncated += "}" * max(0, open_braces)
+
+        # Re-strip trailing commas that may have appeared
+        truncated = re.sub(r",\s*([}\]])", r"\1", truncated)
+
+        try:
+            return json.loads(truncated)
+        except json.JSONDecodeError:
+            pass
+
+    return None
+
+
 def _estimate_cost(input_tokens: int, output_tokens: int) -> float:
    return (input_tokens * COST_PER_INPUT_TOKEN) + (output_tokens * COST_PER_OUTPUT_TOKEN)

@@ -377,7 +420,7 @@ async def convert_document(
        raw_text, input_tokens, output_tokens = await provider.generate_json(
            system_prompt=system_prompt,
            messages=[{"role": "user", "content": user_message}],
-            max_tokens=8192,
+            max_tokens=16384,
        )
    except Exception as e:
        logger.error("AI conversion failed for kb_import=%s: %s", kb_import.id, e)
@@ -410,17 +453,29 @@ async def convert_document(
    try:
        data = json.loads(raw_text)
    except json.JSONDecodeError as e:
-        logger.error(
-            "KB conversion JSON parse failed for kb_import=%s (%d chars): %s",
-            kb_import.id, len(raw_text), raw_text[:500],
-        )
-        kb_import.status = "failed"
-        kb_import.error_message = f"AI returned invalid JSON: {e}"
-        kb_import.processing_time_ms = int((time.monotonic() - start_time) * 1000)
-        kb_import.ai_tokens_input = input_tokens
-        kb_import.ai_tokens_output = output_tokens
-        await db.flush()
-        return []
+        # Attempt JSON repair before giving up
+        data = _try_repair_json(raw_text)
+        if data is None:
+            logger.error(
+                "KB conversion JSON parse failed for kb_import=%s (%d chars). "
+                "Parse error: %s. Raw response (first 2000 chars): %s",
+                kb_import.id, len(raw_text), e, raw_text[:2000],
+            )
+            kb_import.status = "failed"
+            kb_import.error_message = (
+                "AI response could not be parsed as valid JSON. "
+                "This can happen with very long articles — try again or simplify the article."
+            )
+            kb_import.processing_time_ms = int((time.monotonic() - start_time) * 1000)
+            kb_import.ai_tokens_input = input_tokens
+            kb_import.ai_tokens_output = output_tokens
+            await db.flush()
+            return []
+        else:
+            logger.info(
+                "KB conversion JSON repaired for kb_import=%s (%d chars)",
+                kb_import.id, len(raw_text),
+            )

    # Parse into nodes based on target type
    try: