fix: KB conversion — increase max_tokens, add JSON repair, improve error handling
- Increase max_tokens from 8192 to 16384 to prevent truncation on long articles - Add _try_repair_json() that fixes trailing commas and attempts to close unclosed brackets/braces from truncated AI responses - Log full raw response (first 2000 chars) on parse failure for debugging - Set status to 'failed' with user-friendly error message instead of leaving imports stuck in 'processing' state Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,49 @@ def _strip_markdown_fences(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def _try_repair_json(text: str) -> dict | None:
|
||||
"""Attempt to repair common JSON issues from AI responses.
|
||||
|
||||
Handles: trailing commas, unclosed brackets/braces, truncated responses.
|
||||
Returns parsed dict on success, None on failure.
|
||||
"""
|
||||
# Strip trailing commas before closing brackets/braces
|
||||
repaired = re.sub(r",\s*([}\]])", r"\1", text)
|
||||
|
||||
# Try parsing after comma cleanup
|
||||
try:
|
||||
return json.loads(repaired)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try closing unclosed brackets/braces (truncated response)
|
||||
# Count open vs close brackets
|
||||
open_braces = repaired.count("{") - repaired.count("}")
|
||||
open_brackets = repaired.count("[") - repaired.count("]")
|
||||
|
||||
if open_braces > 0 or open_brackets > 0:
|
||||
# Remove any trailing partial key-value pair or string
|
||||
# Find the last complete value (ends with }, ], ", number, true, false, null)
|
||||
truncated = repaired.rstrip()
|
||||
# Strip trailing partial string or key
|
||||
truncated = re.sub(r',\s*"[^"]*$', "", truncated) # trailing "partial_key
|
||||
truncated = re.sub(r',\s*$', "", truncated) # trailing comma
|
||||
|
||||
# Close remaining brackets/braces
|
||||
truncated += "]" * max(0, open_brackets)
|
||||
truncated += "}" * max(0, open_braces)
|
||||
|
||||
# Re-strip trailing commas that may have appeared
|
||||
truncated = re.sub(r",\s*([}\]])", r"\1", truncated)
|
||||
|
||||
try:
|
||||
return json.loads(truncated)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _estimate_cost(input_tokens: int, output_tokens: int) -> float:
|
||||
return (input_tokens * COST_PER_INPUT_TOKEN) + (output_tokens * COST_PER_OUTPUT_TOKEN)
|
||||
|
||||
@@ -377,7 +420,7 @@ async def convert_document(
|
||||
raw_text, input_tokens, output_tokens = await provider.generate_json(
|
||||
system_prompt=system_prompt,
|
||||
messages=[{"role": "user", "content": user_message}],
|
||||
max_tokens=8192,
|
||||
max_tokens=16384,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("AI conversion failed for kb_import=%s: %s", kb_import.id, e)
|
||||
@@ -410,17 +453,29 @@ async def convert_document(
|
||||
try:
|
||||
data = json.loads(raw_text)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(
|
||||
"KB conversion JSON parse failed for kb_import=%s (%d chars): %s",
|
||||
kb_import.id, len(raw_text), raw_text[:500],
|
||||
)
|
||||
kb_import.status = "failed"
|
||||
kb_import.error_message = f"AI returned invalid JSON: {e}"
|
||||
kb_import.processing_time_ms = int((time.monotonic() - start_time) * 1000)
|
||||
kb_import.ai_tokens_input = input_tokens
|
||||
kb_import.ai_tokens_output = output_tokens
|
||||
await db.flush()
|
||||
return []
|
||||
# Attempt JSON repair before giving up
|
||||
data = _try_repair_json(raw_text)
|
||||
if data is None:
|
||||
logger.error(
|
||||
"KB conversion JSON parse failed for kb_import=%s (%d chars). "
|
||||
"Parse error: %s. Raw response (first 2000 chars): %s",
|
||||
kb_import.id, len(raw_text), e, raw_text[:2000],
|
||||
)
|
||||
kb_import.status = "failed"
|
||||
kb_import.error_message = (
|
||||
"AI response could not be parsed as valid JSON. "
|
||||
"This can happen with very long articles — try again or simplify the article."
|
||||
)
|
||||
kb_import.processing_time_ms = int((time.monotonic() - start_time) * 1000)
|
||||
kb_import.ai_tokens_input = input_tokens
|
||||
kb_import.ai_tokens_output = output_tokens
|
||||
await db.flush()
|
||||
return []
|
||||
else:
|
||||
logger.info(
|
||||
"KB conversion JSON repaired for kb_import=%s (%d chars)",
|
||||
kb_import.id, len(raw_text),
|
||||
)
|
||||
|
||||
# Parse into nodes based on target type
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user