"""Tree chunker — converts tree_structure JSON into embeddable text chunks. Produces three chunk types: - tree_summary: Name + description + tags + type overview - node: Individual node content with breadcrumb path context - solution: Full solution/action text with path context """ import logging from typing import Any logger = logging.getLogger(__name__) def _get_breadcrumb(node: dict, parent_path: str = "") -> str: """Build a breadcrumb path string for a node.""" content = node.get("content", node.get("label", ""))[:80] if parent_path: return f"{parent_path} > {content}" return content def _chunk_node( node: dict, tree_name: str, tree_type: str, tags: list[str], parent_path: str = "", ) -> list[dict[str, Any]]: """Recursively chunk a node and its children.""" chunks = [] node_type = node.get("type", "unknown") node_id = node.get("id", "") content = node.get("content", node.get("label", "")) breadcrumb = _get_breadcrumb(node, parent_path) # Build chunk text based on node type if node_type in ("question", "decision"): options = node.get("children", []) option_labels = [ child.get("label", child.get("content", ""))[:100] for child in options if isinstance(child, dict) ] text_parts = [ f"[{node_type}] {content}", ] if option_labels: text_parts.append(f"Options: {', '.join(option_labels)}") text_parts.append(f"Path: {breadcrumb}") text_parts.append(f"Flow: {tree_name} | Type: {tree_type}") if tags: text_parts.append(f"Tags: {', '.join(tags)}") chunks.append({ "chunk_type": "node", "node_type": node_type, "node_id": node_id, "chunk_text": "\n".join(text_parts), }) elif node_type in ("action", "solution", "info", "warning"): text_parts = [ f"[{node_type}] {content}", f"Path: {breadcrumb}", f"Flow: {tree_name} | Type: {tree_type}", ] if tags: text_parts.append(f"Tags: {', '.join(tags)}") chunk_type = "solution" if node_type == "solution" else "node" chunks.append({ "chunk_type": chunk_type, "node_type": node_type, "node_id": node_id, "chunk_text": "\n".join(text_parts), }) elif node_type in ("step", "section_header"): text_parts = [ f"[{node_type}] {content}", f"Path: {breadcrumb}", f"Flow: {tree_name} | Type: {tree_type}", ] if node.get("description"): text_parts.insert(1, node["description"]) if tags: text_parts.append(f"Tags: {', '.join(tags)}") chunks.append({ "chunk_type": "node", "node_type": node_type, "node_id": node_id, "chunk_text": "\n".join(text_parts), }) # Recurse into children children = node.get("children", []) if isinstance(children, list): for child in children: if isinstance(child, dict): chunks.extend( _chunk_node(child, tree_name, tree_type, tags, breadcrumb) ) # Follow next_node_id linked nodes (action nodes) # These are handled at the tree level, not recursively return chunks def chunk_tree( tree_name: str, tree_type: str, description: str | None, tags: list[str], tree_structure: dict[str, Any], ) -> list[dict[str, Any]]: """Convert a tree into embeddable text chunks. Args: tree_name: Name of the flow. tree_type: troubleshooting | procedural | maintenance. description: Flow description. tags: List of tag names. tree_structure: The tree_structure JSONB content. Returns: List of chunk dicts with keys: chunk_type, node_type, node_id, chunk_text. """ chunks = [] # Tree summary chunk summary_parts = [ f"Flow: {tree_name}", f"Type: {tree_type}", ] if description: summary_parts.append(f"Description: {description}") if tags: summary_parts.append(f"Tags: {', '.join(tags)}") chunks.append({ "chunk_type": "tree_summary", "node_type": None, "node_id": None, "chunk_text": "\n".join(summary_parts), }) # Chunk the tree structure nodes root = tree_structure if isinstance(root, dict): # Handle both flat structure and nested if "children" in root or "type" in root: chunks.extend( _chunk_node(root, tree_name, tree_type, tags) ) # Handle steps array (procedural flows) if "steps" in root and isinstance(root["steps"], list): for step in root["steps"]: if isinstance(step, dict): chunks.extend( _chunk_node(step, tree_name, tree_type, tags) ) return chunks