2026-04-25 06:02:14 +00:00
4 changed files with 35 additions and 6 deletions
--- a/backend/app/core/ai_fix_service.py
+++ b/backend/app/core/ai_fix_service.py
@@ -199,7 +199,10 @@ async def generate_fixes(

        try:
            text, in_tok, out_tok = await provider.generate_json(
-                system_prompt=FIX_SYSTEM_PROMPT,
+                system_prompt=[
+                    {"type": "text", "text": FIX_SYSTEM_PROMPT},
+                    # cacheable: stable constant across all fix attempts
+                ],
                messages=messages,
                max_tokens=2048,
            )
@@ -232,7 +235,11 @@ async def generate_fixes(

            try:
                text2, in_tok2, out_tok2 = await provider.generate_json(
-                    system_prompt=FIX_SYSTEM_PROMPT,
+                    system_prompt=[
+                        {"type": "text", "text": FIX_SYSTEM_PROMPT},
+                        # cacheable: stable constant; retry reads the cached
+                        # system block from the first attempt above
+                    ],
                    messages=messages,
                    max_tokens=2048,
                )
--- a/backend/app/core/ai_tree_generator_service.py
+++ b/backend/app/core/ai_tree_generator_service.py
@@ -146,7 +146,10 @@ async def scaffold_branches(
        user_message += f"Environment: {', '.join(tags)}\n"

    raw_text, input_tokens, output_tokens = await provider.generate_json(
-        system_prompt=SCAFFOLD_SYSTEM_PROMPT,
+        system_prompt=[
+            {"type": "text", "text": SCAFFOLD_SYSTEM_PROMPT},
+            # cacheable: stable constant across all scaffold calls
+        ],
        messages=[{"role": "user", "content": user_message}],
        max_tokens=2048,
    )
@@ -207,7 +210,13 @@ async def generate_branch_detail(

    for attempt in range(3):
        raw_text, input_tokens, output_tokens = await provider.generate_json(
-            system_prompt=BRANCH_DETAIL_SYSTEM_PROMPT,
+            system_prompt=[
+                {"type": "text", "text": BRANCH_DETAIL_SYSTEM_PROMPT},
+                # cacheable: stable constant. Retries in this loop re-read the
+                # cached system block rather than paying full input cost each
+                # attempt — the ~2.5k-token prompt with few-shot example is
+                # the dominant cost here.
+            ],
            messages=messages,
            max_tokens=8192,
        )
--- a/backend/app/core/kb_conversion_service.py
+++ b/backend/app/core/kb_conversion_service.py
@@ -425,7 +425,12 @@ async def convert_document(

    try:
        raw_text, input_tokens, output_tokens = await provider.generate_json(
-            system_prompt=system_prompt,
+            system_prompt=[
+                {"type": "text", "text": system_prompt},
+                # cacheable: one of two stable constants (TROUBLESHOOTING_SYSTEM_PROMPT
+                # or PROCEDURAL_SYSTEM_PROMPT) selected by target_type. Each
+                # variant caches independently by text content.
+            ],
            messages=[{"role": "user", "content": user_message}],
            max_tokens=16384,
        )
--- a/backend/app/services/script_builder_service.py
+++ b/backend/app/services/script_builder_service.py
@@ -220,7 +220,15 @@ async def send_message(
    model = settings.get_model_for_action("script_build")
    provider = get_ai_provider(model=model)
    ai_text, input_tokens, output_tokens = await provider.generate_text(
-        system_prompt=system_prompt,
+        system_prompt=[
+            {"type": "text", "text": system_prompt},
+            # cacheable: SYSTEM_PROMPT_TEMPLATE with a per-session language
+            # substitution. Two sessions on the same language share a cache
+            # entry; different languages cache independently. Conversation
+            # history (ai_messages) is NOT cached at this layer — if that
+            # becomes a cost driver, route script_builder through the chat
+            # wrapper (0.4) which handles history caching.
+        ],
        messages=ai_messages,
        max_tokens=8192,
    )