From da93ae55c367543a95f28b5dbf4d4a4e979f4949 Mon Sep 17 00:00:00 2001
From: Michael Chihlas <michael@resolutionflow.com>
Date: Fri, 17 Apr 2026 16:29:45 +0000
Subject: [PATCH] feat(ai): opt-in structured-system-block caching for one-shot
 generators (Phase 0.3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wraps each static system prompt in a single-block list so Phase 0.1's
AnthropicProvider applies cache_control: ephemeral automatically (policy α,
first block gets marked when no caller-authored cache_control is present).

Call sites:
- ai_tree_generator.scaffold_branches: SCAFFOLD_SYSTEM_PROMPT (~1k tokens)
- ai_tree_generator.generate_branch_detail: BRANCH_DETAIL_SYSTEM_PROMPT
  (~2.5k tokens with few-shot example); retries inside the same function
  re-read the cached block instead of paying full input cost on each attempt
- kb_conversion.convert_document: TROUBLESHOOTING or PROCEDURAL prompt
  (each caches independently by text content)
- ai_fix.generate_fixes: FIX_SYSTEM_PROMPT on first attempt + corrective retry
- script_builder.send_message: SYSTEM_PROMPT_TEMPLATE (per-session language
  substitution — same-language sessions share cache entries)

Each edit includes an inline comment explaining why the block is cacheable
(stable-constant, retry-reuse, per-language variant) so a future dev can
see the intent at the cache_control marker site.

script_builder history caching deliberately deferred — per Phase 0.1
decision (option i), AnthropicProvider does not automatically cache the
message list. If script_builder's growing 20-message history turns out
to be a visible cost driver via the anthropic.cache telemetry, route
that caller through the 0.4 chat wrapper which handles history caching.

No runtime verification from code-server; cache-hit behavior will be
confirmed against the new dev environment when it's up, per the inline
TODO(phase0-verify) in ai_provider.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/app/core/ai_fix_service.py             | 11 +++++++++--
 backend/app/core/ai_tree_generator_service.py  | 13 +++++++++++--
 backend/app/core/kb_conversion_service.py      |  7 ++++++-
 backend/app/services/script_builder_service.py | 10 +++++++++-
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/backend/app/core/ai_fix_service.py b/backend/app/core/ai_fix_service.py
index 56325386..53bd3c5d 100644
--- a/backend/app/core/ai_fix_service.py
+++ b/backend/app/core/ai_fix_service.py
@@ -199,7 +199,10 @@ async def generate_fixes(
 
         try:
             text, in_tok, out_tok = await provider.generate_json(
-                system_prompt=FIX_SYSTEM_PROMPT,
+                system_prompt=[
+                    {"type": "text", "text": FIX_SYSTEM_PROMPT},
+                    # cacheable: stable constant across all fix attempts
+                ],
                 messages=messages,
                 max_tokens=2048,
             )
@@ -232,7 +235,11 @@ async def generate_fixes(
 
             try:
                 text2, in_tok2, out_tok2 = await provider.generate_json(
-                    system_prompt=FIX_SYSTEM_PROMPT,
+                    system_prompt=[
+                        {"type": "text", "text": FIX_SYSTEM_PROMPT},
+                        # cacheable: stable constant; retry reads the cached
+                        # system block from the first attempt above
+                    ],
                     messages=messages,
                     max_tokens=2048,
                 )
diff --git a/backend/app/core/ai_tree_generator_service.py b/backend/app/core/ai_tree_generator_service.py
index 2463068f..7c04c3e6 100644
--- a/backend/app/core/ai_tree_generator_service.py
+++ b/backend/app/core/ai_tree_generator_service.py
@@ -146,7 +146,10 @@ async def scaffold_branches(
         user_message += f"Environment: {', '.join(tags)}\n"
 
     raw_text, input_tokens, output_tokens = await provider.generate_json(
-        system_prompt=SCAFFOLD_SYSTEM_PROMPT,
+        system_prompt=[
+            {"type": "text", "text": SCAFFOLD_SYSTEM_PROMPT},
+            # cacheable: stable constant across all scaffold calls
+        ],
         messages=[{"role": "user", "content": user_message}],
         max_tokens=2048,
     )
@@ -207,7 +210,13 @@ async def generate_branch_detail(
 
     for attempt in range(3):
         raw_text, input_tokens, output_tokens = await provider.generate_json(
-            system_prompt=BRANCH_DETAIL_SYSTEM_PROMPT,
+            system_prompt=[
+                {"type": "text", "text": BRANCH_DETAIL_SYSTEM_PROMPT},
+                # cacheable: stable constant. Retries in this loop re-read the
+                # cached system block rather than paying full input cost each
+                # attempt — the ~2.5k-token prompt with few-shot example is
+                # the dominant cost here.
+            ],
             messages=messages,
             max_tokens=8192,
         )
diff --git a/backend/app/core/kb_conversion_service.py b/backend/app/core/kb_conversion_service.py
index 9bb9edf8..da121b3c 100644
--- a/backend/app/core/kb_conversion_service.py
+++ b/backend/app/core/kb_conversion_service.py
@@ -425,7 +425,12 @@ async def convert_document(
 
     try:
         raw_text, input_tokens, output_tokens = await provider.generate_json(
-            system_prompt=system_prompt,
+            system_prompt=[
+                {"type": "text", "text": system_prompt},
+                # cacheable: one of two stable constants (TROUBLESHOOTING_SYSTEM_PROMPT
+                # or PROCEDURAL_SYSTEM_PROMPT) selected by target_type. Each
+                # variant caches independently by text content.
+            ],
             messages=[{"role": "user", "content": user_message}],
             max_tokens=16384,
         )
diff --git a/backend/app/services/script_builder_service.py b/backend/app/services/script_builder_service.py
index 991d9a28..aec7e87a 100644
--- a/backend/app/services/script_builder_service.py
+++ b/backend/app/services/script_builder_service.py
@@ -220,7 +220,15 @@ async def send_message(
     model = settings.get_model_for_action("script_build")
     provider = get_ai_provider(model=model)
     ai_text, input_tokens, output_tokens = await provider.generate_text(
-        system_prompt=system_prompt,
+        system_prompt=[
+            {"type": "text", "text": system_prompt},
+            # cacheable: SYSTEM_PROMPT_TEMPLATE with a per-session language
+            # substitution. Two sessions on the same language share a cache
+            # entry; different languages cache independently. Conversation
+            # history (ai_messages) is NOT cached at this layer — if that
+            # becomes a cost driver, route script_builder through the chat
+            # wrapper (0.4) which handles history caching.
+        ],
         messages=ai_messages,
         max_tokens=8192,
     )