diff --git a/backend/app/core/ai_fix_service.py b/backend/app/core/ai_fix_service.py index 56325386..53bd3c5d 100644 --- a/backend/app/core/ai_fix_service.py +++ b/backend/app/core/ai_fix_service.py @@ -199,7 +199,10 @@ async def generate_fixes( try: text, in_tok, out_tok = await provider.generate_json( - system_prompt=FIX_SYSTEM_PROMPT, + system_prompt=[ + {"type": "text", "text": FIX_SYSTEM_PROMPT}, + # cacheable: stable constant across all fix attempts + ], messages=messages, max_tokens=2048, ) @@ -232,7 +235,11 @@ async def generate_fixes( try: text2, in_tok2, out_tok2 = await provider.generate_json( - system_prompt=FIX_SYSTEM_PROMPT, + system_prompt=[ + {"type": "text", "text": FIX_SYSTEM_PROMPT}, + # cacheable: stable constant; retry reads the cached + # system block from the first attempt above + ], messages=messages, max_tokens=2048, ) diff --git a/backend/app/core/ai_tree_generator_service.py b/backend/app/core/ai_tree_generator_service.py index 2463068f..7c04c3e6 100644 --- a/backend/app/core/ai_tree_generator_service.py +++ b/backend/app/core/ai_tree_generator_service.py @@ -146,7 +146,10 @@ async def scaffold_branches( user_message += f"Environment: {', '.join(tags)}\n" raw_text, input_tokens, output_tokens = await provider.generate_json( - system_prompt=SCAFFOLD_SYSTEM_PROMPT, + system_prompt=[ + {"type": "text", "text": SCAFFOLD_SYSTEM_PROMPT}, + # cacheable: stable constant across all scaffold calls + ], messages=[{"role": "user", "content": user_message}], max_tokens=2048, ) @@ -207,7 +210,13 @@ async def generate_branch_detail( for attempt in range(3): raw_text, input_tokens, output_tokens = await provider.generate_json( - system_prompt=BRANCH_DETAIL_SYSTEM_PROMPT, + system_prompt=[ + {"type": "text", "text": BRANCH_DETAIL_SYSTEM_PROMPT}, + # cacheable: stable constant. Retries in this loop re-read the + # cached system block rather than paying full input cost each + # attempt — the ~2.5k-token prompt with few-shot example is + # the dominant cost here. + ], messages=messages, max_tokens=8192, ) diff --git a/backend/app/core/kb_conversion_service.py b/backend/app/core/kb_conversion_service.py index 9bb9edf8..da121b3c 100644 --- a/backend/app/core/kb_conversion_service.py +++ b/backend/app/core/kb_conversion_service.py @@ -425,7 +425,12 @@ async def convert_document( try: raw_text, input_tokens, output_tokens = await provider.generate_json( - system_prompt=system_prompt, + system_prompt=[ + {"type": "text", "text": system_prompt}, + # cacheable: one of two stable constants (TROUBLESHOOTING_SYSTEM_PROMPT + # or PROCEDURAL_SYSTEM_PROMPT) selected by target_type. Each + # variant caches independently by text content. + ], messages=[{"role": "user", "content": user_message}], max_tokens=16384, ) diff --git a/backend/app/services/script_builder_service.py b/backend/app/services/script_builder_service.py index 991d9a28..aec7e87a 100644 --- a/backend/app/services/script_builder_service.py +++ b/backend/app/services/script_builder_service.py @@ -220,7 +220,15 @@ async def send_message( model = settings.get_model_for_action("script_build") provider = get_ai_provider(model=model) ai_text, input_tokens, output_tokens = await provider.generate_text( - system_prompt=system_prompt, + system_prompt=[ + {"type": "text", "text": system_prompt}, + # cacheable: SYSTEM_PROMPT_TEMPLATE with a per-session language + # substitution. Two sessions on the same language share a cache + # entry; different languages cache independently. Conversation + # history (ai_messages) is NOT cached at this layer — if that + # becomes a cost driver, route script_builder through the chat + # wrapper (0.4) which handles history caching. + ], messages=ai_messages, max_tokens=8192, )