From f0870cf3201c15377222819b2f9918a5a766d445 Mon Sep 17 00:00:00 2001
From: Andy Charlwood <andrew.charlwood@gmail.com>
Date: Mon, 16 Feb 2026 00:42:58 +0000
Subject: [PATCH] feat: US-017 - Improve system prompt instructions and LLM
 parameters

---
 Ralph/prd.json       |  2 +-
 Ralph/progress.txt   | 20 ++++++++++++++++++++
 scripts/benchmark.ts | 16 +++++++---------
 src/lib/llm.ts       | 16 +++++++---------
 4 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/Ralph/prd.json b/Ralph/prd.json
index c5d9ef9..9c6df93 100644
--- a/Ralph/prd.json
+++ b/Ralph/prd.json
@@ -333,7 +333,7 @@
         "Typecheck passes"
       ],
       "priority": 17,
-      "passes": false,
+      "passes": true,
       "notes": "These are behavioral instructions that go in the Rules section of the system prompt. Keep them concise — LLMs follow shorter, clearer rules better than long paragraphs. Consider: '1. Distinguish NHS employment (May 2022–present, ICB) from private sector (Tesco PLC). 2. When asked about tools/skills across career, aggregate from ALL roles. 3. Cite specific numbers, dates, and outcomes — never say approximate when exact figures are available. 4. If the answer isn't in the context, say so clearly.' Temperature and maxTokens are set in the API request config, not the prompt."
     },
     {
diff --git a/Ralph/progress.txt b/Ralph/progress.txt
index bc8a749..d1d5e93 100644
--- a/Ralph/progress.txt
+++ b/Ralph/progress.txt
@@ -396,3 +396,23 @@
   - System prompt no longer depends on `buildEmbeddingTexts()` — the CV context is hardcoded. This means prompt content and embedding texts can diverge (prompt is optimised for Q&A, embeddings for semantic search)
   - When the prompt is close to the 8KB limit, trim verbose connecting phrases and redundant qualifiers first — the specific facts and numbers are what matter for accuracy
 ---
+
+## 2026-02-16 - US-017
+- Improved Response Rules in system prompt (`src/lib/llm.ts`) with numbered, clearer behavioral instructions:
+  1. Explicit "I don't have that information" phrasing for missing data
+  2. Stronger employer distinction instruction with "Never conflate the two"
+  3. Aggregation instruction broadened to include "projects" alongside tools/skills/achievements
+  4. Explicit prohibition on "approximately" and "around" when exact figures exist
+  5. Adaptive length instruction: thorough for list/detail questions, concise for simple ones
+- Lowered temperature from 0.7 to 0.4 for more consistent factual responses
+- Increased max_tokens from 512 to 800 to avoid truncating detailed answers
+- Preserved [ITEMS: ...] suffix instruction unchanged
+- Mirrored identical changes in `scripts/benchmark.ts` (prompt, temperature defaults, max_tokens defaults)
+- Typecheck (0 errors), lint (0 errors), production build passes
+- Files changed: `src/lib/llm.ts`, `scripts/benchmark.ts`
+- **Learnings for future iterations:**
+  - Numbered rules in system prompts tend to be followed more reliably by LLMs than bullet points
+  - Temperature 0.4 is a good balance for factual Q&A — low enough for consistency, high enough to avoid repetitive phrasing
+  - The benchmark script's `callLLM()` uses default params `temperature = 0.4, maxTokens = 800` — these match production. The scoring call overrides temperature to 0 for deterministic scoring
+  - The adaptive length rule ("thorough for detailed questions, concise for simple ones") replaces the fixed "2-4 sentences" rule — this should improve scores on questions requiring enumeration
+---
diff --git a/scripts/benchmark.ts b/scripts/benchmark.ts
index cc61fda..f289856 100644
--- a/scripts/benchmark.ts
+++ b/scripts/benchmark.ts
@@ -155,13 +155,11 @@ Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill
 Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)
 
 ## Response Rules
-- Answer ONLY from the data above. Never invent facts, roles, dates, or achievements.
-- Distinguish NHS employment (May 2022 onwards, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy).
-- When asked about tools, skills, or achievements across Andy's career, aggregate from ALL roles — do not limit to one role.
-- Cite specific numbers, dates, and outcomes when available. Never say "approximately" when exact figures exist in the data.
-- If the answer is not in the data, say so honestly.
-- Do not fabricate URLs, email addresses, or contact details.
-- Be concise: 2-4 sentences unless the question requires a longer answer.
+1. Answer ONLY from the data above. If the answer is not in the data, say "I don't have that information" — never invent facts, roles, dates, achievements, URLs, or contact details.
+2. Distinguish NHS employment (May 2022–present, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy). Never conflate the two.
+3. When asked broad questions about tools, skills, projects, or achievements across Andy's career, aggregate from ALL roles — do not limit your answer to one position.
+4. Cite exact numbers, dates, percentages, and outcomes. Never say "approximately" or "around" when exact figures exist in the data.
+5. For detailed or list-based questions, give a thorough answer covering all relevant items. For simple questions, be concise (2-4 sentences).
 
 ## Item References
 End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
@@ -176,8 +174,8 @@ function sleep(ms: number): Promise<void> {
 async function callLLM(
   systemPrompt: string,
   userMessage: string,
-  temperature = 0.7,
-  maxTokens = 512,
+  temperature = 0.4,
+  maxTokens = 800,
 ): Promise<string> {
   const apiKey = getApiKey()
   const maxRetries = 5
diff --git a/src/lib/llm.ts b/src/lib/llm.ts
index d4f4391..861b7c3 100644
--- a/src/lib/llm.ts
+++ b/src/lib/llm.ts
@@ -96,13 +96,11 @@ Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill
 Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)
 
 ## Response Rules
-- Answer ONLY from the data above. Never invent facts, roles, dates, or achievements.
-- Distinguish NHS employment (May 2022 onwards, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy).
-- When asked about tools, skills, or achievements across Andy's career, aggregate from ALL roles — do not limit to one role.
-- Cite specific numbers, dates, and outcomes when available. Never say "approximately" when exact figures exist in the data.
-- If the answer is not in the data, say so honestly.
-- Do not fabricate URLs, email addresses, or contact details.
-- Be concise: 2-4 sentences unless the question requires a longer answer.
+1. Answer ONLY from the data above. If the answer is not in the data, say "I don't have that information" — never invent facts, roles, dates, achievements, URLs, or contact details.
+2. Distinguish NHS employment (May 2022–present, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy). Never conflate the two.
+3. When asked broad questions about tools, skills, projects, or achievements across Andy's career, aggregate from ALL roles — do not limit your answer to one position.
+4. Cite exact numbers, dates, percentages, and outcomes. Never say "approximately" or "around" when exact figures exist in the data.
+5. For detailed or list-based questions, give a thorough answer covering all relevant items. For simple questions, be concise (2-4 sentences).
 
 ## Item References
 End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
@@ -117,8 +115,8 @@ function buildRequestBody(
   return {
     model: LLM_MODEL,
     stream: true,
-    temperature: 0.7,
-    max_tokens: 512,
+    temperature: 0.4,
+    max_tokens: 800,
     messages: [
       { role: 'system', content: systemPrompt },
       ...messages.map((msg) => ({