feat: US-015 - Migrate benchmark script to OpenRouter

2026-02-16 00:31:16 +00:00
parent 4bab9b369c
commit 8cc7038942
4 changed files with 163 additions and 46 deletions
@@ -294,7 +294,7 @@
        "Typecheck passes"
      ],
      "priority": 15,
-      "passes": false,
+      "passes": true,
      "notes": "The benchmark uses the non-streaming endpoint (no stream:true needed). OpenRouter non-streaming response format: { choices: [{ message: { content: '...' } }] }. The buildSystemPrompt() function should be imported from the renamed llm.ts (or duplicated if the import path alias doesn't work in tsx scripts — check if @/ alias resolves). Keep the same retry logic structure but update status code handling for OpenRouter. The scoring prompt and question flow are unchanged — only the API transport layer changes."
    },
    {
@@ -37,6 +37,8 @@
 - `handleSubmit(overrideText?)` accepts optional text param — use this when programmatically sending messages (e.g., suggested question chips) to avoid stale `inputValue` state
 - `SUGGESTED_QUESTIONS` const array at top of ChatWidget — edit here to change welcome screen chip text
 - System prompt prefixes each CV entry with `[item-id]` so the model can directly reference IDs in its `[ITEMS: ...]` suffix — more reliable than expecting pattern inference
+- Benchmark script (`scripts/benchmark.ts`) uses OpenRouter non-streaming endpoint — response format: `choices[0].message.content` (not `.delta.content` like streaming). Auth via `Authorization: Bearer` header, API key from `process.env.VITE_OPEN_ROUTER_API_KEY`
+- Cannot import `buildSystemPrompt` from `src/lib/llm.ts` into Node scripts — `llm.ts` uses `import.meta.env` (Vite) and `window.location` (browser). Benchmark keeps its own copy of `buildSystemPrompt` that mirrors production

 ---

@@ -344,3 +346,27 @@
  - `buildSystemPrompt()` is now exported from `llm.ts` — benchmark script (US-015) can import it directly instead of duplicating the logic
  - The benchmark script (`scripts/benchmark.ts`) still uses the old Gemini API — needs separate migration in US-015
 ---
+
+## 2026-02-16 - US-015
+- Migrated `scripts/benchmark.ts` from Gemini API to OpenRouter API
+- Replaced `GEMINI_MODEL` / `GEMINI_API_BASE` with `LLM_MODEL = 'z-ai/glm-5'` and `OPENROUTER_API_URL`
+- Updated `getApiKey()` to read `VITE_OPEN_ROUTER_API_KEY` from `.env`
+- Renamed `callGemini()` → `callLLM()` with OpenRouter request format:
+  - OpenAI-compatible messages array with `role: 'system'` for system prompt
+  - Auth via `Authorization: Bearer` header (not URL param)
+  - Added `HTTP-Referer` and `X-Title` headers per OpenRouter docs
+  - Response parsing: `choices[0].message.content` (non-streaming format)
+  - `max_tokens` (OpenAI format) instead of `maxOutputTokens` (Gemini format)
+- Updated `buildSystemPrompt()` to match production `llm.ts` format: item ID prefixes (`[item-id]`), same rules and instructions
+- Scoring calls also use OpenRouter via `callLLM()` (same model)
+- Rate limit retry logic kept same structure, updated error message text for OpenRouter
+- Model name in results output updated to `z-ai/glm-5`
+- Verified end-to-end: `npm run benchmark` runs all 10 questions, scores them, saves results to `scripts/benchmark-results/iteration-0.json`
+- Typecheck passes (0 errors), lint passes (0 new errors/warnings)
+- Files changed: `scripts/benchmark.ts`
+- **Learnings for future iterations:**
+  - Cannot import `buildSystemPrompt` from `src/lib/llm.ts` into Node scripts — `llm.ts` uses `import.meta.env` (Vite-only) and `window.location` (browser-only). Keep a mirrored copy in the benchmark script
+  - OpenRouter non-streaming response format: `{ choices: [{ message: { content: '...' } }] }` — different from streaming which uses `delta.content`
+  - For Node.js scripts, use a static URL for `HTTP-Referer` header (e.g., `'https://andycharlwood.co.uk'`) since `window.location` isn't available
+  - The benchmark script's `buildSystemPrompt()` should be kept in sync with `llm.ts` manually — if one changes, update the other (US-016/US-017 will modify the production prompt)
+---
@@ -0,0 +1,92 @@
+{
+  "iteration": 0,
+  "timestamp": "2026-02-16T00:30:12.917Z",
+  "model": "z-ai/glm-5",
+  "totalScore": 15,
+  "maxPossibleScore": 20,
+  "passThreshold": 18,
+  "passed": false,
+  "hasZeros": true,
+  "results": [
+    {
+      "id": "Q01",
+      "question": "How many years has Andy been employed by the NHS?",
+      "expectedAnswer": "Approximately 3-4 years. Andy's NHS employment started in May 2022 when he joined NHS Norfolk and Waveney ICB. His previous role at Tesco PLC was in the private sector, not the NHS.",
+      "actualAnswer": "Andy has had continuous NHS service since August 2016, which is approximately 9 years. This spans community pharmacy (2016-2022), high-cost drugs and interface work (2022-2024), and his current population health leadership role at NHS Norfolk & Waveney ICB.\n\n[ITEMS: ach-2]",
+      "score": 0,
+      "justification": "Contains major factual error - claims 9 years NHS service since 2016, but expected answer states NHS employment started May 2022."
+    },
+    {
+      "id": "Q02",
+      "question": "What was Andy's involvement with tirzepatide?",
+      "expectedAnswer": "Andy supported commissioning of NICE TA1026 (tirzepatide). He authored the initial executive paper advocating a primary care delivery model over specialist provider, which drove a system shift to GP-led model.",
+      "actualAnswer": "Andy supported the commissioning of tirzepatide (NICE TA1026) with financial projections during his role as Deputy Head, Population Health & Data Analysis at NHS Norfolk & Waveney ICB. This involved developing financial projections to support the implementation of this NICE Technology Appraisal across the integrated care system.\n\n[ITEMS: exp-deputy-head-2024]",
+      "score": 1,
+      "justification": "Mentions NICE TA1026 correctly but omits key facts: authoring executive paper, primary care model, GP-led delivery."
+    },
+    {
+      "id": "Q03",
+      "question": "What specific tools and software has Andy built?",
+      "expectedAnswer": "Andy has built 5 notable projects: a patient switching algorithm (Python, 14000 patients, £2.6M savings), a Blueteq generator for high-cost drug forms, a controlled drugs monitoring system, a Sankey chart tool for visualising patient flows, and PharMetrics — a Power BI analytics dashboard.",
+      "actualAnswer": "Andy has built five main tools:\n\n1. **Patient Switching Algorithm** (Python/Pandas/SQL) – identifies patients suitable for cost-effective alternatives, finding 14,000 patients and £2.6M savings potential\n2. **Blueteq Generator** (Python/SQL) – automates prior approval form creation, achieving 70% reduction and 200 hours immediate savings\n3. **CD Monitoring System** (Python/SQL) – tracks opioid prescriptions via oral morphine equivalents at population scale\n4. **Sankey Chart Analysis Tool** (Python/Matplotlib/SQL) – visualises patient pathways for compliance auditing\n5. **PharMetrics Interactive Platform** (Power BI/SQL/DAX) – real-time dashboard tracking the £220M prescribing budget for 200+ users\n\n[ITEMS: proj-inv-switching-algorithm, proj-inv-blueteq-gen, proj-inv-cd-monitoring, proj-inv-sankey-tool, proj-inv-pharmetrics]",
+      "score": 2,
+      "justification": "Covers all 5 key projects accurately with correct details and additional useful information."
+    },
+    {
+      "id": "Q04",
+      "question": "What were Andy's A-level subjects and grades?",
+      "expectedAnswer": "Andy achieved Mathematics A*, Chemistry B, and Politics C at Highworth Grammar School between 2009-2011.",
+      "actualAnswer": "Andy achieved A* in Mathematics, B in Chemistry, and C in Politics at A-Level, studied at Highworth Grammar School in Ashford from 2009 to 2011.\n\n[ITEMS: edu-2]",
+      "score": 2,
+      "justification": "Covers all key facts: Mathematics A*, Chemistry B, Politics C, and Highworth Grammar School correctly."
+    },
+    {
+      "id": "Q05",
+      "question": "Was Andy's Tesco role part of the NHS?",
+      "expectedAnswer": "No. Andy's role at Tesco PLC was in the private sector as a community pharmacist. Tesco PLC is a private employer. He was an LPC representative during this time.",
+      "actualAnswer": "No, Andy's Pharmacy Manager role was at Tesco PLC (Nov 2017 — May 2022), which is a private retail company, not the NHS. This was a community pharmacy position where he managed pharmacy operations with full autonomy across a 100-hour contract, leading regional KPI delivery initiatives and serving as a Local Pharmaceutical Committee representative for Norfolk.\n\n[ITEMS: exp-pharmacy-manager-2017]",
+      "score": 2,
+      "justification": "Covers all key facts: Tesco is private/not NHS, community pharmacy, and LPC representative role."
+    },
+    {
+      "id": "Q06",
+      "question": "How did the patient switching algorithm work?",
+      "expectedAnswer": "It was Python-based and used real-world GP prescribing data to auto-identify patients eligible for cost-effective medication alternatives. It compressed months of manual work into 3 days, covered 14,000 patients, and identified £2.6M in savings.",
+      "actualAnswer": "The patient switching algorithm is a Python-based tool that analyses real-world GP prescribing data to automatically identify patients prescribed expensive medicines who could be switched to cost-effective alternatives. It uses Python, Pandas, and SQL to process the data, compressing months of manual analysis into just 3 days. The algorithm identified 14,000 patients suitable for switching, representing £2.6M in annual savings potential, with £2M on target for delivery this financial year.\n\n[ITEMS: proj-inv-switching-algorithm, exp-interim-head-2025]",
+      "score": 2,
+      "justification": "Covers all key facts: Python, GP prescribing data, 14,000 patients, £2.6M savings, and 3 days compression."
+    },
+    {
+      "id": "Q07",
+      "question": "What clinical specialties has Andy worked across?",
+      "expectedAnswer": "Andy has worked across rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine through his high-cost drugs role.",
+      "actualAnswer": "Andy has worked across multiple clinical specialties through his high-cost drug pathway development work, including rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine. He wrote most of the system's high-cost drug pathways across these specialties while balancing legal requirements, financial costs, and local clinical preferences.\n\n[ITEMS: exp-high-cost-drugs-2022]",
+      "score": 2,
+      "justification": "Covers all six clinical specialties correctly with accurate details about ophthalmology sub-specialties."
+    },
+    {
+      "id": "Q08",
+      "question": "What is Andy's experience with the dm+d?",
+      "expectedAnswer": "Andy created a comprehensive medicines data table integrating all dm+d products with standardised strengths, morphine equivalents, and Anticholinergic Burden scoring, serving as a single source of truth.",
+      "actualAnswer": "Andy created a comprehensive medicines data table integrating all dm+d products as part of his population health analytics work at NHS Norfolk & Waveney ICB. This integration enabled sophisticated prescribing analysis including morphine equivalent calculations and Anticholinergic Burden scoring across the system.\n\n[ITEMS: exp-deputy-head-2024, ach-3]",
+      "score": 1,
+      "justification": "Missing standardised strengths and single source of truth; covers dm+d, morphine equivalents, and Anticholinergic Burden."
+    },
+    {
+      "id": "Q09",
+      "question": "What budget does Andy manage and how?",
+      "expectedAnswer": "Andy manages a £220M prescribing budget using forecasting models, variance analysis, and financial reporting to the executive team, enabling proactive financial planning.",
+      "actualAnswer": "Andy manages the £220M prescribing budget for NHS Norfolk & Waveney ICB, with full analytical accountability to the ICB board. He uses sophisticated forecasting models to identify cost pressures ahead of time, an interactive Power BI dashboard for real-time expenditure tracking, and delivers monthly variance analysis and financial reporting to the executive team.\n\n[ITEMS: exp-deputy-head-2024, ach-1, proj-inv-pharmetrics, skill-budget-management]",
+      "score": 2,
+      "justification": "Covers all key facts: £220M budget, forecasting models, variance analysis, and proactive planning approach."
+    },
+    {
+      "id": "Q10",
+      "question": "What leadership training does Andy have?",
+      "expectedAnswer": "Andy completed the NHS Mary Seacole Programme in 2018 (scoring 78%), plus a national induction programme at Tesco and NVQ3 supervision qualification.",
+      "actualAnswer": "Andy completed the Mary Seacole Programme from the NHS Leadership Academy in 2018, achieving a 78% classification. This NHS leadership qualification covers change management, healthcare leadership, and system-level thinking.\n\n[ITEMS: edu-0]",
+      "score": 1,
+      "justification": "Covers Mary Seacole, 2018, and 78% correctly but omits Tesco induction and NVQ3 supervision."
+    }
+  ]
+}
@@ -62,75 +62,74 @@ interface BenchmarkResults {
  results: QuestionResult[]
 }

-// --- Gemini API ---
+// --- OpenRouter API ---

-const GEMINI_MODEL = 'gemini-3-flash-preview'
-const GEMINI_API_BASE = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}`
+const LLM_MODEL = 'z-ai/glm-5'
+const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions'

 function getApiKey(): string {
-  const key = process.env.VITE_GEMINI_API_KEY
+  const key = process.env.VITE_OPEN_ROUTER_API_KEY
  if (!key) {
-    throw new Error('VITE_GEMINI_API_KEY not set. Ensure .env file exists with this key.')
+    throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.')
  }
  return key
 }

 function buildSystemPrompt(): string {
  const texts = buildEmbeddingTexts()
-  const cvContent = texts.map((t) => `- ${t.text}`).join('\n')
+  const cvContent = texts.map((t) => `[${t.id}] ${t.text}`).join('\n')

-  return `You are an AI assistant on Andy Charlwood's portfolio website. Answer questions about his experience, skills, projects, and qualifications.
+  return `You are a helpful assistant on Andy Charlwood's portfolio website.

-## Andy's Professional Profile
+## Profile Data
+Each entry is prefixed with its ID in square brackets.

 ${cvContent}

-## Rules
-1. Use ONLY the profile above. Never invent roles, dates, or achievements.
-2. Be concise (2-4 sentences). Be professional but friendly.
-3. If the information isn't in the profile, say so.
+## Response Rules
+- Answer ONLY from the profile data above. Never invent facts, roles, dates, or achievements.
+- Be concise: 2-4 sentences. Professional and friendly tone.
+- If the answer isn't in the profile, say so honestly.
+- Do not fabricate URLs, email addresses, or contact details.

 ## Item References
-After your answer, on a NEW line, list relevant portfolio item IDs:
-[ITEMS: id1, id2, id3]
- IDs match the profile entries above (exp-*, skill-*, proj-*, ach-*, edu-*, action-*).
- Only include IDs directly relevant to your answer.
- If no items are relevant, omit the [ITEMS: ...] line entirely.`
+End your response with a single line listing relevant item IDs:
+[ITEMS: exp-nhs-nwicb, skill-python]
+Only include IDs that directly support your answer. Omit the line if none are relevant.`
 }

 function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms))
 }

-async function callGemini(
+async function callLLM(
  systemPrompt: string,
  userMessage: string,
  temperature = 0.7,
-  maxOutputTokens = 512,
+  maxTokens = 512,
 ): Promise<string> {
  const apiKey = getApiKey()
  const maxRetries = 5

  for (let attempt = 0; attempt < maxRetries; attempt++) {
-    const response = await fetch(
-      `${GEMINI_API_BASE}:generateContent?key=${apiKey}`,
-      {
+    const response = await fetch(OPENROUTER_API_URL, {
      method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
+      headers: {
+        'Content-Type': 'application/json',
+        'Authorization': `Bearer ${apiKey}`,
+        'HTTP-Referer': 'https://andycharlwood.co.uk',
+        'X-Title': 'Andy Charlwood Portfolio',
+      },
      body: JSON.stringify({
-          system_instruction: {
-            parts: [{ text: systemPrompt }],
-          },
-          contents: [
-            { role: 'user', parts: [{ text: userMessage }] },
-          ],
-          generationConfig: {
+        model: LLM_MODEL,
        temperature,
-            maxOutputTokens,
-          },
+        max_tokens: maxTokens,
+        messages: [
+          { role: 'system', content: systemPrompt },
+          { role: 'user', content: userMessage },
+        ],
      }),
-      },
-    )
+    })

    if (response.status === 429 || response.status === 503) {
      const errorBody = await response.text()
@@ -144,13 +143,13 @@ async function callGemini(

    if (!response.ok) {
      const errorBody = await response.text()
-      throw new Error(`Gemini API error ${response.status}: ${errorBody}`)
+      throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`)
    }

    const data = await response.json()
-    const text = data?.candidates?.[0]?.content?.parts?.[0]?.text
+    const text = data?.choices?.[0]?.message?.content
    if (!text) {
-      throw new Error(`No text in Gemini response: ${JSON.stringify(data)}`)
+      throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`)
    }
    return text
  }
@@ -220,7 +219,7 @@ EXPECTED ANSWER: ${expectedAnswer}

 ACTUAL ANSWER: ${actualAnswer}`

-  const rawResponse = await callGemini(scoringPrompt, userMessage, 0, 512)
+  const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512)

  // Extract JSON — handle code fences, preamble text, multiline responses
  const extracted = extractJson(rawResponse)
@@ -311,7 +310,7 @@ async function main() {
  const iteration = getNextIteration(resultsDir)
  console.log(`Running iteration ${iteration}...`)

-  // Build system prompt (same as production)
+  // Build system prompt (same as production llm.ts)
  const systemPrompt = buildSystemPrompt()
  console.log(`System prompt built (${systemPrompt.length} chars).`)

@@ -321,9 +320,9 @@ async function main() {
  for (const q of config.questions) {
    console.log(`\n[${q.id}] ${q.question}`)

-    // Get answer from Gemini
+    // Get answer from LLM
    console.log('  Getting answer...')
-    const actualAnswer = await callGemini(systemPrompt, q.question)
+    const actualAnswer = await callLLM(systemPrompt, q.question)
    console.log(`  Answer: ${actualAnswer.slice(0, 100)}...`)

    // Score the answer
@@ -354,7 +353,7 @@ async function main() {
  const results: BenchmarkResults = {
    iteration,
    timestamp: new Date().toISOString(),
-    model: GEMINI_MODEL,
+    model: LLM_MODEL,
    totalScore,
    maxPossibleScore: config.maxScore,
    passThreshold: config.passThreshold,