import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs' import { resolve } from 'node:path' // Load .env file manually (avoid adding dotenv dependency) function loadEnvFile(): void { const envPath = resolve(import.meta.dirname, '..', '.env') if (!existsSync(envPath)) return const content = readFileSync(envPath, 'utf-8') for (const line of content.split('\n')) { const trimmed = line.trim() if (!trimmed || trimmed.startsWith('#')) continue const eqIndex = trimmed.indexOf('=') if (eqIndex === -1) continue const key = trimmed.slice(0, eqIndex) const value = trimmed.slice(eqIndex + 1) if (!process.env[key]) { process.env[key] = value } } } loadEnvFile() // --- Types --- interface BenchmarkQuestion { id: string question: string expectedAnswer: string keyFacts: string[] } interface BenchmarkConfig { passThreshold: number maxScore: number questions: BenchmarkQuestion[] } interface ScoringResult { score: 0 | 1 | 2 justification: string } interface QuestionResult { id: string question: string expectedAnswer: string actualAnswer: string score: number justification: string } interface BenchmarkResults { iteration: number timestamp: string model: string totalScore: number maxPossibleScore: number passThreshold: number passed: boolean hasZeros: boolean results: QuestionResult[] } // --- OpenRouter API --- const LLM_MODEL = 'z-ai/glm-5' const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions' function getApiKey(): string { const key = process.env.VITE_OPEN_ROUTER_API_KEY if (!key) { throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.') } return key } // Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually // because llm.ts uses import.meta.env (Vite) and window.location (browser) function buildSystemPrompt(): string { return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below. ## Profile Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK. Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2 million people. Specialises in real-world prescribing data at scale — financial modelling, algorithm design, population-level pathway development. Identified and prioritised efficiency programmes worth £14.6M+ through automated analysis. ## Career History ### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis NHS Norfolk & Waveney ICB | May–Nov 2025 Led strategic delivery of population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to Chief Medical Officer. - Identified £14.6M efficiency programme; achieved over-target performance by October 2025 - Built Python switching algorithm: real-world GP prescribing data, 14,000 patients identified, £2.6M annual savings (£2M on target), compressed months of analysis into 3 days - Automated incentive scheme with novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months - Presented to CMO bimonthly with evidence-based recommendations - Led transformation to patient-level SQL analytics and self-serve model ### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis NHS Norfolk & Waveney ICB | Jul 2024–Present (substantive role) Driving data analytics strategy for medicines optimisation from messy, real-world GP prescribing data. - Managed £220M prescribing budget with forecasting models for proactive financial planning - Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics - Led DOAC switching programme financial modelling: interactive dashboard with rebate mechanics, workforce constraints, patent expiry timelines - Renegotiated pharmaceutical rebate terms ahead of patent expiry - Supported tirzepatide commissioning (NICE TA1026): financial projections, eligible cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery - Built Python controlled drug monitoring system: oral morphine equivalents across all opioid prescriptions, patient-level exposure tracking, high-risk identification, diversion detection at population scale - Improved team data fluency through training, documentation, and self-serve tools ### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist NHS Norfolk & Waveney ICB | May 2022–Jul 2024 Led NICE TA implementation and high-cost drug pathways across the ICS. Wrote most system pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine. - Blueteq automation: 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains - Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking - Python Sankey chart tool for patient pathway visualisation and trust compliance auditing ### [exp-pharmacy-manager-2017] Pharmacy Manager Tesco PLC (private sector, NOT NHS) | Nov 2017–May 2022 Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk. - Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue - Created national induction training plan and eLearning modules - Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities ## Projects ### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live) Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks the £220M prescribing budget with self-serve analytics. ### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete) Python-based algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. Identified 14,000 patients, £2.6M annual savings, novel GP payment system linking rewards to savings. ### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete) Software automating Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains, integrated with secondary care databases. ### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete) Python-based controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level OME tracking, high-risk patient identification, potential diversion detection at population scale. ### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete) Python-based visualisation for patient journey mapping through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust-level compliance auditing, multi-specialty pathway coverage. ## Education ### [edu-0] NHS Mary Seacole Programme (2018) NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking. ### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (2011–2015) 4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction). ### [edu-2] A-Levels — Highworth Grammar School (2009–2011) Mathematics A*, Chemistry B, Politics C. ### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016–Present) Professional registration required to practise as a pharmacist in Great Britain. ## Skills Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%) Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%) Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%) ## Response Rules 1. Answer ONLY from the data above. If the answer is not in the data, say "I don't have that information" — never invent facts, roles, dates, achievements, URLs, or contact details. 2. Distinguish NHS employment (May 2022–present, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy). Never conflate the two. 3. When asked broad questions about tools, skills, projects, or achievements across Andy's career, aggregate from ALL roles — do not limit your answer to one position. 4. Cite exact numbers, dates, percentages, and outcomes. Never say "approximately" or "around" when exact figures exist in the data. 5. For detailed or list-based questions, give a thorough answer covering all relevant items. For simple questions, be concise (2-4 sentences). ## Item References End your response with a single line listing relevant item IDs from the square-bracketed IDs above: [ITEMS: exp-deputy-head-2024, skill-python] Only include IDs that directly support your answer. Omit the line if none are relevant.` } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } async function callLLM( systemPrompt: string, userMessage: string, temperature = 0.4, maxTokens = 800, ): Promise { const apiKey = getApiKey() const maxRetries = 5 for (let attempt = 0; attempt < maxRetries; attempt++) { const response = await fetch(OPENROUTER_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}`, 'HTTP-Referer': 'https://andycharlwood.co.uk', 'X-Title': 'Andy Charlwood Portfolio', }, body: JSON.stringify({ model: LLM_MODEL, temperature, max_tokens: maxTokens, messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userMessage }, ], }), }) if (response.status === 429 || response.status === 503) { const errorBody = await response.text() const retryMatch = errorBody.match(/retry in ([\d.]+)s/) const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15 const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable' console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`) await sleep(waitSeconds * 1000) continue } if (!response.ok) { const errorBody = await response.text() throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`) } const data = await response.json() const text = data?.choices?.[0]?.message?.content if (!text) { throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`) } return text } throw new Error('Max retries exceeded for rate limiting') } // --- Scoring --- function extractJson(text: string): string | null { // Try parsing directly first try { JSON.parse(text) return text } catch { /* not direct JSON, continue extraction */ } // Strip markdown code fences const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/) if (fenceMatch) { return fenceMatch[1].trim() } // Find first { ... } block const braceStart = text.indexOf('{') if (braceStart === -1) return null // Find matching closing brace let depth = 0 let inString = false let escaped = false for (let i = braceStart; i < text.length; i++) { const ch = text[i] if (escaped) { escaped = false; continue } if (ch === '\\') { escaped = true; continue } if (ch === '"') { inString = !inString; continue } if (inString) continue if (ch === '{') depth++ if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) } } return null } async function scoreAnswer( question: string, expectedAnswer: string, keyFacts: string[], actualAnswer: string, ): Promise { const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV. Rubric: - 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors. - 1 = PARTIAL: Some key facts right but misses important details or is vague. - 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point. Key facts for score 2: ${keyFacts.map((f) => `- ${f}`).join('\n')} IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text. Example: {"score":2,"justification":"Covers all key facts accurately"} Keep justification under 30 words.` const userMessage = `QUESTION: ${question} EXPECTED ANSWER: ${expectedAnswer} ACTUAL ANSWER: ${actualAnswer}` const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512) // Extract JSON — handle code fences, preamble text, multiline responses const extracted = extractJson(rawResponse) if (!extracted) { console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`) return { score: 0, justification: `Failed to parse scoring response` } } try { const parsed = JSON.parse(extracted) as ScoringResult if (![0, 1, 2].includes(parsed.score)) { console.warn(` Warning: Invalid score value: ${parsed.score}`) return { score: 0, justification: `Invalid score value: ${parsed.score}` } } return parsed } catch { console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`) return { score: 0, justification: `Invalid JSON in response` } } } // --- Iteration Management --- function getNextIteration(resultsDir: string): number { if (!existsSync(resultsDir)) return 0 const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json')) if (files.length === 0) return 0 const iterations = files.map((f) => { const match = f.match(/iteration-(\d+)\.json/) return match ? parseInt(match[1], 10) : -1 }) return Math.max(...iterations) + 1 } // --- Console Output --- function printSummary(results: BenchmarkResults): void { console.log('\n' + '='.repeat(80)) console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`) console.log(`Model: ${results.model} | ${results.timestamp}`) console.log('='.repeat(80)) // Table header console.log( 'ID'.padEnd(6) + 'Score'.padEnd(8) + 'Question'.padEnd(50) + 'Justification' ) console.log('-'.repeat(80)) for (const r of results.results) { const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗' const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification console.log( r.id.padEnd(6) + scoreLabel.padEnd(8) + questionTruncated.padEnd(50) + justTruncated ) } console.log('-'.repeat(80)) console.log( `TOTAL: ${results.totalScore}/${results.maxPossibleScore}` + ` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` + ` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` + ` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}` ) console.log('='.repeat(80)) } // --- Main --- async function main() { const scriptDir = import.meta.dirname const configPath = resolve(scriptDir, 'benchmark-config.json') const resultsDir = resolve(scriptDir, 'benchmark-results') // Load config const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8')) console.log(`Loaded ${config.questions.length} benchmark questions.`) // Determine iteration number const iteration = getNextIteration(resultsDir) console.log(`Running iteration ${iteration}...`) // Build system prompt (same as production llm.ts) const systemPrompt = buildSystemPrompt() console.log(`System prompt built (${systemPrompt.length} chars).`) // Run each question const questionResults: QuestionResult[] = [] for (const q of config.questions) { console.log(`\n[${q.id}] ${q.question}`) // Get answer from LLM console.log(' Getting answer...') const actualAnswer = await callLLM(systemPrompt, q.question) console.log(` Answer: ${actualAnswer.slice(0, 100)}...`) // Score the answer console.log(' Scoring...') const { score, justification } = await scoreAnswer( q.question, q.expectedAnswer, q.keyFacts, actualAnswer, ) console.log(` Score: ${score}/2 — ${justification}`) questionResults.push({ id: q.id, question: q.question, expectedAnswer: q.expectedAnswer, actualAnswer, score, justification, }) } // Calculate totals const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0) const hasZeros = questionResults.some((r) => r.score === 0) const passed = totalScore >= config.passThreshold && !hasZeros const results: BenchmarkResults = { iteration, timestamp: new Date().toISOString(), model: LLM_MODEL, totalScore, maxPossibleScore: config.maxScore, passThreshold: config.passThreshold, passed, hasZeros, results: questionResults, } // Save results mkdirSync(resultsDir, { recursive: true }) const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`) writeFileSync(resultsPath, JSON.stringify(results, null, 2)) console.log(`\nResults saved to ${resultsPath}`) // Print summary table printSummary(results) // Exit with appropriate code process.exit(passed ? 0 : 1) } main().catch((err) => { console.error('Benchmark failed:', err) process.exit(2) })