import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs' import { resolve } from 'node:path' // Load .env file manually (avoid adding dotenv dependency) function loadEnvFile(): void { const envPath = resolve(import.meta.dirname, '..', '.env') if (!existsSync(envPath)) return const content = readFileSync(envPath, 'utf-8') for (const line of content.split('\n')) { const trimmed = line.trim() if (!trimmed || trimmed.startsWith('#')) continue const eqIndex = trimmed.indexOf('=') if (eqIndex === -1) continue const key = trimmed.slice(0, eqIndex) const value = trimmed.slice(eqIndex + 1) if (!process.env[key]) { process.env[key] = value } } } loadEnvFile() // --- Types --- interface BenchmarkQuestion { id: string question: string expectedAnswer: string keyFacts: string[] } interface BenchmarkConfig { passThreshold: number maxScore: number questions: BenchmarkQuestion[] } interface ScoringResult { score: 0 | 1 | 2 justification: string } interface QuestionResult { id: string question: string expectedAnswer: string actualAnswer: string score: number justification: string } interface BenchmarkResults { iteration: number timestamp: string model: string totalScore: number maxPossibleScore: number passThreshold: number passed: boolean hasZeros: boolean results: QuestionResult[] } // --- OpenRouter API --- const LLM_MODEL = 'z-ai/glm-5' const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions' function getApiKey(): string { const key = process.env.VITE_OPEN_ROUTER_API_KEY if (!key) { throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.') } return key } // Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually // because llm.ts uses import.meta.env (Vite) and window.location (browser) function buildSystemPrompt(): string { return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below. ## Profile Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK. Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2 million people. Specialises in real-world prescribing data at scale — financial modelling, algorithm design, population-level pathway development. Identified and prioritised efficiency programmes worth £14.6M+ through automated analysis. ## Career History ### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis NHS Norfolk & Waveney ICB | May–Nov 2025 Led strategic delivery of population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to Chief Medical Officer. - Identified £14.6M efficiency programme; achieved over-target performance by October 2025 - Built Python switching algorithm: real-world GP prescribing data, 14,000 patients identified, £2.6M annual savings (£2M on target), compressed months of analysis into 3 days - Automated incentive scheme with novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months - Presented to CMO bimonthly with evidence-based recommendations - Led transformation to patient-level SQL analytics and self-serve model ### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis NHS Norfolk & Waveney ICB | Jul 2024–Present (substantive role) Driving data analytics strategy for medicines optimisation from messy, real-world GP prescribing data. - Managed £220M prescribing budget with forecasting models for proactive financial planning - Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics - Led DOAC switching programme financial modelling: interactive dashboard with rebate mechanics, workforce constraints, patent expiry timelines - Renegotiated pharmaceutical rebate terms ahead of patent expiry - Supported tirzepatide commissioning (NICE TA1026): financial projections, eligible cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery - Built Python controlled drug monitoring system: oral morphine equivalents across all opioid prescriptions, patient-level exposure tracking, high-risk identification, diversion detection at population scale - Improved team data fluency through training, documentation, and self-serve tools ### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist NHS Norfolk & Waveney ICB | May 2022–Jul 2024 Led NICE TA implementation and high-cost drug pathways across the ICS. Wrote most system pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine. - Blueteq automation: 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains - Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking - Python Sankey chart tool for patient pathway visualisation and trust compliance auditing ### [exp-pharmacy-manager-2017] Pharmacy Manager Tesco PLC (private sector, NOT NHS) | Nov 2017–May 2022 Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk. - Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue - Created national induction training plan and eLearning modules - Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities ## Projects ### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live) Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks the £220M prescribing budget with self-serve analytics. ### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete) Python-based algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. Identified 14,000 patients, £2.6M annual savings, novel GP payment system linking rewards to savings. ### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete) Software automating Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains, integrated with secondary care databases. ### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete) Python-based controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level OME tracking, high-risk patient identification, potential diversion detection at population scale. ### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete) Python-based visualisation for patient journey mapping through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust-level compliance auditing, multi-specialty pathway coverage. ## Education ### [edu-0] NHS Mary Seacole Programme (2018) NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking. ### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (2011–2015) 4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction). ### [edu-2] A-Levels — Highworth Grammar School (2009–2011) Mathematics A*, Chemistry B, Politics C. ### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016–Present) Professional registration required to practise as a pharmacist in Great Britain. ## Skills Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%) Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%) Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%) ## Response Rules - Answer ONLY from the data above. Never invent facts, roles, dates, or achievements. - Distinguish NHS employment (May 2022 onwards, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy). - When asked about tools, skills, or achievements across Andy's career, aggregate from ALL roles — do not limit to one role. - Cite specific numbers, dates, and outcomes when available. Never say "approximately" when exact figures exist in the data. - If the answer is not in the data, say so honestly. - Do not fabricate URLs, email addresses, or contact details. - Be concise: 2-4 sentences unless the question requires a longer answer. ## Item References End your response with a single line listing relevant item IDs from the square-bracketed IDs above: [ITEMS: exp-deputy-head-2024, skill-python] Only include IDs that directly support your answer. Omit the line if none are relevant.` } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } async function callLLM( systemPrompt: string, userMessage: string, temperature = 0.7, maxTokens = 512, ): Promise { const apiKey = getApiKey() const maxRetries = 5 for (let attempt = 0; attempt < maxRetries; attempt++) { const response = await fetch(OPENROUTER_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}`, 'HTTP-Referer': 'https://andycharlwood.co.uk', 'X-Title': 'Andy Charlwood Portfolio', }, body: JSON.stringify({ model: LLM_MODEL, temperature, max_tokens: maxTokens, messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userMessage }, ], }), }) if (response.status === 429 || response.status === 503) { const errorBody = await response.text() const retryMatch = errorBody.match(/retry in ([\d.]+)s/) const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15 const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable' console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`) await sleep(waitSeconds * 1000) continue } if (!response.ok) { const errorBody = await response.text() throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`) } const data = await response.json() const text = data?.choices?.[0]?.message?.content if (!text) { throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`) } return text } throw new Error('Max retries exceeded for rate limiting') } // --- Scoring --- function extractJson(text: string): string | null { // Try parsing directly first try { JSON.parse(text) return text } catch { /* not direct JSON, continue extraction */ } // Strip markdown code fences const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/) if (fenceMatch) { return fenceMatch[1].trim() } // Find first { ... } block const braceStart = text.indexOf('{') if (braceStart === -1) return null // Find matching closing brace let depth = 0 let inString = false let escaped = false for (let i = braceStart; i < text.length; i++) { const ch = text[i] if (escaped) { escaped = false; continue } if (ch === '\\') { escaped = true; continue } if (ch === '"') { inString = !inString; continue } if (inString) continue if (ch === '{') depth++ if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) } } return null } async function scoreAnswer( question: string, expectedAnswer: string, keyFacts: string[], actualAnswer: string, ): Promise { const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV. Rubric: - 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors. - 1 = PARTIAL: Some key facts right but misses important details or is vague. - 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point. Key facts for score 2: ${keyFacts.map((f) => `- ${f}`).join('\n')} IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text. Example: {"score":2,"justification":"Covers all key facts accurately"} Keep justification under 30 words.` const userMessage = `QUESTION: ${question} EXPECTED ANSWER: ${expectedAnswer} ACTUAL ANSWER: ${actualAnswer}` const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512) // Extract JSON — handle code fences, preamble text, multiline responses const extracted = extractJson(rawResponse) if (!extracted) { console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`) return { score: 0, justification: `Failed to parse scoring response` } } try { const parsed = JSON.parse(extracted) as ScoringResult if (![0, 1, 2].includes(parsed.score)) { console.warn(` Warning: Invalid score value: ${parsed.score}`) return { score: 0, justification: `Invalid score value: ${parsed.score}` } } return parsed } catch { console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`) return { score: 0, justification: `Invalid JSON in response` } } } // --- Iteration Management --- function getNextIteration(resultsDir: string): number { if (!existsSync(resultsDir)) return 0 const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json')) if (files.length === 0) return 0 const iterations = files.map((f) => { const match = f.match(/iteration-(\d+)\.json/) return match ? parseInt(match[1], 10) : -1 }) return Math.max(...iterations) + 1 } // --- Console Output --- function printSummary(results: BenchmarkResults): void { console.log('\n' + '='.repeat(80)) console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`) console.log(`Model: ${results.model} | ${results.timestamp}`) console.log('='.repeat(80)) // Table header console.log( 'ID'.padEnd(6) + 'Score'.padEnd(8) + 'Question'.padEnd(50) + 'Justification' ) console.log('-'.repeat(80)) for (const r of results.results) { const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗' const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification console.log( r.id.padEnd(6) + scoreLabel.padEnd(8) + questionTruncated.padEnd(50) + justTruncated ) } console.log('-'.repeat(80)) console.log( `TOTAL: ${results.totalScore}/${results.maxPossibleScore}` + ` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` + ` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` + ` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}` ) console.log('='.repeat(80)) } // --- Main --- async function main() { const scriptDir = import.meta.dirname const configPath = resolve(scriptDir, 'benchmark-config.json') const resultsDir = resolve(scriptDir, 'benchmark-results') // Load config const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8')) console.log(`Loaded ${config.questions.length} benchmark questions.`) // Determine iteration number const iteration = getNextIteration(resultsDir) console.log(`Running iteration ${iteration}...`) // Build system prompt (same as production llm.ts) const systemPrompt = buildSystemPrompt() console.log(`System prompt built (${systemPrompt.length} chars).`) // Run each question const questionResults: QuestionResult[] = [] for (const q of config.questions) { console.log(`\n[${q.id}] ${q.question}`) // Get answer from LLM console.log(' Getting answer...') const actualAnswer = await callLLM(systemPrompt, q.question) console.log(` Answer: ${actualAnswer.slice(0, 100)}...`) // Score the answer console.log(' Scoring...') const { score, justification } = await scoreAnswer( q.question, q.expectedAnswer, q.keyFacts, actualAnswer, ) console.log(` Score: ${score}/2 — ${justification}`) questionResults.push({ id: q.id, question: q.question, expectedAnswer: q.expectedAnswer, actualAnswer, score, justification, }) } // Calculate totals const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0) const hasZeros = questionResults.some((r) => r.score === 0) const passed = totalScore >= config.passThreshold && !hasZeros const results: BenchmarkResults = { iteration, timestamp: new Date().toISOString(), model: LLM_MODEL, totalScore, maxPossibleScore: config.maxScore, passThreshold: config.passThreshold, passed, hasZeros, results: questionResults, } // Save results mkdirSync(resultsDir, { recursive: true }) const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`) writeFileSync(resultsPath, JSON.stringify(results, null, 2)) console.log(`\nResults saved to ${resultsPath}`) // Print summary table printSummary(results) // Exit with appropriate code process.exit(passed ? 0 : 1) } main().catch((err) => { console.error('Benchmark failed:', err) process.exit(2) })