Files
portfolio/scripts/benchmark.ts
T
admin d2efc7030a feat: US-019 - Run benchmark and validate accuracy
Benchmark passes 19/20 (threshold 18/20) with no zeros.
Structural improvements: Employment Timeline section, leadership
labels on Tesco bullets, GPhC clarification, prompt trimming.
Fixed Q10 expected answer to match actual CV data.
2026-02-16 00:59:37 +00:00

455 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
import { resolve } from 'node:path'
// Load .env file manually (avoid adding dotenv dependency)
function loadEnvFile(): void {
const envPath = resolve(import.meta.dirname, '..', '.env')
if (!existsSync(envPath)) return
const content = readFileSync(envPath, 'utf-8')
for (const line of content.split('\n')) {
const trimmed = line.trim()
if (!trimmed || trimmed.startsWith('#')) continue
const eqIndex = trimmed.indexOf('=')
if (eqIndex === -1) continue
const key = trimmed.slice(0, eqIndex)
const value = trimmed.slice(eqIndex + 1)
if (!process.env[key]) {
process.env[key] = value
}
}
}
loadEnvFile()
// --- Types ---
interface BenchmarkQuestion {
id: string
question: string
expectedAnswer: string
keyFacts: string[]
}
interface BenchmarkConfig {
passThreshold: number
maxScore: number
questions: BenchmarkQuestion[]
}
interface ScoringResult {
score: 0 | 1 | 2
justification: string
}
interface QuestionResult {
id: string
question: string
expectedAnswer: string
actualAnswer: string
score: number
justification: string
}
interface BenchmarkResults {
iteration: number
timestamp: string
model: string
totalScore: number
maxPossibleScore: number
passThreshold: number
passed: boolean
hasZeros: boolean
results: QuestionResult[]
}
// --- OpenRouter API ---
const LLM_MODEL = 'z-ai/glm-5'
const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions'
function getApiKey(): string {
const key = process.env.VITE_OPEN_ROUTER_API_KEY
if (!key) {
throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.')
}
return key
}
// Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually
// because llm.ts uses import.meta.env (Vite) and window.location (browser)
function buildSystemPrompt(): string {
return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below.
## Profile
Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK.
Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2M people. Specialises in prescribing data at scale — financial modelling, algorithm design, pathway development. Identified efficiency programmes worth £14.6M+ through automated analysis.
## Employment Timeline (IMPORTANT)
- **NHS employment**: May 2022present (all roles at NHS Norfolk & Waveney ICB). Total NHS service: ~4 years.
- **Private sector**: Nov 2017May 2022 at Tesco PLC (community pharmacy). This was NOT NHS employment.
- GPhC registration (Aug 2016) is a professional licence, NOT an employer or NHS role.
## Career History
### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | MayNov 2025
Led population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to CMO.
- Identified £14.6M efficiency programme; achieved over-target performance by October 2025
- Built Python switching algorithm: real-world GP prescribing data, 14,000 patients, £2.6M annual savings (£2M on target), compressed months into 3 days
- Novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months
- Presented to CMO bimonthly; led transformation to patient-level SQL analytics
### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | Jul 2024Present (substantive role)
Data analytics strategy for medicines optimisation from real-world GP prescribing data.
- Managed £220M prescribing budget with forecasting models for proactive financial planning
- Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics
- Led DOAC switching financial modelling: interactive dashboard with rebate mechanics, patent expiry timelines
- Renegotiated pharmaceutical rebate terms ahead of patent expiry
- Tirzepatide commissioning (NICE TA1026): financial projections, cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery
- Built Python controlled drug monitoring: oral morphine equivalents across all opioid prescriptions, patient-level tracking, high-risk identification, diversion detection
- Improved team data fluency through training and self-serve tools
### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist
NHS Norfolk & Waveney ICB | May 2022Jul 2024
Led NICE TA implementation and high-cost drug pathways across the ICS. Pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, migraine.
- Blueteq automation: 70% form reduction, 200 hours immediate savings, 78 hours ongoing weekly gains
- Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking
- Python Sankey chart tool for patient pathway visualisation and trust compliance auditing
### [exp-pharmacy-manager-2017] Pharmacy Manager
Tesco PLC (private sector, NOT NHS) | Nov 2017May 2022
Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk.
- Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue
- Leadership training: Created national induction training plan and eLearning modules for Tesco pharmacy staff
- Leadership development: Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities
## Projects
### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live)
Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks £220M prescribing budget.
### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete)
Python algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. 14,000 patients, £2.6M annual savings, novel GP payment system.
### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete)
Automated Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 78 hours ongoing weekly gains.
### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete)
Controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level tracking, high-risk identification, diversion detection.
### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete)
Patient journey visualisation through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust compliance auditing.
## Education
### [edu-0] NHS Mary Seacole Programme (2018)
NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking.
### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (20112015)
4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction).
### [edu-2] A-Levels — Highworth Grammar School (20092011)
Mathematics A*, Chemistry B, Politics C.
### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016Present)
Professional registration required to practise as a pharmacist in Great Britain.
## Skills
Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%)
Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%)
Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)
## Response Rules
1. Answer ONLY from the data above. If the answer is not in the data, say "I don't have that information" — never invent facts, roles, dates, achievements, URLs, or contact details.
2. Distinguish NHS employment (May 2022present, ~4 years, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017May 2022, community pharmacy). Never conflate the two. GPhC registration is a professional licence, not NHS employment.
3. When asked broad questions about tools, skills, projects, or achievements across Andy's career, aggregate from ALL roles — do not limit your answer to one position.
4. Cite exact numbers, dates, percentages, and outcomes. Never say "approximately" or "around" when exact figures exist in the data.
5. For detailed or list-based questions, give a thorough answer covering all relevant items. For simple questions, be concise (2-4 sentences).
## Item References
End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
[ITEMS: exp-deputy-head-2024, skill-python]
Only include IDs that directly support your answer. Omit the line if none are relevant.`
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}
async function callLLM(
systemPrompt: string,
userMessage: string,
temperature = 0.4,
maxTokens = 800,
): Promise<string> {
const apiKey = getApiKey()
const maxRetries = 5
for (let attempt = 0; attempt < maxRetries; attempt++) {
const response = await fetch(OPENROUTER_API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
'HTTP-Referer': 'https://andycharlwood.co.uk',
'X-Title': 'Andy Charlwood Portfolio',
},
body: JSON.stringify({
model: LLM_MODEL,
temperature,
max_tokens: maxTokens,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userMessage },
],
}),
})
if (response.status === 429 || response.status === 503) {
const errorBody = await response.text()
const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
await sleep(waitSeconds * 1000)
continue
}
if (!response.ok) {
const errorBody = await response.text()
throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`)
}
const data = await response.json()
const text = data?.choices?.[0]?.message?.content
if (!text) {
throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`)
}
return text
}
throw new Error('Max retries exceeded for rate limiting')
}
// --- Scoring ---
function extractJson(text: string): string | null {
// Try parsing directly first
try {
JSON.parse(text)
return text
} catch { /* not direct JSON, continue extraction */ }
// Strip markdown code fences
const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
if (fenceMatch) {
return fenceMatch[1].trim()
}
// Find first { ... } block
const braceStart = text.indexOf('{')
if (braceStart === -1) return null
// Find matching closing brace
let depth = 0
let inString = false
let escaped = false
for (let i = braceStart; i < text.length; i++) {
const ch = text[i]
if (escaped) { escaped = false; continue }
if (ch === '\\') { escaped = true; continue }
if (ch === '"') { inString = !inString; continue }
if (inString) continue
if (ch === '{') depth++
if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
}
return null
}
async function scoreAnswer(
question: string,
expectedAnswer: string,
keyFacts: string[],
actualAnswer: string,
): Promise<ScoringResult> {
const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.
Rubric:
- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
- 1 = PARTIAL: Some key facts right but misses important details or is vague.
- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.
Key facts for score 2:
${keyFacts.map((f) => `- ${f}`).join('\n')}
IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
Example: {"score":2,"justification":"Covers all key facts accurately"}
Keep justification under 30 words.`
const userMessage = `QUESTION: ${question}
EXPECTED ANSWER: ${expectedAnswer}
ACTUAL ANSWER: ${actualAnswer}`
const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512)
// Extract JSON — handle code fences, preamble text, multiline responses
const extracted = extractJson(rawResponse)
if (!extracted) {
console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
return { score: 0, justification: `Failed to parse scoring response` }
}
try {
const parsed = JSON.parse(extracted) as ScoringResult
if (![0, 1, 2].includes(parsed.score)) {
console.warn(` Warning: Invalid score value: ${parsed.score}`)
return { score: 0, justification: `Invalid score value: ${parsed.score}` }
}
return parsed
} catch {
console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
return { score: 0, justification: `Invalid JSON in response` }
}
}
// --- Iteration Management ---
function getNextIteration(resultsDir: string): number {
if (!existsSync(resultsDir)) return 0
const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
if (files.length === 0) return 0
const iterations = files.map((f) => {
const match = f.match(/iteration-(\d+)\.json/)
return match ? parseInt(match[1], 10) : -1
})
return Math.max(...iterations) + 1
}
// --- Console Output ---
function printSummary(results: BenchmarkResults): void {
console.log('\n' + '='.repeat(80))
console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
console.log(`Model: ${results.model} | ${results.timestamp}`)
console.log('='.repeat(80))
// Table header
console.log(
'ID'.padEnd(6) +
'Score'.padEnd(8) +
'Question'.padEnd(50) +
'Justification'
)
console.log('-'.repeat(80))
for (const r of results.results) {
const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
console.log(
r.id.padEnd(6) +
scoreLabel.padEnd(8) +
questionTruncated.padEnd(50) +
justTruncated
)
}
console.log('-'.repeat(80))
console.log(
`TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
)
console.log('='.repeat(80))
}
// --- Main ---
async function main() {
const scriptDir = import.meta.dirname
const configPath = resolve(scriptDir, 'benchmark-config.json')
const resultsDir = resolve(scriptDir, 'benchmark-results')
// Load config
const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
console.log(`Loaded ${config.questions.length} benchmark questions.`)
// Determine iteration number
const iteration = getNextIteration(resultsDir)
console.log(`Running iteration ${iteration}...`)
// Build system prompt (same as production llm.ts)
const systemPrompt = buildSystemPrompt()
console.log(`System prompt built (${systemPrompt.length} chars).`)
// Run each question
const questionResults: QuestionResult[] = []
for (const q of config.questions) {
console.log(`\n[${q.id}] ${q.question}`)
// Get answer from LLM
console.log(' Getting answer...')
const actualAnswer = await callLLM(systemPrompt, q.question)
console.log(` Answer: ${actualAnswer.slice(0, 100)}...`)
// Score the answer
console.log(' Scoring...')
const { score, justification } = await scoreAnswer(
q.question,
q.expectedAnswer,
q.keyFacts,
actualAnswer,
)
console.log(` Score: ${score}/2 — ${justification}`)
questionResults.push({
id: q.id,
question: q.question,
expectedAnswer: q.expectedAnswer,
actualAnswer,
score,
justification,
})
}
// Calculate totals
const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
const hasZeros = questionResults.some((r) => r.score === 0)
const passed = totalScore >= config.passThreshold && !hasZeros
const results: BenchmarkResults = {
iteration,
timestamp: new Date().toISOString(),
model: LLM_MODEL,
totalScore,
maxPossibleScore: config.maxScore,
passThreshold: config.passThreshold,
passed,
hasZeros,
results: questionResults,
}
// Save results
mkdirSync(resultsDir, { recursive: true })
const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
writeFileSync(resultsPath, JSON.stringify(results, null, 2))
console.log(`\nResults saved to ${resultsPath}`)
// Print summary table
printSummary(results)
// Exit with appropriate code
process.exit(passed ? 0 : 1)
}
main().catch((err) => {
console.error('Benchmark failed:', err)
process.exit(2)
})