Files
portfolio/scripts/benchmark.ts
T

453 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
import { resolve } from 'node:path'
// Load .env file manually (avoid adding dotenv dependency)
function loadEnvFile(): void {
const envPath = resolve(import.meta.dirname, '..', '.env')
if (!existsSync(envPath)) return
const content = readFileSync(envPath, 'utf-8')
for (const line of content.split('\n')) {
const trimmed = line.trim()
if (!trimmed || trimmed.startsWith('#')) continue
const eqIndex = trimmed.indexOf('=')
if (eqIndex === -1) continue
const key = trimmed.slice(0, eqIndex)
const value = trimmed.slice(eqIndex + 1)
if (!process.env[key]) {
process.env[key] = value
}
}
}
loadEnvFile()
// --- Types ---
interface BenchmarkQuestion {
id: string
question: string
expectedAnswer: string
keyFacts: string[]
}
interface BenchmarkConfig {
passThreshold: number
maxScore: number
questions: BenchmarkQuestion[]
}
interface ScoringResult {
score: 0 | 1 | 2
justification: string
}
interface QuestionResult {
id: string
question: string
expectedAnswer: string
actualAnswer: string
score: number
justification: string
}
interface BenchmarkResults {
iteration: number
timestamp: string
model: string
totalScore: number
maxPossibleScore: number
passThreshold: number
passed: boolean
hasZeros: boolean
results: QuestionResult[]
}
// --- OpenRouter API ---
const LLM_MODEL = 'z-ai/glm-5'
const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions'
function getApiKey(): string {
const key = process.env.VITE_OPEN_ROUTER_API_KEY
if (!key) {
throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.')
}
return key
}
// Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually
// because llm.ts uses import.meta.env (Vite) and window.location (browser)
function buildSystemPrompt(): string {
return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below.
## Profile
Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK.
Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2 million people. Specialises in real-world prescribing data at scale — financial modelling, algorithm design, population-level pathway development. Identified and prioritised efficiency programmes worth £14.6M+ through automated analysis.
## Career History
### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | MayNov 2025
Led strategic delivery of population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to Chief Medical Officer.
- Identified £14.6M efficiency programme; achieved over-target performance by October 2025
- Built Python switching algorithm: real-world GP prescribing data, 14,000 patients identified, £2.6M annual savings (£2M on target), compressed months of analysis into 3 days
- Automated incentive scheme with novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months
- Presented to CMO bimonthly with evidence-based recommendations
- Led transformation to patient-level SQL analytics and self-serve model
### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | Jul 2024Present (substantive role)
Driving data analytics strategy for medicines optimisation from messy, real-world GP prescribing data.
- Managed £220M prescribing budget with forecasting models for proactive financial planning
- Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics
- Led DOAC switching programme financial modelling: interactive dashboard with rebate mechanics, workforce constraints, patent expiry timelines
- Renegotiated pharmaceutical rebate terms ahead of patent expiry
- Supported tirzepatide commissioning (NICE TA1026): financial projections, eligible cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery
- Built Python controlled drug monitoring system: oral morphine equivalents across all opioid prescriptions, patient-level exposure tracking, high-risk identification, diversion detection at population scale
- Improved team data fluency through training, documentation, and self-serve tools
### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist
NHS Norfolk & Waveney ICB | May 2022Jul 2024
Led NICE TA implementation and high-cost drug pathways across the ICS. Wrote most system pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine.
- Blueteq automation: 70% form reduction, 200 hours immediate savings, 78 hours ongoing weekly gains
- Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking
- Python Sankey chart tool for patient pathway visualisation and trust compliance auditing
### [exp-pharmacy-manager-2017] Pharmacy Manager
Tesco PLC (private sector, NOT NHS) | Nov 2017May 2022
Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk.
- Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue
- Created national induction training plan and eLearning modules
- Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities
## Projects
### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live)
Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks the £220M prescribing budget with self-serve analytics.
### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete)
Python-based algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. Identified 14,000 patients, £2.6M annual savings, novel GP payment system linking rewards to savings.
### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete)
Software automating Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 78 hours ongoing weekly gains, integrated with secondary care databases.
### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete)
Python-based controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level OME tracking, high-risk patient identification, potential diversion detection at population scale.
### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete)
Python-based visualisation for patient journey mapping through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust-level compliance auditing, multi-specialty pathway coverage.
## Education
### [edu-0] NHS Mary Seacole Programme (2018)
NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking.
### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (20112015)
4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction).
### [edu-2] A-Levels — Highworth Grammar School (20092011)
Mathematics A*, Chemistry B, Politics C.
### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016Present)
Professional registration required to practise as a pharmacist in Great Britain.
## Skills
Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%)
Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%)
Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)
## Response Rules
- Answer ONLY from the data above. Never invent facts, roles, dates, or achievements.
- Distinguish NHS employment (May 2022 onwards, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017May 2022, community pharmacy).
- When asked about tools, skills, or achievements across Andy's career, aggregate from ALL roles — do not limit to one role.
- Cite specific numbers, dates, and outcomes when available. Never say "approximately" when exact figures exist in the data.
- If the answer is not in the data, say so honestly.
- Do not fabricate URLs, email addresses, or contact details.
- Be concise: 2-4 sentences unless the question requires a longer answer.
## Item References
End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
[ITEMS: exp-deputy-head-2024, skill-python]
Only include IDs that directly support your answer. Omit the line if none are relevant.`
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}
async function callLLM(
systemPrompt: string,
userMessage: string,
temperature = 0.7,
maxTokens = 512,
): Promise<string> {
const apiKey = getApiKey()
const maxRetries = 5
for (let attempt = 0; attempt < maxRetries; attempt++) {
const response = await fetch(OPENROUTER_API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
'HTTP-Referer': 'https://andycharlwood.co.uk',
'X-Title': 'Andy Charlwood Portfolio',
},
body: JSON.stringify({
model: LLM_MODEL,
temperature,
max_tokens: maxTokens,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userMessage },
],
}),
})
if (response.status === 429 || response.status === 503) {
const errorBody = await response.text()
const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
await sleep(waitSeconds * 1000)
continue
}
if (!response.ok) {
const errorBody = await response.text()
throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`)
}
const data = await response.json()
const text = data?.choices?.[0]?.message?.content
if (!text) {
throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`)
}
return text
}
throw new Error('Max retries exceeded for rate limiting')
}
// --- Scoring ---
function extractJson(text: string): string | null {
// Try parsing directly first
try {
JSON.parse(text)
return text
} catch { /* not direct JSON, continue extraction */ }
// Strip markdown code fences
const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
if (fenceMatch) {
return fenceMatch[1].trim()
}
// Find first { ... } block
const braceStart = text.indexOf('{')
if (braceStart === -1) return null
// Find matching closing brace
let depth = 0
let inString = false
let escaped = false
for (let i = braceStart; i < text.length; i++) {
const ch = text[i]
if (escaped) { escaped = false; continue }
if (ch === '\\') { escaped = true; continue }
if (ch === '"') { inString = !inString; continue }
if (inString) continue
if (ch === '{') depth++
if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
}
return null
}
async function scoreAnswer(
question: string,
expectedAnswer: string,
keyFacts: string[],
actualAnswer: string,
): Promise<ScoringResult> {
const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.
Rubric:
- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
- 1 = PARTIAL: Some key facts right but misses important details or is vague.
- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.
Key facts for score 2:
${keyFacts.map((f) => `- ${f}`).join('\n')}
IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
Example: {"score":2,"justification":"Covers all key facts accurately"}
Keep justification under 30 words.`
const userMessage = `QUESTION: ${question}
EXPECTED ANSWER: ${expectedAnswer}
ACTUAL ANSWER: ${actualAnswer}`
const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512)
// Extract JSON — handle code fences, preamble text, multiline responses
const extracted = extractJson(rawResponse)
if (!extracted) {
console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
return { score: 0, justification: `Failed to parse scoring response` }
}
try {
const parsed = JSON.parse(extracted) as ScoringResult
if (![0, 1, 2].includes(parsed.score)) {
console.warn(` Warning: Invalid score value: ${parsed.score}`)
return { score: 0, justification: `Invalid score value: ${parsed.score}` }
}
return parsed
} catch {
console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
return { score: 0, justification: `Invalid JSON in response` }
}
}
// --- Iteration Management ---
function getNextIteration(resultsDir: string): number {
if (!existsSync(resultsDir)) return 0
const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
if (files.length === 0) return 0
const iterations = files.map((f) => {
const match = f.match(/iteration-(\d+)\.json/)
return match ? parseInt(match[1], 10) : -1
})
return Math.max(...iterations) + 1
}
// --- Console Output ---
function printSummary(results: BenchmarkResults): void {
console.log('\n' + '='.repeat(80))
console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
console.log(`Model: ${results.model} | ${results.timestamp}`)
console.log('='.repeat(80))
// Table header
console.log(
'ID'.padEnd(6) +
'Score'.padEnd(8) +
'Question'.padEnd(50) +
'Justification'
)
console.log('-'.repeat(80))
for (const r of results.results) {
const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
console.log(
r.id.padEnd(6) +
scoreLabel.padEnd(8) +
questionTruncated.padEnd(50) +
justTruncated
)
}
console.log('-'.repeat(80))
console.log(
`TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
)
console.log('='.repeat(80))
}
// --- Main ---
async function main() {
const scriptDir = import.meta.dirname
const configPath = resolve(scriptDir, 'benchmark-config.json')
const resultsDir = resolve(scriptDir, 'benchmark-results')
// Load config
const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
console.log(`Loaded ${config.questions.length} benchmark questions.`)
// Determine iteration number
const iteration = getNextIteration(resultsDir)
console.log(`Running iteration ${iteration}...`)
// Build system prompt (same as production llm.ts)
const systemPrompt = buildSystemPrompt()
console.log(`System prompt built (${systemPrompt.length} chars).`)
// Run each question
const questionResults: QuestionResult[] = []
for (const q of config.questions) {
console.log(`\n[${q.id}] ${q.question}`)
// Get answer from LLM
console.log(' Getting answer...')
const actualAnswer = await callLLM(systemPrompt, q.question)
console.log(` Answer: ${actualAnswer.slice(0, 100)}...`)
// Score the answer
console.log(' Scoring...')
const { score, justification } = await scoreAnswer(
q.question,
q.expectedAnswer,
q.keyFacts,
actualAnswer,
)
console.log(` Score: ${score}/2 — ${justification}`)
questionResults.push({
id: q.id,
question: q.question,
expectedAnswer: q.expectedAnswer,
actualAnswer,
score,
justification,
})
}
// Calculate totals
const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
const hasZeros = questionResults.some((r) => r.score === 0)
const passed = totalScore >= config.passThreshold && !hasZeros
const results: BenchmarkResults = {
iteration,
timestamp: new Date().toISOString(),
model: LLM_MODEL,
totalScore,
maxPossibleScore: config.maxScore,
passThreshold: config.passThreshold,
passed,
hasZeros,
results: questionResults,
}
// Save results
mkdirSync(resultsDir, { recursive: true })
const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
writeFileSync(resultsPath, JSON.stringify(results, null, 2))
console.log(`\nResults saved to ${resultsPath}`)
// Print summary table
printSummary(results)
// Exit with appropriate code
process.exit(passed ? 0 : 1)
}
main().catch((err) => {
console.error('Benchmark failed:', err)
process.exit(2)
})