451 lines
18 KiB
TypeScript
451 lines
18 KiB
TypeScript
import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
|
||
import { resolve } from 'node:path'
|
||
// Load .env file manually (avoid adding dotenv dependency)
|
||
function loadEnvFile(): void {
|
||
const envPath = resolve(import.meta.dirname, '..', '.env')
|
||
if (!existsSync(envPath)) return
|
||
const content = readFileSync(envPath, 'utf-8')
|
||
for (const line of content.split('\n')) {
|
||
const trimmed = line.trim()
|
||
if (!trimmed || trimmed.startsWith('#')) continue
|
||
const eqIndex = trimmed.indexOf('=')
|
||
if (eqIndex === -1) continue
|
||
const key = trimmed.slice(0, eqIndex)
|
||
const value = trimmed.slice(eqIndex + 1)
|
||
if (!process.env[key]) {
|
||
process.env[key] = value
|
||
}
|
||
}
|
||
}
|
||
loadEnvFile()
|
||
|
||
// --- Types ---
|
||
|
||
interface BenchmarkQuestion {
|
||
id: string
|
||
question: string
|
||
expectedAnswer: string
|
||
keyFacts: string[]
|
||
}
|
||
|
||
interface BenchmarkConfig {
|
||
passThreshold: number
|
||
maxScore: number
|
||
questions: BenchmarkQuestion[]
|
||
}
|
||
|
||
interface ScoringResult {
|
||
score: 0 | 1 | 2
|
||
justification: string
|
||
}
|
||
|
||
interface QuestionResult {
|
||
id: string
|
||
question: string
|
||
expectedAnswer: string
|
||
actualAnswer: string
|
||
score: number
|
||
justification: string
|
||
}
|
||
|
||
interface BenchmarkResults {
|
||
iteration: number
|
||
timestamp: string
|
||
model: string
|
||
totalScore: number
|
||
maxPossibleScore: number
|
||
passThreshold: number
|
||
passed: boolean
|
||
hasZeros: boolean
|
||
results: QuestionResult[]
|
||
}
|
||
|
||
// --- OpenRouter API ---
|
||
|
||
const LLM_MODEL = 'z-ai/glm-5'
|
||
const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions'
|
||
|
||
function getApiKey(): string {
|
||
const key = process.env.VITE_OPEN_ROUTER_API_KEY
|
||
if (!key) {
|
||
throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.')
|
||
}
|
||
return key
|
||
}
|
||
|
||
// Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually
|
||
// because llm.ts uses import.meta.env (Vite) and window.location (browser)
|
||
function buildSystemPrompt(): string {
|
||
return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below.
|
||
|
||
## Profile
|
||
Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK.
|
||
Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2 million people. Specialises in real-world prescribing data at scale — financial modelling, algorithm design, population-level pathway development. Identified and prioritised efficiency programmes worth £14.6M+ through automated analysis.
|
||
|
||
## Career History
|
||
|
||
### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis
|
||
NHS Norfolk & Waveney ICB | May–Nov 2025
|
||
Led strategic delivery of population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to Chief Medical Officer.
|
||
- Identified £14.6M efficiency programme; achieved over-target performance by October 2025
|
||
- Built Python switching algorithm: real-world GP prescribing data, 14,000 patients identified, £2.6M annual savings (£2M on target), compressed months of analysis into 3 days
|
||
- Automated incentive scheme with novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months
|
||
- Presented to CMO bimonthly with evidence-based recommendations
|
||
- Led transformation to patient-level SQL analytics and self-serve model
|
||
|
||
### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis
|
||
NHS Norfolk & Waveney ICB | Jul 2024–Present (substantive role)
|
||
Driving data analytics strategy for medicines optimisation from messy, real-world GP prescribing data.
|
||
- Managed £220M prescribing budget with forecasting models for proactive financial planning
|
||
- Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics
|
||
- Led DOAC switching programme financial modelling: interactive dashboard with rebate mechanics, workforce constraints, patent expiry timelines
|
||
- Renegotiated pharmaceutical rebate terms ahead of patent expiry
|
||
- Supported tirzepatide commissioning (NICE TA1026): financial projections, eligible cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery
|
||
- Built Python controlled drug monitoring system: oral morphine equivalents across all opioid prescriptions, patient-level exposure tracking, high-risk identification, diversion detection at population scale
|
||
- Improved team data fluency through training, documentation, and self-serve tools
|
||
|
||
### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist
|
||
NHS Norfolk & Waveney ICB | May 2022–Jul 2024
|
||
Led NICE TA implementation and high-cost drug pathways across the ICS. Wrote most system pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine.
|
||
- Blueteq automation: 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains
|
||
- Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking
|
||
- Python Sankey chart tool for patient pathway visualisation and trust compliance auditing
|
||
|
||
### [exp-pharmacy-manager-2017] Pharmacy Manager
|
||
Tesco PLC (private sector, NOT NHS) | Nov 2017–May 2022
|
||
Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk.
|
||
- Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue
|
||
- Created national induction training plan and eLearning modules
|
||
- Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities
|
||
|
||
## Projects
|
||
|
||
### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live)
|
||
Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks the £220M prescribing budget with self-serve analytics.
|
||
|
||
### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete)
|
||
Python-based algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. Identified 14,000 patients, £2.6M annual savings, novel GP payment system linking rewards to savings.
|
||
|
||
### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete)
|
||
Software automating Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains, integrated with secondary care databases.
|
||
|
||
### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete)
|
||
Python-based controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level OME tracking, high-risk patient identification, potential diversion detection at population scale.
|
||
|
||
### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete)
|
||
Python-based visualisation for patient journey mapping through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust-level compliance auditing, multi-specialty pathway coverage.
|
||
|
||
## Education
|
||
|
||
### [edu-0] NHS Mary Seacole Programme (2018)
|
||
NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking.
|
||
|
||
### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (2011–2015)
|
||
4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction).
|
||
|
||
### [edu-2] A-Levels — Highworth Grammar School (2009–2011)
|
||
Mathematics A*, Chemistry B, Politics C.
|
||
|
||
### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016–Present)
|
||
Professional registration required to practise as a pharmacist in Great Britain.
|
||
|
||
## Skills
|
||
Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%)
|
||
Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%)
|
||
Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)
|
||
|
||
## Response Rules
|
||
1. Answer ONLY from the data above. If the answer is not in the data, say "I don't have that information" — never invent facts, roles, dates, achievements, URLs, or contact details.
|
||
2. Distinguish NHS employment (May 2022–present, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy). Never conflate the two.
|
||
3. When asked broad questions about tools, skills, projects, or achievements across Andy's career, aggregate from ALL roles — do not limit your answer to one position.
|
||
4. Cite exact numbers, dates, percentages, and outcomes. Never say "approximately" or "around" when exact figures exist in the data.
|
||
5. For detailed or list-based questions, give a thorough answer covering all relevant items. For simple questions, be concise (2-4 sentences).
|
||
|
||
## Item References
|
||
End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
|
||
[ITEMS: exp-deputy-head-2024, skill-python]
|
||
Only include IDs that directly support your answer. Omit the line if none are relevant.`
|
||
}
|
||
|
||
function sleep(ms: number): Promise<void> {
|
||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||
}
|
||
|
||
async function callLLM(
|
||
systemPrompt: string,
|
||
userMessage: string,
|
||
temperature = 0.4,
|
||
maxTokens = 800,
|
||
): Promise<string> {
|
||
const apiKey = getApiKey()
|
||
const maxRetries = 5
|
||
|
||
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
||
const response = await fetch(OPENROUTER_API_URL, {
|
||
method: 'POST',
|
||
headers: {
|
||
'Content-Type': 'application/json',
|
||
'Authorization': `Bearer ${apiKey}`,
|
||
'HTTP-Referer': 'https://andycharlwood.co.uk',
|
||
'X-Title': 'Andy Charlwood Portfolio',
|
||
},
|
||
body: JSON.stringify({
|
||
model: LLM_MODEL,
|
||
temperature,
|
||
max_tokens: maxTokens,
|
||
messages: [
|
||
{ role: 'system', content: systemPrompt },
|
||
{ role: 'user', content: userMessage },
|
||
],
|
||
}),
|
||
})
|
||
|
||
if (response.status === 429 || response.status === 503) {
|
||
const errorBody = await response.text()
|
||
const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
|
||
const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
|
||
const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
|
||
console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
|
||
await sleep(waitSeconds * 1000)
|
||
continue
|
||
}
|
||
|
||
if (!response.ok) {
|
||
const errorBody = await response.text()
|
||
throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`)
|
||
}
|
||
|
||
const data = await response.json()
|
||
const text = data?.choices?.[0]?.message?.content
|
||
if (!text) {
|
||
throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`)
|
||
}
|
||
return text
|
||
}
|
||
|
||
throw new Error('Max retries exceeded for rate limiting')
|
||
}
|
||
|
||
// --- Scoring ---
|
||
|
||
function extractJson(text: string): string | null {
|
||
// Try parsing directly first
|
||
try {
|
||
JSON.parse(text)
|
||
return text
|
||
} catch { /* not direct JSON, continue extraction */ }
|
||
|
||
// Strip markdown code fences
|
||
const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
|
||
if (fenceMatch) {
|
||
return fenceMatch[1].trim()
|
||
}
|
||
|
||
// Find first { ... } block
|
||
const braceStart = text.indexOf('{')
|
||
if (braceStart === -1) return null
|
||
|
||
// Find matching closing brace
|
||
let depth = 0
|
||
let inString = false
|
||
let escaped = false
|
||
for (let i = braceStart; i < text.length; i++) {
|
||
const ch = text[i]
|
||
if (escaped) { escaped = false; continue }
|
||
if (ch === '\\') { escaped = true; continue }
|
||
if (ch === '"') { inString = !inString; continue }
|
||
if (inString) continue
|
||
if (ch === '{') depth++
|
||
if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
|
||
}
|
||
|
||
return null
|
||
}
|
||
|
||
async function scoreAnswer(
|
||
question: string,
|
||
expectedAnswer: string,
|
||
keyFacts: string[],
|
||
actualAnswer: string,
|
||
): Promise<ScoringResult> {
|
||
const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.
|
||
|
||
Rubric:
|
||
- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
|
||
- 1 = PARTIAL: Some key facts right but misses important details or is vague.
|
||
- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.
|
||
|
||
Key facts for score 2:
|
||
${keyFacts.map((f) => `- ${f}`).join('\n')}
|
||
|
||
IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
|
||
Example: {"score":2,"justification":"Covers all key facts accurately"}
|
||
Keep justification under 30 words.`
|
||
|
||
const userMessage = `QUESTION: ${question}
|
||
|
||
EXPECTED ANSWER: ${expectedAnswer}
|
||
|
||
ACTUAL ANSWER: ${actualAnswer}`
|
||
|
||
const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512)
|
||
|
||
// Extract JSON — handle code fences, preamble text, multiline responses
|
||
const extracted = extractJson(rawResponse)
|
||
if (!extracted) {
|
||
console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
|
||
return { score: 0, justification: `Failed to parse scoring response` }
|
||
}
|
||
|
||
try {
|
||
const parsed = JSON.parse(extracted) as ScoringResult
|
||
if (![0, 1, 2].includes(parsed.score)) {
|
||
console.warn(` Warning: Invalid score value: ${parsed.score}`)
|
||
return { score: 0, justification: `Invalid score value: ${parsed.score}` }
|
||
}
|
||
return parsed
|
||
} catch {
|
||
console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
|
||
return { score: 0, justification: `Invalid JSON in response` }
|
||
}
|
||
}
|
||
|
||
// --- Iteration Management ---
|
||
|
||
function getNextIteration(resultsDir: string): number {
|
||
if (!existsSync(resultsDir)) return 0
|
||
|
||
const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
|
||
if (files.length === 0) return 0
|
||
|
||
const iterations = files.map((f) => {
|
||
const match = f.match(/iteration-(\d+)\.json/)
|
||
return match ? parseInt(match[1], 10) : -1
|
||
})
|
||
return Math.max(...iterations) + 1
|
||
}
|
||
|
||
// --- Console Output ---
|
||
|
||
function printSummary(results: BenchmarkResults): void {
|
||
console.log('\n' + '='.repeat(80))
|
||
console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
|
||
console.log(`Model: ${results.model} | ${results.timestamp}`)
|
||
console.log('='.repeat(80))
|
||
|
||
// Table header
|
||
console.log(
|
||
'ID'.padEnd(6) +
|
||
'Score'.padEnd(8) +
|
||
'Question'.padEnd(50) +
|
||
'Justification'
|
||
)
|
||
console.log('-'.repeat(80))
|
||
|
||
for (const r of results.results) {
|
||
const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
|
||
const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
|
||
const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
|
||
console.log(
|
||
r.id.padEnd(6) +
|
||
scoreLabel.padEnd(8) +
|
||
questionTruncated.padEnd(50) +
|
||
justTruncated
|
||
)
|
||
}
|
||
|
||
console.log('-'.repeat(80))
|
||
console.log(
|
||
`TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
|
||
` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
|
||
` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
|
||
` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
|
||
)
|
||
console.log('='.repeat(80))
|
||
}
|
||
|
||
// --- Main ---
|
||
|
||
async function main() {
|
||
const scriptDir = import.meta.dirname
|
||
const configPath = resolve(scriptDir, 'benchmark-config.json')
|
||
const resultsDir = resolve(scriptDir, 'benchmark-results')
|
||
|
||
// Load config
|
||
const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
|
||
console.log(`Loaded ${config.questions.length} benchmark questions.`)
|
||
|
||
// Determine iteration number
|
||
const iteration = getNextIteration(resultsDir)
|
||
console.log(`Running iteration ${iteration}...`)
|
||
|
||
// Build system prompt (same as production llm.ts)
|
||
const systemPrompt = buildSystemPrompt()
|
||
console.log(`System prompt built (${systemPrompt.length} chars).`)
|
||
|
||
// Run each question
|
||
const questionResults: QuestionResult[] = []
|
||
|
||
for (const q of config.questions) {
|
||
console.log(`\n[${q.id}] ${q.question}`)
|
||
|
||
// Get answer from LLM
|
||
console.log(' Getting answer...')
|
||
const actualAnswer = await callLLM(systemPrompt, q.question)
|
||
console.log(` Answer: ${actualAnswer.slice(0, 100)}...`)
|
||
|
||
// Score the answer
|
||
console.log(' Scoring...')
|
||
const { score, justification } = await scoreAnswer(
|
||
q.question,
|
||
q.expectedAnswer,
|
||
q.keyFacts,
|
||
actualAnswer,
|
||
)
|
||
console.log(` Score: ${score}/2 — ${justification}`)
|
||
|
||
questionResults.push({
|
||
id: q.id,
|
||
question: q.question,
|
||
expectedAnswer: q.expectedAnswer,
|
||
actualAnswer,
|
||
score,
|
||
justification,
|
||
})
|
||
}
|
||
|
||
// Calculate totals
|
||
const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
|
||
const hasZeros = questionResults.some((r) => r.score === 0)
|
||
const passed = totalScore >= config.passThreshold && !hasZeros
|
||
|
||
const results: BenchmarkResults = {
|
||
iteration,
|
||
timestamp: new Date().toISOString(),
|
||
model: LLM_MODEL,
|
||
totalScore,
|
||
maxPossibleScore: config.maxScore,
|
||
passThreshold: config.passThreshold,
|
||
passed,
|
||
hasZeros,
|
||
results: questionResults,
|
||
}
|
||
|
||
// Save results
|
||
mkdirSync(resultsDir, { recursive: true })
|
||
const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
|
||
writeFileSync(resultsPath, JSON.stringify(results, null, 2))
|
||
console.log(`\nResults saved to ${resultsPath}`)
|
||
|
||
// Print summary table
|
||
printSummary(results)
|
||
|
||
// Exit with appropriate code
|
||
process.exit(passed ? 0 : 1)
|
||
}
|
||
|
||
main().catch((err) => {
|
||
console.error('Benchmark failed:', err)
|
||
process.exit(2)
|
||
})
|