portfolio/scripts/benchmark.ts

import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
import { resolve } from 'node:path'
// Load .env file manually (avoid adding dotenv dependency)
function loadEnvFile(): void {
  const envPath = resolve(import.meta.dirname, '..', '.env')
  if (!existsSync(envPath)) return
  const content = readFileSync(envPath, 'utf-8')
  for (const line of content.split('\n')) {
    const trimmed = line.trim()
    if (!trimmed || trimmed.startsWith('#')) continue
    const eqIndex = trimmed.indexOf('=')
    if (eqIndex === -1) continue
    const key = trimmed.slice(0, eqIndex)
    const value = trimmed.slice(eqIndex + 1)
    if (!process.env[key]) {
      process.env[key] = value
    }
  }
}
loadEnvFile()

// --- Types ---

interface BenchmarkQuestion {
  id: string
  question: string
  expectedAnswer: string
  keyFacts: string[]
}

interface BenchmarkConfig {
  passThreshold: number
  maxScore: number
  questions: BenchmarkQuestion[]
}

interface ScoringResult {
  score: 0 | 1 | 2
  justification: string
}

interface QuestionResult {
  id: string
  question: string
  expectedAnswer: string
  actualAnswer: string
  score: number
  justification: string
}

interface BenchmarkResults {
  iteration: number
  timestamp: string
  model: string
  totalScore: number
  maxPossibleScore: number
  passThreshold: number
  passed: boolean
  hasZeros: boolean
  results: QuestionResult[]
}

// --- OpenRouter API ---

const LLM_MODEL = 'z-ai/glm-5'
const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions'

function getApiKey(): string {
  const key = process.env.VITE_OPEN_ROUTER_API_KEY
  if (!key) {
    throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.')
  }
  return key
}

// Mirrors buildSystemPrompt() from src/lib/llm.ts — kept in sync manually
// because llm.ts uses import.meta.env (Vite) and window.location (browser)
function buildSystemPrompt(): string {
  return `You are a helpful assistant on Andy Charlwood's portfolio website. Answer questions about Andy's professional background using ONLY the information below.

## Profile
Andy Charlwood — MPharm, GPhC Registered Pharmacist. Norwich, UK.
Healthcare leader combining clinical pharmacy with Python, SQL, and data analytics (self-taught). Leading population health analytics for NHS Norfolk & Waveney ICB, serving 1.2 million people. Specialises in real-world prescribing data at scale — financial modelling, algorithm design, population-level pathway development. Identified and prioritised efficiency programmes worth £14.6M+ through automated analysis.

## Career History

### [exp-interim-head-2025] Interim Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | May–Nov 2025
Led strategic delivery of population health initiatives and data-driven medicines optimisation, reporting to Associate Director of Pharmacy with accountability to Chief Medical Officer.
- Identified £14.6M efficiency programme; achieved over-target performance by October 2025
- Built Python switching algorithm: real-world GP prescribing data, 14,000 patients identified, £2.6M annual savings (£2M on target), compressed months of analysis into 3 days
- Automated incentive scheme with novel GP payment system linking rewards to savings; 50% prescribing reduction within 2 months
- Presented to CMO bimonthly with evidence-based recommendations
- Led transformation to patient-level SQL analytics and self-serve model

### [exp-deputy-head-2024] Deputy Head, Population Health & Data Analysis
NHS Norfolk & Waveney ICB | Jul 2024–Present (substantive role)
Driving data analytics strategy for medicines optimisation from messy, real-world GP prescribing data.
- Managed £220M prescribing budget with forecasting models for proactive financial planning
- Created comprehensive dm+d medicines data table: standardised strengths, morphine equivalents, Anticholinergic Burden scoring — single source of truth for all medicines analytics
- Led DOAC switching programme financial modelling: interactive dashboard with rebate mechanics, workforce constraints, patent expiry timelines
- Renegotiated pharmaceutical rebate terms ahead of patent expiry
- Supported tirzepatide commissioning (NICE TA1026): financial projections, eligible cohort identification; authored executive paper advocating primary care model, driving system shift to GP-led delivery
- Built Python controlled drug monitoring system: oral morphine equivalents across all opioid prescriptions, patient-level exposure tracking, high-risk identification, diversion detection at population scale
- Improved team data fluency through training, documentation, and self-serve tools

### [exp-high-cost-drugs-2022] High-Cost Drugs & Interface Pharmacist
NHS Norfolk & Waveney ICB | May 2022–Jul 2024
Led NICE TA implementation and high-cost drug pathways across the ICS. Wrote most system pathways spanning: rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine.
- Blueteq automation: 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains
- Integrated Blueteq with secondary care databases for accurate high-cost drug spend tracking
- Python Sankey chart tool for patient pathway visualisation and trust compliance auditing

### [exp-pharmacy-manager-2017] Pharmacy Manager
Tesco PLC (private sector, NOT NHS) | Nov 2017–May 2022
Community pharmacy with full operational autonomy (100-hour contract). LPC representative for Norfolk.
- Asthma screening process adopted nationally (~300 branches): reduced pharmacist time 60→6 hours/store/month, ~£1M revenue
- Created national induction training plan and eLearning modules
- Supervised two staff through NVQ3 to pharmacy technician registration; full HR responsibilities

## Projects

### [proj-inv-pharmetrics] PharMetrics Interactive Platform (2024, Live)
Real-time medicines expenditure dashboard for NHS decision-makers. Tech: Power BI, SQL, DAX. Tracks the £220M prescribing budget with self-serve analytics.

### [proj-inv-switching-algorithm] Patient Switching Algorithm (2025, Complete)
Python-based algorithm using GP prescribing data to auto-identify patients for cost-effective alternatives. Tech: Python, Pandas, SQL. Identified 14,000 patients, £2.6M annual savings, novel GP payment system linking rewards to savings.

### [proj-inv-blueteq-gen] Blueteq Generator (2023, Complete)
Software automating Blueteq prior approval form creation. Tech: Python, SQL. 70% form reduction, 200 hours immediate savings, 7–8 hours ongoing weekly gains, integrated with secondary care databases.

### [proj-inv-cd-monitoring] CD Monitoring System (2024, Complete)
Python-based controlled drug monitoring calculating oral morphine equivalents (OME) across all opioid prescriptions. Tech: Python, SQL. Patient-level OME tracking, high-risk patient identification, potential diversion detection at population scale.

### [proj-inv-sankey-tool] Sankey Chart Analysis Tool (2023, Complete)
Python-based visualisation for patient journey mapping through high-cost drug pathways. Tech: Python, Matplotlib, SQL. Trust-level compliance auditing, multi-specialty pathway coverage.

## Education

### [edu-0] NHS Mary Seacole Programme (2018)
NHS Leadership Academy. Score: 78%. Covers change management, healthcare leadership, system-level thinking.

### [edu-1] MPharm (Hons) 2:1 — University of East Anglia (2011–2015)
4-year integrated Master's degree. Research project on drug delivery and cocrystals: 75.1% (Distinction).

### [edu-2] A-Levels — Highworth Grammar School (2009–2011)
Mathematics A*, Chemistry B, Politics C.

### [edu-3] GPhC Registration — General Pharmaceutical Council (August 2016–Present)
Professional registration required to practise as a pharmacist in Great Britain.

## Skills
Technical: [skill-data-analysis] Data Analysis (9yr, 95%), [skill-python] Python (6yr, 90%), [skill-sql] SQL (7yr, 88%), [skill-power-bi] Power BI (5yr, 92%), [skill-javascript-typescript] JavaScript/TypeScript (3yr, 70%), [skill-excel] Excel (9yr, 85%), [skill-algorithm-design] Algorithm Design (3yr, 82%), [skill-data-pipelines] Data Pipelines (2yr, 75%)
Domain: [skill-medicines-optimisation] Medicines Optimisation (9yr, 95%), [skill-population-health] Population Health (3yr, 90%), [skill-nice-ta] NICE TA Implementation (3yr, 92%), [skill-health-economics] Health Economics (3yr, 80%), [skill-clinical-pathways] Clinical Pathways (3yr, 88%), [skill-controlled-drugs] Controlled Drugs (1yr, 85%)
Leadership: [skill-budget-management] Budget Management (1yr, 90%), [skill-stakeholder-engagement] Stakeholder Engagement (3yr, 88%), [skill-pharma-negotiation] Pharmaceutical Negotiation (1yr, 82%), [skill-team-development] Team Development (8yr, 85%), [skill-change-management] Change Management (7yr, 80%), [skill-financial-modelling] Financial Modelling (1yr, 78%), [skill-executive-comms] Executive Communication (1yr, 85%)

## Response Rules
- Answer ONLY from the data above. Never invent facts, roles, dates, or achievements.
- Distinguish NHS employment (May 2022 onwards, all at Norfolk & Waveney ICB) from private sector (Tesco PLC, Nov 2017–May 2022, community pharmacy).
- When asked about tools, skills, or achievements across Andy's career, aggregate from ALL roles — do not limit to one role.
- Cite specific numbers, dates, and outcomes when available. Never say "approximately" when exact figures exist in the data.
- If the answer is not in the data, say so honestly.
- Do not fabricate URLs, email addresses, or contact details.
- Be concise: 2-4 sentences unless the question requires a longer answer.

## Item References
End your response with a single line listing relevant item IDs from the square-bracketed IDs above:
[ITEMS: exp-deputy-head-2024, skill-python]
Only include IDs that directly support your answer. Omit the line if none are relevant.`
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms))
}

async function callLLM(
  systemPrompt: string,
  userMessage: string,
  temperature = 0.7,
  maxTokens = 512,
): Promise<string> {
  const apiKey = getApiKey()
  const maxRetries = 5

  for (let attempt = 0; attempt < maxRetries; attempt++) {
    const response = await fetch(OPENROUTER_API_URL, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${apiKey}`,
        'HTTP-Referer': 'https://andycharlwood.co.uk',
        'X-Title': 'Andy Charlwood Portfolio',
      },
      body: JSON.stringify({
        model: LLM_MODEL,
        temperature,
        max_tokens: maxTokens,
        messages: [
          { role: 'system', content: systemPrompt },
          { role: 'user', content: userMessage },
        ],
      }),
    })

    if (response.status === 429 || response.status === 503) {
      const errorBody = await response.text()
      const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
      const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
      const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
      console.log(`  ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
      await sleep(waitSeconds * 1000)
      continue
    }

    if (!response.ok) {
      const errorBody = await response.text()
      throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`)
    }

    const data = await response.json()
    const text = data?.choices?.[0]?.message?.content
    if (!text) {
      throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`)
    }
    return text
  }

  throw new Error('Max retries exceeded for rate limiting')
}

// --- Scoring ---

function extractJson(text: string): string | null {
  // Try parsing directly first
  try {
    JSON.parse(text)
    return text
  } catch { /* not direct JSON, continue extraction */ }

  // Strip markdown code fences
  const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
  if (fenceMatch) {
    return fenceMatch[1].trim()
  }

  // Find first { ... } block
  const braceStart = text.indexOf('{')
  if (braceStart === -1) return null

  // Find matching closing brace
  let depth = 0
  let inString = false
  let escaped = false
  for (let i = braceStart; i < text.length; i++) {
    const ch = text[i]
    if (escaped) { escaped = false; continue }
    if (ch === '\\') { escaped = true; continue }
    if (ch === '"') { inString = !inString; continue }
    if (inString) continue
    if (ch === '{') depth++
    if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
  }

  return null
}

async function scoreAnswer(
  question: string,
  expectedAnswer: string,
  keyFacts: string[],
  actualAnswer: string,
): Promise<ScoringResult> {
  const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.

Rubric:
- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
- 1 = PARTIAL: Some key facts right but misses important details or is vague.
- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.

Key facts for score 2:
${keyFacts.map((f) => `- ${f}`).join('\n')}

IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
Example: {"score":2,"justification":"Covers all key facts accurately"}
Keep justification under 30 words.`

  const userMessage = `QUESTION: ${question}

EXPECTED ANSWER: ${expectedAnswer}

ACTUAL ANSWER: ${actualAnswer}`

  const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512)

  // Extract JSON — handle code fences, preamble text, multiline responses
  const extracted = extractJson(rawResponse)
  if (!extracted) {
    console.warn(`  Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
    return { score: 0, justification: `Failed to parse scoring response` }
  }

  try {
    const parsed = JSON.parse(extracted) as ScoringResult
    if (![0, 1, 2].includes(parsed.score)) {
      console.warn(`  Warning: Invalid score value: ${parsed.score}`)
      return { score: 0, justification: `Invalid score value: ${parsed.score}` }
    }
    return parsed
  } catch {
    console.warn(`  Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
    return { score: 0, justification: `Invalid JSON in response` }
  }
}

// --- Iteration Management ---

function getNextIteration(resultsDir: string): number {
  if (!existsSync(resultsDir)) return 0

  const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
  if (files.length === 0) return 0

  const iterations = files.map((f) => {
    const match = f.match(/iteration-(\d+)\.json/)
    return match ? parseInt(match[1], 10) : -1
  })
  return Math.max(...iterations) + 1
}

// --- Console Output ---

function printSummary(results: BenchmarkResults): void {
  console.log('\n' + '='.repeat(80))
  console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
  console.log(`Model: ${results.model} | ${results.timestamp}`)
  console.log('='.repeat(80))

  // Table header
  console.log(
    'ID'.padEnd(6) +
    'Score'.padEnd(8) +
    'Question'.padEnd(50) +
    'Justification'
  )
  console.log('-'.repeat(80))

  for (const r of results.results) {
    const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
    const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
    const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
    console.log(
      r.id.padEnd(6) +
      scoreLabel.padEnd(8) +
      questionTruncated.padEnd(50) +
      justTruncated
    )
  }

  console.log('-'.repeat(80))
  console.log(
    `TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
    ` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
    ` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
    ` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
  )
  console.log('='.repeat(80))
}

// --- Main ---

async function main() {
  const scriptDir = import.meta.dirname
  const configPath = resolve(scriptDir, 'benchmark-config.json')
  const resultsDir = resolve(scriptDir, 'benchmark-results')

  // Load config
  const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
  console.log(`Loaded ${config.questions.length} benchmark questions.`)

  // Determine iteration number
  const iteration = getNextIteration(resultsDir)
  console.log(`Running iteration ${iteration}...`)

  // Build system prompt (same as production llm.ts)
  const systemPrompt = buildSystemPrompt()
  console.log(`System prompt built (${systemPrompt.length} chars).`)

  // Run each question
  const questionResults: QuestionResult[] = []

  for (const q of config.questions) {
    console.log(`\n[${q.id}] ${q.question}`)

    // Get answer from LLM
    console.log('  Getting answer...')
    const actualAnswer = await callLLM(systemPrompt, q.question)
    console.log(`  Answer: ${actualAnswer.slice(0, 100)}...`)

    // Score the answer
    console.log('  Scoring...')
    const { score, justification } = await scoreAnswer(
      q.question,
      q.expectedAnswer,
      q.keyFacts,
      actualAnswer,
    )
    console.log(`  Score: ${score}/2 — ${justification}`)

    questionResults.push({
      id: q.id,
      question: q.question,
      expectedAnswer: q.expectedAnswer,
      actualAnswer,
      score,
      justification,
    })
  }

  // Calculate totals
  const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
  const hasZeros = questionResults.some((r) => r.score === 0)
  const passed = totalScore >= config.passThreshold && !hasZeros

  const results: BenchmarkResults = {
    iteration,
    timestamp: new Date().toISOString(),
    model: LLM_MODEL,
    totalScore,
    maxPossibleScore: config.maxScore,
    passThreshold: config.passThreshold,
    passed,
    hasZeros,
    results: questionResults,
  }

  // Save results
  mkdirSync(resultsDir, { recursive: true })
  const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
  writeFileSync(resultsPath, JSON.stringify(results, null, 2))
  console.log(`\nResults saved to ${resultsPath}`)

  // Print summary table
  printSummary(results)

  // Exit with appropriate code
  process.exit(passed ? 0 : 1)
}

main().catch((err) => {
  console.error('Benchmark failed:', err)
  process.exit(2)
})