merge

2026-02-15 23:20:24 +00:00
parent 4580ca9c84
commit 0fbbf9e46f
10 changed files with 1576 additions and 2 deletions
@@ -0,0 +1,120 @@
+{
+  "passThreshold": 18,
+  "maxScore": 20,
+  "questions": [
+    {
+      "id": "Q01",
+      "question": "How many years has Andy been employed by the NHS?",
+      "expectedAnswer": "Approximately 3-4 years. Andy's NHS employment started in May 2022 when he joined NHS Norfolk and Waveney ICB. His previous role at Tesco PLC was in the private sector, not the NHS.",
+      "keyFacts": [
+        "NHS employment started May 2022",
+        "Tesco was private employer",
+        "approximately 3-4 years NHS employment"
+      ]
+    },
+    {
+      "id": "Q02",
+      "question": "What was Andy's involvement with tirzepatide?",
+      "expectedAnswer": "Andy supported commissioning of NICE TA1026 (tirzepatide). He authored the initial executive paper advocating a primary care delivery model over specialist provider, which drove a system shift to GP-led model.",
+      "keyFacts": [
+        "NICE TA1026",
+        "authored executive paper",
+        "primary care model",
+        "GP-led delivery"
+      ]
+    },
+    {
+      "id": "Q03",
+      "question": "What specific tools and software has Andy built?",
+      "expectedAnswer": "Andy has built 5 notable projects: a patient switching algorithm (Python, 14000 patients, £2.6M savings), a Blueteq generator for high-cost drug forms, a controlled drugs monitoring system, a Sankey chart tool for visualising patient flows, and PharMetrics — a Power BI analytics dashboard.",
+      "keyFacts": [
+        "patient switching algorithm",
+        "Blueteq generator",
+        "CD monitoring system",
+        "Sankey chart tool",
+        "PharMetrics dashboard"
+      ]
+    },
+    {
+      "id": "Q04",
+      "question": "What were Andy's A-level subjects and grades?",
+      "expectedAnswer": "Andy achieved Mathematics A*, Chemistry B, and Politics C at Highworth Grammar School between 2009-2011.",
+      "keyFacts": [
+        "Mathematics A*",
+        "Chemistry B",
+        "Politics C",
+        "Highworth Grammar School"
+      ]
+    },
+    {
+      "id": "Q05",
+      "question": "Was Andy's Tesco role part of the NHS?",
+      "expectedAnswer": "No. Andy's role at Tesco PLC was in the private sector as a community pharmacist. Tesco PLC is a private employer. He was an LPC representative during this time.",
+      "keyFacts": [
+        "Tesco PLC is private/not NHS",
+        "community pharmacy",
+        "LPC representative"
+      ]
+    },
+    {
+      "id": "Q06",
+      "question": "How did the patient switching algorithm work?",
+      "expectedAnswer": "It was Python-based and used real-world GP prescribing data to auto-identify patients eligible for cost-effective medication alternatives. It compressed months of manual work into 3 days, covered 14,000 patients, and identified £2.6M in savings.",
+      "keyFacts": [
+        "Python",
+        "GP prescribing data",
+        "14000 patients",
+        "£2.6M savings",
+        "compressed months to 3 days"
+      ]
+    },
+    {
+      "id": "Q07",
+      "question": "What clinical specialties has Andy worked across?",
+      "expectedAnswer": "Andy has worked across rheumatology, ophthalmology (wet AMD, DMO, RVO), dermatology, gastroenterology, neurology, and migraine through his high-cost drugs role.",
+      "keyFacts": [
+        "rheumatology",
+        "ophthalmology",
+        "dermatology",
+        "gastroenterology",
+        "neurology",
+        "migraine"
+      ]
+    },
+    {
+      "id": "Q08",
+      "question": "What is Andy's experience with the dm+d?",
+      "expectedAnswer": "Andy created a comprehensive medicines data table integrating all dm+d products with standardised strengths, morphine equivalents, and Anticholinergic Burden scoring, serving as a single source of truth.",
+      "keyFacts": [
+        "dm+d integration",
+        "standardised strengths",
+        "morphine equivalents",
+        "Anticholinergic Burden",
+        "single source of truth"
+      ]
+    },
+    {
+      "id": "Q09",
+      "question": "What budget does Andy manage and how?",
+      "expectedAnswer": "Andy manages a £220M prescribing budget using forecasting models, variance analysis, and financial reporting to the executive team, enabling proactive financial planning.",
+      "keyFacts": [
+        "£220M",
+        "forecasting models",
+        "variance analysis",
+        "proactive financial planning"
+      ]
+    },
+    {
+      "id": "Q10",
+      "question": "What leadership training does Andy have?",
+      "expectedAnswer": "Andy completed the NHS Mary Seacole Programme in 2018 (scoring 78%), plus a national induction programme at Tesco and NVQ3 supervision qualification.",
+      "keyFacts": [
+        "Mary Seacole Programme",
+        "2018",
+        "78%",
+        "national induction training at Tesco",
+        "NVQ3 supervision"
+      ]
+    }
+  ]
+}
@@ -0,0 +1,382 @@
+import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
+import { resolve } from 'node:path'
+import { buildEmbeddingTexts } from '@/lib/search'
+
+// Load .env file manually (avoid adding dotenv dependency)
+function loadEnvFile(): void {
+  const envPath = resolve(import.meta.dirname, '..', '.env')
+  if (!existsSync(envPath)) return
+  const content = readFileSync(envPath, 'utf-8')
+  for (const line of content.split('\n')) {
+    const trimmed = line.trim()
+    if (!trimmed || trimmed.startsWith('#')) continue
+    const eqIndex = trimmed.indexOf('=')
+    if (eqIndex === -1) continue
+    const key = trimmed.slice(0, eqIndex)
+    const value = trimmed.slice(eqIndex + 1)
+    if (!process.env[key]) {
+      process.env[key] = value
+    }
+  }
+}
+loadEnvFile()
+
+// --- Types ---
+
+interface BenchmarkQuestion {
+  id: string
+  question: string
+  expectedAnswer: string
+  keyFacts: string[]
+}
+
+interface BenchmarkConfig {
+  passThreshold: number
+  maxScore: number
+  questions: BenchmarkQuestion[]
+}
+
+interface ScoringResult {
+  score: 0 | 1 | 2
+  justification: string
+}
+
+interface QuestionResult {
+  id: string
+  question: string
+  expectedAnswer: string
+  actualAnswer: string
+  score: number
+  justification: string
+}
+
+interface BenchmarkResults {
+  iteration: number
+  timestamp: string
+  model: string
+  totalScore: number
+  maxPossibleScore: number
+  passThreshold: number
+  passed: boolean
+  hasZeros: boolean
+  results: QuestionResult[]
+}
+
+// --- Gemini API ---
+
+const GEMINI_MODEL = 'gemini-3-flash-preview'
+const GEMINI_API_BASE = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}`
+
+function getApiKey(): string {
+  const key = process.env.VITE_GEMINI_API_KEY
+  if (!key) {
+    throw new Error('VITE_GEMINI_API_KEY not set. Ensure .env file exists with this key.')
+  }
+  return key
+}
+
+function buildSystemPrompt(): string {
+  const texts = buildEmbeddingTexts()
+  const cvContent = texts.map((t) => `- ${t.text}`).join('\n')
+
+  return `You are an AI assistant on Andy Charlwood's portfolio website. Answer questions about his experience, skills, projects, and qualifications.
+
+## Andy's Professional Profile
+
+${cvContent}
+
+## Rules
+1. Use ONLY the profile above. Never invent roles, dates, or achievements.
+2. Be concise (2-4 sentences). Be professional but friendly.
+3. If the information isn't in the profile, say so.
+
+## Item References
+After your answer, on a NEW line, list relevant portfolio item IDs:
+[ITEMS: id1, id2, id3]
+- IDs match the profile entries above (exp-*, skill-*, proj-*, ach-*, edu-*, action-*).
+- Only include IDs directly relevant to your answer.
+- If no items are relevant, omit the [ITEMS: ...] line entirely.`
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms))
+}
+
+async function callGemini(
+  systemPrompt: string,
+  userMessage: string,
+  temperature = 0.7,
+  maxOutputTokens = 512,
+): Promise<string> {
+  const apiKey = getApiKey()
+  const maxRetries = 5
+
+  for (let attempt = 0; attempt < maxRetries; attempt++) {
+    const response = await fetch(
+      `${GEMINI_API_BASE}:generateContent?key=${apiKey}`,
+      {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          system_instruction: {
+            parts: [{ text: systemPrompt }],
+          },
+          contents: [
+            { role: 'user', parts: [{ text: userMessage }] },
+          ],
+          generationConfig: {
+            temperature,
+            maxOutputTokens,
+          },
+        }),
+      },
+    )
+
+    if (response.status === 429 || response.status === 503) {
+      const errorBody = await response.text()
+      const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
+      const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
+      const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
+      console.log(`  ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
+      await sleep(waitSeconds * 1000)
+      continue
+    }
+
+    if (!response.ok) {
+      const errorBody = await response.text()
+      throw new Error(`Gemini API error ${response.status}: ${errorBody}`)
+    }
+
+    const data = await response.json()
+    const text = data?.candidates?.[0]?.content?.parts?.[0]?.text
+    if (!text) {
+      throw new Error(`No text in Gemini response: ${JSON.stringify(data)}`)
+    }
+    return text
+  }
+
+  throw new Error('Max retries exceeded for rate limiting')
+}
+
+// --- Scoring ---
+
+function extractJson(text: string): string | null {
+  // Try parsing directly first
+  try {
+    JSON.parse(text)
+    return text
+  } catch { /* not direct JSON, continue extraction */ }
+
+  // Strip markdown code fences
+  const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
+  if (fenceMatch) {
+    return fenceMatch[1].trim()
+  }
+
+  // Find first { ... } block
+  const braceStart = text.indexOf('{')
+  if (braceStart === -1) return null
+
+  // Find matching closing brace
+  let depth = 0
+  let inString = false
+  let escaped = false
+  for (let i = braceStart; i < text.length; i++) {
+    const ch = text[i]
+    if (escaped) { escaped = false; continue }
+    if (ch === '\\') { escaped = true; continue }
+    if (ch === '"') { inString = !inString; continue }
+    if (inString) continue
+    if (ch === '{') depth++
+    if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
+  }
+
+  return null
+}
+
+async function scoreAnswer(
+  question: string,
+  expectedAnswer: string,
+  keyFacts: string[],
+  actualAnswer: string,
+): Promise<ScoringResult> {
+  const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.
+
+Rubric:
+- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
+- 1 = PARTIAL: Some key facts right but misses important details or is vague.
+- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.
+
+Key facts for score 2:
+${keyFacts.map((f) => `- ${f}`).join('\n')}
+
+IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
+Example: {"score":2,"justification":"Covers all key facts accurately"}
+Keep justification under 30 words.`
+
+  const userMessage = `QUESTION: ${question}
+
+EXPECTED ANSWER: ${expectedAnswer}
+
+ACTUAL ANSWER: ${actualAnswer}`
+
+  const rawResponse = await callGemini(scoringPrompt, userMessage, 0, 512)
+
+  // Extract JSON — handle code fences, preamble text, multiline responses
+  const extracted = extractJson(rawResponse)
+  if (!extracted) {
+    console.warn(`  Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
+    return { score: 0, justification: `Failed to parse scoring response` }
+  }
+
+  try {
+    const parsed = JSON.parse(extracted) as ScoringResult
+    if (![0, 1, 2].includes(parsed.score)) {
+      console.warn(`  Warning: Invalid score value: ${parsed.score}`)
+      return { score: 0, justification: `Invalid score value: ${parsed.score}` }
+    }
+    return parsed
+  } catch {
+    console.warn(`  Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
+    return { score: 0, justification: `Invalid JSON in response` }
+  }
+}
+
+// --- Iteration Management ---
+
+function getNextIteration(resultsDir: string): number {
+  if (!existsSync(resultsDir)) return 0
+
+  const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
+  if (files.length === 0) return 0
+
+  const iterations = files.map((f) => {
+    const match = f.match(/iteration-(\d+)\.json/)
+    return match ? parseInt(match[1], 10) : -1
+  })
+  return Math.max(...iterations) + 1
+}
+
+// --- Console Output ---
+
+function printSummary(results: BenchmarkResults): void {
+  console.log('\n' + '='.repeat(80))
+  console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
+  console.log(`Model: ${results.model} | ${results.timestamp}`)
+  console.log('='.repeat(80))
+
+  // Table header
+  console.log(
+    'ID'.padEnd(6) +
+    'Score'.padEnd(8) +
+    'Question'.padEnd(50) +
+    'Justification'
+  )
+  console.log('-'.repeat(80))
+
+  for (const r of results.results) {
+    const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
+    const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
+    const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
+    console.log(
+      r.id.padEnd(6) +
+      scoreLabel.padEnd(8) +
+      questionTruncated.padEnd(50) +
+      justTruncated
+    )
+  }
+
+  console.log('-'.repeat(80))
+  console.log(
+    `TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
+    ` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
+    ` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
+    ` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
+  )
+  console.log('='.repeat(80))
+}
+
+// --- Main ---
+
+async function main() {
+  const scriptDir = import.meta.dirname
+  const configPath = resolve(scriptDir, 'benchmark-config.json')
+  const resultsDir = resolve(scriptDir, 'benchmark-results')
+
+  // Load config
+  const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
+  console.log(`Loaded ${config.questions.length} benchmark questions.`)
+
+  // Determine iteration number
+  const iteration = getNextIteration(resultsDir)
+  console.log(`Running iteration ${iteration}...`)
+
+  // Build system prompt (same as production)
+  const systemPrompt = buildSystemPrompt()
+  console.log(`System prompt built (${systemPrompt.length} chars).`)
+
+  // Run each question
+  const questionResults: QuestionResult[] = []
+
+  for (const q of config.questions) {
+    console.log(`\n[${q.id}] ${q.question}`)
+
+    // Get answer from Gemini
+    console.log('  Getting answer...')
+    const actualAnswer = await callGemini(systemPrompt, q.question)
+    console.log(`  Answer: ${actualAnswer.slice(0, 100)}...`)
+
+    // Score the answer
+    console.log('  Scoring...')
+    const { score, justification } = await scoreAnswer(
+      q.question,
+      q.expectedAnswer,
+      q.keyFacts,
+      actualAnswer,
+    )
+    console.log(`  Score: ${score}/2 — ${justification}`)
+
+    questionResults.push({
+      id: q.id,
+      question: q.question,
+      expectedAnswer: q.expectedAnswer,
+      actualAnswer,
+      score,
+      justification,
+    })
+  }
+
+  // Calculate totals
+  const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
+  const hasZeros = questionResults.some((r) => r.score === 0)
+  const passed = totalScore >= config.passThreshold && !hasZeros
+
+  const results: BenchmarkResults = {
+    iteration,
+    timestamp: new Date().toISOString(),
+    model: GEMINI_MODEL,
+    totalScore,
+    maxPossibleScore: config.maxScore,
+    passThreshold: config.passThreshold,
+    passed,
+    hasZeros,
+    results: questionResults,
+  }
+
+  // Save results
+  mkdirSync(resultsDir, { recursive: true })
+  const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
+  writeFileSync(resultsPath, JSON.stringify(results, null, 2))
+  console.log(`\nResults saved to ${resultsPath}`)
+
+  // Print summary table
+  printSummary(results)
+
+  // Exit with appropriate code
+  process.exit(passed ? 0 : 1)
+}
+
+main().catch((err) => {
+  console.error('Benchmark failed:', err)
+  process.exit(2)
+})