import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs' import { resolve } from 'node:path' import { buildEmbeddingTexts } from '@/lib/search' // Load .env file manually (avoid adding dotenv dependency) function loadEnvFile(): void { const envPath = resolve(import.meta.dirname, '..', '.env') if (!existsSync(envPath)) return const content = readFileSync(envPath, 'utf-8') for (const line of content.split('\n')) { const trimmed = line.trim() if (!trimmed || trimmed.startsWith('#')) continue const eqIndex = trimmed.indexOf('=') if (eqIndex === -1) continue const key = trimmed.slice(0, eqIndex) const value = trimmed.slice(eqIndex + 1) if (!process.env[key]) { process.env[key] = value } } } loadEnvFile() // --- Types --- interface BenchmarkQuestion { id: string question: string expectedAnswer: string keyFacts: string[] } interface BenchmarkConfig { passThreshold: number maxScore: number questions: BenchmarkQuestion[] } interface ScoringResult { score: 0 | 1 | 2 justification: string } interface QuestionResult { id: string question: string expectedAnswer: string actualAnswer: string score: number justification: string } interface BenchmarkResults { iteration: number timestamp: string model: string totalScore: number maxPossibleScore: number passThreshold: number passed: boolean hasZeros: boolean results: QuestionResult[] } // --- OpenRouter API --- const LLM_MODEL = 'z-ai/glm-5' const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1/chat/completions' function getApiKey(): string { const key = process.env.VITE_OPEN_ROUTER_API_KEY if (!key) { throw new Error('VITE_OPEN_ROUTER_API_KEY not set. Ensure .env file exists with this key.') } return key } function buildSystemPrompt(): string { const texts = buildEmbeddingTexts() const cvContent = texts.map((t) => `[${t.id}] ${t.text}`).join('\n') return `You are a helpful assistant on Andy Charlwood's portfolio website. ## Profile Data Each entry is prefixed with its ID in square brackets. ${cvContent} ## Response Rules - Answer ONLY from the profile data above. Never invent facts, roles, dates, or achievements. - Be concise: 2-4 sentences. Professional and friendly tone. - If the answer isn't in the profile, say so honestly. - Do not fabricate URLs, email addresses, or contact details. ## Item References End your response with a single line listing relevant item IDs: [ITEMS: exp-nhs-nwicb, skill-python] Only include IDs that directly support your answer. Omit the line if none are relevant.` } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } async function callLLM( systemPrompt: string, userMessage: string, temperature = 0.7, maxTokens = 512, ): Promise { const apiKey = getApiKey() const maxRetries = 5 for (let attempt = 0; attempt < maxRetries; attempt++) { const response = await fetch(OPENROUTER_API_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}`, 'HTTP-Referer': 'https://andycharlwood.co.uk', 'X-Title': 'Andy Charlwood Portfolio', }, body: JSON.stringify({ model: LLM_MODEL, temperature, max_tokens: maxTokens, messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userMessage }, ], }), }) if (response.status === 429 || response.status === 503) { const errorBody = await response.text() const retryMatch = errorBody.match(/retry in ([\d.]+)s/) const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15 const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable' console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`) await sleep(waitSeconds * 1000) continue } if (!response.ok) { const errorBody = await response.text() throw new Error(`OpenRouter API error ${response.status}: ${errorBody}`) } const data = await response.json() const text = data?.choices?.[0]?.message?.content if (!text) { throw new Error(`No text in OpenRouter response: ${JSON.stringify(data)}`) } return text } throw new Error('Max retries exceeded for rate limiting') } // --- Scoring --- function extractJson(text: string): string | null { // Try parsing directly first try { JSON.parse(text) return text } catch { /* not direct JSON, continue extraction */ } // Strip markdown code fences const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/) if (fenceMatch) { return fenceMatch[1].trim() } // Find first { ... } block const braceStart = text.indexOf('{') if (braceStart === -1) return null // Find matching closing brace let depth = 0 let inString = false let escaped = false for (let i = braceStart; i < text.length; i++) { const ch = text[i] if (escaped) { escaped = false; continue } if (ch === '\\') { escaped = true; continue } if (ch === '"') { inString = !inString; continue } if (inString) continue if (ch === '{') depth++ if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) } } return null } async function scoreAnswer( question: string, expectedAnswer: string, keyFacts: string[], actualAnswer: string, ): Promise { const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV. Rubric: - 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors. - 1 = PARTIAL: Some key facts right but misses important details or is vague. - 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point. Key facts for score 2: ${keyFacts.map((f) => `- ${f}`).join('\n')} IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text. Example: {"score":2,"justification":"Covers all key facts accurately"} Keep justification under 30 words.` const userMessage = `QUESTION: ${question} EXPECTED ANSWER: ${expectedAnswer} ACTUAL ANSWER: ${actualAnswer}` const rawResponse = await callLLM(scoringPrompt, userMessage, 0, 512) // Extract JSON — handle code fences, preamble text, multiline responses const extracted = extractJson(rawResponse) if (!extracted) { console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`) return { score: 0, justification: `Failed to parse scoring response` } } try { const parsed = JSON.parse(extracted) as ScoringResult if (![0, 1, 2].includes(parsed.score)) { console.warn(` Warning: Invalid score value: ${parsed.score}`) return { score: 0, justification: `Invalid score value: ${parsed.score}` } } return parsed } catch { console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`) return { score: 0, justification: `Invalid JSON in response` } } } // --- Iteration Management --- function getNextIteration(resultsDir: string): number { if (!existsSync(resultsDir)) return 0 const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json')) if (files.length === 0) return 0 const iterations = files.map((f) => { const match = f.match(/iteration-(\d+)\.json/) return match ? parseInt(match[1], 10) : -1 }) return Math.max(...iterations) + 1 } // --- Console Output --- function printSummary(results: BenchmarkResults): void { console.log('\n' + '='.repeat(80)) console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`) console.log(`Model: ${results.model} | ${results.timestamp}`) console.log('='.repeat(80)) // Table header console.log( 'ID'.padEnd(6) + 'Score'.padEnd(8) + 'Question'.padEnd(50) + 'Justification' ) console.log('-'.repeat(80)) for (const r of results.results) { const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗' const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification console.log( r.id.padEnd(6) + scoreLabel.padEnd(8) + questionTruncated.padEnd(50) + justTruncated ) } console.log('-'.repeat(80)) console.log( `TOTAL: ${results.totalScore}/${results.maxPossibleScore}` + ` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` + ` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` + ` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}` ) console.log('='.repeat(80)) } // --- Main --- async function main() { const scriptDir = import.meta.dirname const configPath = resolve(scriptDir, 'benchmark-config.json') const resultsDir = resolve(scriptDir, 'benchmark-results') // Load config const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8')) console.log(`Loaded ${config.questions.length} benchmark questions.`) // Determine iteration number const iteration = getNextIteration(resultsDir) console.log(`Running iteration ${iteration}...`) // Build system prompt (same as production llm.ts) const systemPrompt = buildSystemPrompt() console.log(`System prompt built (${systemPrompt.length} chars).`) // Run each question const questionResults: QuestionResult[] = [] for (const q of config.questions) { console.log(`\n[${q.id}] ${q.question}`) // Get answer from LLM console.log(' Getting answer...') const actualAnswer = await callLLM(systemPrompt, q.question) console.log(` Answer: ${actualAnswer.slice(0, 100)}...`) // Score the answer console.log(' Scoring...') const { score, justification } = await scoreAnswer( q.question, q.expectedAnswer, q.keyFacts, actualAnswer, ) console.log(` Score: ${score}/2 — ${justification}`) questionResults.push({ id: q.id, question: q.question, expectedAnswer: q.expectedAnswer, actualAnswer, score, justification, }) } // Calculate totals const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0) const hasZeros = questionResults.some((r) => r.score === 0) const passed = totalScore >= config.passThreshold && !hasZeros const results: BenchmarkResults = { iteration, timestamp: new Date().toISOString(), model: LLM_MODEL, totalScore, maxPossibleScore: config.maxScore, passThreshold: config.passThreshold, passed, hasZeros, results: questionResults, } // Save results mkdirSync(resultsDir, { recursive: true }) const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`) writeFileSync(resultsPath, JSON.stringify(results, null, 2)) console.log(`\nResults saved to ${resultsPath}`) // Print summary table printSummary(results) // Exit with appropriate code process.exit(passed ? 0 : 1) } main().catch((err) => { console.error('Benchmark failed:', err) process.exit(2) })