383 lines
11 KiB
TypeScript
383 lines
11 KiB
TypeScript
import { readFileSync, writeFileSync, readdirSync, mkdirSync, existsSync } from 'node:fs'
|
|
import { resolve } from 'node:path'
|
|
import { buildEmbeddingTexts } from '@/lib/search'
|
|
|
|
// Load .env file manually (avoid adding dotenv dependency)
|
|
function loadEnvFile(): void {
|
|
const envPath = resolve(import.meta.dirname, '..', '.env')
|
|
if (!existsSync(envPath)) return
|
|
const content = readFileSync(envPath, 'utf-8')
|
|
for (const line of content.split('\n')) {
|
|
const trimmed = line.trim()
|
|
if (!trimmed || trimmed.startsWith('#')) continue
|
|
const eqIndex = trimmed.indexOf('=')
|
|
if (eqIndex === -1) continue
|
|
const key = trimmed.slice(0, eqIndex)
|
|
const value = trimmed.slice(eqIndex + 1)
|
|
if (!process.env[key]) {
|
|
process.env[key] = value
|
|
}
|
|
}
|
|
}
|
|
loadEnvFile()
|
|
|
|
// --- Types ---
|
|
|
|
interface BenchmarkQuestion {
|
|
id: string
|
|
question: string
|
|
expectedAnswer: string
|
|
keyFacts: string[]
|
|
}
|
|
|
|
interface BenchmarkConfig {
|
|
passThreshold: number
|
|
maxScore: number
|
|
questions: BenchmarkQuestion[]
|
|
}
|
|
|
|
interface ScoringResult {
|
|
score: 0 | 1 | 2
|
|
justification: string
|
|
}
|
|
|
|
interface QuestionResult {
|
|
id: string
|
|
question: string
|
|
expectedAnswer: string
|
|
actualAnswer: string
|
|
score: number
|
|
justification: string
|
|
}
|
|
|
|
interface BenchmarkResults {
|
|
iteration: number
|
|
timestamp: string
|
|
model: string
|
|
totalScore: number
|
|
maxPossibleScore: number
|
|
passThreshold: number
|
|
passed: boolean
|
|
hasZeros: boolean
|
|
results: QuestionResult[]
|
|
}
|
|
|
|
// --- Gemini API ---
|
|
|
|
const GEMINI_MODEL = 'gemini-3-flash-preview'
|
|
const GEMINI_API_BASE = `https://generativelanguage.googleapis.com/v1beta/models/${GEMINI_MODEL}`
|
|
|
|
function getApiKey(): string {
|
|
const key = process.env.VITE_GEMINI_API_KEY
|
|
if (!key) {
|
|
throw new Error('VITE_GEMINI_API_KEY not set. Ensure .env file exists with this key.')
|
|
}
|
|
return key
|
|
}
|
|
|
|
function buildSystemPrompt(): string {
|
|
const texts = buildEmbeddingTexts()
|
|
const cvContent = texts.map((t) => `- ${t.text}`).join('\n')
|
|
|
|
return `You are an AI assistant on Andy Charlwood's portfolio website. Answer questions about his experience, skills, projects, and qualifications.
|
|
|
|
## Andy's Professional Profile
|
|
|
|
${cvContent}
|
|
|
|
## Rules
|
|
1. Use ONLY the profile above. Never invent roles, dates, or achievements.
|
|
2. Be concise (2-4 sentences). Be professional but friendly.
|
|
3. If the information isn't in the profile, say so.
|
|
|
|
## Item References
|
|
After your answer, on a NEW line, list relevant portfolio item IDs:
|
|
[ITEMS: id1, id2, id3]
|
|
- IDs match the profile entries above (exp-*, skill-*, proj-*, ach-*, edu-*, action-*).
|
|
- Only include IDs directly relevant to your answer.
|
|
- If no items are relevant, omit the [ITEMS: ...] line entirely.`
|
|
}
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms))
|
|
}
|
|
|
|
async function callGemini(
|
|
systemPrompt: string,
|
|
userMessage: string,
|
|
temperature = 0.7,
|
|
maxOutputTokens = 512,
|
|
): Promise<string> {
|
|
const apiKey = getApiKey()
|
|
const maxRetries = 5
|
|
|
|
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
const response = await fetch(
|
|
`${GEMINI_API_BASE}:generateContent?key=${apiKey}`,
|
|
{
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
system_instruction: {
|
|
parts: [{ text: systemPrompt }],
|
|
},
|
|
contents: [
|
|
{ role: 'user', parts: [{ text: userMessage }] },
|
|
],
|
|
generationConfig: {
|
|
temperature,
|
|
maxOutputTokens,
|
|
},
|
|
}),
|
|
},
|
|
)
|
|
|
|
if (response.status === 429 || response.status === 503) {
|
|
const errorBody = await response.text()
|
|
const retryMatch = errorBody.match(/retry in ([\d.]+)s/)
|
|
const waitSeconds = retryMatch ? Math.ceil(parseFloat(retryMatch[1])) + 2 : (attempt + 1) * 15
|
|
const reason = response.status === 429 ? 'Rate limited' : 'Service unavailable'
|
|
console.log(` ${reason}. Waiting ${waitSeconds}s (attempt ${attempt + 1}/${maxRetries})...`)
|
|
await sleep(waitSeconds * 1000)
|
|
continue
|
|
}
|
|
|
|
if (!response.ok) {
|
|
const errorBody = await response.text()
|
|
throw new Error(`Gemini API error ${response.status}: ${errorBody}`)
|
|
}
|
|
|
|
const data = await response.json()
|
|
const text = data?.candidates?.[0]?.content?.parts?.[0]?.text
|
|
if (!text) {
|
|
throw new Error(`No text in Gemini response: ${JSON.stringify(data)}`)
|
|
}
|
|
return text
|
|
}
|
|
|
|
throw new Error('Max retries exceeded for rate limiting')
|
|
}
|
|
|
|
// --- Scoring ---
|
|
|
|
function extractJson(text: string): string | null {
|
|
// Try parsing directly first
|
|
try {
|
|
JSON.parse(text)
|
|
return text
|
|
} catch { /* not direct JSON, continue extraction */ }
|
|
|
|
// Strip markdown code fences
|
|
const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/)
|
|
if (fenceMatch) {
|
|
return fenceMatch[1].trim()
|
|
}
|
|
|
|
// Find first { ... } block
|
|
const braceStart = text.indexOf('{')
|
|
if (braceStart === -1) return null
|
|
|
|
// Find matching closing brace
|
|
let depth = 0
|
|
let inString = false
|
|
let escaped = false
|
|
for (let i = braceStart; i < text.length; i++) {
|
|
const ch = text[i]
|
|
if (escaped) { escaped = false; continue }
|
|
if (ch === '\\') { escaped = true; continue }
|
|
if (ch === '"') { inString = !inString; continue }
|
|
if (inString) continue
|
|
if (ch === '{') depth++
|
|
if (ch === '}') { depth--; if (depth === 0) return text.slice(braceStart, i + 1) }
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
async function scoreAnswer(
|
|
question: string,
|
|
expectedAnswer: string,
|
|
keyFacts: string[],
|
|
actualAnswer: string,
|
|
): Promise<ScoringResult> {
|
|
const scoringPrompt = `You are a strict evaluator. Compare an ACTUAL answer to an EXPECTED answer about a person's CV.
|
|
|
|
Rubric:
|
|
- 2 = ACCURATE: Covers key facts correctly. Minor omissions OK if no errors.
|
|
- 1 = PARTIAL: Some key facts right but misses important details or is vague.
|
|
- 0 = INCORRECT: Contains factual errors, contradicts expected answer, or misses the point.
|
|
|
|
Key facts for score 2:
|
|
${keyFacts.map((f) => `- ${f}`).join('\n')}
|
|
|
|
IMPORTANT: Respond with ONLY a single-line JSON object. No markdown, no code fences, no extra text.
|
|
Example: {"score":2,"justification":"Covers all key facts accurately"}
|
|
Keep justification under 30 words.`
|
|
|
|
const userMessage = `QUESTION: ${question}
|
|
|
|
EXPECTED ANSWER: ${expectedAnswer}
|
|
|
|
ACTUAL ANSWER: ${actualAnswer}`
|
|
|
|
const rawResponse = await callGemini(scoringPrompt, userMessage, 0, 512)
|
|
|
|
// Extract JSON — handle code fences, preamble text, multiline responses
|
|
const extracted = extractJson(rawResponse)
|
|
if (!extracted) {
|
|
console.warn(` Warning: Could not extract JSON from scoring response: ${rawResponse.slice(0, 200)}`)
|
|
return { score: 0, justification: `Failed to parse scoring response` }
|
|
}
|
|
|
|
try {
|
|
const parsed = JSON.parse(extracted) as ScoringResult
|
|
if (![0, 1, 2].includes(parsed.score)) {
|
|
console.warn(` Warning: Invalid score value: ${parsed.score}`)
|
|
return { score: 0, justification: `Invalid score value: ${parsed.score}` }
|
|
}
|
|
return parsed
|
|
} catch {
|
|
console.warn(` Warning: Invalid JSON: ${extracted.slice(0, 150)}`)
|
|
return { score: 0, justification: `Invalid JSON in response` }
|
|
}
|
|
}
|
|
|
|
// --- Iteration Management ---
|
|
|
|
function getNextIteration(resultsDir: string): number {
|
|
if (!existsSync(resultsDir)) return 0
|
|
|
|
const files = readdirSync(resultsDir).filter((f) => f.startsWith('iteration-') && f.endsWith('.json'))
|
|
if (files.length === 0) return 0
|
|
|
|
const iterations = files.map((f) => {
|
|
const match = f.match(/iteration-(\d+)\.json/)
|
|
return match ? parseInt(match[1], 10) : -1
|
|
})
|
|
return Math.max(...iterations) + 1
|
|
}
|
|
|
|
// --- Console Output ---
|
|
|
|
function printSummary(results: BenchmarkResults): void {
|
|
console.log('\n' + '='.repeat(80))
|
|
console.log(`BENCHMARK RESULTS — Iteration ${results.iteration}`)
|
|
console.log(`Model: ${results.model} | ${results.timestamp}`)
|
|
console.log('='.repeat(80))
|
|
|
|
// Table header
|
|
console.log(
|
|
'ID'.padEnd(6) +
|
|
'Score'.padEnd(8) +
|
|
'Question'.padEnd(50) +
|
|
'Justification'
|
|
)
|
|
console.log('-'.repeat(80))
|
|
|
|
for (const r of results.results) {
|
|
const scoreLabel = r.score === 2 ? '2 ✓' : r.score === 1 ? '1 ~' : '0 ✗'
|
|
const questionTruncated = r.question.length > 47 ? r.question.slice(0, 44) + '...' : r.question
|
|
const justTruncated = r.justification.length > 60 ? r.justification.slice(0, 57) + '...' : r.justification
|
|
console.log(
|
|
r.id.padEnd(6) +
|
|
scoreLabel.padEnd(8) +
|
|
questionTruncated.padEnd(50) +
|
|
justTruncated
|
|
)
|
|
}
|
|
|
|
console.log('-'.repeat(80))
|
|
console.log(
|
|
`TOTAL: ${results.totalScore}/${results.maxPossibleScore}` +
|
|
` | Threshold: ${results.passThreshold}/${results.maxPossibleScore}` +
|
|
` | Has zeros: ${results.hasZeros ? 'YES' : 'No'}` +
|
|
` | ${results.passed ? 'PASSED ✓' : 'FAILED ✗'}`
|
|
)
|
|
console.log('='.repeat(80))
|
|
}
|
|
|
|
// --- Main ---
|
|
|
|
async function main() {
|
|
const scriptDir = import.meta.dirname
|
|
const configPath = resolve(scriptDir, 'benchmark-config.json')
|
|
const resultsDir = resolve(scriptDir, 'benchmark-results')
|
|
|
|
// Load config
|
|
const config: BenchmarkConfig = JSON.parse(readFileSync(configPath, 'utf-8'))
|
|
console.log(`Loaded ${config.questions.length} benchmark questions.`)
|
|
|
|
// Determine iteration number
|
|
const iteration = getNextIteration(resultsDir)
|
|
console.log(`Running iteration ${iteration}...`)
|
|
|
|
// Build system prompt (same as production)
|
|
const systemPrompt = buildSystemPrompt()
|
|
console.log(`System prompt built (${systemPrompt.length} chars).`)
|
|
|
|
// Run each question
|
|
const questionResults: QuestionResult[] = []
|
|
|
|
for (const q of config.questions) {
|
|
console.log(`\n[${q.id}] ${q.question}`)
|
|
|
|
// Get answer from Gemini
|
|
console.log(' Getting answer...')
|
|
const actualAnswer = await callGemini(systemPrompt, q.question)
|
|
console.log(` Answer: ${actualAnswer.slice(0, 100)}...`)
|
|
|
|
// Score the answer
|
|
console.log(' Scoring...')
|
|
const { score, justification } = await scoreAnswer(
|
|
q.question,
|
|
q.expectedAnswer,
|
|
q.keyFacts,
|
|
actualAnswer,
|
|
)
|
|
console.log(` Score: ${score}/2 — ${justification}`)
|
|
|
|
questionResults.push({
|
|
id: q.id,
|
|
question: q.question,
|
|
expectedAnswer: q.expectedAnswer,
|
|
actualAnswer,
|
|
score,
|
|
justification,
|
|
})
|
|
}
|
|
|
|
// Calculate totals
|
|
const totalScore = questionResults.reduce((sum, r) => sum + r.score, 0)
|
|
const hasZeros = questionResults.some((r) => r.score === 0)
|
|
const passed = totalScore >= config.passThreshold && !hasZeros
|
|
|
|
const results: BenchmarkResults = {
|
|
iteration,
|
|
timestamp: new Date().toISOString(),
|
|
model: GEMINI_MODEL,
|
|
totalScore,
|
|
maxPossibleScore: config.maxScore,
|
|
passThreshold: config.passThreshold,
|
|
passed,
|
|
hasZeros,
|
|
results: questionResults,
|
|
}
|
|
|
|
// Save results
|
|
mkdirSync(resultsDir, { recursive: true })
|
|
const resultsPath = resolve(resultsDir, `iteration-${iteration}.json`)
|
|
writeFileSync(resultsPath, JSON.stringify(results, null, 2))
|
|
console.log(`\nResults saved to ${resultsPath}`)
|
|
|
|
// Print summary table
|
|
printSummary(results)
|
|
|
|
// Exit with appropriate code
|
|
process.exit(passed ? 0 : 1)
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error('Benchmark failed:', err)
|
|
process.exit(2)
|
|
})
|