feat: US-003 - Generate and commit embeddings.json

This commit is contained in:
2026-02-15 17:55:53 +00:00
parent 219a3f04be
commit aa1774320a
4 changed files with 16374 additions and 7 deletions
+1 -1
View File
@@ -53,7 +53,7 @@
"Typecheck passes"
],
"priority": 3,
"passes": false,
"passes": true,
"notes": "The pipeline returns a Tensor — use .tolist() or .data to extract the raw float array. Mean-pool across the token dimension (dim 1) to get a single 384-d vector per input. Process items sequentially to avoid OOM in Node. The output file will be ~200KB for ~40 items with 384 floats each."
},
{
+17
View File
@@ -9,6 +9,7 @@
- Project uses `"type": "module"` in package.json
- Palette item IDs: `exp-{consultation.id}`, `skill-{skill.id}`, `proj-{investigation.id}`, `ach-{0-3}`, `edu-{0-3}`, `action-{0-3}`
- `buildEmbeddingTexts()` in `src/lib/search.ts` returns `Array<{ id: string, text: string }>` with IDs matching PaletteItem IDs — use this for both embedding generation and chat context
- `src/data/embeddings.json` is an array of `{ id: string, embedding: number[] }` — 42 items, 384-d vectors, IDs match PaletteItem IDs. Vite imports JSON natively.
---
@@ -45,3 +46,19 @@
- Quick action items are `action-0` through `action-3`
- `documents.ts` is imported but wasn't previously used in `search.ts` — now used for education embedding text
---
## 2026-02-15 - US-003
- Updated `scripts/generate-embeddings.ts` to import `buildEmbeddingTexts()` and generate full embeddings
- Script embeds all 42 palette items sequentially using `Xenova/all-MiniLM-L6-v2`
- Outputs `src/data/embeddings.json` as `Array<{ id: string, embedding: number[] }>`
- Each embedding is a 384-dimensional float array
- File is ~453KB (42 items × 384 floats with pretty-printed JSON)
- `npm run generate-embeddings` regenerates the file successfully
- Typecheck and lint pass
- Files changed: `scripts/generate-embeddings.ts`, `src/data/embeddings.json`
- **Learnings for future iterations:**
- `import.meta.dirname` works in tsx/Node ESM scripts — use it instead of `__dirname` (which isn't available in ESM)
- `@/` path alias works in `npx tsx` scripts because tsx resolves tsconfig paths automatically
- The embeddings file is ~450KB with pretty-print; could be reduced with compact JSON but readability is preferred for now
- Processing 42 items takes ~10-15 seconds on first run (model cached after first download)
---
+15 -5
View File
@@ -1,17 +1,27 @@
import { writeFileSync } from 'node:fs'
import { resolve } from 'node:path'
import { pipeline } from '@xenova/transformers'
import { buildEmbeddingTexts } from '@/lib/search'
async function main() {
const items = buildEmbeddingTexts()
console.log(`Found ${items.length} items to embed.`)
console.log('Loading all-MiniLM-L6-v2 model...')
const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2')
const testString = 'This is a test string for embedding generation.'
console.log(`Embedding test string: "${testString}"`)
const embeddings: Array<{ id: string; embedding: number[] }> = []
const output = await extractor(testString, { pooling: 'mean', normalize: true })
for (const item of items) {
const output = await extractor(item.text, { pooling: 'mean', normalize: true })
const vector = Array.from(output.data as Float32Array)
embeddings.push({ id: item.id, embedding: vector })
console.log(` [${embeddings.length}/${items.length}] ${item.id} (${vector.length}d)`)
}
console.log(`Vector length: ${vector.length}`)
console.log('Done.')
const outPath = resolve(import.meta.dirname, '..', 'src', 'data', 'embeddings.json')
writeFileSync(outPath, JSON.stringify(embeddings, null, 2))
console.log(`\nWrote ${embeddings.length} embeddings to ${outPath}`)
}
main().catch((err) => {
File diff suppressed because it is too large Load Diff