feat: US-003 - Generate and commit embeddings.json

2026-02-15 17:55:53 +00:00
parent 219a3f04be
commit aa1774320a
4 changed files with 16374 additions and 7 deletions
@@ -53,7 +53,7 @@
        "Typecheck passes"
      ],
      "priority": 3,
-      "passes": false,
+      "passes": true,
      "notes": "The pipeline returns a Tensor — use .tolist() or .data to extract the raw float array. Mean-pool across the token dimension (dim 1) to get a single 384-d vector per input. Process items sequentially to avoid OOM in Node. The output file will be ~200KB for ~40 items with 384 floats each."
    },
    {
@@ -9,6 +9,7 @@
 - Project uses `"type": "module"` in package.json
 - Palette item IDs: `exp-{consultation.id}`, `skill-{skill.id}`, `proj-{investigation.id}`, `ach-{0-3}`, `edu-{0-3}`, `action-{0-3}`
 - `buildEmbeddingTexts()` in `src/lib/search.ts` returns `Array<{ id: string, text: string }>` with IDs matching PaletteItem IDs — use this for both embedding generation and chat context
+- `src/data/embeddings.json` is an array of `{ id: string, embedding: number[] }` — 42 items, 384-d vectors, IDs match PaletteItem IDs. Vite imports JSON natively.

 ---

@@ -45,3 +46,19 @@
  - Quick action items are `action-0` through `action-3`
  - `documents.ts` is imported but wasn't previously used in `search.ts` — now used for education embedding text
 ---
+
+## 2026-02-15 - US-003
+- Updated `scripts/generate-embeddings.ts` to import `buildEmbeddingTexts()` and generate full embeddings
+- Script embeds all 42 palette items sequentially using `Xenova/all-MiniLM-L6-v2`
+- Outputs `src/data/embeddings.json` as `Array<{ id: string, embedding: number[] }>`
+- Each embedding is a 384-dimensional float array
+- File is ~453KB (42 items × 384 floats with pretty-printed JSON)
+- `npm run generate-embeddings` regenerates the file successfully
+- Typecheck and lint pass
+- Files changed: `scripts/generate-embeddings.ts`, `src/data/embeddings.json`
+- **Learnings for future iterations:**
+  - `import.meta.dirname` works in tsx/Node ESM scripts — use it instead of `__dirname` (which isn't available in ESM)
+  - `@/` path alias works in `npx tsx` scripts because tsx resolves tsconfig paths automatically
+  - The embeddings file is ~450KB with pretty-print; could be reduced with compact JSON but readability is preferred for now
+  - Processing 42 items takes ~10-15 seconds on first run (model cached after first download)
+---
@@ -1,17 +1,27 @@
+import { writeFileSync } from 'node:fs'
+import { resolve } from 'node:path'
 import { pipeline } from '@xenova/transformers'
+import { buildEmbeddingTexts } from '@/lib/search'

 async function main() {
+  const items = buildEmbeddingTexts()
+  console.log(`Found ${items.length} items to embed.`)
+
  console.log('Loading all-MiniLM-L6-v2 model...')
  const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2')

-  const testString = 'This is a test string for embedding generation.'
-  console.log(`Embedding test string: "${testString}"`)
+  const embeddings: Array<{ id: string; embedding: number[] }> = []

-  const output = await extractor(testString, { pooling: 'mean', normalize: true })
+  for (const item of items) {
+    const output = await extractor(item.text, { pooling: 'mean', normalize: true })
    const vector = Array.from(output.data as Float32Array)
+    embeddings.push({ id: item.id, embedding: vector })
+    console.log(`  [${embeddings.length}/${items.length}] ${item.id} (${vector.length}d)`)
+  }

-  console.log(`Vector length: ${vector.length}`)
-  console.log('Done.')
+  const outPath = resolve(import.meta.dirname, '..', 'src', 'data', 'embeddings.json')
+  writeFileSync(outPath, JSON.stringify(embeddings, null, 2))
+  console.log(`\nWrote ${embeddings.length} embeddings to ${outPath}`)
 }

 main().catch((err) => {