feat: US-003 - Generate and commit embeddings.json
This commit is contained in:
+1
-1
@@ -53,7 +53,7 @@
|
|||||||
"Typecheck passes"
|
"Typecheck passes"
|
||||||
],
|
],
|
||||||
"priority": 3,
|
"priority": 3,
|
||||||
"passes": false,
|
"passes": true,
|
||||||
"notes": "The pipeline returns a Tensor — use .tolist() or .data to extract the raw float array. Mean-pool across the token dimension (dim 1) to get a single 384-d vector per input. Process items sequentially to avoid OOM in Node. The output file will be ~200KB for ~40 items with 384 floats each."
|
"notes": "The pipeline returns a Tensor — use .tolist() or .data to extract the raw float array. Mean-pool across the token dimension (dim 1) to get a single 384-d vector per input. Process items sequentially to avoid OOM in Node. The output file will be ~200KB for ~40 items with 384 floats each."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
- Project uses `"type": "module"` in package.json
|
- Project uses `"type": "module"` in package.json
|
||||||
- Palette item IDs: `exp-{consultation.id}`, `skill-{skill.id}`, `proj-{investigation.id}`, `ach-{0-3}`, `edu-{0-3}`, `action-{0-3}`
|
- Palette item IDs: `exp-{consultation.id}`, `skill-{skill.id}`, `proj-{investigation.id}`, `ach-{0-3}`, `edu-{0-3}`, `action-{0-3}`
|
||||||
- `buildEmbeddingTexts()` in `src/lib/search.ts` returns `Array<{ id: string, text: string }>` with IDs matching PaletteItem IDs — use this for both embedding generation and chat context
|
- `buildEmbeddingTexts()` in `src/lib/search.ts` returns `Array<{ id: string, text: string }>` with IDs matching PaletteItem IDs — use this for both embedding generation and chat context
|
||||||
|
- `src/data/embeddings.json` is an array of `{ id: string, embedding: number[] }` — 42 items, 384-d vectors, IDs match PaletteItem IDs. Vite imports JSON natively.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -45,3 +46,19 @@
|
|||||||
- Quick action items are `action-0` through `action-3`
|
- Quick action items are `action-0` through `action-3`
|
||||||
- `documents.ts` is imported but wasn't previously used in `search.ts` — now used for education embedding text
|
- `documents.ts` is imported but wasn't previously used in `search.ts` — now used for education embedding text
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 2026-02-15 - US-003
|
||||||
|
- Updated `scripts/generate-embeddings.ts` to import `buildEmbeddingTexts()` and generate full embeddings
|
||||||
|
- Script embeds all 42 palette items sequentially using `Xenova/all-MiniLM-L6-v2`
|
||||||
|
- Outputs `src/data/embeddings.json` as `Array<{ id: string, embedding: number[] }>`
|
||||||
|
- Each embedding is a 384-dimensional float array
|
||||||
|
- File is ~453KB (42 items × 384 floats with pretty-printed JSON)
|
||||||
|
- `npm run generate-embeddings` regenerates the file successfully
|
||||||
|
- Typecheck and lint pass
|
||||||
|
- Files changed: `scripts/generate-embeddings.ts`, `src/data/embeddings.json`
|
||||||
|
- **Learnings for future iterations:**
|
||||||
|
- `import.meta.dirname` works in tsx/Node ESM scripts — use it instead of `__dirname` (which isn't available in ESM)
|
||||||
|
- `@/` path alias works in `npx tsx` scripts because tsx resolves tsconfig paths automatically
|
||||||
|
- The embeddings file is ~450KB with pretty-print; could be reduced with compact JSON but readability is preferred for now
|
||||||
|
- Processing 42 items takes ~10-15 seconds on first run (model cached after first download)
|
||||||
|
---
|
||||||
|
|||||||
@@ -1,17 +1,27 @@
|
|||||||
|
import { writeFileSync } from 'node:fs'
|
||||||
|
import { resolve } from 'node:path'
|
||||||
import { pipeline } from '@xenova/transformers'
|
import { pipeline } from '@xenova/transformers'
|
||||||
|
import { buildEmbeddingTexts } from '@/lib/search'
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
|
const items = buildEmbeddingTexts()
|
||||||
|
console.log(`Found ${items.length} items to embed.`)
|
||||||
|
|
||||||
console.log('Loading all-MiniLM-L6-v2 model...')
|
console.log('Loading all-MiniLM-L6-v2 model...')
|
||||||
const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2')
|
const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2')
|
||||||
|
|
||||||
const testString = 'This is a test string for embedding generation.'
|
const embeddings: Array<{ id: string; embedding: number[] }> = []
|
||||||
console.log(`Embedding test string: "${testString}"`)
|
|
||||||
|
|
||||||
const output = await extractor(testString, { pooling: 'mean', normalize: true })
|
for (const item of items) {
|
||||||
|
const output = await extractor(item.text, { pooling: 'mean', normalize: true })
|
||||||
const vector = Array.from(output.data as Float32Array)
|
const vector = Array.from(output.data as Float32Array)
|
||||||
|
embeddings.push({ id: item.id, embedding: vector })
|
||||||
|
console.log(` [${embeddings.length}/${items.length}] ${item.id} (${vector.length}d)`)
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`Vector length: ${vector.length}`)
|
const outPath = resolve(import.meta.dirname, '..', 'src', 'data', 'embeddings.json')
|
||||||
console.log('Done.')
|
writeFileSync(outPath, JSON.stringify(embeddings, null, 2))
|
||||||
|
console.log(`\nWrote ${embeddings.length} embeddings to ${outPath}`)
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((err) => {
|
main().catch((err) => {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user