feat: US-003 - Generate and commit embeddings.json
This commit is contained in:
@@ -1,17 +1,27 @@
|
||||
import { writeFileSync } from 'node:fs'
|
||||
import { resolve } from 'node:path'
|
||||
import { pipeline } from '@xenova/transformers'
|
||||
import { buildEmbeddingTexts } from '@/lib/search'
|
||||
|
||||
async function main() {
|
||||
const items = buildEmbeddingTexts()
|
||||
console.log(`Found ${items.length} items to embed.`)
|
||||
|
||||
console.log('Loading all-MiniLM-L6-v2 model...')
|
||||
const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2')
|
||||
|
||||
const testString = 'This is a test string for embedding generation.'
|
||||
console.log(`Embedding test string: "${testString}"`)
|
||||
const embeddings: Array<{ id: string; embedding: number[] }> = []
|
||||
|
||||
const output = await extractor(testString, { pooling: 'mean', normalize: true })
|
||||
const vector = Array.from(output.data as Float32Array)
|
||||
for (const item of items) {
|
||||
const output = await extractor(item.text, { pooling: 'mean', normalize: true })
|
||||
const vector = Array.from(output.data as Float32Array)
|
||||
embeddings.push({ id: item.id, embedding: vector })
|
||||
console.log(` [${embeddings.length}/${items.length}] ${item.id} (${vector.length}d)`)
|
||||
}
|
||||
|
||||
console.log(`Vector length: ${vector.length}`)
|
||||
console.log('Done.')
|
||||
const outPath = resolve(import.meta.dirname, '..', 'src', 'data', 'embeddings.json')
|
||||
writeFileSync(outPath, JSON.stringify(embeddings, null, 2))
|
||||
console.log(`\nWrote ${embeddings.length} embeddings to ${outPath}`)
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
|
||||
Reference in New Issue
Block a user