feat: US-013 - Self-host ONNX embedding model

Download all-MiniLM-L6-v2 model files to public/models/ and configure
@xenova/transformers to load from local path instead of Hugging Face CDN.
Eliminates external dependency for semantic search embedding model.
This commit is contained in:
2026-02-15 20:59:03 +00:00
parent 9e9dd1ae4b
commit 667e5b249c
8 changed files with 30785 additions and 4 deletions
+2 -2
View File
@@ -232,7 +232,7 @@
"Verify in browser using dev-browser skill" "Verify in browser using dev-browser skill"
], ],
"priority": 12, "priority": 12,
"passes": false, "passes": true,
"notes": "Replace the current empty-state text ('Ask me anything about Andy's experience, skills, or projects.') with the new welcome bubble + chips. The chips should call handleSubmit (or equivalent) with the chip text pre-filled — simplest approach is setInputValue(chipText) then immediately trigger submit. Check that the welcome state reappears if the user hasn't sent a message (messages.length === 0). The suggested questions could live in a const array at the top of ChatWidget for easy future editing." "notes": "Replace the current empty-state text ('Ask me anything about Andy's experience, skills, or projects.') with the new welcome bubble + chips. The chips should call handleSubmit (or equivalent) with the chip text pre-filled — simplest approach is setInputValue(chipText) then immediately trigger submit. Check that the welcome state reappears if the user hasn't sent a message (messages.length === 0). The suggested questions could live in a const array at the top of ChatWidget for easy future editing."
}, },
{ {
@@ -250,7 +250,7 @@
"Typecheck passes" "Typecheck passes"
], ],
"priority": 13, "priority": 13,
"passes": false, "passes": true,
"notes": "Transformers.js uses env.localModelPath or env.remoteHost to control where models are fetched from. Setting env.localModelPath = '/models/' should make it look for files at /models/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx etc. The Vite public/ directory serves files at the root — so public/models/ becomes /models/ at runtime. For the build script (Node.js), use a file:// path or the local filesystem path instead. Download model files from https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main — the quantized ONNX model is ~23MB. Check what files the pipeline actually requests by watching network tab before making this change." "notes": "Transformers.js uses env.localModelPath or env.remoteHost to control where models are fetched from. Setting env.localModelPath = '/models/' should make it look for files at /models/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx etc. The Vite public/ directory serves files at the root — so public/models/ becomes /models/ at runtime. For the build script (Node.js), use a file:// path or the local filesystem path instead. Download model files from https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main — the quantized ONNX model is ~23MB. Check what files the pipeline actually requests by watching network tab before making this change."
}, },
{ {
+46
View File
@@ -12,6 +12,7 @@
- `src/data/embeddings.json` is an array of `{ id: string, embedding: number[] }` — 42 items, 384-d vectors, IDs match PaletteItem IDs. Vite imports JSON natively. - `src/data/embeddings.json` is an array of `{ id: string, embedding: number[] }` — 42 items, 384-d vectors, IDs match PaletteItem IDs. Vite imports JSON natively.
- `src/lib/embedding-model.ts` exports `initModel()`, `embedQuery(text)`, `isModelReady()` — check `isModelReady()` before calling `embedQuery()` - `src/lib/embedding-model.ts` exports `initModel()`, `embedQuery(text)`, `isModelReady()` — check `isModelReady()` before calling `embedQuery()`
- `initModel()` is called fire-and-forget in `App.tsx` on mount — model loads during boot/ECG/login phases - `initModel()` is called fire-and-forget in `App.tsx` on mount — model loads during boot/ECG/login phases
- ONNX model files self-hosted in `public/models/Xenova/all-MiniLM-L6-v2/` — `env.localModelPath = '/models/'`, `env.allowRemoteModels = false`, `env.useBrowserCache = false` eliminates HF CDN dependency
- `src/lib/semantic-search.ts` exports `semanticSearch(queryEmbedding, embeddings, threshold?)` and `loadEmbeddings()` — embeddings are normalized so cosine similarity is dot(a,b)/(mag(a)*mag(b)) - `src/lib/semantic-search.ts` exports `semanticSearch(queryEmbedding, embeddings, threshold?)` and `loadEmbeddings()` — embeddings are normalized so cosine similarity is dot(a,b)/(mag(a)*mag(b))
- CommandPalette uses `semanticResults` state + debounced `useEffect` for async semantic search, falling back to Fuse.js when `isModelReady()` returns false or on any error - CommandPalette uses `semanticResults` state + debounced `useEffect` for async semantic search, falling back to Fuse.js when `isModelReady()` returns false or on any error
- `loadEmbeddings()` and `paletteMap` (Map<id, PaletteItem>) are precomputed via `useMemo` — no re-computation on each search - `loadEmbeddings()` and `paletteMap` (Map<id, PaletteItem>) are precomputed via `useMemo` — no re-computation on each search
@@ -31,6 +32,8 @@
- TopBar is `z-index: 100` (fixed), nav is `z-index: 99` (sticky) — mobile full-screen overlays need `z-index > 100` to appear above them - TopBar is `z-index: 100` (fixed), nav is `z-index: 99` (sticky) — mobile full-screen overlays need `z-index > 100` to appear above them
- Inline `style={{ display: 'flex' }}` overrides Tailwind's `hidden` class — use `!important` modifier (`max-md:!hidden`) or move display to Tailwind classes to allow responsive hiding - Inline `style={{ display: 'flex' }}` overrides Tailwind's `hidden` class — use `!important` modifier (`max-md:!hidden`) or move display to Tailwind classes to allow responsive hiding
- ChatWidget mobile breakpoint is `md` (768px) — below this, panel is full-screen; above, it's 380px anchored bottom-right - ChatWidget mobile breakpoint is `md` (768px) — below this, panel is full-screen; above, it's 380px anchored bottom-right
- `handleSubmit(overrideText?)` accepts optional text param — use this when programmatically sending messages (e.g., suggested question chips) to avoid stale `inputValue` state
- `SUGGESTED_QUESTIONS` const array at top of ChatWidget — edit here to change welcome screen chip text
--- ---
@@ -250,3 +253,46 @@
- `100dvh` (dynamic viewport height) is essential for mobile full-screen panels — it accounts for browser chrome (address bar, toolbar) unlike `100vh` - `100dvh` (dynamic viewport height) is essential for mobile full-screen panels — it accounts for browser chrome (address bar, toolbar) unlike `100vh`
- The `[data-chat-panel]` CSS selector in the `<style>` block is the right place for responsive size rules since Tailwind can't conditionally set max-height based on viewport width - The `[data-chat-panel]` CSS selector in the `<style>` block is the right place for responsive size rules since Tailwind can't conditionally set max-height based on viewport width
--- ---
## 2026-02-15 - US-012
- Replaced empty-state centered text with welcome bubble + suggested question chips
- Welcome bubble styled as assistant message (left-aligned, `var(--bg-dashboard)` bg, `var(--border-light)` border)
- Added `SUGGESTED_QUESTIONS` const array at module top for easy future editing
- Three chips: "What's his NHS experience?", "Tell me about his data skills", "What projects has he built?"
- Chips styled: rounded-full, teal accent border, teal hover tint, `font-ui` 12.5px
- Clicking a chip calls `handleSubmit(questionText)` — same codepath as typing + Enter
- Refactored `handleSubmit` to accept optional `overrideText` parameter (avoids stale state issue with `setInputValue` + immediate submit)
- Wrapped send button `onClick` in arrow function to prevent passing MouseEvent as text argument
- Welcome/chips visible when `messages.length === 0`, replaced by conversation once any message is sent
- Typecheck passes (0 errors), lint passes (0 new errors/warnings)
- Browser verified: welcome bubble displays correctly, chips render, clicking chip sends message and replaces welcome state
- Files changed: `src/components/ChatWidget.tsx`
- **Learnings for future iterations:**
- When refactoring a callback to accept optional parameters, wrap `onClick={handler}` as `onClick={() => handler()}` to prevent React from passing the SyntheticEvent as the first argument
- `SUGGESTED_QUESTIONS` as a module-level const is the simplest approach — easily editable, no data file needed for 3 items
- The `handleSubmit(overrideText?)` pattern avoids the stale-state problem: `setInputValue(text)` followed by immediate `handleSubmit()` would read the old `inputValue` since React batches state updates
---
## 2026-02-15 - US-013
- Downloaded all-MiniLM-L6-v2 model files to `public/models/Xenova/all-MiniLM-L6-v2/`:
- `config.json`, `tokenizer.json`, `tokenizer_config.json`, `onnx/model_quantized.onnx` (~22MB)
- Updated `src/lib/embedding-model.ts`:
- `env.localModelPath = '/models/'` — Vite serves `public/` at root
- `env.allowRemoteModels = false` — prevents any HF CDN fallback
- `env.useBrowserCache = false` — prevents stale Cache API entries from interfering
- Updated `scripts/generate-embeddings.ts`:
- `env.localModelPath = resolve(import.meta.dirname, '..', 'public', 'models')` — absolute path for Node.js
- `env.allowRemoteModels = false`
- Model files committed as static assets (not in .gitignore)
- Browser verified: all 4 model files fetched from `localhost:5173/models/` with 200 OK, zero `huggingface.co` requests
- Semantic search verified working: "data analysis" returns multi-category results (Core Skills, Active Projects, Achievements)
- Build script (`npm run generate-embeddings`) still works with local model files
- Typecheck passes (0 errors), lint passes (0 new errors/warnings)
- Files changed: `src/lib/embedding-model.ts`, `scripts/generate-embeddings.ts`, `public/models/Xenova/all-MiniLM-L6-v2/` (new directory with 4 files)
- **Learnings for future iterations:**
- `@xenova/transformers` env configuration: `env.localModelPath` sets the base path, `env.allowRemoteModels = false` prevents CDN fallback, `env.useBrowserCache = false` bypasses Browser Cache API
- The library constructs paths as `{localModelPath}/{modelId}/{filename}` — so `/models/` + `Xenova/all-MiniLM-L6-v2` + `/onnx/model_quantized.onnx`
- Browser Cache API can retain stale entries from previous HF CDN loads — setting `useBrowserCache = false` forces fresh fetches from the configured local path
- For Node.js scripts, use an absolute filesystem path for `localModelPath` (not a URL)
- The quantized ONNX model (`model_quantized.onnx`) is ~22MB — acceptable for a static asset since it's cached after first load
---
@@ -0,0 +1,25 @@
{
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.29.2",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,15 @@
{
"clean_up_tokenization_spaces": true,
"cls_token": "[CLS]",
"do_basic_tokenize": true,
"do_lower_case": true,
"mask_token": "[MASK]",
"model_max_length": 512,
"never_split": null,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}
+5 -1
View File
@@ -1,8 +1,12 @@
import { writeFileSync } from 'node:fs' import { writeFileSync } from 'node:fs'
import { resolve } from 'node:path' import { resolve } from 'node:path'
import { pipeline } from '@xenova/transformers' import { env, pipeline } from '@xenova/transformers'
import { buildEmbeddingTexts } from '@/lib/search' import { buildEmbeddingTexts } from '@/lib/search'
// Use local model files from public/models/ (same files the browser uses)
env.localModelPath = resolve(import.meta.dirname, '..', 'public', 'models')
env.allowRemoteModels = false
async function main() { async function main() {
const items = buildEmbeddingTexts() const items = buildEmbeddingTexts()
console.log(`Found ${items.length} items to embed.`) console.log(`Found ${items.length} items to embed.`)
+6 -1
View File
@@ -1,4 +1,9 @@
import { pipeline, type FeatureExtractionPipeline } from '@xenova/transformers' import { env, pipeline, type FeatureExtractionPipeline } from '@xenova/transformers'
// Serve model files from /models/ (Vite serves public/ at root)
env.localModelPath = '/models/'
env.allowRemoteModels = false
env.useBrowserCache = false
let extractor: FeatureExtractionPipeline | null = null let extractor: FeatureExtractionPipeline | null = null
let loading = false let loading = false