diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index 6e1fc81..d6f0a93 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -1,206 +1,174 @@ -# Implementation Plan - HCD Analysis UI Redesign +# Implementation Plan - Pathway Data Architecture ## Project Overview -Complete frontend redesign of the Patient Pathway Analysis tool. Replace the current multi-page sidebar layout with a modern, single-page dashboard featuring: -- Instant reactive filtering with debounce -- Interactive Plotly icicle chart that updates in real-time -- NHS-inspired but bold, modern visual design -- KPI metrics that respond to filter changes +Pre-compute patient treatment pathways from Snowflake and store in SQLite for fast Reflex filtering. This replaces the current simplified `prepare_chart_data()` with full pathway hierarchy support. -**Design Reference:** See `DESIGN_SYSTEM.md` for color palette, typography, spacing, and component specs. +**Architecture**: Snowflake → Pathway Processing → SQLite (pre-computed) → Reflex (filter & view) -**Source Code:** The existing `pathways_app/pathways_app.py` contains the current implementation. Create a new `pathways_app/app_v2.py` for the redesign, leaving the original intact until verification. +**Key Benefits**: +- Performance: Pathway calculation done once during data refresh, not on every filter +- Simplicity: Reflex filters pre-computed data with simple SQL WHERE clauses +- Full Pathways: Sequential treatment pathways (drug_0 → drug_1 → drug_2...) with statistics + +**Design Reference**: See `PATHWAY_DATA_ARCHITECTURE_PLAN.md` for detailed architecture, schema, and data flow. + +**Source Code**: +- Existing analysis: `analysis/pathway_analyzer.py` +- Existing visualization: `visualization/plotly_generator.py` +- Existing Reflex app: `pathways_app/app_v2.py` ## Quality Checks Run after each task: ```bash -# Syntax check -python -m py_compile pathways_app/app_v2.py +# Syntax check for Python files +python -m py_compile # Import verification -python -c "from pathways_app.app_v2 import app" +python -c "from import " -# Reflex compilation test +# For Reflex changes cd pathways_app && timeout 60 python -m reflex run 2>&1 | head -30 - -# If compilation shows errors, fix before marking task complete ``` -## Phase 1: Foundation +## Phase 1: Schema & Data Pipeline Foundation -### 1.1 Design Tokens Module -- [x] Create `pathways_app/styles.py` with design token classes: - - `Colors` class with all palette colors as constants - - `Typography` class with font sizes, weights - - `Spacing` class with spacing scale - - `Shadows` class with shadow values - - `Radii` class with border radius values -- [x] Create helper functions for common style patterns (e.g., `card_style()`, `button_primary_style()`) -- [x] Verify imports work: `from pathways_app.styles import Colors, Spacing` +### 1.1 Extend Database Schema +- [x] Add `pathway_date_filters` table with 6 pre-defined combinations: + - `all_6mo`, `all_12mo`, `1yr_6mo`, `1yr_12mo`, `2yr_6mo`, `2yr_12mo` +- [x] Add `pathway_nodes` table with: + - Hierarchy structure (parents, ids, labels, level) + - Patient counts and costs (value, cost, costpp, cost_pp_pa) + - Date ranges (first_seen, last_seen, first_seen_parent, last_seen_parent) + - Treatment statistics (average_spacing, average_administered, avg_days) + - Denormalized filter columns (trust_name, directory, drug_sequence) + - Foreign key to date_filter_id +- [x] Add `pathway_refresh_log` table for tracking refresh status +- [x] Create indexes for efficient filtering +- [x] Verify schema with: `python -c "from data_processing.schema import *"` -### 1.2 App Skeleton -- [x] Create `pathways_app/app_v2.py` with basic Reflex app structure -- [x] Define new `AppState` class with minimal state (placeholder for now) -- [x] Create single-page layout structure matching DESIGN_SYSTEM.md -- [x] Verify `reflex run` compiles and shows blank page with correct structure -- [x] Configure Reflex theme with design system colors +### 1.2 Create Pathway Pipeline Module +- [ ] Create `data_processing/pathway_pipeline.py` with: + - `fetch_and_transform_data()` - Snowflake fetch + UPID/drug/directory transformations + - `process_pathway_for_date_filter(df, date_filter_config)` - Single filter processing + - `extract_denormalized_fields(ice_df)` - Extract trust, directory, drug_sequence from ids + - `convert_to_records(ice_df, date_filter_id)` - Convert ice_df to list of dicts for SQLite +- [ ] Integrate with existing `analysis/pathway_analyzer.py` functions +- [ ] Verify: `python -c "from data_processing.pathway_pipeline import *"` -## Phase 2: Layout Components +### 1.3 Create Migration Script +- [ ] Create script to set up new tables in existing `data/pathways.db` +- [ ] Pre-populate `pathway_date_filters` with 6 combinations +- [ ] Verify migration runs cleanly on fresh database -### 2.1 Top Navigation Bar -- [x] Create `top_bar()` component: - - Logo (use existing NHS person logo from assets) - - App title "HCD Analysis" - - Chart type tabs/pills (Icicle active, placeholders for future charts) - - Data freshness indicator (right side): "12,450 records (2d ago)" -- [x] Style with Heritage Blue accents, clean typography -- [x] Fixed height: 64px -- [x] Verify renders correctly +## Phase 2: CLI Refresh Command -### 2.2 Filter Section -- [x] Create `filter_section()` component with card styling -- [x] Add date range pickers: - - "Initiated" range with enable/disable checkbox (default: disabled) - - "Last Seen" range with enable/disable checkbox (default: enabled, last 6 months) - - "To" date defaults to latest date in dataset (placeholder — actual data integration in Phase 3) -- [x] Add searchable multi-select dropdowns: - - Drugs dropdown with search, select all, count display - - Indications dropdown with search, select all, count display - - Directorates dropdown with search, select all, count display -- [ ] Implement debounced filter change handlers (300ms) — deferred to Phase 3.3 -- [x] Style according to design system +### 2.1 Create Refresh Command +- [ ] Create `cli/refresh_pathways.py` with: + - DATE_FILTER_CONFIGS constant (6 combinations) + - `compute_date_ranges(config, max_date)` - Calculate actual dates from config + - `refresh_pathways(minimum_patients, provider_codes, ...)` main function +- [ ] Implement refresh flow: + 1. Fetch ALL data from Snowflake (full date range) + 2. Apply transformations (UPID, drug names, directory) + 3. Clear existing pathway_nodes + 4. For each of 6 date filter configs: filter → process → insert + 5. Update pathway_refresh_log +- [ ] Add CLI argument parsing (--minimum-patients, --provider-codes, etc.) +- [ ] Verify: `python -m cli.refresh_pathways --help` -### 2.3 KPI Row -- [x] Create `kpi_card()` component: - - Large mono number (32-48px) - - Label below (caption style) - - Subtle background tint -- [x] Create `kpi_row()` component with responsive grid -- [x] Initially show: Unique Patients count -- [x] Leave space for future metrics (Drugs count, Total cost, Match rate) -- [x] KPIs should be reactive to filter state +### 2.2 Test Refresh Pipeline +- [ ] Run refresh with Snowflake data +- [ ] Verify all 6 date_filter_ids populated in pathway_nodes +- [ ] Verify pathway structure matches original `generate_icicle_chart()` output +- [ ] Verify patient counts are correct (compare with original app) +- [ ] Document estimated processing time (expect 6-12 minutes for 440K records) -### 2.4 Chart Container -- [x] Create `chart_section()` component -- [x] Full-width card with appropriate padding -- [x] Placeholder for Plotly chart (integrate in Phase 4) -- [x] Loading state with skeleton/spinner -- [x] Error state with friendly message +## Phase 3: Reflex Integration -## Phase 3: State Management +### 3.1 Update AppState +- [ ] Replace date picker state with dropdown state: + - `selected_initiated: str = "all"` ("all", "1yr", "2yr") + - `selected_last_seen: str = "6mo"` ("6mo", "12mo") +- [ ] Add `date_filter_id` computed property: `f"{selected_initiated}_{selected_last_seen}"` +- [ ] Rewrite `load_pathway_data()` to query `pathway_nodes` table: + - Base filter: `WHERE date_filter_id = ?` + - Trust/directory/drug filters on denormalized columns +- [ ] Add `recalculate_parent_totals()` for filtered hierarchies +- [ ] Update KPI calculations from root node data -### 3.1 Core State Variables -- [x] Define filter state variables in `AppState`: - - `initiated_filter_enabled: bool = False` - - `initiated_from_date: str = ""` (ISO date string) - - `initiated_to_date: str = ""` - - `last_seen_filter_enabled: bool = True` - - `last_seen_from_date: str` (default: 6 months ago, computed at class definition) - - `last_seen_to_date: str` (default: today, updated on data load) - - `selected_drugs: list[str] = []` (empty = all) - - `selected_indications: list[str] = []` (empty = all) - - `selected_directorates: list[str] = []` (empty = all) -- [x] Define data state variables: - - `data_loaded: bool = False` - - `total_records: int = 0` - - `last_updated: str = ""` (ISO timestamp) - - `raw_data: list[dict[str, Any]] = []` (list of dicts, Reflex-friendly) - - `latest_date_in_data: str = ""` (for "to" date defaults) -- [x] Define UI state variables: - - `chart_loading: bool = False` - - `error_message: str = ""` - - `current_chart: str = "icicle"` +### 3.2 Update Icicle Figure +- [ ] Update `icicle_figure` computed property to use all pathway_nodes columns +- [ ] Match original 10-field customdata structure: + - values, colours, costs, costpp + - first_seen, last_seen, first_seen_parent, last_seen_parent + - average_spacing, cost_pp_pa +- [ ] Restore full hover/text templates from `visualization/plotly_generator.py` +- [ ] Verify chart renders correctly with treatment statistics -### 3.2 Data Loading -- [x] Create `load_data()` method that reads from SQLite -- [x] Populate available options for dropdowns (drugs, indications, directorates) -- [x] Detect latest date in dataset for "to" date defaults -- [x] Calculate total records and last updated timestamp -- [x] Call on app initialization +### 3.3 Update UI Components +- [ ] Replace date pickers with select dropdowns: + - Initiated: "All years", "Last 2 years", "Last 1 year" + - Last Seen: "Last 6 months", "Last 12 months" +- [ ] Add "Data refreshed: X ago" indicator from pathway_refresh_log +- [ ] Update filter section layout +- [ ] Verify UI compiles and renders correctly -### 3.3 Filter Logic -- [x] Create `apply_filters()` computed method that filters the data based on current state -- [x] Handle initiated date filter (when enabled) -- [x] Handle last seen date filter (when enabled) -- [x] Handle drug/indication/directorate multi-select filters -- [x] Return filtered DataFrame +## Phase 4: Testing & Validation -### 3.4 KPI Calculations -- [x] Create computed properties for KPI values: - - `unique_patients: int` — COUNT(DISTINCT patient_id) from filtered data - - `total_drugs: int` — COUNT(DISTINCT drug_name_std) from filtered data - - `total_cost: float` — SUM(price_actual) from filtered data - - (Blocked: indication_match_rate requires Snowflake GP data) -- [x] Ensure KPIs update reactively when filters change -- Note: KPIs implemented in apply_filters() method, called by all filter event handlers +### 4.1 End-to-End Validation +- [ ] **Pathway hierarchy matches original**: Compare specific pathway ids structure +- [ ] **Patient counts match**: Compare root patient count for same date range +- [ ] **Treatment statistics display correctly**: Verify "Average treatment duration" hover data +- [ ] **Drug filtering works**: Filter to FARICIMAB, verify correct pathways shown +- [ ] **Chart renders with all tooltip data**: Verify 10-field customdata structure -## Phase 4: Interactive Chart +### 4.2 Performance Testing +- [ ] Measure filter change response time (target: <500ms) +- [ ] Measure initial page load (target: <2s including data load) +- [ ] Verify chart interaction (zoom, hover) is smooth with no lag +- [ ] Test with full dataset -### 4.1 Chart Data Preparation -- [x] Create `prepare_chart_data()` method that transforms filtered data for Plotly icicle -- [x] Reuse/adapt logic from existing `pathway_analyzer.py` (simplified hierarchy: Trust → Directory → Drug) -- [x] Return data structure compatible with `go.Icicle()` (list of dicts with parents, ids, labels, value, cost, colour) -- [x] Generate chart_title based on current filter state -- [x] Call prepare_chart_data() from apply_filters() for reactive updates - -### 4.2 Reactive Plotly Integration -- [x] Create `generate_icicle_chart()` computed property that returns Plotly figure -- [x] Configure chart colors using design system palette -- [x] Configure chart interactivity (zoom, pan, click, hover) -- [x] Set responsive sizing - -### 4.3 Chart Component -- [x] Integrate `rx.plotly()` component in chart_section -- [x] Pass reactive figure from state -- [x] Handle loading states (show skeleton while computing) -- [x] Handle empty data state (friendly message) -- [x] Verify chart updates when filters change - -## Phase 5: Polish & Verification - -### 5.1 Visual Polish -- [x] Review all components against DESIGN_SYSTEM.md -- [x] Ensure consistent spacing throughout -- [x] Ensure consistent typography throughout -- [x] Add hover states and transitions to interactive elements -- [x] Test responsive behavior (resize browser) - -### 5.2 Performance Optimization -- [x] Profile filter + chart update cycle -- [x] Ensure debounce is working correctly (not triggering on every keystroke) -- [x] Optimize any slow computed properties -- [x] Verify smooth 60fps interactions - -### 5.3 Error Handling -- [x] Handle no data loaded state gracefully -- [x] Handle filter resulting in zero records -- [x] Handle any data loading errors -- [x] User-friendly error messages - -### 5.4 Final Verification -- [x] Load real data from SQLite (440K records, 552 drugs, 29 directorates, 32 indications) -- [x] Test all filter combinations (no filter, Last Seen, Drug, Directorate, Combined - all working) -- [x] Verify KPIs update correctly (patients, drugs, cost all compute correctly) -- [x] Verify chart updates correctly (1,887 hierarchical nodes generated correctly) -- [ ] Compare key metrics with original app to ensure correctness -- [ ] Test with large dataset for performance - -### 5.5 Cleanup -- [ ] Remove or comment out old `pathways_app.py` code paths -- [x] Update any imports/references to use new app (updated __init__.py to re-export from app_v2) +### 4.3 Documentation +- [ ] Update CLAUDE.md with new architecture +- [ ] Document CLI usage for `refresh_pathways` - [ ] Update README with new run instructions -- [ ] Document any breaking changes +- [ ] Document any breaking changes from original app ## Completion Criteria All tasks marked `[x]` AND: -- [x] App compiles without errors (`reflex run` succeeds) -- [x] All filters work with instant (debounced) updates -- [x] KPIs display correct numbers matching filter state (verified via SQL queries) -- [x] Icicle chart renders and updates reactively (1,887 nodes generated correctly) -- [x] Visual design matches DESIGN_SYSTEM.md (verified in iteration 15) +- [ ] App compiles without errors (`reflex run` succeeds) +- [ ] All 6 date filter combinations work correctly +- [ ] Drug/directory/trust filters work with instant updates +- [ ] KPIs display correct numbers matching filter state +- [ ] Icicle chart renders with full pathway data and statistics +- [ ] Treatment duration and dosing information displays in tooltips - [ ] No console errors during normal operation -- [x] Verified with real patient data from SQLite (440K records tested) +- [ ] Verified with real patient data from Snowflake + +## Reference + +### Date Filter Combinations + +| ID | Initiated | Last Seen | Default | +|----|-----------|-----------|---------| +| `all_6mo` | All years | Last 6 months | Yes | +| `all_12mo` | All years | Last 12 months | No | +| `1yr_6mo` | Last 1 year | Last 6 months | No | +| `1yr_12mo` | Last 1 year | Last 12 months | No | +| `2yr_6mo` | Last 2 years | Last 6 months | No | +| `2yr_12mo` | Last 2 years | Last 12 months | No | + +### Key Files + +| File | Purpose | +|------|---------| +| `data_processing/schema.py` | Database schema definitions | +| `data_processing/pathway_pipeline.py` | New pathway processing pipeline | +| `cli/refresh_pathways.py` | CLI refresh command | +| `analysis/pathway_analyzer.py` | Existing pathway analysis logic | +| `visualization/plotly_generator.py` | Existing chart generation | +| `pathways_app/app_v2.py` | Reflex application | diff --git a/data_processing/schema.py b/data_processing/schema.py index ee879c9..e3915fa 100644 --- a/data_processing/schema.py +++ b/data_processing/schema.py @@ -116,6 +116,148 @@ CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_dr """ +# ============================================================================= +# Pathway Data Architecture Schemas +# ============================================================================= + +PATHWAY_DATE_FILTERS_SCHEMA = """ +-- Stores the 6 pre-computed date filter combinations +-- Each combination represents a different initiated/last_seen date range +-- Used to efficiently query pre-computed pathway data +CREATE TABLE IF NOT EXISTS pathway_date_filters ( + id TEXT PRIMARY KEY, -- e.g., 'all_6mo', '1yr_12mo' + initiated_label TEXT NOT NULL, -- e.g., 'All years', 'Last 1 year', 'Last 2 years' + last_seen_label TEXT NOT NULL, -- e.g., 'Last 6 months', 'Last 12 months' + initiated_years INTEGER, -- NULL for 'All', 1, or 2 + last_seen_months INTEGER NOT NULL, -- 6 or 12 + is_default INTEGER DEFAULT 0, -- 1 for 'all_6mo' (default selection) + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Pre-populate the 6 combinations +INSERT OR REPLACE INTO pathway_date_filters (id, initiated_label, last_seen_label, initiated_years, last_seen_months, is_default) VALUES + ('all_6mo', 'All years', 'Last 6 months', NULL, 6, 1), + ('all_12mo', 'All years', 'Last 12 months', NULL, 12, 0), + ('1yr_6mo', 'Last 1 year', 'Last 6 months', 1, 6, 0), + ('1yr_12mo', 'Last 1 year', 'Last 12 months', 1, 12, 0), + ('2yr_6mo', 'Last 2 years', 'Last 6 months', 2, 6, 0), + ('2yr_12mo', 'Last 2 years', 'Last 12 months', 2, 12, 0); +""" + +PATHWAY_NODES_SCHEMA = """ +-- Main pathway nodes table (one set per date filter combination) +-- Stores pre-computed pathway hierarchy with all visualization data +-- Designed for fast filtering by date_filter_id + trust/directory/drug +CREATE TABLE IF NOT EXISTS pathway_nodes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + + -- Date filter combination this belongs to + date_filter_id TEXT NOT NULL, + + -- Hierarchy structure (for icicle chart) + parents TEXT NOT NULL, -- Parent node identifier + ids TEXT NOT NULL, -- Unique node identifier (hierarchical path) + labels TEXT NOT NULL, -- Display label + level INTEGER NOT NULL, -- Hierarchy depth (0=root, 1=trust, 2=directory, 3+=drugs) + + -- Patient counts (accurate for this date filter combination) + value INTEGER NOT NULL DEFAULT 0, -- Patient count + + -- Cost metrics + cost REAL NOT NULL DEFAULT 0.0, -- Total cost + costpp REAL, -- Cost per patient + cost_pp_pa TEXT, -- Cost per patient per annum (formatted string) + + -- Visualization + colour REAL NOT NULL DEFAULT 0.0, -- Color value (proportion of parent) + + -- Date ranges (for this node) + first_seen TEXT, -- First intervention date (ISO format) + last_seen TEXT, -- Last intervention date (ISO format) + first_seen_parent TEXT, -- Earliest date in parent group + last_seen_parent TEXT, -- Latest date in parent group + + -- Treatment statistics + average_spacing TEXT, -- Formatted treatment duration string + average_administered TEXT, -- JSON array of average doses per drug + avg_days REAL, -- Average treatment duration in days + + -- Denormalized filter columns (for efficient WHERE clause filtering) + trust_name TEXT, -- Extracted trust name from ids + directory TEXT, -- Extracted directory from ids + drug_sequence TEXT, -- Pipe-separated drug sequence from pathway + + -- Metadata + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + data_refresh_id TEXT, -- Links to pathway_refresh_log + + -- Unique per date filter + pathway + UNIQUE(date_filter_id, ids), + FOREIGN KEY (date_filter_id) REFERENCES pathway_date_filters(id) +); + +-- Indexes for efficient filtering +-- Primary filter: select by date_filter_id +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_date_filter ON pathway_nodes(date_filter_id); + +-- Level filter: often used with date_filter_id +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_level ON pathway_nodes(date_filter_id, level); + +-- Trust filter: for Trust dropdown filtering +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_trust ON pathway_nodes(date_filter_id, trust_name); + +-- Directory filter: for Directory dropdown filtering +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_directory ON pathway_nodes(date_filter_id, directory); + +-- Drug sequence filter: for drug filtering (uses LIKE '%DRUG%') +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_drug_seq ON pathway_nodes(drug_sequence); + +-- Parents filter: for finding children of a node +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_parents ON pathway_nodes(date_filter_id, parents); + +-- Composite index for common filter combination +CREATE INDEX IF NOT EXISTS idx_pathway_nodes_filter_composite + ON pathway_nodes(date_filter_id, trust_name, directory); +""" + +PATHWAY_REFRESH_LOG_SCHEMA = """ +-- Metadata table for tracking refresh status +-- Tracks when pathway data was last refreshed from Snowflake +CREATE TABLE IF NOT EXISTS pathway_refresh_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + refresh_id TEXT NOT NULL, -- Unique identifier for this refresh run + started_at TEXT NOT NULL, -- ISO timestamp when refresh started + completed_at TEXT, -- ISO timestamp when refresh completed (NULL if still running) + status TEXT DEFAULT 'running', -- 'running', 'completed', 'failed' + record_count INTEGER, -- Total pathway_nodes records created + date_filter_counts TEXT, -- JSON: {"all_6mo": 1234, "all_12mo": 1567, ...} + error_message TEXT, -- Error details if status='failed' + snowflake_query_date_from TEXT, -- Start date of Snowflake query + snowflake_query_date_to TEXT, -- End date of Snowflake query + processing_duration_seconds REAL, -- How long the refresh took + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +-- Index for finding latest refresh +CREATE INDEX IF NOT EXISTS idx_pathway_refresh_log_started ON pathway_refresh_log(started_at DESC); + +-- Index for finding by status +CREATE INDEX IF NOT EXISTS idx_pathway_refresh_log_status ON pathway_refresh_log(status); +""" + +# Combined pathway schema +PATHWAY_TABLES_SCHEMA = f""" +-- Pathway Data Architecture Tables +-- Pre-computed pathway data for fast Reflex filtering + +{PATHWAY_DATE_FILTERS_SCHEMA} + +{PATHWAY_NODES_SCHEMA} + +{PATHWAY_REFRESH_LOG_SCHEMA} +""" + + # ============================================================================= # Fact Table Schemas # ============================================================================= @@ -346,7 +488,7 @@ FACT_TABLES_SCHEMA = f""" ALL_TABLES_SCHEMA = f""" -- Complete Database Schema --- Reference tables + Fact tables + Materialized views + File tracking +-- Reference tables + Fact tables + Materialized views + File tracking + Pathway tables {REFERENCE_TABLES_SCHEMA} @@ -355,6 +497,8 @@ ALL_TABLES_SCHEMA = f""" {MATERIALIZED_VIEWS_SCHEMA} {FILE_TRACKING_SCHEMA} + +{PATHWAY_TABLES_SCHEMA} """ @@ -598,6 +742,157 @@ def verify_file_tracking_tables_exist(conn: sqlite3.Connection) -> list[str]: return missing +# ============================================================================= +# Pathway Table Helper Functions +# ============================================================================= + +def create_pathway_tables(conn: sqlite3.Connection) -> None: + """ + Create pathway data architecture tables in the database. + + Creates: + - pathway_date_filters: 6 pre-defined date filter combinations + - pathway_nodes: Pre-computed pathway hierarchy data + - pathway_refresh_log: Refresh tracking metadata + + Args: + conn: SQLite database connection. + """ + logger.info("Creating pathway tables...") + conn.executescript(PATHWAY_TABLES_SCHEMA) + logger.info("Pathway tables created successfully") + + +def drop_pathway_tables(conn: sqlite3.Connection) -> None: + """ + Drop pathway data architecture tables from the database. + + Args: + conn: SQLite database connection. + + Warning: + This will delete all pre-computed pathway data. + """ + logger.warning("Dropping pathway tables...") + conn.executescript(""" + DROP TABLE IF EXISTS pathway_nodes; + DROP TABLE IF EXISTS pathway_refresh_log; + DROP TABLE IF EXISTS pathway_date_filters; + """) + logger.info("Pathway tables dropped") + + +def get_pathway_table_counts(conn: sqlite3.Connection) -> dict[str, int]: + """ + Get row counts for pathway tables. + + Args: + conn: SQLite database connection. + + Returns: + Dictionary mapping table name to row count. + """ + tables = ["pathway_date_filters", "pathway_nodes", "pathway_refresh_log"] + counts = {} + + for table in tables: + try: + cursor = conn.execute(f"SELECT COUNT(*) FROM {table}") + result = cursor.fetchone() + counts[table] = result[0] if result else 0 + except sqlite3.OperationalError: + # Table doesn't exist yet + counts[table] = 0 + + return counts + + +def verify_pathway_tables_exist(conn: sqlite3.Connection) -> list[str]: + """ + Verify that pathway tables exist. + + Args: + conn: SQLite database connection. + + Returns: + List of missing table names. Empty list means all tables exist. + """ + required_tables = ["pathway_date_filters", "pathway_nodes", "pathway_refresh_log"] + missing = [] + + for table in required_tables: + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name=?", + (table,) + ) + if cursor.fetchone() is None: + missing.append(table) + + return missing + + +def clear_pathway_nodes(conn: sqlite3.Connection, date_filter_id: str | None = None) -> int: + """ + Clear pathway nodes, optionally for a specific date filter. + + Args: + conn: SQLite database connection. + date_filter_id: If provided, only clear nodes for this date filter. + If None, clear all pathway nodes. + + Returns: + Number of rows deleted. + """ + if date_filter_id: + cursor = conn.execute( + "DELETE FROM pathway_nodes WHERE date_filter_id = ?", + (date_filter_id,) + ) + else: + cursor = conn.execute("DELETE FROM pathway_nodes") + + deleted_count = cursor.rowcount + conn.commit() + logger.info(f"Cleared {deleted_count} pathway nodes") + return deleted_count + + +def get_pathway_refresh_status(conn: sqlite3.Connection) -> dict | None: + """ + Get the status of the most recent pathway refresh. + + Args: + conn: SQLite database connection. + + Returns: + Dictionary with refresh status, or None if no refresh has been done. + """ + try: + cursor = conn.execute(""" + SELECT refresh_id, started_at, completed_at, status, record_count, + date_filter_counts, error_message, processing_duration_seconds + FROM pathway_refresh_log + ORDER BY started_at DESC + LIMIT 1 + """) + row = cursor.fetchone() + if row: + return { + "refresh_id": row[0], + "started_at": row[1], + "completed_at": row[2], + "status": row[3], + "record_count": row[4], + "date_filter_counts": row[5], + "error_message": row[6], + "processing_duration_seconds": row[7], + } + return None + except sqlite3.OperationalError: + # Table doesn't exist yet + return None + + # ============================================================================= # Combined Helper Functions # ============================================================================= @@ -625,6 +920,7 @@ def drop_all_tables(conn: sqlite3.Connection) -> None: This will delete all data. Use with extreme caution. """ logger.warning("Dropping all tables...") + drop_pathway_tables(conn) drop_file_tracking_tables(conn) drop_fact_tables(conn) drop_reference_tables(conn) @@ -645,6 +941,7 @@ def get_all_table_counts(conn: sqlite3.Connection) -> dict[str, int]: counts.update(get_reference_table_counts(conn)) counts.update(get_fact_table_counts(conn)) counts.update(get_file_tracking_counts(conn)) + counts.update(get_pathway_table_counts(conn)) return counts @@ -662,4 +959,5 @@ def verify_all_tables_exist(conn: sqlite3.Connection) -> list[str]: missing.extend(verify_reference_tables_exist(conn)) missing.extend(verify_fact_tables_exist(conn)) missing.extend(verify_file_tracking_tables_exist(conn)) + missing.extend(verify_pathway_tables_exist(conn)) return missing