refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables

Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
2026-02-06 08:51:03 +00:00
parent bb93c1673e
commit 778ed99ef6
11 changed files with 95 additions and 3653 deletions
@@ -115,43 +115,6 @@ CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_cluster ON ref_drug_
 CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_drug_indication_clusters(indication);
 """

-REF_DRUG_SNOMED_MAPPING_SCHEMA = """
-- Direct SNOMED code mapping from drug to indication to GP diagnosis codes
-- Source: data/drug_snomed_mapping_enriched.csv (163K rows)
-- Used for direct GP record matching to assign diagnosis-based directorates
-- and to support indication-based pathway hierarchy (Trust → Search_Term → Drug → Pathway)
-CREATE TABLE IF NOT EXISTS ref_drug_snomed_mapping (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    drug_name TEXT NOT NULL,                -- Original drug name from mapping
-    indication TEXT NOT NULL,               -- Specific indication (603 unique values)
-    ta_id TEXT,                             -- NICE TA reference (e.g., TA568)
-    search_term TEXT NOT NULL,              -- Simplified grouping (187 unique values)
-    snomed_code TEXT NOT NULL,              -- SNOMED CT code for GP record matching
-    snomed_description TEXT,                -- SNOMED code description
-    cleaned_drug_name TEXT NOT NULL,        -- Standardized drug name for matching
-    primary_directorate TEXT,               -- Primary directorate for this indication
-    all_directorates TEXT,                  -- Pipe-separated list of valid directorates
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    UNIQUE(drug_name, indication, snomed_code)
-);
-
-- Index for looking up SNOMED codes by drug name (most common access pattern)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug ON ref_drug_snomed_mapping(drug_name);
-
-- Index for looking up by cleaned drug name (standardized matching)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_cleaned ON ref_drug_snomed_mapping(cleaned_drug_name);
-
-- Index for looking up by SNOMED code (reverse lookup from GP record)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_snomed ON ref_drug_snomed_mapping(snomed_code);
-
-- Index for grouping by search_term (indication-based hierarchy)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_search_term ON ref_drug_snomed_mapping(search_term);
-
-- Composite index for drug + snomed code (common lookup pattern)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug_snomed
-    ON ref_drug_snomed_mapping(cleaned_drug_name, snomed_code);
-"""
-

 # =============================================================================
 # Pathway Data Architecture Schemas
@@ -278,6 +241,7 @@ CREATE TABLE IF NOT EXISTS pathway_refresh_log (
    snowflake_query_date_from TEXT,         -- Start date of Snowflake query
    snowflake_query_date_to TEXT,           -- End date of Snowflake query
    processing_duration_seconds REAL,       -- How long the refresh took
+    source_row_count INTEGER,               -- Number of Snowflake rows fetched
    created_at TEXT DEFAULT CURRENT_TIMESTAMP
 );

@@ -301,208 +265,6 @@ PATHWAY_TABLES_SCHEMA = f"""
 """


-# =============================================================================
-# Fact Table Schemas
-# =============================================================================
-
-FACT_INTERVENTIONS_SCHEMA = """
-- Patient intervention records (fact table)
-- Source: HCD activity data (CSV/Parquet files or Snowflake)
-- This is the main fact table storing all patient intervention events
-CREATE TABLE IF NOT EXISTS fact_interventions (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- Patient identification
-    upid TEXT NOT NULL,                     -- Unique Patient ID (Provider Code[:3] + PersonKey)
-    provider_code TEXT NOT NULL,            -- Original provider code (3-5 chars)
-    person_key TEXT NOT NULL,               -- Patient key from source system
-
-    -- Intervention details
-    drug_name_raw TEXT,                     -- Original drug name from source
-    drug_name_std TEXT NOT NULL,            -- Standardized drug name (via ref_drug_names)
-    intervention_date DATE NOT NULL,        -- Date of intervention
-    price_actual REAL NOT NULL DEFAULT 0,   -- Cost of intervention in GBP
-
-    -- Organization and directory
-    org_name TEXT,                          -- Organization name (cleaned, no commas)
-    directory TEXT,                         -- Medical directory/specialty (may be "Undefined")
-
-    -- Source tracking
-    source_file TEXT,                       -- Original file this record came from
-    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-
-    -- Additional clinical fields (optional, used in directory fallback logic)
-    treatment_function_code INTEGER,
-    additional_detail_1 TEXT,
-    additional_detail_2 TEXT,
-    additional_detail_3 TEXT,
-    additional_detail_4 TEXT,
-    additional_detail_5 TEXT
-);
-
-- Primary indexes for common filter patterns used in generate_graph()
-- UPID: Used for patient grouping, pathway analysis
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid ON fact_interventions(upid);
-
-- Drug name (standardized): Used for drug filtering
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_drug ON fact_interventions(drug_name_std);
-
-- Intervention date: Used for date range filtering (start_date, end_date, last_seen)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_date ON fact_interventions(intervention_date);
-
-- Directory: Used for directory/specialty filtering
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_directory ON fact_interventions(directory);
-
-- Organization: Used for trust filtering (Provider Code maps to org_name)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_org ON fact_interventions(org_name);
-
-- Composite index for common filter combination (trust + drug + directory)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_composite
-    ON fact_interventions(org_name, drug_name_std, directory);
-
-- Composite index for date-based patient analysis
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid_date
-    ON fact_interventions(upid, intervention_date);
-"""
-
-
-# =============================================================================
-# Materialized View Schemas (Cached Aggregations)
-# =============================================================================
-
-MV_PATIENT_TREATMENT_SUMMARY_SCHEMA = """
-- Materialized view of patient treatment summaries
-- Pre-computed aggregations per patient for faster pathway analysis
-- Refreshed when fact_interventions data changes
-CREATE TABLE IF NOT EXISTS mv_patient_treatment_summary (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- Patient identification
-    upid TEXT NOT NULL UNIQUE,              -- Unique Patient ID
-
-    -- Organization and directory (for filtering)
-    org_name TEXT,                          -- Organization name (first org seen)
-    directory TEXT,                         -- Primary directory (first directory assigned)
-
-    -- Date range
-    first_seen_date DATE NOT NULL,          -- First intervention date
-    last_seen_date DATE NOT NULL,           -- Last intervention date
-    days_treated INTEGER NOT NULL DEFAULT 0, -- Duration: last_seen - first_seen
-
-    -- Cost aggregations
-    total_cost REAL NOT NULL DEFAULT 0,     -- Sum of all intervention costs
-    avg_cost_per_intervention REAL,         -- Average cost per intervention
-
-    -- Treatment summary
-    intervention_count INTEGER NOT NULL DEFAULT 0,  -- Total number of interventions
-    unique_drug_count INTEGER NOT NULL DEFAULT 0,   -- Number of distinct drugs
-
-    -- Drug sequence (pipe-separated standardized drug names in chronological order)
-    -- Example: "ADALIMUMAB|ETANERCEPT|INFLIXIMAB"
-    drug_sequence TEXT,
-
-    -- Drug frequency counts (JSON: {"ADALIMUMAB": 5, "ETANERCEPT": 3})
-    -- Stores count of each drug for this patient
-    drug_counts_json TEXT,
-
-    -- Drug cost totals (JSON: {"ADALIMUMAB": 15000.00, "ETANERCEPT": 8000.00})
-    -- Stores total cost per drug for this patient
-    drug_costs_json TEXT,
-
-    -- Per-drug date ranges (JSON: {"ADALIMUMAB": {"first": "2023-01-01", "last": "2023-06-15"}, ...})
-    -- Stores first/last date for each drug
-    drug_date_ranges_json TEXT,
-
-    -- Metadata
-    computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    source_row_count INTEGER               -- Number of fact_interventions rows used
-);
-
-- Index for fast patient lookup
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_upid ON mv_patient_treatment_summary(upid);
-
-- Indexes for common filter patterns
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_org ON mv_patient_treatment_summary(org_name);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_directory ON mv_patient_treatment_summary(directory);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_first_seen ON mv_patient_treatment_summary(first_seen_date);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_last_seen ON mv_patient_treatment_summary(last_seen_date);
-
-- Composite index for date range filtering (common in generate_graph)
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_date_range
-    ON mv_patient_treatment_summary(first_seen_date, last_seen_date);
-
-- Composite index for org + directory + dates (full filter pattern)
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_filter_composite
-    ON mv_patient_treatment_summary(org_name, directory, first_seen_date, last_seen_date);
-
-- Index for drug sequence pattern matching
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_drug_seq ON mv_patient_treatment_summary(drug_sequence);
-"""
-
-MATERIALIZED_VIEWS_SCHEMA = f"""
-- Materialized Views Schema
-- Pre-computed aggregations for performance
-
-{MV_PATIENT_TREATMENT_SUMMARY_SCHEMA}
-"""
-
-
-# =============================================================================
-# File Tracking Schemas (Incremental Updates)
-# =============================================================================
-
-PROCESSED_FILES_SCHEMA = """
-- Tracks processed data files for incremental updates
-- Enables detecting changed files by comparing hashes
-- Stores processing status and statistics
-CREATE TABLE IF NOT EXISTS processed_files (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- File identification
-    file_path TEXT NOT NULL,                -- Full path to the file
-    file_name TEXT NOT NULL,                -- Just the filename (for display)
-    file_hash TEXT NOT NULL,                -- SHA256 hash of file contents
-
-    -- File metadata
-    file_size_bytes INTEGER,                -- Size of file in bytes
-    file_modified_at TIMESTAMP,             -- File's last modification timestamp
-
-    -- Processing results
-    row_count INTEGER DEFAULT 0,            -- Number of rows processed from this file
-    status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, success, error
-    error_message TEXT,                     -- Error details if status='error'
-
-    -- Timestamps
-    first_processed_at TIMESTAMP,           -- When first processed
-    last_processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    processing_duration_seconds REAL,       -- How long processing took
-
-    -- Uniqueness: only one record per file path
-    -- Hash changes indicate file content changed (needs reprocessing)
-    UNIQUE(file_path)
-);
-
-- Index for fast lookup by file path
-CREATE INDEX IF NOT EXISTS idx_processed_files_path ON processed_files(file_path);
-
-- Index for finding files by status (e.g., find all pending or errored files)
-CREATE INDEX IF NOT EXISTS idx_processed_files_status ON processed_files(status);
-
-- Index for finding files by hash (detect if same file appears at different paths)
-CREATE INDEX IF NOT EXISTS idx_processed_files_hash ON processed_files(file_hash);
-
-- Index for finding recently processed files
-CREATE INDEX IF NOT EXISTS idx_processed_files_last_processed ON processed_files(last_processed_at);
-"""
-
-FILE_TRACKING_SCHEMA = f"""
-- File Tracking Schema
-- Supports incremental data loading
-
-{PROCESSED_FILES_SCHEMA}
-"""
-
-
 # =============================================================================
 # Combined Schemas
 # =============================================================================
@@ -520,29 +282,14 @@ REFERENCE_TABLES_SCHEMA = f"""
 {REF_DRUG_DIRECTORY_MAP_SCHEMA}

 {REF_DRUG_INDICATION_CLUSTERS_SCHEMA}
-
-{REF_DRUG_SNOMED_MAPPING_SCHEMA}
-"""
-
-FACT_TABLES_SCHEMA = f"""
-- Fact Tables Schema
-- Contains patient intervention data
-
-{FACT_INTERVENTIONS_SCHEMA}
 """

 ALL_TABLES_SCHEMA = f"""
 -- Complete Database Schema
-- Reference tables + Fact tables + Materialized views + File tracking + Pathway tables
+-- Reference tables + Pathway tables

 {REFERENCE_TABLES_SCHEMA}

-{FACT_TABLES_SCHEMA}
-
-{MATERIALIZED_VIEWS_SCHEMA}
-
-{FILE_TRACKING_SCHEMA}
-
 {PATHWAY_TABLES_SCHEMA}
 """

@@ -580,26 +327,10 @@ def drop_reference_tables(conn: sqlite3.Connection) -> None:
        DROP TABLE IF EXISTS ref_directories;
        DROP TABLE IF EXISTS ref_drug_directory_map;
        DROP TABLE IF EXISTS ref_drug_indication_clusters;
-        DROP TABLE IF EXISTS ref_drug_snomed_mapping;
    """)
    logger.info("Reference tables dropped")


-def create_drug_snomed_mapping_table(conn: sqlite3.Connection) -> None:
-    """
-    Create the ref_drug_snomed_mapping table for direct SNOMED code mapping.
-
-    This table stores mappings from drugs to SNOMED codes for GP record matching,
-    enabling diagnosis-based directorate assignment and indication-based pathways.
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating ref_drug_snomed_mapping table...")
-    conn.executescript(REF_DRUG_SNOMED_MAPPING_SCHEMA)
-    logger.info("ref_drug_snomed_mapping table created successfully")
-
-
 def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for all reference tables.
@@ -616,7 +347,6 @@ def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
-        "ref_drug_snomed_mapping",
    ]
    counts = {}

@@ -647,7 +377,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
-        "ref_drug_snomed_mapping",
    ]
    missing = []

@@ -662,164 +391,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
    return missing


-# =============================================================================
-# Fact Table Helper Functions
-# =============================================================================
-
-def create_fact_tables(conn: sqlite3.Connection) -> None:
-    """
-    Create all fact tables in the database (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating fact tables...")
-    conn.executescript(FACT_TABLES_SCHEMA)
-    conn.executescript(MATERIALIZED_VIEWS_SCHEMA)
-    logger.info("Fact tables created successfully")
-
-
-def drop_fact_tables(conn: sqlite3.Connection) -> None:
-    """
-    Drop all fact tables from the database.
-
-    Args:
-        conn: SQLite database connection.
-
-    Warning:
-        This will delete all patient intervention data. Use with caution.
-    """
-    logger.warning("Dropping fact tables...")
-    conn.executescript("""
-        DROP TABLE IF EXISTS fact_interventions;
-        DROP TABLE IF EXISTS mv_patient_treatment_summary;
-    """)
-    logger.info("Fact tables dropped")
-
-
-def get_fact_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
-    """
-    Get row counts for all fact tables (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        Dictionary mapping table name to row count.
-    """
-    tables = ["fact_interventions", "mv_patient_treatment_summary"]
-    counts = {}
-
-    for table in tables:
-        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
-        result = cursor.fetchone()
-        counts[table] = result[0] if result else 0
-
-    return counts
-
-
-def verify_fact_tables_exist(conn: sqlite3.Connection) -> list[str]:
-    """
-    Verify that all fact tables exist (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        List of missing table names. Empty list means all tables exist.
-    """
-    required_tables = ["fact_interventions", "mv_patient_treatment_summary"]
-    missing = []
-
-    for table in required_tables:
-        cursor = conn.execute(
-            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
-            (table,)
-        )
-        if cursor.fetchone() is None:
-            missing.append(table)
-
-    return missing
-
-
-# =============================================================================
-# File Tracking Helper Functions
-# =============================================================================
-
-def create_file_tracking_tables(conn: sqlite3.Connection) -> None:
-    """
-    Create file tracking tables in the database.
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating file tracking tables...")
-    conn.executescript(FILE_TRACKING_SCHEMA)
-    logger.info("File tracking tables created successfully")
-
-
-def drop_file_tracking_tables(conn: sqlite3.Connection) -> None:
-    """
-    Drop file tracking tables from the database.
-
-    Args:
-        conn: SQLite database connection.
-
-    Warning:
-        This will delete all file tracking history.
-    """
-    logger.warning("Dropping file tracking tables...")
-    conn.executescript("""
-        DROP TABLE IF EXISTS processed_files;
-    """)
-    logger.info("File tracking tables dropped")
-
-
-def get_file_tracking_counts(conn: sqlite3.Connection) -> dict[str, int]:
-    """
-    Get row counts for file tracking tables.
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        Dictionary mapping table name to row count.
-    """
-    tables = ["processed_files"]
-    counts = {}
-
-    for table in tables:
-        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
-        result = cursor.fetchone()
-        counts[table] = result[0] if result else 0
-
-    return counts
-
-
-def verify_file_tracking_tables_exist(conn: sqlite3.Connection) -> list[str]:
-    """
-    Verify that file tracking tables exist.
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        List of missing table names. Empty list means all tables exist.
-    """
-    required_tables = ["processed_files"]
-    missing = []
-
-    for table in required_tables:
-        cursor = conn.execute(
-            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
-            (table,)
-        )
-        if cursor.fetchone() is None:
-            missing.append(table)
-
-    return missing
-
-
 # =============================================================================
 # Pathway Table Helper Functions
 # =============================================================================
@@ -1050,13 +621,37 @@ def migrate_pathway_nodes_chart_type(conn: sqlite3.Connection) -> tuple[bool, st
        return False, f"Migration failed: {e}"


+def migrate_refresh_log_source_row_count(conn: sqlite3.Connection) -> tuple[bool, str]:
+    """Add source_row_count column to pathway_refresh_log if it doesn't exist.
+
+    This column stores the Snowflake row count for display in the UI footer.
+    """
+    cursor = conn.execute("PRAGMA table_info(pathway_refresh_log)")
+    columns = [row[1] for row in cursor.fetchall()]
+
+    if "source_row_count" in columns:
+        return True, "source_row_count column already exists"
+
+    logger.info("Adding source_row_count column to pathway_refresh_log...")
+    try:
+        conn.execute("""
+            ALTER TABLE pathway_refresh_log
+            ADD COLUMN source_row_count INTEGER
+        """)
+        conn.commit()
+        return True, "Added source_row_count column"
+    except Exception as e:
+        logger.error(f"Failed to add source_row_count column: {e}")
+        return False, f"Migration failed: {e}"
+
+
 # =============================================================================
 # Combined Helper Functions
 # =============================================================================

 def create_all_tables(conn: sqlite3.Connection) -> None:
    """
-    Create all tables (reference + fact) in the database.
+    Create all tables (reference + pathway) in the database.

    Args:
        conn: SQLite database connection.
@@ -1078,8 +673,6 @@ def drop_all_tables(conn: sqlite3.Connection) -> None:
    """
    logger.warning("Dropping all tables...")
    drop_pathway_tables(conn)
-    drop_file_tracking_tables(conn)
-    drop_fact_tables(conn)
    drop_reference_tables(conn)
    logger.info("All tables dropped")

@@ -1096,8 +689,6 @@ def get_all_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    counts = {}
    counts.update(get_reference_table_counts(conn))
-    counts.update(get_fact_table_counts(conn))
-    counts.update(get_file_tracking_counts(conn))
    counts.update(get_pathway_table_counts(conn))
    return counts

@@ -1114,7 +705,5 @@ def verify_all_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    missing = []
    missing.extend(verify_reference_tables_exist(conn))
-    missing.extend(verify_fact_tables_exist(conn))
-    missing.extend(verify_file_tracking_tables_exist(conn))
    missing.extend(verify_pathway_tables_exist(conn))
    return missing