refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables

Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
2026-02-06 08:51:03 +00:00
parent bb93c1673e
commit 778ed99ef6
11 changed files with 95 additions and 3653 deletions
@@ -24,15 +24,6 @@ from data_processing.schema import (
    REF_DRUG_DIRECTORY_MAP_SCHEMA,
    REF_DRUG_INDICATION_CLUSTERS_SCHEMA,
    REFERENCE_TABLES_SCHEMA,
-    # Fact table schemas
-    FACT_INTERVENTIONS_SCHEMA,
-    FACT_TABLES_SCHEMA,
-    # Materialized view schemas
-    MV_PATIENT_TREATMENT_SUMMARY_SCHEMA,
-    MATERIALIZED_VIEWS_SCHEMA,
-    # File tracking schemas
-    PROCESSED_FILES_SCHEMA,
-    FILE_TRACKING_SCHEMA,
    # Combined schema
    ALL_TABLES_SCHEMA,
    # Reference table functions
@@ -40,16 +31,6 @@ from data_processing.schema import (
    drop_reference_tables,
    get_reference_table_counts,
    verify_reference_tables_exist,
-    # Fact table functions
-    create_fact_tables,
-    drop_fact_tables,
-    get_fact_table_counts,
-    verify_fact_tables_exist,
-    # File tracking functions
-    create_file_tracking_tables,
-    drop_file_tracking_tables,
-    get_file_tracking_counts,
-    verify_file_tracking_tables_exist,
    # Combined functions
    create_all_tables,
    drop_all_tables,
@@ -81,27 +62,12 @@ from data_processing.reference_data import (
 from data_processing.loader import (
    DataLoader,
    FileDataLoader,
-    SQLiteDataLoader,
    LoadResult,
    get_loader,
    REQUIRED_COLUMNS,
    OPTIONAL_COLUMNS,
 )

-# Patient data migration functions
-from data_processing.patient_data import (
-    PatientDataLoadResult,
-    load_patient_data,
-    get_patient_data_stats,
-    list_processed_files,
-    calculate_file_hash,
-    # Materialized view functions
-    MVRefreshResult,
-    refresh_patient_treatment_summary,
-    get_patient_summary_stats,
-    verify_mv_consistency,
-)
-
 # Snowflake connector
 from data_processing.snowflake_connector import (
    SnowflakeConnector,
@@ -165,15 +131,6 @@ __all__ = [
    "REF_DRUG_DIRECTORY_MAP_SCHEMA",
    "REF_DRUG_INDICATION_CLUSTERS_SCHEMA",
    "REFERENCE_TABLES_SCHEMA",
-    # Fact table schemas
-    "FACT_INTERVENTIONS_SCHEMA",
-    "FACT_TABLES_SCHEMA",
-    # Materialized view schemas
-    "MV_PATIENT_TREATMENT_SUMMARY_SCHEMA",
-    "MATERIALIZED_VIEWS_SCHEMA",
-    # File tracking schemas
-    "PROCESSED_FILES_SCHEMA",
-    "FILE_TRACKING_SCHEMA",
    # Combined schema
    "ALL_TABLES_SCHEMA",
    # Reference table functions
@@ -181,16 +138,6 @@ __all__ = [
    "drop_reference_tables",
    "get_reference_table_counts",
    "verify_reference_tables_exist",
-    # Fact table functions
-    "create_fact_tables",
-    "drop_fact_tables",
-    "get_fact_table_counts",
-    "verify_fact_tables_exist",
-    # File tracking functions
-    "create_file_tracking_tables",
-    "drop_file_tracking_tables",
-    "get_file_tracking_counts",
-    "verify_file_tracking_tables_exist",
    # Combined functions
    "create_all_tables",
    "drop_all_tables",
@@ -216,22 +163,10 @@ __all__ = [
    # Data loader abstractions
    "DataLoader",
    "FileDataLoader",
-    "SQLiteDataLoader",
    "LoadResult",
    "get_loader",
    "REQUIRED_COLUMNS",
    "OPTIONAL_COLUMNS",
-    # Patient data migration
-    "PatientDataLoadResult",
-    "load_patient_data",
-    "get_patient_data_stats",
-    "list_processed_files",
-    "calculate_file_hash",
-    # Materialized view functions
-    "MVRefreshResult",
-    "refresh_patient_treatment_summary",
-    "get_patient_summary_stats",
-    "verify_mv_consistency",
    # Snowflake connector
    "SnowflakeConnector",
    "SnowflakeConnectionError",
@@ -232,9 +232,9 @@ class DataSourceManager:
            )

    def _check_sqlite_status(self) -> SourceStatus:
-        """Check if SQLite database is available with data."""
+        """Check if SQLite database is available with pathway data."""
        try:
-            from data_processing.database import default_db_manager, default_db_config
+            from data_processing.database import default_db_config

            db_path = self._sqlite_db_path or Path(default_db_config.db_path)

@@ -252,22 +252,22 @@ class DataSourceManager:
            config = DatabaseConfig(db_path=db_path)
            manager = DatabaseManager(config)

-            if not manager.table_exists("fact_interventions"):
+            if not manager.table_exists("pathway_nodes"):
                return SourceStatus(
                    source_type=DataSourceType.SQLITE,
                    available=False,
                    configured=True,
-                    message="fact_interventions table not found",
+                    message="pathway_nodes table not found",
                    last_checked=datetime.now(),
                )

-            count = manager.get_table_count("fact_interventions")
+            count = manager.get_table_count("pathway_nodes")
            if count == 0:
                return SourceStatus(
                    source_type=DataSourceType.SQLITE,
                    available=False,
                    configured=True,
-                    message="fact_interventions table is empty",
+                    message="pathway_nodes table is empty",
                    last_checked=datetime.now(),
                )

@@ -275,7 +275,7 @@ class DataSourceManager:
                source_type=DataSourceType.SQLITE,
                available=True,
                configured=True,
-                message=f"SQLite database ready ({count:,} rows)",
+                message=f"SQLite database ready ({count:,} pathway nodes)",
                last_checked=datetime.now(),
            )
        except Exception as e:
@@ -535,50 +535,14 @@ class DataSourceManager:
        drugs: Optional[list[str]],
        directories: Optional[list[str]],
    ) -> Optional[DataSourceResult]:
-        """Try to get data from SQLite."""
-        import time
+        """Try to get data from SQLite.

-        try:
-            from data_processing.loader import SQLiteDataLoader
-
-            # Determine database path
-            db_path = self._sqlite_db_path
-            if db_path is None:
-                from data_processing.database import default_db_config
-                db_path = Path(default_db_config.db_path)
-
-            loader = SQLiteDataLoader(
-                db_path=db_path,
-                date_range=(start_date, end_date) if start_date and end_date else None,
-                trusts=trusts,
-                drugs=drugs,
-                directories=directories,
-            )
-
-            # Check if source is valid
-            is_valid, msg = loader.validate_source()
-            if not is_valid:
-                logger.debug(f"SQLite not available: {msg}")
-                return None
-
-            start_time = time.time()
-            result = loader.load()
-            load_time = time.time() - start_time
-
-            logger.info(f"SQLite loaded {result.row_count} rows in {load_time:.2f}s")
-
-            return DataSourceResult(
-                df=result.df,
-                source_type=DataSourceType.SQLITE,
-                source_detail=str(db_path),
-                row_count=result.row_count,
-                cached=False,
-                from_fallback=False,
-                load_time_seconds=load_time,
-            )
-        except Exception as e:
-            logger.warning(f"SQLite query failed: {e}")
-            return None
+        Note: Raw intervention data is no longer stored in SQLite.
+        The app now uses pre-computed pathway_nodes via load_pathway_data().
+        This fallback is retained for interface compatibility but always returns None.
+        """
+        logger.debug("SQLite raw data fallback skipped (fact_interventions removed)")
+        return None

    def _try_file(
        self,
@@ -78,42 +78,6 @@ class DrugIndicationMatchRate:
    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs


-@dataclass
-class DrugSnomedMapping:
-    """SNOMED code mapping for a drug from ref_drug_snomed_mapping."""
-    snomed_code: str
-    snomed_description: str
-    search_term: str
-    primary_directorate: str
-    indication: str = ""
-    ta_id: str = ""
-
-
-@dataclass
-class DirectSnomedMatchResult:
-    """Result of direct SNOMED code lookup in GP records."""
-    patient_pseudonym: str
-    matched: bool
-    snomed_code: Optional[str] = None
-    snomed_description: Optional[str] = None
-    search_term: Optional[str] = None
-    primary_directorate: Optional[str] = None
-    event_date: Optional[datetime] = None
-    source: str = "DIRECT_SNOMED"  # DIRECT_SNOMED | NONE
-
-
-@dataclass
-class DirectorateAssignment:
-    """Result of directorate assignment for a patient-drug combination."""
-    upid: str
-    drug_name: str
-    directorate: Optional[str]
-    search_term: Optional[str] = None
-    source: str = "FALLBACK"  # DIAGNOSIS | FALLBACK
-    snomed_code: Optional[str] = None
-    event_date: Optional[datetime] = None
-
-
 def get_drug_clusters(
    drug_name: str,
    db_manager: Optional[DatabaseManager] = None
@@ -180,266 +144,6 @@ def get_drug_cluster_ids(
    return list(set(c["cluster_id"] for c in clusters))


-def get_drug_snomed_codes(
-    drug_name: str,
-    db_manager: Optional[DatabaseManager] = None
-) -> list[DrugSnomedMapping]:
-    """
-    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.
-
-    This uses the enriched mapping CSV data loaded into SQLite, which provides
-    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.
-
-    Args:
-        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-
-    Returns:
-        List of DrugSnomedMapping with snomed_code, snomed_description,
-        search_term, primary_directorate, indication, ta_id
-    """
-    if db_manager is None:
-        db_manager = default_db_manager
-
-    query = """
-        SELECT DISTINCT
-            snomed_code,
-            snomed_description,
-            search_term,
-            primary_directorate,
-            indication,
-            ta_id
-        FROM ref_drug_snomed_mapping
-        WHERE UPPER(cleaned_drug_name) = UPPER(?)
-           OR UPPER(drug_name) = UPPER(?)
-        ORDER BY search_term, snomed_code
-    """
-
-    try:
-        with db_manager.get_connection() as conn:
-            cursor = conn.execute(query, (drug_name, drug_name))
-            rows = cursor.fetchall()
-
-            results = []
-            for row in rows:
-                results.append(DrugSnomedMapping(
-                    snomed_code=row["snomed_code"],
-                    snomed_description=row["snomed_description"] or "",
-                    search_term=row["search_term"] or "",
-                    primary_directorate=row["primary_directorate"] or "",
-                    indication=row["indication"] or "",
-                    ta_id=row["ta_id"] or "",
-                ))
-
-            logger.debug(f"Found {len(results)} SNOMED mappings for drug '{drug_name}'")
-            return results
-
-    except Exception as e:
-        logger.error(f"Error getting SNOMED codes for drug '{drug_name}': {e}")
-        return []
-
-
-def patient_has_indication_direct(
-    patient_pseudonym: str,
-    drug_snomed_mappings: list[DrugSnomedMapping],
-    connector: Optional[SnowflakeConnector] = None,
-    before_date: Optional[date] = None,
-) -> DirectSnomedMatchResult:
-    """
-    Check if patient has any of the SNOMED codes in their GP records.
-
-    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding
-    for exact SNOMED code matches (not via cluster). Returns the most recent
-    match by EventDateTime if multiple matches exist.
-
-    Args:
-        patient_pseudonym: Patient's pseudonymised NHS number
-        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        before_date: Optional date - only check diagnoses before this date
-
-    Returns:
-        DirectSnomedMatchResult with match details (most recent by EventDateTime)
-    """
-    result = DirectSnomedMatchResult(
-        patient_pseudonym=patient_pseudonym,
-        matched=False,
-        source="NONE",
-    )
-
-    if not drug_snomed_mappings:
-        return result
-
-    if not SNOWFLAKE_AVAILABLE:
-        logger.warning("Snowflake connector not available")
-        return result
-
-    if not is_snowflake_configured():
-        logger.warning("Snowflake not configured - cannot check GP records")
-        return result
-
-    if connector is None:
-        connector = get_connector()
-
-    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)
-    snomed_lookup = {
-        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)
-        for m in drug_snomed_mappings
-    }
-
-    # Get unique SNOMED codes
-    snomed_codes = list(snomed_lookup.keys())
-
-    # Build placeholders for SNOMED codes
-    placeholders = ", ".join(["%s"] * len(snomed_codes))
-
-    # Query to find most recent matching SNOMED code in GP records
-    query = f'''
-        SELECT
-            "SNOMEDCode",
-            "EventDateTime"
-        FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
-        WHERE "PatientPseudonym" = %s
-            AND "SNOMEDCode" IN ({placeholders})
-    '''
-
-    params: list = [patient_pseudonym] + snomed_codes
-
-    if before_date:
-        query += ' AND "EventDateTime" < %s'
-        params.append(before_date.isoformat())
-
-    query += ' ORDER BY "EventDateTime" DESC LIMIT 1'
-
-    try:
-        results = connector.execute_dict(query, tuple(params))
-
-        if results:
-            row = results[0]
-            matched_code = row.get("SNOMEDCode")
-            event_dt = row.get("EventDateTime")
-
-            if matched_code and matched_code in snomed_lookup:
-                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]
-
-                return DirectSnomedMatchResult(
-                    patient_pseudonym=patient_pseudonym,
-                    matched=True,
-                    snomed_code=matched_code,
-                    snomed_description=snomed_desc,
-                    search_term=search_term,
-                    primary_directorate=primary_dir,
-                    event_date=event_dt,
-                    source="DIRECT_SNOMED",
-                )
-
-        return result
-
-    except Exception as e:
-        logger.error(f"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}")
-        return result
-
-
-def get_directorate_from_diagnosis(
-    upid: str,
-    drug_name: str,
-    connector: Optional[SnowflakeConnector] = None,
-    db_manager: Optional[DatabaseManager] = None,
-    before_date: Optional[date] = None,
-) -> DirectorateAssignment:
-    """
-    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.
-
-    This function attempts to assign a directorate based on the patient's GP records
-    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result
-    indicating that the caller should use alternative assignment methods (e.g.,
-    department_identification() from tools/data.py).
-
-    Workflow:
-    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping
-    2. Query patient's GP records for matching SNOMED codes
-    3. If match found → return diagnosis-based directorate and search_term
-    4. If no match → return FALLBACK result (caller handles fallback logic)
-
-    Args:
-        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)
-        drug_name: Drug name to look up
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-        before_date: Optional date - only check diagnoses before this date
-
-    Returns:
-        DirectorateAssignment with directorate, search_term, and source
-    """
-    result = DirectorateAssignment(
-        upid=upid,
-        drug_name=drug_name,
-        directorate=None,
-        source="FALLBACK",
-    )
-
-    # Step 1: Get SNOMED codes for the drug
-    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)
-
-    if not drug_snomed_mappings:
-        logger.debug(f"No SNOMED mappings found for drug '{drug_name}' - using fallback")
-        return result
-
-    # Step 2: Check Snowflake availability
-    if not SNOWFLAKE_AVAILABLE:
-        logger.debug("Snowflake not available - using fallback")
-        return result
-
-    if not is_snowflake_configured():
-        logger.debug("Snowflake not configured - using fallback")
-        return result
-
-    # Step 3: Get patient pseudonym from UPID
-    # UPID format is Provider Code (3 chars) + PersonKey
-    # We need to query Snowflake to get the PatientPseudonym for this PersonKey
-    # However, patient_has_indication_direct expects PatientPseudonym, not UPID
-    # For now, we'll use UPID as the identifier - the actual integration
-    # will need to happen at the DataFrame level where we have PersonKey
-    #
-    # NOTE: This function will be called from the pipeline where we have
-    # access to PatientPseudonym. The UPID is passed for logging/tracking.
-
-    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.
-    # The caller should pass the PatientPseudonym or we need to look it up.
-    # For now, let's assume the caller will use this in a batch context
-    # where they can map UPID -> PatientPseudonym.
-
-    # Let me reconsider: the function signature takes UPID but we need
-    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),
-    # we'll have the PersonKey column which IS the PatientPseudonym.
-    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.
-    #
-    # We can extract PatientPseudonym from UPID by removing the first 3 chars.
-    patient_pseudonym = upid[3:] if len(upid) > 3 else upid
-
-    # Step 4: Check patient's GP records for matching SNOMED codes
-    match_result = patient_has_indication_direct(
-        patient_pseudonym=patient_pseudonym,
-        drug_snomed_mappings=drug_snomed_mappings,
-        connector=connector,
-        before_date=before_date,
-    )
-
-    if match_result.matched and match_result.primary_directorate:
-        return DirectorateAssignment(
-            upid=upid,
-            drug_name=drug_name,
-            directorate=match_result.primary_directorate,
-            search_term=match_result.search_term,
-            source="DIAGNOSIS",
-            snomed_code=match_result.snomed_code,
-            event_date=match_result.event_date,
-        )
-
-    # No match found - return fallback result
-    return result
-
-
 def get_cluster_snomed_codes(
    cluster_id: str,
    connector: Optional[SnowflakeConnector] = None,
@@ -864,229 +568,6 @@ def get_available_clusters(
        return []


-def batch_lookup_indication_groups(
-    df: "pd.DataFrame",
-    connector: Optional[SnowflakeConnector] = None,
-    db_manager: Optional[DatabaseManager] = None,
-    batch_size: int = 500,
-) -> "pd.DataFrame":
-    """
-    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.
-
-    This is the efficient batch version of get_directorate_from_diagnosis().
-    Instead of querying Snowflake per patient, it batches the lookups for performance.
-
-    Strategy:
-    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame
-    2. For each unique drug, get all SNOMED codes from local SQLite
-    3. Build batched Snowflake queries to check GP records
-    4. Return indication_df mapping UPID → Indication_Group
-
-    For unmatched patients, Indication_Group will be their Directory (with suffix).
-
-    Args:
-        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-        batch_size: Number of patients per Snowflake query batch
-
-    Returns:
-        DataFrame with columns: UPID, Indication_Group, Source
-        - Indication_Group: Search_Term (if matched) or "Directory (no GP dx)" (if not)
-        - Source: "DIAGNOSIS" or "FALLBACK"
-    """
-    import pandas as pd
-
-    if db_manager is None:
-        db_manager = default_db_manager
-
-    logger.info(f"Starting batch indication lookup for {len(df)} records...")
-
-    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
-    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
-    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
-    if 'PseudoNHSNoLinked' not in df.columns:
-        logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
-        # Return fallback for all patients
-        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']]
-
-    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)
-    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
-    logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
-
-    # Step 2: Get all unique drugs and their SNOMED codes
-    unique_drugs = unique_pairs['Drug Name'].unique()
-    logger.info(f"Building SNOMED lookup for {len(unique_drugs)} unique drugs...")
-
-    # Build drug -> list of DrugSnomedMapping dict
-    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}
-    all_snomed_codes: set[str] = set()
-    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]
-
-    for drug_name in unique_drugs:
-        mappings = get_drug_snomed_codes(drug_name, db_manager)
-        drug_snomed_map[drug_name] = mappings
-
-        for m in mappings:
-            all_snomed_codes.add(m.snomed_code)
-            if m.snomed_code not in snomed_to_drug_searchterm:
-                snomed_to_drug_searchterm[m.snomed_code] = []
-            snomed_to_drug_searchterm[m.snomed_code].append(
-                (drug_name, m.search_term, m.primary_directorate)
-            )
-
-    logger.info(f"Total SNOMED codes to check: {len(all_snomed_codes)}")
-
-    # Step 3: Check Snowflake availability
-    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
-        logger.warning("Snowflake not available - returning fallback for all patients")
-        result_df = unique_pairs[['UPID', 'Directory']].copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
-
-    if connector is None:
-        connector = get_connector()
-
-    # Step 4: Query GP records for all patients in batches
-    # The query finds the most recent matching SNOMED code for each patient
-
-    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)
-    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
-    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
-
-    logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
-
-    # Results dict: PersonKey -> (snomed_code, event_date)
-    gp_matches: dict[str, tuple[str, Any]] = {}
-
-    # Convert SNOMED codes to list for query
-    snomed_list = list(all_snomed_codes)
-
-    if not snomed_list:
-        logger.warning("No SNOMED codes to check - returning fallback for all patients")
-        result_df = unique_pairs[['UPID', 'Directory']].copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
-
-    # Build SNOMED IN clause (reused across batches)
-    snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
-
-    # Process patients in batches
-    for batch_start in range(0, len(patient_pseudonyms), batch_size):
-        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
-        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
-
-        logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
-
-        # Build patient IN clause
-        patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
-
-        # Query to find all matching SNOMED codes for these patients
-        # We'll get all matches and pick the most recent per patient in Python
-        query = f'''
-            SELECT
-                "PatientPseudonym",
-                "SNOMEDCode",
-                "EventDateTime"
-            FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
-            WHERE "PatientPseudonym" IN ({patient_placeholders})
-              AND "SNOMEDCode" IN ({snomed_placeholders})
-            ORDER BY "PatientPseudonym", "EventDateTime" DESC
-        '''
-
-        params = tuple(batch_pseudonyms) + tuple(snomed_list)
-
-        try:
-            results = connector.execute_dict(query, params)
-
-            # Process results - pick most recent per patient
-            for row in results:
-                person_key = row.get("PatientPseudonym")
-                snomed_code = row.get("SNOMEDCode")
-                event_date = row.get("EventDateTime")
-
-                if person_key and snomed_code:
-                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)
-                    if person_key not in gp_matches:
-                        gp_matches[person_key] = (snomed_code, event_date)
-
-        except Exception as e:
-            logger.error(f"Error querying GP records for batch: {e}")
-            # Continue with other batches
-
-    logger.info(f"Found GP matches for {len(gp_matches)} patients")
-
-    # Step 5: Build result DataFrame
-    # For each unique_pair, determine Indication_Group based on match status
-    results_list = []
-
-    # We need to dedupe by UPID - a patient might be on multiple drugs
-    # Strategy: For each UPID, use the most recent match (if any)
-    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)
-
-    for _, row in unique_pairs.iterrows():
-        upid = row['UPID']
-        drug_name = row['Drug Name']
-        patient_pseudonym = row['PseudoNHSNoLinked']
-        directory = row['Directory']
-
-        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
-        if patient_pseudonym in gp_matches:
-            matched_snomed, event_date = gp_matches[patient_pseudonym]
-
-            # Find the search_term for this SNOMED code and drug
-            # (A SNOMED code might map to multiple drugs with different search_terms)
-            if matched_snomed in snomed_to_drug_searchterm:
-                # Look for match with current drug first
-                search_term = None
-                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:
-                    if drug.upper() == drug_name.upper():
-                        search_term = st
-                        break
-                # If no drug-specific match, use any match
-                if search_term is None:
-                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]
-
-                # Only update if we don't have a match for this UPID yet
-                if upid not in upid_to_match:
-                    upid_to_match[upid] = (search_term, "DIAGNOSIS")
-            else:
-                # Shouldn't happen but fallback just in case
-                if upid not in upid_to_match:
-                    upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
-        else:
-            # No GP match - use fallback
-            if upid not in upid_to_match:
-                upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
-
-    # Build result DataFrame
-    for upid, (indication_group, source) in upid_to_match.items():
-        results_list.append({
-            'UPID': upid,
-            'Indication_Group': indication_group,
-            'Source': source,
-        })
-
-    result_df = pd.DataFrame(results_list)
-
-    # Log statistics
-    diagnosis_count = len([s for s in result_df['Source'] if s == "DIAGNOSIS"])
-    fallback_count = len([s for s in result_df['Source'] if s == "FALLBACK"])
-    total = len(result_df)
-
-    logger.info(f"Indication lookup complete:")
-    logger.info(f"  Total unique patients: {total}")
-    logger.info(f"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)")
-    logger.info(f"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)")
-
-    return result_df
-
-
 # === Drug-to-indication mapping from DimSearchTerm.csv ===


@@ -1713,10 +1194,7 @@ __all__ = [
    "ClusterSnomedCodes",
    "IndicationValidationResult",
    "DrugIndicationMatchRate",
-    "DrugSnomedMapping",
-    "DirectSnomedMatchResult",
-    "DirectorateAssignment",
-    # Cluster-based lookup functions (existing)
+    # Cluster-based lookup functions
    "get_drug_clusters",
    "get_drug_cluster_ids",
    "get_cluster_snomed_codes",
@@ -1725,20 +1203,13 @@ __all__ = [
    "get_indication_match_rate",
    "batch_validate_indications",
    "get_available_clusters",
-    # Direct SNOMED lookup functions (new)
-    "get_drug_snomed_codes",
-    "patient_has_indication_direct",
-    # Diagnosis-based directorate assignment
-    "get_directorate_from_diagnosis",
-    # Batch lookup for indication groups
-    "batch_lookup_indication_groups",
    # Drug-indication mapping from DimSearchTerm.csv
    "SEARCH_TERM_MERGE_MAP",
    "load_drug_indication_mapping",
    "get_search_terms_for_drug",
    # Drug-aware indication assignment
    "assign_drug_indications",
-    # Snowflake-direct indication lookup (new approach)
+    # Snowflake-direct indication lookup
    "get_patient_indication_groups",
    "CLUSTER_MAPPING_SQL",
 ]
@@ -1,401 +0,0 @@
-"""
-Load enriched SNOMED mapping data into SQLite database.
-
-This module loads the drug_snomed_mapping_enriched.csv file into the
-ref_drug_snomed_mapping table for direct GP record matching.
-
-Source file: data/drug_snomed_mapping_enriched.csv (163K rows)
-Target table: ref_drug_snomed_mapping
-
-Usage:
-    python -m data_processing.load_snomed_mapping
-
-Columns mapped:
-    Drug -> drug_name
-    Indication -> indication
-    TA_ID -> ta_id
-    Search_Term -> search_term
-    SNOMEDCode -> snomed_code (cleaned: removes trailing .0)
-    SNOMEDDescription -> snomed_description
-    CleanedDrugName -> cleaned_drug_name
-    PrimaryDirectorate -> primary_directorate
-    AllDirectorates -> all_directorates
-"""
-
-from pathlib import Path
-from typing import Optional
-
-from core.logging_config import get_logger
-from data_processing.database import DatabaseManager
-from data_processing.reference_data import MigrationResult, _read_csv_with_fallback_encoding
-
-logger = get_logger(__name__)
-
-DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
-
-
-def clean_snomed_code(snomed_code: str) -> str:
-    """
-    Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
-
-    The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
-    or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
-    These need to be converted to clean integer strings.
-
-    Args:
-        snomed_code: Raw SNOMED code from CSV.
-
-    Returns:
-        Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
-    """
-    if not snomed_code:
-        return ""
-
-    code = snomed_code.strip()
-
-    # Handle scientific notation (e.g., "1.0629311000119108e+16")
-    if 'e' in code.lower():
-        try:
-            # Convert to float first, then to int, then to string
-            # Using int() directly on the float preserves precision for SNOMED codes
-            value = float(code)
-            # Check if it's a whole number (no decimal part)
-            if value == int(value):
-                return str(int(value))
-            else:
-                # Has decimal part - return as cleaned float
-                return str(value).replace('.0', '')
-        except (ValueError, OverflowError):
-            # If conversion fails, return as-is but cleaned
-            return code
-
-    # Remove trailing .0 if present (for non-scientific notation)
-    if code.endswith(".0"):
-        code = code[:-2]
-
-    return code
-
-
-def migrate_drug_snomed_mapping(
-    db_manager: Optional[DatabaseManager] = None,
-    csv_path: Optional[Path] = None
-) -> MigrationResult:
-    """
-    Migrate drug SNOMED mappings from CSV to SQLite ref_drug_snomed_mapping table.
-
-    Source file format (with header):
-        Drug,Indication,TA_ID,Search_Term,SNOMEDCode,SNOMEDDescription,
-        CleanedDrugName,PrimaryDirectorate,AllDirectorates
-
-    Example rows:
-        ABATACEPT,Psoriatic arthritis after DMARDs,TA568,psoriatic arthritis,
-        156370009.0,Psoriatic arthritis,ABATACEPT,RHEUMATOLOGY,RHEUMATOLOGY|DERMATOLOGY
-
-    Args:
-        db_manager: DatabaseManager instance. Uses default if not provided.
-        csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
-
-    Returns:
-        MigrationResult with statistics about the migration.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-    if csv_path is None:
-        csv_path = DEFAULT_CSV_PATH
-
-    table_name = "ref_drug_snomed_mapping"
-
-    logger.info(f"Migrating drug SNOMED mappings from {csv_path} to {table_name}")
-
-    if not csv_path.exists():
-        error_msg = f"Source file not found: {csv_path}"
-        logger.error(error_msg)
-        return MigrationResult(
-            table_name=table_name,
-            source_file=str(csv_path),
-            rows_read=0,
-            rows_inserted=0,
-            rows_skipped=0,
-            success=False,
-            error_message=error_msg
-        )
-
-    rows_read = 0
-    rows_inserted = 0
-    rows_skipped = 0
-
-    try:
-        with db_manager.get_transaction() as conn:
-            rows = _read_csv_with_fallback_encoding(csv_path)
-
-            for i, row in enumerate(rows):
-                # Skip header row
-                if i == 0 and len(row) >= 5 and row[0].strip().lower() == "drug":
-                    logger.debug("Skipping header row")
-                    continue
-
-                rows_read += 1
-
-                # Validate row format (need at least: Drug, Indication, TA_ID, Search_Term, SNOMEDCode)
-                if len(row) < 5:
-                    logger.warning(f"Skipping malformed row {rows_read}: {row}")
-                    rows_skipped += 1
-                    continue
-
-                drug_name = row[0].strip()
-                indication = row[1].strip()
-                ta_id = row[2].strip() if len(row) > 2 else ""
-                search_term = row[3].strip()
-                snomed_code_raw = row[4].strip() if len(row) > 4 else ""
-                snomed_description = row[5].strip() if len(row) > 5 else ""
-                cleaned_drug_name = row[6].strip() if len(row) > 6 else drug_name.upper()
-                primary_directorate = row[7].strip() if len(row) > 7 else ""
-                all_directorates = row[8].strip() if len(row) > 8 else ""
-
-                # Skip if required fields are empty
-                if not drug_name or not indication or not search_term or not snomed_code_raw:
-                    logger.warning(f"Skipping row {rows_read} with empty required fields")
-                    rows_skipped += 1
-                    continue
-
-                # Clean SNOMED code (remove trailing .0)
-                snomed_code = clean_snomed_code(snomed_code_raw)
-
-                if not snomed_code:
-                    logger.warning(f"Skipping row {rows_read} with invalid SNOMED code: {snomed_code_raw}")
-                    rows_skipped += 1
-                    continue
-
-                cursor = conn.execute(
-                    """
-                    INSERT OR IGNORE INTO ref_drug_snomed_mapping
-                    (drug_name, indication, ta_id, search_term, snomed_code, snomed_description,
-                     cleaned_drug_name, primary_directorate, all_directorates)
-                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
-                    """,
-                    (
-                        drug_name,
-                        indication,
-                        ta_id,
-                        search_term,
-                        snomed_code,
-                        snomed_description,
-                        cleaned_drug_name,
-                        primary_directorate,
-                        all_directorates,
-                    )
-                )
-
-                if cursor.rowcount > 0:
-                    rows_inserted += 1
-                else:
-                    rows_skipped += 1
-
-                # Log progress every 10000 rows
-                if rows_read % 10000 == 0:
-                    logger.info(f"Processed {rows_read} rows, inserted {rows_inserted}")
-
-        logger.info(
-            f"Drug SNOMED mapping migration complete: {rows_read} rows read, "
-            f"{rows_inserted} inserted, {rows_skipped} skipped"
-        )
-
-        return MigrationResult(
-            table_name=table_name,
-            source_file=str(csv_path),
-            rows_read=rows_read,
-            rows_inserted=rows_inserted,
-            rows_skipped=rows_skipped,
-            success=True
-        )
-
-    except Exception as e:
-        error_msg = f"Migration failed: {e}"
-        logger.error(error_msg)
-        return MigrationResult(
-            table_name=table_name,
-            source_file=str(csv_path),
-            rows_read=rows_read,
-            rows_inserted=0,
-            rows_skipped=0,
-            success=False,
-            error_message=error_msg
-        )
-
-
-def get_drug_snomed_mapping_counts(db_manager: Optional[DatabaseManager] = None) -> dict:
-    """
-    Get statistics about the ref_drug_snomed_mapping table.
-
-    Args:
-        db_manager: DatabaseManager instance. Uses default if not provided.
-
-    Returns:
-        Dictionary with:
-        - total_mappings: Total rows in table
-        - unique_drugs: Count of distinct drug names
-        - unique_search_terms: Count of distinct search terms
-        - unique_snomed_codes: Count of distinct SNOMED codes
-        - unique_indications: Count of distinct indications
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    with db_manager.get_connection() as conn:
-        cursor = conn.execute("SELECT COUNT(*) FROM ref_drug_snomed_mapping")
-        total = cursor.fetchone()[0]
-
-        cursor = conn.execute("SELECT COUNT(DISTINCT drug_name) FROM ref_drug_snomed_mapping")
-        unique_drugs = cursor.fetchone()[0]
-
-        cursor = conn.execute("SELECT COUNT(DISTINCT search_term) FROM ref_drug_snomed_mapping")
-        unique_search_terms = cursor.fetchone()[0]
-
-        cursor = conn.execute("SELECT COUNT(DISTINCT snomed_code) FROM ref_drug_snomed_mapping")
-        unique_snomed_codes = cursor.fetchone()[0]
-
-        cursor = conn.execute("SELECT COUNT(DISTINCT indication) FROM ref_drug_snomed_mapping")
-        unique_indications = cursor.fetchone()[0]
-
-        return {
-            "total_mappings": total,
-            "unique_drugs": unique_drugs,
-            "unique_search_terms": unique_search_terms,
-            "unique_snomed_codes": unique_snomed_codes,
-            "unique_indications": unique_indications,
-        }
-
-
-def verify_drug_snomed_mapping_migration(
-    db_manager: Optional[DatabaseManager] = None,
-    csv_path: Optional[Path] = None
-) -> tuple[bool, str]:
-    """
-    Verify that drug SNOMED mappings were migrated correctly.
-
-    Checks:
-    - Row count is reasonable (163K+ expected)
-    - Unique search terms is reasonable (187 expected)
-    - Sample lookups return expected values
-
-    Args:
-        db_manager: DatabaseManager instance. Uses default if not provided.
-        csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
-
-    Returns:
-        Tuple of (success: bool, message: str)
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-    if csv_path is None:
-        csv_path = DEFAULT_CSV_PATH
-
-    stats = get_drug_snomed_mapping_counts(db_manager)
-
-    # Basic sanity checks
-    if stats["total_mappings"] < 100000:
-        return False, f"Too few rows: expected 163K+, got {stats['total_mappings']}"
-
-    if stats["unique_search_terms"] < 100:
-        return False, f"Too few search terms: expected ~187, got {stats['unique_search_terms']}"
-
-    # Sample lookup verification
-    with db_manager.get_connection() as conn:
-        # Check that ABATACEPT exists (from sample data)
-        cursor = conn.execute(
-            "SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE drug_name = 'ABATACEPT'"
-        )
-        abatacept_count = cursor.fetchone()[0]
-        if abatacept_count == 0:
-            return False, "Sample drug ABATACEPT not found in table"
-
-        # Check that SNOMED codes were cleaned (no .0 suffix)
-        cursor = conn.execute(
-            "SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE snomed_code LIKE '%.0'"
-        )
-        dirty_codes = cursor.fetchone()[0]
-        if dirty_codes > 0:
-            return False, f"Found {dirty_codes} SNOMED codes with uncleaned .0 suffix"
-
-    return True, (
-        f"Verified {stats['total_mappings']:,} mappings: "
-        f"{stats['unique_drugs']} drugs, "
-        f"{stats['unique_search_terms']} search terms, "
-        f"{stats['unique_snomed_codes']:,} SNOMED codes"
-    )
-
-
-def main():
-    """CLI entry point for loading SNOMED mapping data."""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Load drug SNOMED mapping data into SQLite database"
-    )
-    parser.add_argument(
-        "--csv",
-        type=Path,
-        default=DEFAULT_CSV_PATH,
-        help=f"Path to CSV file (default: {DEFAULT_CSV_PATH})"
-    )
-    parser.add_argument(
-        "--verify-only",
-        action="store_true",
-        help="Only verify existing data, don't migrate"
-    )
-    parser.add_argument(
-        "-v", "--verbose",
-        action="store_true",
-        help="Enable verbose logging"
-    )
-
-    args = parser.parse_args()
-
-    # Configure logging
-    import logging
-    if args.verbose:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    if args.verify_only:
-        print("Verifying existing data...")
-        success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
-        if success:
-            print(f"[OK] Verification passed: {message}")
-        else:
-            print(f"[FAILED] Verification failed: {message}")
-        return 0 if success else 1
-
-    # Run migration
-    print(f"Loading SNOMED mapping from {args.csv}...")
-    result = migrate_drug_snomed_mapping(csv_path=args.csv)
-
-    if result.success:
-        print(f"[OK] {result}")
-
-        # Show statistics
-        stats = get_drug_snomed_mapping_counts()
-        print(f"\nTable statistics:")
-        print(f"  Total mappings: {stats['total_mappings']:,}")
-        print(f"  Unique drugs: {stats['unique_drugs']}")
-        print(f"  Unique search terms: {stats['unique_search_terms']}")
-        print(f"  Unique SNOMED codes: {stats['unique_snomed_codes']:,}")
-        print(f"  Unique indications: {stats['unique_indications']}")
-
-        # Verify
-        success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
-        if success:
-            print(f"\n[OK] Verification: {message}")
-        else:
-            print(f"\n[WARNING] Verification: {message}")
-            return 1
-    else:
-        print(f"[FAILED] {result}")
-        return 1
-
-    return 0
-
-
-if __name__ == "__main__":
-    exit(main())
@@ -11,7 +11,6 @@ The DataLoader ABC defines the contract for all loader implementations.

 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from datetime import date
 from pathlib import Path
 from typing import Optional

@@ -29,7 +28,7 @@ class LoadResult:

    Attributes:
        df: The loaded DataFrame with processed patient intervention data
-        source: Description of the data source (e.g., "csv:/path/to/file.csv", "sqlite:fact_interventions")
+        source: Description of the data source (e.g., "file:/path/to/file.csv")
        row_count: Number of rows loaded
        columns: List of column names in the DataFrame
        load_time_seconds: Time taken to load the data
@@ -224,150 +223,6 @@ class FileDataLoader(DataLoader):
        )


-class SQLiteDataLoader(DataLoader):
-    """Loads data from SQLite fact_interventions table.
-
-    This provides faster loading by reading pre-processed data from SQLite
-    instead of re-processing CSV files each time.
-
-    The SQLite database must have been populated by the migration scripts.
-
-    Args:
-        db_path: Path to the SQLite database (uses default if None)
-        date_range: Optional tuple of (start_date, end_date) to filter data
-        trusts: Optional list of trust names to filter
-        drugs: Optional list of drug names to filter
-        directories: Optional list of directories to filter
-    """
-
-    def __init__(
-        self,
-        db_path: Optional[Path | str] = None,
-        date_range: Optional[tuple[date, date]] = None,
-        trusts: Optional[list[str]] = None,
-        drugs: Optional[list[str]] = None,
-        directories: Optional[list[str]] = None,
-    ):
-        from data_processing.database import default_db_config
-
-        self.db_path = Path(db_path) if db_path else Path(default_db_config.db_path)
-        self.date_range = date_range
-        self.trusts = trusts
-        self.drugs = drugs
-        self.directories = directories
-
-    def validate_source(self) -> tuple[bool, str]:
-        """Check if the database exists and has the fact_interventions table."""
-        if not self.db_path.exists():
-            return False, f"Database not found: {self.db_path}"
-
-        # Check if fact_interventions table exists
-        from data_processing.database import DatabaseManager, DatabaseConfig
-
-        config = DatabaseConfig(db_path=self.db_path)
-        manager = DatabaseManager(config)
-
-        if not manager.table_exists("fact_interventions"):
-            return False, "fact_interventions table not found in database"
-
-        count = manager.get_table_count("fact_interventions")
-        if count == 0:
-            return False, "fact_interventions table is empty"
-
-        return True, f"OK ({count:,} rows available)"
-
-    @property
-    def source_description(self) -> str:
-        return f"sqlite:{self.db_path}"
-
-    def load(self) -> LoadResult:
-        """Load data from SQLite fact_interventions table.
-
-        Maps SQLite column names to the expected DataFrame column names.
-        Applies optional filters for date range, trusts, drugs, directories.
-        """
-        import time
-        from data_processing.database import DatabaseManager, DatabaseConfig
-
-        start_time = time.time()
-
-        # Validate source
-        is_valid, msg = self.validate_source()
-        if not is_valid:
-            raise FileNotFoundError(msg)
-
-        logger.info(f"Loading data from SQLite: {self.db_path}")
-
-        # Build query with optional filters
-        query = """
-            SELECT
-                upid AS "UPID",
-                provider_code AS "Provider Code",
-                person_key AS "PersonKey",
-                drug_name_std AS "Drug Name",
-                intervention_date AS "Intervention Date",
-                price_actual AS "Price Actual",
-                org_name AS "OrganisationName",
-                directory AS "Directory",
-                treatment_function_code AS "Treatment Function Code",
-                additional_detail_1 AS "Additional Detail 1",
-                additional_detail_2 AS "Additional Detail 2",
-                additional_detail_3 AS "Additional Detail 3",
-                additional_detail_4 AS "Additional Detail 4",
-                additional_detail_5 AS "Additional Detail 5"
-            FROM fact_interventions
-            WHERE 1=1
-        """
-        params = []
-
-        if self.date_range:
-            start, end = self.date_range
-            query += " AND intervention_date >= ? AND intervention_date < ?"
-            params.extend([str(start), str(end)])
-
-        if self.trusts:
-            placeholders = ','.join('?' * len(self.trusts))
-            query += f" AND org_name IN ({placeholders})"
-            params.extend(self.trusts)
-
-        if self.drugs:
-            placeholders = ','.join('?' * len(self.drugs))
-            query += f" AND drug_name_std IN ({placeholders})"
-            params.extend(self.drugs)
-
-        if self.directories:
-            placeholders = ','.join('?' * len(self.directories))
-            query += f" AND directory IN ({placeholders})"
-            params.extend(self.directories)
-
-        # Execute query
-        config = DatabaseConfig(db_path=self.db_path)
-        manager = DatabaseManager(config)
-
-        with manager.get_connection() as conn:
-            df = pd.read_sql_query(query, conn, params=params)
-
-        # Convert intervention_date to datetime
-        df['Intervention Date'] = pd.to_datetime(df['Intervention Date'])
-
-        logger.info(f"Loaded {len(df)} rows from SQLite")
-
-        # Validate result
-        is_valid, missing = self.validate_dataframe(df)
-        if not is_valid:
-            raise ValueError(f"SQLite data missing required columns: {missing}")
-
-        load_time = time.time() - start_time
-        logger.info(f"SQLite data loading complete. {len(df)} rows in {load_time:.2f}s")
-
-        return LoadResult(
-            df=df,
-            source=self.source_description,
-            row_count=len(df),
-            load_time_seconds=load_time,
-        )
-
-
 def get_loader(
    source: str | Path,
    paths: Optional[PathConfig] = None,
@@ -376,7 +231,7 @@ def get_loader(
    """Factory function to create the appropriate DataLoader.

    Args:
-        source: Either a file path (CSV/Parquet) or "sqlite" for database
+        source: File path (CSV/Parquet)
        paths: PathConfig for reference data (used by FileDataLoader)
        **kwargs: Additional arguments passed to the loader constructor

@@ -386,14 +241,6 @@ def get_loader(
    Examples:
        >>> loader = get_loader("data/activity.csv")
        >>> loader = get_loader("data/activity.parquet")
-        >>> loader = get_loader("sqlite")
-        >>> loader = get_loader("sqlite", date_range=(date(2024, 1, 1), date(2024, 12, 31)))
    """
-    source_str = str(source).lower()
-
-    if source_str == "sqlite":
-        return SQLiteDataLoader(**kwargs)
-
-    # Assume it's a file path
    path = Path(source)
    return FileDataLoader(file_path=path, paths=paths)
@@ -35,6 +35,7 @@ from data_processing.schema import (
    verify_all_tables_exist,
    get_all_table_counts,
    migrate_pathway_nodes_chart_type,
+    migrate_refresh_log_source_row_count,
 )
 from data_processing.reference_data import (
    MigrationResult,
@@ -49,12 +50,6 @@ from data_processing.reference_data import (
    verify_drug_directory_map_migration,
    verify_drug_indication_clusters_migration,
 )
-from data_processing.patient_data import (
-    load_patient_data,
-    refresh_patient_treatment_summary,
-    get_patient_data_stats,
-    verify_mv_consistency,
-)

 logger = get_logger(__name__)

@@ -67,9 +62,8 @@ def initialize_database(
    """
    Initialize the database with all required tables.

-    Creates all tables defined in the schema (reference tables, fact tables,
-    materialized views, and file tracking tables). Uses IF NOT EXISTS so
-    safe to run multiple times.
+    Creates all tables defined in the schema (reference tables and pathway
+    tables). Uses IF NOT EXISTS so safe to run multiple times.

    Args:
        db_manager: DatabaseManager instance. Uses default if not provided.
@@ -122,6 +116,14 @@ def initialize_database(
            else:
                logger.error(f"pathway_nodes migration failed: {msg}")
                return False
+
+            # Add source_row_count column to pathway_refresh_log if it doesn't exist
+            success, msg = migrate_refresh_log_source_row_count(conn)
+            if success:
+                logger.info(f"pathway_refresh_log migration: {msg}")
+            else:
+                logger.error(f"pathway_refresh_log migration failed: {msg}")
+                return False
    except Exception as e:
        logger.error(f"Migration failed: {e}")
        return False
@@ -274,107 +276,6 @@ def create_progress_reporter(description: str = "Loading", width: int = 40):
    return report_progress


-def load_patient_data_cli(
-    file_path: Path,
-    db_manager: Optional[DatabaseManager] = None,
-    paths: Optional[PathConfig] = None,
-    force: bool = False,
-    refresh_mv: bool = True
-) -> bool:
-    """
-    Load patient data from file with CLI progress reporting.
-
-    Args:
-        file_path: Path to CSV or Parquet file.
-        db_manager: DatabaseManager instance. Uses default if not provided.
-        paths: PathConfig for reference data. Uses default if not provided.
-        force: If True, re-process even if file hash matches.
-        refresh_mv: If True, refresh the materialized view after loading.
-
-    Returns:
-        True if loading succeeded, False otherwise.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-    if paths is None:
-        paths = default_paths
-
-    print(f"\n=== Loading Patient Data ===\n")
-    print(f"File: {file_path}")
-
-    # Check file exists
-    if not file_path.exists():
-        print(f"ERROR: File not found: {file_path}")
-        return False
-
-    # Calculate and display file info
-    file_size_mb = file_path.stat().st_size / (1024 * 1024)
-    print(f"Size: {file_size_mb:.1f} MB")
-    print()
-
-    # Create progress callback
-    progress_callback = create_progress_reporter("Loading rows", width=40)
-
-    # Load the data
-    result = load_patient_data(
-        file_path=file_path,
-        db_manager=db_manager,
-        paths=paths,
-        batch_size=5000,
-        force=force,
-        progress_callback=progress_callback
-    )
-
-    # Print result
-    print()
-    if result.was_already_processed:
-        print("File already processed (same hash). Skipping.")
-        print(f"Use --force to re-process.")
-    elif result.success:
-        print(f"Loaded {result.rows_inserted:,} rows in {result.load_time_seconds:.1f}s")
-        if result.rows_skipped > 0:
-            print(f"Skipped {result.rows_skipped:,} rows (missing UPID or date)")
-    else:
-        print(f"FAILED: {result.error_message}")
-        return False
-
-    # Refresh materialized view if requested
-    if refresh_mv and result.success and not result.was_already_processed:
-        print()
-        print("Refreshing materialized view...")
-        mv_progress = create_progress_reporter("Processing patients", width=40)
-        mv_result = refresh_patient_treatment_summary(
-            db_manager=db_manager,
-            progress_callback=mv_progress
-        )
-
-        if mv_result.success:
-            print(f"MV refreshed: {mv_result.patients_processed:,} patients in {mv_result.refresh_time_seconds:.1f}s")
-
-            # Verify consistency
-            consistent, msg = verify_mv_consistency(db_manager)
-            if consistent:
-                print(f"MV verification: OK")
-            else:
-                print(f"MV verification: FAILED - {msg}")
-        else:
-            print(f"MV refresh FAILED: {mv_result.error_message}")
-
-    # Print summary statistics
-    print()
-    print("=== Patient Data Summary ===")
-    stats = get_patient_data_stats(db_manager)
-    print(f"  Total rows: {stats['total_rows']:,}")
-    print(f"  Unique patients: {stats['unique_patients']:,}")
-    print(f"  Unique drugs: {stats['unique_drugs']:,}")
-    print(f"  Unique organizations: {stats['unique_organizations']:,}")
-    if stats['date_range'][0] and stats['date_range'][1]:
-        print(f"  Date range: {stats['date_range'][0]} to {stats['date_range'][1]}")
-    print()
-
-    return result.success
-
-
 def get_database_status(db_manager: Optional[DatabaseManager] = None) -> dict:
    """
    Get the current status of the database.
@@ -452,8 +353,6 @@ Examples:
  python -m data_processing.migrate --drop-existing  # Reset database
  python -m data_processing.migrate --reference-data  # Migrate reference data
  python -m data_processing.migrate --reference-data --verify  # With verification
-  python -m data_processing.migrate --load-patient-data data.parquet  # Load patient data
-  python -m data_processing.migrate --load-patient-data data.csv --force  # Force reload
  python -m data_processing.migrate --db-path ./data/test.db  # Custom path
        """
    )
@@ -493,23 +392,6 @@ Examples:
        action="store_true",
        help="Enable verbose logging"
    )
-    parser.add_argument(
-        "--load-patient-data",
-        type=Path,
-        metavar="FILE",
-        help="Load patient data from CSV or Parquet file with progress reporting"
-    )
-    parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Force re-processing even if file hash matches (use with --load-patient-data)"
-    )
-    parser.add_argument(
-        "--no-refresh-mv",
-        action="store_true",
-        help="Skip materialized view refresh after loading (use with --load-patient-data)"
-    )
-
    args = parser.parse_args()

    # Set up logging
@@ -562,32 +444,6 @@ Examples:
            print("Reference data migration completed with errors. Check logs for details.")
            return 1

-    # Handle --load-patient-data (load patient data from CSV/Parquet)
-    if args.load_patient_data:
-        # Ensure database exists with tables first
-        if not db_manager.exists:
-            print("Database does not exist. Initializing schema first...")
-            success = initialize_database(db_manager=db_manager)
-            if not success:
-                print("\nDatabase initialization failed. Check logs for details.")
-                return 1
-
-        # Load patient data with progress reporting
-        success = load_patient_data_cli(
-            file_path=args.load_patient_data,
-            db_manager=db_manager,
-            paths=default_paths,
-            force=args.force,
-            refresh_mv=not args.no_refresh_mv
-        )
-
-        if success:
-            print("Patient data load completed successfully.")
-            return 0
-        else:
-            print("Patient data load failed. Check logs for details.")
-            return 1
-
    # Run schema migration (default behavior)
    success = initialize_database(
        db_manager=db_manager,
@@ -1,890 +0,0 @@
-"""
-Patient data migration functions for NHS High-Cost Drug Patient Pathway Analysis Tool.
-
-Provides functions to load patient intervention data from CSV/Parquet files
-into the SQLite fact_interventions table. Supports:
- Batch processing for large files
- File hash tracking for incremental updates
- Progress reporting during loading
-"""
-
-import hashlib
-import os
-import sqlite3
-import time
-from dataclasses import dataclass
-from datetime import datetime
-from pathlib import Path
-from typing import Callable, Optional
-
-import pandas as pd
-
-from core import PathConfig, default_paths
-from core.logging_config import get_logger
-from data_processing.database import DatabaseManager
-
-logger = get_logger(__name__)
-
-
-@dataclass
-class PatientDataLoadResult:
-    """Results from a patient data load operation."""
-    file_path: str
-    file_hash: str
-    rows_read: int
-    rows_inserted: int
-    rows_skipped: int
-    success: bool
-    error_message: Optional[str] = None
-    load_time_seconds: float = 0.0
-    was_already_processed: bool = False
-
-    def __str__(self) -> str:
-        if self.was_already_processed:
-            return f"{self.file_path}: Already processed (same hash)"
-        elif self.success:
-            return (
-                f"{self.file_path}: Loaded {self.rows_inserted:,} rows "
-                f"in {self.load_time_seconds:.1f}s"
-            )
-        else:
-            return f"{self.file_path}: FAILED - {self.error_message}"
-
-
-def calculate_file_hash(file_path: Path) -> str:
-    """
-    Calculate SHA256 hash of a file.
-
-    Uses chunked reading to handle large files efficiently.
-
-    Args:
-        file_path: Path to the file.
-
-    Returns:
-        Hex string of SHA256 hash.
-    """
-    sha256_hash = hashlib.sha256()
-    with open(file_path, "rb") as f:
-        for chunk in iter(lambda: f.read(8192), b""):
-            sha256_hash.update(chunk)
-    return sha256_hash.hexdigest()
-
-
-def check_file_processed(
-    conn: sqlite3.Connection,
-    file_path: str,
-    file_hash: str
-) -> tuple[bool, Optional[str]]:
-    """
-    Check if a file has already been processed with the same hash.
-
-    Args:
-        conn: Database connection.
-        file_path: Full path to the file.
-        file_hash: SHA256 hash of the file.
-
-    Returns:
-        Tuple of (is_processed, old_hash).
-        - If is_processed is True and old_hash == file_hash, file is unchanged.
-        - If is_processed is True and old_hash != file_hash, file has changed.
-        - If is_processed is False, file is new.
-    """
-    cursor = conn.execute(
-        "SELECT file_hash, status FROM processed_files WHERE file_path = ?",
-        (file_path,)
-    )
-    result = cursor.fetchone()
-
-    if result is None:
-        return False, None
-
-    old_hash = result["file_hash"]
-    status = result["status"]
-
-    # Only consider it processed if status is success and hash matches
-    if status == "success" and old_hash == file_hash:
-        return True, old_hash
-
-    return False, old_hash
-
-
-def record_file_processing_start(
-    conn: sqlite3.Connection,
-    file_path: str,
-    file_hash: str,
-    file_size: int,
-    file_modified: datetime
-) -> None:
-    """
-    Record that we're starting to process a file.
-
-    Args:
-        conn: Database connection.
-        file_path: Full path to the file.
-        file_hash: SHA256 hash of the file.
-        file_size: File size in bytes.
-        file_modified: File modification timestamp.
-    """
-    file_name = Path(file_path).name
-    now = datetime.now().isoformat()
-
-    conn.execute("""
-        INSERT INTO processed_files (
-            file_path, file_name, file_hash, file_size_bytes,
-            file_modified_at, status, first_processed_at, last_processed_at
-        ) VALUES (?, ?, ?, ?, ?, 'processing', ?, ?)
-        ON CONFLICT(file_path) DO UPDATE SET
-            file_hash = excluded.file_hash,
-            file_size_bytes = excluded.file_size_bytes,
-            file_modified_at = excluded.file_modified_at,
-            status = 'processing',
-            last_processed_at = excluded.last_processed_at,
-            error_message = NULL
-    """, (file_path, file_name, file_hash, file_size, file_modified.isoformat(), now, now))
-
-
-def record_file_processing_complete(
-    conn: sqlite3.Connection,
-    file_path: str,
-    row_count: int,
-    duration_seconds: float,
-    success: bool,
-    error_message: Optional[str] = None
-) -> None:
-    """
-    Record that file processing has completed.
-
-    Args:
-        conn: Database connection.
-        file_path: Full path to the file.
-        row_count: Number of rows processed.
-        duration_seconds: Time taken to process.
-        success: Whether processing was successful.
-        error_message: Error message if failed.
-    """
-    status = "success" if success else "error"
-
-    conn.execute("""
-        UPDATE processed_files
-        SET status = ?,
-            row_count = ?,
-            processing_duration_seconds = ?,
-            error_message = ?,
-            last_processed_at = ?
-        WHERE file_path = ?
-    """, (status, row_count, duration_seconds, error_message, datetime.now().isoformat(), file_path))
-
-
-def load_dataframe_to_sqlite(
-    df: pd.DataFrame,
-    conn: sqlite3.Connection,
-    source_file: str,
-    batch_size: int = 5000,
-    progress_callback: Optional[Callable[[int, int], None]] = None
-) -> int:
-    """
-    Load a processed DataFrame into fact_interventions table.
-
-    Args:
-        df: Processed DataFrame with required columns (from FileDataLoader).
-        conn: Database connection.
-        source_file: Source file path for tracking.
-        batch_size: Number of rows to insert per batch.
-        progress_callback: Optional callback(rows_inserted, total_rows) for progress updates.
-
-    Returns:
-        Number of rows inserted.
-    """
-    # Store the original drug names before processing (for rows where mapping doesn't exist)
-    # The drug_names() transformation sets Drug Name to NULL when no mapping exists.
-    # We need to preserve the original for those cases.
-
-    # Insert SQL columns - always include drug_name_raw
-    insert_columns = [
-        "upid", "provider_code", "person_key",
-        "drug_name_raw", "drug_name_std",
-        "intervention_date", "price_actual",
-        "org_name", "directory",
-        "treatment_function_code",
-        "additional_detail_1", "additional_detail_2", "additional_detail_3",
-        "additional_detail_4", "additional_detail_5",
-        "source_file"
-    ]
-    placeholders = ",".join(["?"] * len(insert_columns))
-    insert_sql = f"""
-        INSERT INTO fact_interventions ({",".join(insert_columns)})
-        VALUES ({placeholders})
-    """
-
-    rows_inserted = 0
-    rows_skipped = 0
-    total_rows = len(df)
-
-    # Process in batches
-    for batch_start in range(0, total_rows, batch_size):
-        batch_end = min(batch_start + batch_size, total_rows)
-        batch_df = df.iloc[batch_start:batch_end]
-
-        # Prepare batch data
-        batch_data = []
-        for _, row in batch_df.iterrows():
-            # Skip rows missing required fields
-            if pd.isna(row.get("UPID")) or pd.isna(row.get("Intervention Date")):
-                rows_skipped += 1
-                continue
-            # Get drug names - raw and standardized
-            drug_name_raw = row.get("Drug Name Raw") if "Drug Name Raw" in df.columns else None
-            drug_name_std = row.get("Drug Name")
-
-            # If drug_name_std is NULL, use the raw drug name (uppercase)
-            # This handles cases where the drug isn't in the drugnames.csv mapping
-            if pd.isna(drug_name_std):
-                if drug_name_raw is not None and not pd.isna(drug_name_raw):
-                    drug_name_std = str(drug_name_raw).upper().strip()
-                else:
-                    drug_name_std = "UNKNOWN"
-
-            # Also clean up raw drug name for storage
-            if drug_name_raw is not None and not pd.isna(drug_name_raw):
-                drug_name_raw = str(drug_name_raw).strip()
-
-            # Get other values with null handling
-            def get_value(col_name):
-                if col_name not in df.columns:
-                    return None
-                val = row[col_name]
-                if pd.isna(val):
-                    return None
-                elif hasattr(val, "strftime"):
-                    return val.strftime("%Y-%m-%d")
-                return val
-
-            row_data = (
-                get_value("UPID"),
-                get_value("Provider Code"),
-                get_value("PersonKey"),
-                drug_name_raw,
-                drug_name_std,
-                get_value("Intervention Date"),
-                get_value("Price Actual") or 0,
-                get_value("OrganisationName"),
-                get_value("Directory"),
-                get_value("Treatment Function Code"),
-                get_value("Additional Detail 1"),
-                get_value("Additional Detail 2"),
-                get_value("Additional Detail 3"),
-                get_value("Additional Detail 4"),
-                get_value("Additional Detail 5"),
-                source_file
-            )
-            batch_data.append(row_data)
-
-        # Execute batch insert
-        conn.executemany(insert_sql, batch_data)
-        rows_inserted += len(batch_data)
-
-        # Report progress
-        if progress_callback:
-            progress_callback(rows_inserted, total_rows)
-
-    if rows_skipped > 0:
-        logger.info(f"Skipped {rows_skipped:,} rows with missing UPID or Intervention Date")
-
-    return rows_inserted
-
-
-def delete_file_data(conn: sqlite3.Connection, source_file: str) -> int:
-    """
-    Delete all data from a specific source file.
-
-    Used when re-processing a changed file.
-
-    Args:
-        conn: Database connection.
-        source_file: Source file path.
-
-    Returns:
-        Number of rows deleted.
-    """
-    cursor = conn.execute(
-        "DELETE FROM fact_interventions WHERE source_file = ?",
-        (source_file,)
-    )
-    return cursor.rowcount
-
-
-def load_patient_data(
-    file_path: Path | str,
-    db_manager: Optional[DatabaseManager] = None,
-    paths: Optional[PathConfig] = None,
-    batch_size: int = 5000,
-    force: bool = False,
-    progress_callback: Optional[Callable[[int, int], None]] = None
-) -> PatientDataLoadResult:
-    """
-    Load patient data from CSV/Parquet file into fact_interventions table.
-
-    This is the main entry point for loading patient data. It:
-    1. Calculates file hash to detect changes
-    2. Checks if file was already processed (skip if unchanged)
-    3. Loads and transforms data using FileDataLoader
-    4. Inserts data into SQLite in batches
-    5. Records processing status in processed_files table
-
-    Args:
-        file_path: Path to CSV or Parquet file.
-        db_manager: DatabaseManager instance. Uses default if not provided.
-        paths: PathConfig for reference data. Uses default if not provided.
-        batch_size: Number of rows to insert per batch (default: 5000).
-        force: If True, re-process even if file hash matches.
-        progress_callback: Optional callback(rows_inserted, total_rows) for progress.
-
-    Returns:
-        PatientDataLoadResult with loading statistics.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-    if paths is None:
-        paths = default_paths
-
-    file_path = Path(file_path)
-    file_path_str = str(file_path.absolute())
-
-    logger.info(f"Starting patient data load from {file_path}")
-    start_time = time.time()
-
-    # Check file exists
-    if not file_path.exists():
-        error_msg = f"File not found: {file_path}"
-        logger.error(error_msg)
-        return PatientDataLoadResult(
-            file_path=file_path_str,
-            file_hash="",
-            rows_read=0,
-            rows_inserted=0,
-            rows_skipped=0,
-            success=False,
-            error_message=error_msg
-        )
-
-    # Calculate file hash
-    logger.info("Calculating file hash...")
-    file_hash = calculate_file_hash(file_path)
-    file_size = file_path.stat().st_size
-    file_modified = datetime.fromtimestamp(file_path.stat().st_mtime)
-
-    logger.info(f"File hash: {file_hash[:16]}... Size: {file_size:,} bytes")
-
-    # Check if already processed
-    if not force:
-        with db_manager.get_connection() as conn:
-            is_processed, old_hash = check_file_processed(conn, file_path_str, file_hash)
-            if is_processed:
-                logger.info(f"File already processed with same hash, skipping")
-                return PatientDataLoadResult(
-                    file_path=file_path_str,
-                    file_hash=file_hash,
-                    rows_read=0,
-                    rows_inserted=0,
-                    rows_skipped=0,
-                    success=True,
-                    was_already_processed=True
-                )
-            elif old_hash is not None:
-                logger.info(f"File hash changed, will re-process (old: {old_hash[:16]}...)")
-
-    try:
-        # Use FileDataLoader to load and transform data
-        from data_processing.loader import FileDataLoader
-
-        loader = FileDataLoader(file_path, paths)
-        logger.info("Loading and transforming data...")
-        result = loader.load()
-        df = result.df
-        rows_read = result.row_count
-
-        logger.info(f"Loaded {rows_read:,} rows, starting SQLite insert...")
-
-        # Load into SQLite
-        with db_manager.get_transaction() as conn:
-            # Record that we're starting
-            record_file_processing_start(conn, file_path_str, file_hash, file_size, file_modified)
-
-            # Delete any existing data from this file (for re-processing)
-            deleted = delete_file_data(conn, file_path_str)
-            if deleted > 0:
-                logger.info(f"Deleted {deleted:,} existing rows from previous load")
-
-            # Insert new data
-            rows_inserted = load_dataframe_to_sqlite(
-                df, conn, file_path_str, batch_size, progress_callback
-            )
-
-            # Record success
-            load_time = time.time() - start_time
-            record_file_processing_complete(
-                conn, file_path_str, rows_inserted, load_time, True
-            )
-
-        logger.info(f"Successfully loaded {rows_inserted:,} rows in {load_time:.1f}s")
-
-        return PatientDataLoadResult(
-            file_path=file_path_str,
-            file_hash=file_hash,
-            rows_read=rows_read,
-            rows_inserted=rows_inserted,
-            rows_skipped=rows_read - rows_inserted,
-            success=True,
-            load_time_seconds=load_time
-        )
-
-    except Exception as e:
-        load_time = time.time() - start_time
-        error_msg = str(e)
-        logger.error(f"Failed to load patient data: {error_msg}")
-
-        # Record failure
-        try:
-            with db_manager.get_connection() as conn:
-                record_file_processing_complete(
-                    conn, file_path_str, 0, load_time, False, error_msg
-                )
-        except Exception:
-            pass  # Don't fail on failure to record failure
-
-        return PatientDataLoadResult(
-            file_path=file_path_str,
-            file_hash=file_hash if 'file_hash' in dir() else "",
-            rows_read=0,
-            rows_inserted=0,
-            rows_skipped=0,
-            success=False,
-            error_message=error_msg,
-            load_time_seconds=load_time
-        )
-
-
-def get_patient_data_stats(db_manager: Optional[DatabaseManager] = None) -> dict:
-    """
-    Get statistics about patient data in fact_interventions.
-
-    Returns:
-        Dictionary with statistics about the loaded data.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    stats = {}
-
-    with db_manager.get_connection() as conn:
-        # Total rows
-        cursor = conn.execute("SELECT COUNT(*) FROM fact_interventions")
-        stats["total_rows"] = cursor.fetchone()[0]
-
-        # Unique patients
-        cursor = conn.execute("SELECT COUNT(DISTINCT upid) FROM fact_interventions")
-        stats["unique_patients"] = cursor.fetchone()[0]
-
-        # Unique drugs
-        cursor = conn.execute("SELECT COUNT(DISTINCT drug_name_std) FROM fact_interventions")
-        stats["unique_drugs"] = cursor.fetchone()[0]
-
-        # Unique organizations
-        cursor = conn.execute("SELECT COUNT(DISTINCT org_name) FROM fact_interventions")
-        stats["unique_organizations"] = cursor.fetchone()[0]
-
-        # Date range
-        cursor = conn.execute("""
-            SELECT MIN(intervention_date), MAX(intervention_date)
-            FROM fact_interventions
-        """)
-        result = cursor.fetchone()
-        stats["date_range"] = (result[0], result[1]) if result else (None, None)
-
-        # Processed files
-        cursor = conn.execute("""
-            SELECT COUNT(*), SUM(row_count)
-            FROM processed_files WHERE status = 'success'
-        """)
-        result = cursor.fetchone()
-        stats["processed_files"] = result[0] if result else 0
-        stats["processed_rows"] = result[1] if result and result[1] else 0
-
-    return stats
-
-
-def list_processed_files(db_manager: Optional[DatabaseManager] = None) -> list[dict]:
-    """
-    List all processed files and their status.
-
-    Returns:
-        List of dictionaries with file processing information.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    files = []
-
-    with db_manager.get_connection() as conn:
-        cursor = conn.execute("""
-            SELECT file_path, file_name, file_hash, file_size_bytes,
-                   row_count, status, error_message,
-                   first_processed_at, last_processed_at, processing_duration_seconds
-            FROM processed_files
-            ORDER BY last_processed_at DESC
-        """)
-
-        for row in cursor.fetchall():
-            files.append({
-                "file_path": row["file_path"],
-                "file_name": row["file_name"],
-                "file_hash": row["file_hash"],
-                "file_size_bytes": row["file_size_bytes"],
-                "row_count": row["row_count"],
-                "status": row["status"],
-                "error_message": row["error_message"],
-                "first_processed_at": row["first_processed_at"],
-                "last_processed_at": row["last_processed_at"],
-                "processing_duration_seconds": row["processing_duration_seconds"],
-            })
-
-    return files
-
-
-# =============================================================================
-# Materialized View Refresh Functions
-# =============================================================================
-
-@dataclass
-class MVRefreshResult:
-    """Results from refreshing the patient treatment summary materialized view."""
-    patients_processed: int
-    rows_inserted: int
-    refresh_time_seconds: float
-    success: bool
-    error_message: Optional[str] = None
-
-    def __str__(self) -> str:
-        if self.success:
-            return (
-                f"Refreshed MV: {self.patients_processed:,} patients "
-                f"in {self.refresh_time_seconds:.1f}s"
-            )
-        else:
-            return f"MV refresh FAILED: {self.error_message}"
-
-
-def refresh_patient_treatment_summary(
-    db_manager: Optional[DatabaseManager] = None,
-    progress_callback: Optional[Callable[[int, int], None]] = None
-) -> MVRefreshResult:
-    """
-    Refresh the mv_patient_treatment_summary materialized view.
-
-    This computes per-patient aggregations from fact_interventions:
-    - First/last seen dates
-    - Total cost, average cost per intervention
-    - Intervention count, unique drug count
-    - Drug sequence (chronological, pipe-separated)
-    - Drug counts, costs, and date ranges (as JSON)
-
-    The MV is fully rebuilt (truncate and re-insert) for simplicity.
-    This typically takes 30-60 seconds for ~35,000 patients.
-
-    Args:
-        db_manager: DatabaseManager instance. Uses default if not provided.
-        progress_callback: Optional callback(patients_done, total_patients).
-
-    Returns:
-        MVRefreshResult with refresh statistics.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    logger.info("Starting materialized view refresh...")
-    start_time = time.time()
-
-    try:
-        with db_manager.get_transaction() as conn:
-            # Step 1: Get total patient count for progress reporting
-            cursor = conn.execute("SELECT COUNT(DISTINCT upid) FROM fact_interventions")
-            total_patients = cursor.fetchone()[0]
-            logger.info(f"Processing {total_patients:,} unique patients")
-
-            if total_patients == 0:
-                logger.warning("No patient data in fact_interventions, MV will be empty")
-                return MVRefreshResult(
-                    patients_processed=0,
-                    rows_inserted=0,
-                    refresh_time_seconds=time.time() - start_time,
-                    success=True
-                )
-
-            # Step 2: Clear existing MV data
-            conn.execute("DELETE FROM mv_patient_treatment_summary")
-            logger.info("Cleared existing MV data")
-
-            # Step 3: Compute aggregations using SQL CTEs
-            # This is more efficient than processing row-by-row in Python
-            refresh_sql = """
-            WITH patient_aggs AS (
-                -- Basic aggregations per patient
-                SELECT
-                    upid,
-                    MIN(org_name) as org_name,
-                    MIN(directory) as directory,
-                    MIN(intervention_date) as first_seen_date,
-                    MAX(intervention_date) as last_seen_date,
-                    JULIANDAY(MAX(intervention_date)) - JULIANDAY(MIN(intervention_date)) as days_treated,
-                    SUM(price_actual) as total_cost,
-                    AVG(price_actual) as avg_cost_per_intervention,
-                    COUNT(*) as intervention_count,
-                    COUNT(DISTINCT drug_name_std) as unique_drug_count,
-                    COUNT(*) as source_row_count
-                FROM fact_interventions
-                GROUP BY upid
-            ),
-            drug_sequences AS (
-                -- Drug sequence per patient (chronological order, pipe-separated)
-                SELECT
-                    upid,
-                    GROUP_CONCAT(drug_name_std, '|') as drug_sequence
-                FROM (
-                    SELECT DISTINCT
-                        upid,
-                        drug_name_std,
-                        MIN(intervention_date) as first_date
-                    FROM fact_interventions
-                    GROUP BY upid, drug_name_std
-                    ORDER BY upid, first_date
-                )
-                GROUP BY upid
-            ),
-            drug_counts AS (
-                -- JSON object of drug counts per patient
-                SELECT
-                    upid,
-                    '{' || GROUP_CONCAT('"' || drug_name_std || '": ' || cnt, ', ') || '}' as drug_counts_json
-                FROM (
-                    SELECT
-                        upid,
-                        drug_name_std,
-                        COUNT(*) as cnt
-                    FROM fact_interventions
-                    GROUP BY upid, drug_name_std
-                )
-                GROUP BY upid
-            ),
-            drug_costs AS (
-                -- JSON object of drug costs per patient
-                SELECT
-                    upid,
-                    '{' || GROUP_CONCAT('"' || drug_name_std || '": ' || ROUND(total_cost, 2), ', ') || '}' as drug_costs_json
-                FROM (
-                    SELECT
-                        upid,
-                        drug_name_std,
-                        SUM(price_actual) as total_cost
-                    FROM fact_interventions
-                    GROUP BY upid, drug_name_std
-                )
-                GROUP BY upid
-            ),
-            drug_dates AS (
-                -- JSON object of drug date ranges per patient
-                SELECT
-                    upid,
-                    '{' || GROUP_CONCAT('"' || drug_name_std || '": {"first": "' || first_date || '", "last": "' || last_date || '"}', ', ') || '}' as drug_date_ranges_json
-                FROM (
-                    SELECT
-                        upid,
-                        drug_name_std,
-                        MIN(intervention_date) as first_date,
-                        MAX(intervention_date) as last_date
-                    FROM fact_interventions
-                    GROUP BY upid, drug_name_std
-                )
-                GROUP BY upid
-            )
-            INSERT INTO mv_patient_treatment_summary (
-                upid, org_name, directory,
-                first_seen_date, last_seen_date, days_treated,
-                total_cost, avg_cost_per_intervention,
-                intervention_count, unique_drug_count,
-                drug_sequence, drug_counts_json, drug_costs_json, drug_date_ranges_json,
-                source_row_count, computed_at
-            )
-            SELECT
-                pa.upid,
-                pa.org_name,
-                pa.directory,
-                pa.first_seen_date,
-                pa.last_seen_date,
-                CAST(pa.days_treated AS INTEGER),
-                pa.total_cost,
-                pa.avg_cost_per_intervention,
-                pa.intervention_count,
-                pa.unique_drug_count,
-                ds.drug_sequence,
-                dc.drug_counts_json,
-                dco.drug_costs_json,
-                dd.drug_date_ranges_json,
-                pa.source_row_count,
-                CURRENT_TIMESTAMP
-            FROM patient_aggs pa
-            LEFT JOIN drug_sequences ds ON pa.upid = ds.upid
-            LEFT JOIN drug_counts dc ON pa.upid = dc.upid
-            LEFT JOIN drug_costs dco ON pa.upid = dco.upid
-            LEFT JOIN drug_dates dd ON pa.upid = dd.upid
-            """
-
-            logger.info("Executing MV refresh query...")
-            conn.execute(refresh_sql)
-
-            # Get actual rows inserted
-            cursor = conn.execute("SELECT COUNT(*) FROM mv_patient_treatment_summary")
-            rows_inserted = cursor.fetchone()[0]
-
-            refresh_time = time.time() - start_time
-            logger.info(f"MV refresh complete: {rows_inserted:,} rows in {refresh_time:.1f}s")
-
-            # Report progress if callback provided
-            if progress_callback:
-                progress_callback(rows_inserted, total_patients)
-
-            return MVRefreshResult(
-                patients_processed=total_patients,
-                rows_inserted=rows_inserted,
-                refresh_time_seconds=refresh_time,
-                success=True
-            )
-
-    except Exception as e:
-        refresh_time = time.time() - start_time
-        error_msg = str(e)
-        logger.error(f"MV refresh failed: {error_msg}")
-        return MVRefreshResult(
-            patients_processed=0,
-            rows_inserted=0,
-            refresh_time_seconds=refresh_time,
-            success=False,
-            error_message=error_msg
-        )
-
-
-def get_patient_summary_stats(db_manager: Optional[DatabaseManager] = None) -> dict:
-    """
-    Get statistics about the patient treatment summary MV.
-
-    Returns:
-        Dictionary with MV statistics.
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    stats = {}
-
-    with db_manager.get_connection() as conn:
-        # Total rows
-        cursor = conn.execute("SELECT COUNT(*) FROM mv_patient_treatment_summary")
-        stats["total_patients"] = cursor.fetchone()[0]
-
-        if stats["total_patients"] == 0:
-            return stats
-
-        # Aggregated statistics
-        cursor = conn.execute("""
-            SELECT
-                SUM(total_cost) as total_cost_all,
-                AVG(total_cost) as avg_cost_per_patient,
-                SUM(intervention_count) as total_interventions,
-                AVG(intervention_count) as avg_interventions_per_patient,
-                AVG(unique_drug_count) as avg_drugs_per_patient,
-                AVG(days_treated) as avg_days_treated,
-                MIN(first_seen_date) as earliest_date,
-                MAX(last_seen_date) as latest_date,
-                MAX(computed_at) as last_refresh
-            FROM mv_patient_treatment_summary
-        """)
-        result = cursor.fetchone()
-
-        stats["total_cost"] = result[0] if result[0] else 0
-        stats["avg_cost_per_patient"] = result[1] if result[1] else 0
-        stats["total_interventions"] = result[2] if result[2] else 0
-        stats["avg_interventions_per_patient"] = result[3] if result[3] else 0
-        stats["avg_drugs_per_patient"] = result[4] if result[4] else 0
-        stats["avg_days_treated"] = result[5] if result[5] else 0
-        stats["date_range"] = (result[6], result[7])
-        stats["last_refresh"] = result[8]
-
-        # Unique directories in MV
-        cursor = conn.execute("SELECT COUNT(DISTINCT directory) FROM mv_patient_treatment_summary")
-        stats["unique_directories"] = cursor.fetchone()[0]
-
-        # Unique organizations in MV
-        cursor = conn.execute("SELECT COUNT(DISTINCT org_name) FROM mv_patient_treatment_summary")
-        stats["unique_organizations"] = cursor.fetchone()[0]
-
-    return stats
-
-
-def verify_mv_consistency(db_manager: Optional[DatabaseManager] = None) -> tuple[bool, str]:
-    """
-    Verify that the MV is consistent with fact_interventions.
-
-    Checks that:
-    - Patient counts match
-    - Total cost sums match
-    - Intervention counts match
-
-    Returns:
-        Tuple of (is_consistent, message).
-    """
-    if db_manager is None:
-        db_manager = DatabaseManager()
-
-    with db_manager.get_connection() as conn:
-        # Get fact table counts
-        cursor = conn.execute("""
-            SELECT
-                COUNT(DISTINCT upid) as patients,
-                SUM(price_actual) as total_cost,
-                COUNT(*) as interventions
-            FROM fact_interventions
-        """)
-        fact_row = cursor.fetchone()
-        fact_patients = fact_row[0] or 0
-        fact_cost = fact_row[1] or 0
-        fact_interventions = fact_row[2] or 0
-
-        # Get MV counts
-        cursor = conn.execute("""
-            SELECT
-                COUNT(*) as patients,
-                SUM(total_cost) as total_cost,
-                SUM(intervention_count) as interventions
-            FROM mv_patient_treatment_summary
-        """)
-        mv_row = cursor.fetchone()
-        mv_patients = mv_row[0] or 0
-        mv_cost = mv_row[1] or 0
-        mv_interventions = mv_row[2] or 0
-
-        # Compare
-        issues = []
-
-        if fact_patients != mv_patients:
-            issues.append(f"Patient count mismatch: fact={fact_patients:,}, mv={mv_patients:,}")
-
-        if mv_interventions != fact_interventions:
-            issues.append(f"Intervention count mismatch: fact={fact_interventions:,}, mv={mv_interventions:,}")
-
-        # Allow small floating point differences in cost
-        cost_diff = abs(fact_cost - mv_cost)
-        if cost_diff > 0.01:
-            issues.append(f"Cost mismatch: fact={fact_cost:,.2f}, mv={mv_cost:,.2f}, diff={cost_diff:.2f}")
-
-        if issues:
-            return False, "; ".join(issues)
-
-        return True, f"MV consistent: {mv_patients:,} patients, {mv_interventions:,} interventions, £{mv_cost:,.2f} total"
@@ -115,43 +115,6 @@ CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_cluster ON ref_drug_
 CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_drug_indication_clusters(indication);
 """

-REF_DRUG_SNOMED_MAPPING_SCHEMA = """
-- Direct SNOMED code mapping from drug to indication to GP diagnosis codes
-- Source: data/drug_snomed_mapping_enriched.csv (163K rows)
-- Used for direct GP record matching to assign diagnosis-based directorates
-- and to support indication-based pathway hierarchy (Trust → Search_Term → Drug → Pathway)
-CREATE TABLE IF NOT EXISTS ref_drug_snomed_mapping (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    drug_name TEXT NOT NULL,                -- Original drug name from mapping
-    indication TEXT NOT NULL,               -- Specific indication (603 unique values)
-    ta_id TEXT,                             -- NICE TA reference (e.g., TA568)
-    search_term TEXT NOT NULL,              -- Simplified grouping (187 unique values)
-    snomed_code TEXT NOT NULL,              -- SNOMED CT code for GP record matching
-    snomed_description TEXT,                -- SNOMED code description
-    cleaned_drug_name TEXT NOT NULL,        -- Standardized drug name for matching
-    primary_directorate TEXT,               -- Primary directorate for this indication
-    all_directorates TEXT,                  -- Pipe-separated list of valid directorates
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    UNIQUE(drug_name, indication, snomed_code)
-);
-
-- Index for looking up SNOMED codes by drug name (most common access pattern)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug ON ref_drug_snomed_mapping(drug_name);
-
-- Index for looking up by cleaned drug name (standardized matching)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_cleaned ON ref_drug_snomed_mapping(cleaned_drug_name);
-
-- Index for looking up by SNOMED code (reverse lookup from GP record)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_snomed ON ref_drug_snomed_mapping(snomed_code);
-
-- Index for grouping by search_term (indication-based hierarchy)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_search_term ON ref_drug_snomed_mapping(search_term);
-
-- Composite index for drug + snomed code (common lookup pattern)
-CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug_snomed
-    ON ref_drug_snomed_mapping(cleaned_drug_name, snomed_code);
-"""
-

 # =============================================================================
 # Pathway Data Architecture Schemas
@@ -278,6 +241,7 @@ CREATE TABLE IF NOT EXISTS pathway_refresh_log (
    snowflake_query_date_from TEXT,         -- Start date of Snowflake query
    snowflake_query_date_to TEXT,           -- End date of Snowflake query
    processing_duration_seconds REAL,       -- How long the refresh took
+    source_row_count INTEGER,               -- Number of Snowflake rows fetched
    created_at TEXT DEFAULT CURRENT_TIMESTAMP
 );

@@ -301,208 +265,6 @@ PATHWAY_TABLES_SCHEMA = f"""
 """


-# =============================================================================
-# Fact Table Schemas
-# =============================================================================
-
-FACT_INTERVENTIONS_SCHEMA = """
-- Patient intervention records (fact table)
-- Source: HCD activity data (CSV/Parquet files or Snowflake)
-- This is the main fact table storing all patient intervention events
-CREATE TABLE IF NOT EXISTS fact_interventions (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- Patient identification
-    upid TEXT NOT NULL,                     -- Unique Patient ID (Provider Code[:3] + PersonKey)
-    provider_code TEXT NOT NULL,            -- Original provider code (3-5 chars)
-    person_key TEXT NOT NULL,               -- Patient key from source system
-
-    -- Intervention details
-    drug_name_raw TEXT,                     -- Original drug name from source
-    drug_name_std TEXT NOT NULL,            -- Standardized drug name (via ref_drug_names)
-    intervention_date DATE NOT NULL,        -- Date of intervention
-    price_actual REAL NOT NULL DEFAULT 0,   -- Cost of intervention in GBP
-
-    -- Organization and directory
-    org_name TEXT,                          -- Organization name (cleaned, no commas)
-    directory TEXT,                         -- Medical directory/specialty (may be "Undefined")
-
-    -- Source tracking
-    source_file TEXT,                       -- Original file this record came from
-    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-
-    -- Additional clinical fields (optional, used in directory fallback logic)
-    treatment_function_code INTEGER,
-    additional_detail_1 TEXT,
-    additional_detail_2 TEXT,
-    additional_detail_3 TEXT,
-    additional_detail_4 TEXT,
-    additional_detail_5 TEXT
-);
-
-- Primary indexes for common filter patterns used in generate_graph()
-- UPID: Used for patient grouping, pathway analysis
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid ON fact_interventions(upid);
-
-- Drug name (standardized): Used for drug filtering
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_drug ON fact_interventions(drug_name_std);
-
-- Intervention date: Used for date range filtering (start_date, end_date, last_seen)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_date ON fact_interventions(intervention_date);
-
-- Directory: Used for directory/specialty filtering
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_directory ON fact_interventions(directory);
-
-- Organization: Used for trust filtering (Provider Code maps to org_name)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_org ON fact_interventions(org_name);
-
-- Composite index for common filter combination (trust + drug + directory)
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_composite
-    ON fact_interventions(org_name, drug_name_std, directory);
-
-- Composite index for date-based patient analysis
-CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid_date
-    ON fact_interventions(upid, intervention_date);
-"""
-
-
-# =============================================================================
-# Materialized View Schemas (Cached Aggregations)
-# =============================================================================
-
-MV_PATIENT_TREATMENT_SUMMARY_SCHEMA = """
-- Materialized view of patient treatment summaries
-- Pre-computed aggregations per patient for faster pathway analysis
-- Refreshed when fact_interventions data changes
-CREATE TABLE IF NOT EXISTS mv_patient_treatment_summary (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- Patient identification
-    upid TEXT NOT NULL UNIQUE,              -- Unique Patient ID
-
-    -- Organization and directory (for filtering)
-    org_name TEXT,                          -- Organization name (first org seen)
-    directory TEXT,                         -- Primary directory (first directory assigned)
-
-    -- Date range
-    first_seen_date DATE NOT NULL,          -- First intervention date
-    last_seen_date DATE NOT NULL,           -- Last intervention date
-    days_treated INTEGER NOT NULL DEFAULT 0, -- Duration: last_seen - first_seen
-
-    -- Cost aggregations
-    total_cost REAL NOT NULL DEFAULT 0,     -- Sum of all intervention costs
-    avg_cost_per_intervention REAL,         -- Average cost per intervention
-
-    -- Treatment summary
-    intervention_count INTEGER NOT NULL DEFAULT 0,  -- Total number of interventions
-    unique_drug_count INTEGER NOT NULL DEFAULT 0,   -- Number of distinct drugs
-
-    -- Drug sequence (pipe-separated standardized drug names in chronological order)
-    -- Example: "ADALIMUMAB|ETANERCEPT|INFLIXIMAB"
-    drug_sequence TEXT,
-
-    -- Drug frequency counts (JSON: {"ADALIMUMAB": 5, "ETANERCEPT": 3})
-    -- Stores count of each drug for this patient
-    drug_counts_json TEXT,
-
-    -- Drug cost totals (JSON: {"ADALIMUMAB": 15000.00, "ETANERCEPT": 8000.00})
-    -- Stores total cost per drug for this patient
-    drug_costs_json TEXT,
-
-    -- Per-drug date ranges (JSON: {"ADALIMUMAB": {"first": "2023-01-01", "last": "2023-06-15"}, ...})
-    -- Stores first/last date for each drug
-    drug_date_ranges_json TEXT,
-
-    -- Metadata
-    computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    source_row_count INTEGER               -- Number of fact_interventions rows used
-);
-
-- Index for fast patient lookup
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_upid ON mv_patient_treatment_summary(upid);
-
-- Indexes for common filter patterns
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_org ON mv_patient_treatment_summary(org_name);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_directory ON mv_patient_treatment_summary(directory);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_first_seen ON mv_patient_treatment_summary(first_seen_date);
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_last_seen ON mv_patient_treatment_summary(last_seen_date);
-
-- Composite index for date range filtering (common in generate_graph)
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_date_range
-    ON mv_patient_treatment_summary(first_seen_date, last_seen_date);
-
-- Composite index for org + directory + dates (full filter pattern)
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_filter_composite
-    ON mv_patient_treatment_summary(org_name, directory, first_seen_date, last_seen_date);
-
-- Index for drug sequence pattern matching
-CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_drug_seq ON mv_patient_treatment_summary(drug_sequence);
-"""
-
-MATERIALIZED_VIEWS_SCHEMA = f"""
-- Materialized Views Schema
-- Pre-computed aggregations for performance
-
-{MV_PATIENT_TREATMENT_SUMMARY_SCHEMA}
-"""
-
-
-# =============================================================================
-# File Tracking Schemas (Incremental Updates)
-# =============================================================================
-
-PROCESSED_FILES_SCHEMA = """
-- Tracks processed data files for incremental updates
-- Enables detecting changed files by comparing hashes
-- Stores processing status and statistics
-CREATE TABLE IF NOT EXISTS processed_files (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-
-    -- File identification
-    file_path TEXT NOT NULL,                -- Full path to the file
-    file_name TEXT NOT NULL,                -- Just the filename (for display)
-    file_hash TEXT NOT NULL,                -- SHA256 hash of file contents
-
-    -- File metadata
-    file_size_bytes INTEGER,                -- Size of file in bytes
-    file_modified_at TIMESTAMP,             -- File's last modification timestamp
-
-    -- Processing results
-    row_count INTEGER DEFAULT 0,            -- Number of rows processed from this file
-    status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, success, error
-    error_message TEXT,                     -- Error details if status='error'
-
-    -- Timestamps
-    first_processed_at TIMESTAMP,           -- When first processed
-    last_processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    processing_duration_seconds REAL,       -- How long processing took
-
-    -- Uniqueness: only one record per file path
-    -- Hash changes indicate file content changed (needs reprocessing)
-    UNIQUE(file_path)
-);
-
-- Index for fast lookup by file path
-CREATE INDEX IF NOT EXISTS idx_processed_files_path ON processed_files(file_path);
-
-- Index for finding files by status (e.g., find all pending or errored files)
-CREATE INDEX IF NOT EXISTS idx_processed_files_status ON processed_files(status);
-
-- Index for finding files by hash (detect if same file appears at different paths)
-CREATE INDEX IF NOT EXISTS idx_processed_files_hash ON processed_files(file_hash);
-
-- Index for finding recently processed files
-CREATE INDEX IF NOT EXISTS idx_processed_files_last_processed ON processed_files(last_processed_at);
-"""
-
-FILE_TRACKING_SCHEMA = f"""
-- File Tracking Schema
-- Supports incremental data loading
-
-{PROCESSED_FILES_SCHEMA}
-"""
-
-
 # =============================================================================
 # Combined Schemas
 # =============================================================================
@@ -520,29 +282,14 @@ REFERENCE_TABLES_SCHEMA = f"""
 {REF_DRUG_DIRECTORY_MAP_SCHEMA}

 {REF_DRUG_INDICATION_CLUSTERS_SCHEMA}
-
-{REF_DRUG_SNOMED_MAPPING_SCHEMA}
-"""
-
-FACT_TABLES_SCHEMA = f"""
-- Fact Tables Schema
-- Contains patient intervention data
-
-{FACT_INTERVENTIONS_SCHEMA}
 """

 ALL_TABLES_SCHEMA = f"""
 -- Complete Database Schema
-- Reference tables + Fact tables + Materialized views + File tracking + Pathway tables
+-- Reference tables + Pathway tables

 {REFERENCE_TABLES_SCHEMA}

-{FACT_TABLES_SCHEMA}
-
-{MATERIALIZED_VIEWS_SCHEMA}
-
-{FILE_TRACKING_SCHEMA}
-
 {PATHWAY_TABLES_SCHEMA}
 """

@@ -580,26 +327,10 @@ def drop_reference_tables(conn: sqlite3.Connection) -> None:
        DROP TABLE IF EXISTS ref_directories;
        DROP TABLE IF EXISTS ref_drug_directory_map;
        DROP TABLE IF EXISTS ref_drug_indication_clusters;
-        DROP TABLE IF EXISTS ref_drug_snomed_mapping;
    """)
    logger.info("Reference tables dropped")


-def create_drug_snomed_mapping_table(conn: sqlite3.Connection) -> None:
-    """
-    Create the ref_drug_snomed_mapping table for direct SNOMED code mapping.
-
-    This table stores mappings from drugs to SNOMED codes for GP record matching,
-    enabling diagnosis-based directorate assignment and indication-based pathways.
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating ref_drug_snomed_mapping table...")
-    conn.executescript(REF_DRUG_SNOMED_MAPPING_SCHEMA)
-    logger.info("ref_drug_snomed_mapping table created successfully")
-
-
 def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for all reference tables.
@@ -616,7 +347,6 @@ def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
-        "ref_drug_snomed_mapping",
    ]
    counts = {}

@@ -647,7 +377,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
-        "ref_drug_snomed_mapping",
    ]
    missing = []

@@ -662,164 +391,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
    return missing


-# =============================================================================
-# Fact Table Helper Functions
-# =============================================================================
-
-def create_fact_tables(conn: sqlite3.Connection) -> None:
-    """
-    Create all fact tables in the database (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating fact tables...")
-    conn.executescript(FACT_TABLES_SCHEMA)
-    conn.executescript(MATERIALIZED_VIEWS_SCHEMA)
-    logger.info("Fact tables created successfully")
-
-
-def drop_fact_tables(conn: sqlite3.Connection) -> None:
-    """
-    Drop all fact tables from the database.
-
-    Args:
-        conn: SQLite database connection.
-
-    Warning:
-        This will delete all patient intervention data. Use with caution.
-    """
-    logger.warning("Dropping fact tables...")
-    conn.executescript("""
-        DROP TABLE IF EXISTS fact_interventions;
-        DROP TABLE IF EXISTS mv_patient_treatment_summary;
-    """)
-    logger.info("Fact tables dropped")
-
-
-def get_fact_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
-    """
-    Get row counts for all fact tables (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        Dictionary mapping table name to row count.
-    """
-    tables = ["fact_interventions", "mv_patient_treatment_summary"]
-    counts = {}
-
-    for table in tables:
-        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
-        result = cursor.fetchone()
-        counts[table] = result[0] if result else 0
-
-    return counts
-
-
-def verify_fact_tables_exist(conn: sqlite3.Connection) -> list[str]:
-    """
-    Verify that all fact tables exist (including materialized views).
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        List of missing table names. Empty list means all tables exist.
-    """
-    required_tables = ["fact_interventions", "mv_patient_treatment_summary"]
-    missing = []
-
-    for table in required_tables:
-        cursor = conn.execute(
-            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
-            (table,)
-        )
-        if cursor.fetchone() is None:
-            missing.append(table)
-
-    return missing
-
-
-# =============================================================================
-# File Tracking Helper Functions
-# =============================================================================
-
-def create_file_tracking_tables(conn: sqlite3.Connection) -> None:
-    """
-    Create file tracking tables in the database.
-
-    Args:
-        conn: SQLite database connection.
-    """
-    logger.info("Creating file tracking tables...")
-    conn.executescript(FILE_TRACKING_SCHEMA)
-    logger.info("File tracking tables created successfully")
-
-
-def drop_file_tracking_tables(conn: sqlite3.Connection) -> None:
-    """
-    Drop file tracking tables from the database.
-
-    Args:
-        conn: SQLite database connection.
-
-    Warning:
-        This will delete all file tracking history.
-    """
-    logger.warning("Dropping file tracking tables...")
-    conn.executescript("""
-        DROP TABLE IF EXISTS processed_files;
-    """)
-    logger.info("File tracking tables dropped")
-
-
-def get_file_tracking_counts(conn: sqlite3.Connection) -> dict[str, int]:
-    """
-    Get row counts for file tracking tables.
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        Dictionary mapping table name to row count.
-    """
-    tables = ["processed_files"]
-    counts = {}
-
-    for table in tables:
-        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
-        result = cursor.fetchone()
-        counts[table] = result[0] if result else 0
-
-    return counts
-
-
-def verify_file_tracking_tables_exist(conn: sqlite3.Connection) -> list[str]:
-    """
-    Verify that file tracking tables exist.
-
-    Args:
-        conn: SQLite database connection.
-
-    Returns:
-        List of missing table names. Empty list means all tables exist.
-    """
-    required_tables = ["processed_files"]
-    missing = []
-
-    for table in required_tables:
-        cursor = conn.execute(
-            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
-            (table,)
-        )
-        if cursor.fetchone() is None:
-            missing.append(table)
-
-    return missing
-
-
 # =============================================================================
 # Pathway Table Helper Functions
 # =============================================================================
@@ -1050,13 +621,37 @@ def migrate_pathway_nodes_chart_type(conn: sqlite3.Connection) -> tuple[bool, st
        return False, f"Migration failed: {e}"


+def migrate_refresh_log_source_row_count(conn: sqlite3.Connection) -> tuple[bool, str]:
+    """Add source_row_count column to pathway_refresh_log if it doesn't exist.
+
+    This column stores the Snowflake row count for display in the UI footer.
+    """
+    cursor = conn.execute("PRAGMA table_info(pathway_refresh_log)")
+    columns = [row[1] for row in cursor.fetchall()]
+
+    if "source_row_count" in columns:
+        return True, "source_row_count column already exists"
+
+    logger.info("Adding source_row_count column to pathway_refresh_log...")
+    try:
+        conn.execute("""
+            ALTER TABLE pathway_refresh_log
+            ADD COLUMN source_row_count INTEGER
+        """)
+        conn.commit()
+        return True, "Added source_row_count column"
+    except Exception as e:
+        logger.error(f"Failed to add source_row_count column: {e}")
+        return False, f"Migration failed: {e}"
+
+
 # =============================================================================
 # Combined Helper Functions
 # =============================================================================

 def create_all_tables(conn: sqlite3.Connection) -> None:
    """
-    Create all tables (reference + fact) in the database.
+    Create all tables (reference + pathway) in the database.

    Args:
        conn: SQLite database connection.
@@ -1078,8 +673,6 @@ def drop_all_tables(conn: sqlite3.Connection) -> None:
    """
    logger.warning("Dropping all tables...")
    drop_pathway_tables(conn)
-    drop_file_tracking_tables(conn)
-    drop_fact_tables(conn)
    drop_reference_tables(conn)
    logger.info("All tables dropped")

@@ -1096,8 +689,6 @@ def get_all_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    counts = {}
    counts.update(get_reference_table_counts(conn))
-    counts.update(get_fact_table_counts(conn))
-    counts.update(get_file_tracking_counts(conn))
    counts.update(get_pathway_table_counts(conn))
    return counts

@@ -1114,7 +705,5 @@ def verify_all_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    missing = []
    missing.extend(verify_reference_tables_exist(conn))
-    missing.extend(verify_fact_tables_exist(conn))
-    missing.extend(verify_file_tracking_tables_exist(conn))
    missing.extend(verify_pathway_tables_exist(conn))
    return missing