refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables

Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
2026-02-06 08:51:03 +00:00
parent bb93c1673e
commit 778ed99ef6
11 changed files with 95 additions and 3653 deletions
@@ -78,42 +78,6 @@ class DrugIndicationMatchRate:
    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs


-@dataclass
-class DrugSnomedMapping:
-    """SNOMED code mapping for a drug from ref_drug_snomed_mapping."""
-    snomed_code: str
-    snomed_description: str
-    search_term: str
-    primary_directorate: str
-    indication: str = ""
-    ta_id: str = ""
-
-
-@dataclass
-class DirectSnomedMatchResult:
-    """Result of direct SNOMED code lookup in GP records."""
-    patient_pseudonym: str
-    matched: bool
-    snomed_code: Optional[str] = None
-    snomed_description: Optional[str] = None
-    search_term: Optional[str] = None
-    primary_directorate: Optional[str] = None
-    event_date: Optional[datetime] = None
-    source: str = "DIRECT_SNOMED"  # DIRECT_SNOMED | NONE
-
-
-@dataclass
-class DirectorateAssignment:
-    """Result of directorate assignment for a patient-drug combination."""
-    upid: str
-    drug_name: str
-    directorate: Optional[str]
-    search_term: Optional[str] = None
-    source: str = "FALLBACK"  # DIAGNOSIS | FALLBACK
-    snomed_code: Optional[str] = None
-    event_date: Optional[datetime] = None
-
-
 def get_drug_clusters(
    drug_name: str,
    db_manager: Optional[DatabaseManager] = None
@@ -180,266 +144,6 @@ def get_drug_cluster_ids(
    return list(set(c["cluster_id"] for c in clusters))


-def get_drug_snomed_codes(
-    drug_name: str,
-    db_manager: Optional[DatabaseManager] = None
-) -> list[DrugSnomedMapping]:
-    """
-    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.
-
-    This uses the enriched mapping CSV data loaded into SQLite, which provides
-    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.
-
-    Args:
-        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-
-    Returns:
-        List of DrugSnomedMapping with snomed_code, snomed_description,
-        search_term, primary_directorate, indication, ta_id
-    """
-    if db_manager is None:
-        db_manager = default_db_manager
-
-    query = """
-        SELECT DISTINCT
-            snomed_code,
-            snomed_description,
-            search_term,
-            primary_directorate,
-            indication,
-            ta_id
-        FROM ref_drug_snomed_mapping
-        WHERE UPPER(cleaned_drug_name) = UPPER(?)
-           OR UPPER(drug_name) = UPPER(?)
-        ORDER BY search_term, snomed_code
-    """
-
-    try:
-        with db_manager.get_connection() as conn:
-            cursor = conn.execute(query, (drug_name, drug_name))
-            rows = cursor.fetchall()
-
-            results = []
-            for row in rows:
-                results.append(DrugSnomedMapping(
-                    snomed_code=row["snomed_code"],
-                    snomed_description=row["snomed_description"] or "",
-                    search_term=row["search_term"] or "",
-                    primary_directorate=row["primary_directorate"] or "",
-                    indication=row["indication"] or "",
-                    ta_id=row["ta_id"] or "",
-                ))
-
-            logger.debug(f"Found {len(results)} SNOMED mappings for drug '{drug_name}'")
-            return results
-
-    except Exception as e:
-        logger.error(f"Error getting SNOMED codes for drug '{drug_name}': {e}")
-        return []
-
-
-def patient_has_indication_direct(
-    patient_pseudonym: str,
-    drug_snomed_mappings: list[DrugSnomedMapping],
-    connector: Optional[SnowflakeConnector] = None,
-    before_date: Optional[date] = None,
-) -> DirectSnomedMatchResult:
-    """
-    Check if patient has any of the SNOMED codes in their GP records.
-
-    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding
-    for exact SNOMED code matches (not via cluster). Returns the most recent
-    match by EventDateTime if multiple matches exist.
-
-    Args:
-        patient_pseudonym: Patient's pseudonymised NHS number
-        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        before_date: Optional date - only check diagnoses before this date
-
-    Returns:
-        DirectSnomedMatchResult with match details (most recent by EventDateTime)
-    """
-    result = DirectSnomedMatchResult(
-        patient_pseudonym=patient_pseudonym,
-        matched=False,
-        source="NONE",
-    )
-
-    if not drug_snomed_mappings:
-        return result
-
-    if not SNOWFLAKE_AVAILABLE:
-        logger.warning("Snowflake connector not available")
-        return result
-
-    if not is_snowflake_configured():
-        logger.warning("Snowflake not configured - cannot check GP records")
-        return result
-
-    if connector is None:
-        connector = get_connector()
-
-    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)
-    snomed_lookup = {
-        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)
-        for m in drug_snomed_mappings
-    }
-
-    # Get unique SNOMED codes
-    snomed_codes = list(snomed_lookup.keys())
-
-    # Build placeholders for SNOMED codes
-    placeholders = ", ".join(["%s"] * len(snomed_codes))
-
-    # Query to find most recent matching SNOMED code in GP records
-    query = f'''
-        SELECT
-            "SNOMEDCode",
-            "EventDateTime"
-        FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
-        WHERE "PatientPseudonym" = %s
-            AND "SNOMEDCode" IN ({placeholders})
-    '''
-
-    params: list = [patient_pseudonym] + snomed_codes
-
-    if before_date:
-        query += ' AND "EventDateTime" < %s'
-        params.append(before_date.isoformat())
-
-    query += ' ORDER BY "EventDateTime" DESC LIMIT 1'
-
-    try:
-        results = connector.execute_dict(query, tuple(params))
-
-        if results:
-            row = results[0]
-            matched_code = row.get("SNOMEDCode")
-            event_dt = row.get("EventDateTime")
-
-            if matched_code and matched_code in snomed_lookup:
-                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]
-
-                return DirectSnomedMatchResult(
-                    patient_pseudonym=patient_pseudonym,
-                    matched=True,
-                    snomed_code=matched_code,
-                    snomed_description=snomed_desc,
-                    search_term=search_term,
-                    primary_directorate=primary_dir,
-                    event_date=event_dt,
-                    source="DIRECT_SNOMED",
-                )
-
-        return result
-
-    except Exception as e:
-        logger.error(f"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}")
-        return result
-
-
-def get_directorate_from_diagnosis(
-    upid: str,
-    drug_name: str,
-    connector: Optional[SnowflakeConnector] = None,
-    db_manager: Optional[DatabaseManager] = None,
-    before_date: Optional[date] = None,
-) -> DirectorateAssignment:
-    """
-    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.
-
-    This function attempts to assign a directorate based on the patient's GP records
-    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result
-    indicating that the caller should use alternative assignment methods (e.g.,
-    department_identification() from tools/data.py).
-
-    Workflow:
-    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping
-    2. Query patient's GP records for matching SNOMED codes
-    3. If match found → return diagnosis-based directorate and search_term
-    4. If no match → return FALLBACK result (caller handles fallback logic)
-
-    Args:
-        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)
-        drug_name: Drug name to look up
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-        before_date: Optional date - only check diagnoses before this date
-
-    Returns:
-        DirectorateAssignment with directorate, search_term, and source
-    """
-    result = DirectorateAssignment(
-        upid=upid,
-        drug_name=drug_name,
-        directorate=None,
-        source="FALLBACK",
-    )
-
-    # Step 1: Get SNOMED codes for the drug
-    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)
-
-    if not drug_snomed_mappings:
-        logger.debug(f"No SNOMED mappings found for drug '{drug_name}' - using fallback")
-        return result
-
-    # Step 2: Check Snowflake availability
-    if not SNOWFLAKE_AVAILABLE:
-        logger.debug("Snowflake not available - using fallback")
-        return result
-
-    if not is_snowflake_configured():
-        logger.debug("Snowflake not configured - using fallback")
-        return result
-
-    # Step 3: Get patient pseudonym from UPID
-    # UPID format is Provider Code (3 chars) + PersonKey
-    # We need to query Snowflake to get the PatientPseudonym for this PersonKey
-    # However, patient_has_indication_direct expects PatientPseudonym, not UPID
-    # For now, we'll use UPID as the identifier - the actual integration
-    # will need to happen at the DataFrame level where we have PersonKey
-    #
-    # NOTE: This function will be called from the pipeline where we have
-    # access to PatientPseudonym. The UPID is passed for logging/tracking.
-
-    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.
-    # The caller should pass the PatientPseudonym or we need to look it up.
-    # For now, let's assume the caller will use this in a batch context
-    # where they can map UPID -> PatientPseudonym.
-
-    # Let me reconsider: the function signature takes UPID but we need
-    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),
-    # we'll have the PersonKey column which IS the PatientPseudonym.
-    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.
-    #
-    # We can extract PatientPseudonym from UPID by removing the first 3 chars.
-    patient_pseudonym = upid[3:] if len(upid) > 3 else upid
-
-    # Step 4: Check patient's GP records for matching SNOMED codes
-    match_result = patient_has_indication_direct(
-        patient_pseudonym=patient_pseudonym,
-        drug_snomed_mappings=drug_snomed_mappings,
-        connector=connector,
-        before_date=before_date,
-    )
-
-    if match_result.matched and match_result.primary_directorate:
-        return DirectorateAssignment(
-            upid=upid,
-            drug_name=drug_name,
-            directorate=match_result.primary_directorate,
-            search_term=match_result.search_term,
-            source="DIAGNOSIS",
-            snomed_code=match_result.snomed_code,
-            event_date=match_result.event_date,
-        )
-
-    # No match found - return fallback result
-    return result
-
-
 def get_cluster_snomed_codes(
    cluster_id: str,
    connector: Optional[SnowflakeConnector] = None,
@@ -864,229 +568,6 @@ def get_available_clusters(
        return []


-def batch_lookup_indication_groups(
-    df: "pd.DataFrame",
-    connector: Optional[SnowflakeConnector] = None,
-    db_manager: Optional[DatabaseManager] = None,
-    batch_size: int = 500,
-) -> "pd.DataFrame":
-    """
-    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.
-
-    This is the efficient batch version of get_directorate_from_diagnosis().
-    Instead of querying Snowflake per patient, it batches the lookups for performance.
-
-    Strategy:
-    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame
-    2. For each unique drug, get all SNOMED codes from local SQLite
-    3. Build batched Snowflake queries to check GP records
-    4. Return indication_df mapping UPID → Indication_Group
-
-    For unmatched patients, Indication_Group will be their Directory (with suffix).
-
-    Args:
-        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey
-        connector: Optional SnowflakeConnector (defaults to singleton)
-        db_manager: Optional DatabaseManager (defaults to default_db_manager)
-        batch_size: Number of patients per Snowflake query batch
-
-    Returns:
-        DataFrame with columns: UPID, Indication_Group, Source
-        - Indication_Group: Search_Term (if matched) or "Directory (no GP dx)" (if not)
-        - Source: "DIAGNOSIS" or "FALLBACK"
-    """
-    import pandas as pd
-
-    if db_manager is None:
-        db_manager = default_db_manager
-
-    logger.info(f"Starting batch indication lookup for {len(df)} records...")
-
-    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
-    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
-    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
-    if 'PseudoNHSNoLinked' not in df.columns:
-        logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
-        # Return fallback for all patients
-        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']]
-
-    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)
-    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
-    logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
-
-    # Step 2: Get all unique drugs and their SNOMED codes
-    unique_drugs = unique_pairs['Drug Name'].unique()
-    logger.info(f"Building SNOMED lookup for {len(unique_drugs)} unique drugs...")
-
-    # Build drug -> list of DrugSnomedMapping dict
-    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}
-    all_snomed_codes: set[str] = set()
-    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]
-
-    for drug_name in unique_drugs:
-        mappings = get_drug_snomed_codes(drug_name, db_manager)
-        drug_snomed_map[drug_name] = mappings
-
-        for m in mappings:
-            all_snomed_codes.add(m.snomed_code)
-            if m.snomed_code not in snomed_to_drug_searchterm:
-                snomed_to_drug_searchterm[m.snomed_code] = []
-            snomed_to_drug_searchterm[m.snomed_code].append(
-                (drug_name, m.search_term, m.primary_directorate)
-            )
-
-    logger.info(f"Total SNOMED codes to check: {len(all_snomed_codes)}")
-
-    # Step 3: Check Snowflake availability
-    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
-        logger.warning("Snowflake not available - returning fallback for all patients")
-        result_df = unique_pairs[['UPID', 'Directory']].copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
-
-    if connector is None:
-        connector = get_connector()
-
-    # Step 4: Query GP records for all patients in batches
-    # The query finds the most recent matching SNOMED code for each patient
-
-    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)
-    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
-    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
-
-    logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
-
-    # Results dict: PersonKey -> (snomed_code, event_date)
-    gp_matches: dict[str, tuple[str, Any]] = {}
-
-    # Convert SNOMED codes to list for query
-    snomed_list = list(all_snomed_codes)
-
-    if not snomed_list:
-        logger.warning("No SNOMED codes to check - returning fallback for all patients")
-        result_df = unique_pairs[['UPID', 'Directory']].copy()
-        result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
-        result_df['Source'] = "FALLBACK"
-        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
-
-    # Build SNOMED IN clause (reused across batches)
-    snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
-
-    # Process patients in batches
-    for batch_start in range(0, len(patient_pseudonyms), batch_size):
-        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
-        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
-
-        logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
-
-        # Build patient IN clause
-        patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
-
-        # Query to find all matching SNOMED codes for these patients
-        # We'll get all matches and pick the most recent per patient in Python
-        query = f'''
-            SELECT
-                "PatientPseudonym",
-                "SNOMEDCode",
-                "EventDateTime"
-            FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
-            WHERE "PatientPseudonym" IN ({patient_placeholders})
-              AND "SNOMEDCode" IN ({snomed_placeholders})
-            ORDER BY "PatientPseudonym", "EventDateTime" DESC
-        '''
-
-        params = tuple(batch_pseudonyms) + tuple(snomed_list)
-
-        try:
-            results = connector.execute_dict(query, params)
-
-            # Process results - pick most recent per patient
-            for row in results:
-                person_key = row.get("PatientPseudonym")
-                snomed_code = row.get("SNOMEDCode")
-                event_date = row.get("EventDateTime")
-
-                if person_key and snomed_code:
-                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)
-                    if person_key not in gp_matches:
-                        gp_matches[person_key] = (snomed_code, event_date)
-
-        except Exception as e:
-            logger.error(f"Error querying GP records for batch: {e}")
-            # Continue with other batches
-
-    logger.info(f"Found GP matches for {len(gp_matches)} patients")
-
-    # Step 5: Build result DataFrame
-    # For each unique_pair, determine Indication_Group based on match status
-    results_list = []
-
-    # We need to dedupe by UPID - a patient might be on multiple drugs
-    # Strategy: For each UPID, use the most recent match (if any)
-    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)
-
-    for _, row in unique_pairs.iterrows():
-        upid = row['UPID']
-        drug_name = row['Drug Name']
-        patient_pseudonym = row['PseudoNHSNoLinked']
-        directory = row['Directory']
-
-        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
-        if patient_pseudonym in gp_matches:
-            matched_snomed, event_date = gp_matches[patient_pseudonym]
-
-            # Find the search_term for this SNOMED code and drug
-            # (A SNOMED code might map to multiple drugs with different search_terms)
-            if matched_snomed in snomed_to_drug_searchterm:
-                # Look for match with current drug first
-                search_term = None
-                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:
-                    if drug.upper() == drug_name.upper():
-                        search_term = st
-                        break
-                # If no drug-specific match, use any match
-                if search_term is None:
-                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]
-
-                # Only update if we don't have a match for this UPID yet
-                if upid not in upid_to_match:
-                    upid_to_match[upid] = (search_term, "DIAGNOSIS")
-            else:
-                # Shouldn't happen but fallback just in case
-                if upid not in upid_to_match:
-                    upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
-        else:
-            # No GP match - use fallback
-            if upid not in upid_to_match:
-                upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
-
-    # Build result DataFrame
-    for upid, (indication_group, source) in upid_to_match.items():
-        results_list.append({
-            'UPID': upid,
-            'Indication_Group': indication_group,
-            'Source': source,
-        })
-
-    result_df = pd.DataFrame(results_list)
-
-    # Log statistics
-    diagnosis_count = len([s for s in result_df['Source'] if s == "DIAGNOSIS"])
-    fallback_count = len([s for s in result_df['Source'] if s == "FALLBACK"])
-    total = len(result_df)
-
-    logger.info(f"Indication lookup complete:")
-    logger.info(f"  Total unique patients: {total}")
-    logger.info(f"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)")
-    logger.info(f"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)")
-
-    return result_df
-
-
 # === Drug-to-indication mapping from DimSearchTerm.csv ===


@@ -1713,10 +1194,7 @@ __all__ = [
    "ClusterSnomedCodes",
    "IndicationValidationResult",
    "DrugIndicationMatchRate",
-    "DrugSnomedMapping",
-    "DirectSnomedMatchResult",
-    "DirectorateAssignment",
-    # Cluster-based lookup functions (existing)
+    # Cluster-based lookup functions
    "get_drug_clusters",
    "get_drug_cluster_ids",
    "get_cluster_snomed_codes",
@@ -1725,20 +1203,13 @@ __all__ = [
    "get_indication_match_rate",
    "batch_validate_indications",
    "get_available_clusters",
-    # Direct SNOMED lookup functions (new)
-    "get_drug_snomed_codes",
-    "patient_has_indication_direct",
-    # Diagnosis-based directorate assignment
-    "get_directorate_from_diagnosis",
-    # Batch lookup for indication groups
-    "batch_lookup_indication_groups",
    # Drug-indication mapping from DimSearchTerm.csv
    "SEARCH_TERM_MERGE_MAP",
    "load_drug_indication_mapping",
    "get_search_terms_for_drug",
    # Drug-aware indication assignment
    "assign_drug_indications",
-    # Snowflake-direct indication lookup (new approach)
+    # Snowflake-direct indication lookup
    "get_patient_indication_groups",
    "CLUSTER_MAPPING_SQL",
 ]