refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables

Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows),
ref_drug_snomed_mapping (144K rows), and processed_files — all unused since
the app moved to pre-computed pathway_nodes.

Key changes:
- Rewrite load_data() to source from pathway_nodes + pathway_refresh_log
- Remove 7 dead methods and 8 dead state vars from pathways_app.py
- Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py
- Remove SQLiteDataLoader (depended on fact_interventions)
- Remove file tracking schema (processed_files tracked fact_interventions loads)
- Remove legacy diagnosis functions from diagnosis_lookup.py
- Add source_row_count migration for pathway_refresh_log
- Clean all cross-references in __init__.py, data_source.py, migrate.py
This commit is contained in:
Andrew Charlwood
2026-02-06 08:51:03 +00:00
parent bb93c1673e
commit 778ed99ef6
11 changed files with 95 additions and 3653 deletions
+2 -531
View File
@@ -78,42 +78,6 @@ class DrugIndicationMatchRate:
sample_unmatched: list[str] = field(default_factory=list) # Sample patient IDs
@dataclass
class DrugSnomedMapping:
"""SNOMED code mapping for a drug from ref_drug_snomed_mapping."""
snomed_code: str
snomed_description: str
search_term: str
primary_directorate: str
indication: str = ""
ta_id: str = ""
@dataclass
class DirectSnomedMatchResult:
"""Result of direct SNOMED code lookup in GP records."""
patient_pseudonym: str
matched: bool
snomed_code: Optional[str] = None
snomed_description: Optional[str] = None
search_term: Optional[str] = None
primary_directorate: Optional[str] = None
event_date: Optional[datetime] = None
source: str = "DIRECT_SNOMED" # DIRECT_SNOMED | NONE
@dataclass
class DirectorateAssignment:
"""Result of directorate assignment for a patient-drug combination."""
upid: str
drug_name: str
directorate: Optional[str]
search_term: Optional[str] = None
source: str = "FALLBACK" # DIAGNOSIS | FALLBACK
snomed_code: Optional[str] = None
event_date: Optional[datetime] = None
def get_drug_clusters(
drug_name: str,
db_manager: Optional[DatabaseManager] = None
@@ -180,266 +144,6 @@ def get_drug_cluster_ids(
return list(set(c["cluster_id"] for c in clusters))
def get_drug_snomed_codes(
drug_name: str,
db_manager: Optional[DatabaseManager] = None
) -> list[DrugSnomedMapping]:
"""
Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.
This uses the enriched mapping CSV data loaded into SQLite, which provides
direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.
Args:
drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)
db_manager: Optional DatabaseManager (defaults to default_db_manager)
Returns:
List of DrugSnomedMapping with snomed_code, snomed_description,
search_term, primary_directorate, indication, ta_id
"""
if db_manager is None:
db_manager = default_db_manager
query = """
SELECT DISTINCT
snomed_code,
snomed_description,
search_term,
primary_directorate,
indication,
ta_id
FROM ref_drug_snomed_mapping
WHERE UPPER(cleaned_drug_name) = UPPER(?)
OR UPPER(drug_name) = UPPER(?)
ORDER BY search_term, snomed_code
"""
try:
with db_manager.get_connection() as conn:
cursor = conn.execute(query, (drug_name, drug_name))
rows = cursor.fetchall()
results = []
for row in rows:
results.append(DrugSnomedMapping(
snomed_code=row["snomed_code"],
snomed_description=row["snomed_description"] or "",
search_term=row["search_term"] or "",
primary_directorate=row["primary_directorate"] or "",
indication=row["indication"] or "",
ta_id=row["ta_id"] or "",
))
logger.debug(f"Found {len(results)} SNOMED mappings for drug '{drug_name}'")
return results
except Exception as e:
logger.error(f"Error getting SNOMED codes for drug '{drug_name}': {e}")
return []
def patient_has_indication_direct(
patient_pseudonym: str,
drug_snomed_mappings: list[DrugSnomedMapping],
connector: Optional[SnowflakeConnector] = None,
before_date: Optional[date] = None,
) -> DirectSnomedMatchResult:
"""
Check if patient has any of the SNOMED codes in their GP records.
This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding
for exact SNOMED code matches (not via cluster). Returns the most recent
match by EventDateTime if multiple matches exist.
Args:
patient_pseudonym: Patient's pseudonymised NHS number
drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()
connector: Optional SnowflakeConnector (defaults to singleton)
before_date: Optional date - only check diagnoses before this date
Returns:
DirectSnomedMatchResult with match details (most recent by EventDateTime)
"""
result = DirectSnomedMatchResult(
patient_pseudonym=patient_pseudonym,
matched=False,
source="NONE",
)
if not drug_snomed_mappings:
return result
if not SNOWFLAKE_AVAILABLE:
logger.warning("Snowflake connector not available")
return result
if not is_snowflake_configured():
logger.warning("Snowflake not configured - cannot check GP records")
return result
if connector is None:
connector = get_connector()
# Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)
snomed_lookup = {
m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)
for m in drug_snomed_mappings
}
# Get unique SNOMED codes
snomed_codes = list(snomed_lookup.keys())
# Build placeholders for SNOMED codes
placeholders = ", ".join(["%s"] * len(snomed_codes))
# Query to find most recent matching SNOMED code in GP records
query = f'''
SELECT
"SNOMEDCode",
"EventDateTime"
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
WHERE "PatientPseudonym" = %s
AND "SNOMEDCode" IN ({placeholders})
'''
params: list = [patient_pseudonym] + snomed_codes
if before_date:
query += ' AND "EventDateTime" < %s'
params.append(before_date.isoformat())
query += ' ORDER BY "EventDateTime" DESC LIMIT 1'
try:
results = connector.execute_dict(query, tuple(params))
if results:
row = results[0]
matched_code = row.get("SNOMEDCode")
event_dt = row.get("EventDateTime")
if matched_code and matched_code in snomed_lookup:
search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]
return DirectSnomedMatchResult(
patient_pseudonym=patient_pseudonym,
matched=True,
snomed_code=matched_code,
snomed_description=snomed_desc,
search_term=search_term,
primary_directorate=primary_dir,
event_date=event_dt,
source="DIRECT_SNOMED",
)
return result
except Exception as e:
logger.error(f"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}")
return result
def get_directorate_from_diagnosis(
upid: str,
drug_name: str,
connector: Optional[SnowflakeConnector] = None,
db_manager: Optional[DatabaseManager] = None,
before_date: Optional[date] = None,
) -> DirectorateAssignment:
"""
Get directorate assignment for a patient-drug combination using diagnosis-based lookup.
This function attempts to assign a directorate based on the patient's GP records
(direct SNOMED code matching). If no match is found, it returns a FALLBACK result
indicating that the caller should use alternative assignment methods (e.g.,
department_identification() from tools/data.py).
Workflow:
1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping
2. Query patient's GP records for matching SNOMED codes
3. If match found → return diagnosis-based directorate and search_term
4. If no match → return FALLBACK result (caller handles fallback logic)
Args:
upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)
drug_name: Drug name to look up
connector: Optional SnowflakeConnector (defaults to singleton)
db_manager: Optional DatabaseManager (defaults to default_db_manager)
before_date: Optional date - only check diagnoses before this date
Returns:
DirectorateAssignment with directorate, search_term, and source
"""
result = DirectorateAssignment(
upid=upid,
drug_name=drug_name,
directorate=None,
source="FALLBACK",
)
# Step 1: Get SNOMED codes for the drug
drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)
if not drug_snomed_mappings:
logger.debug(f"No SNOMED mappings found for drug '{drug_name}' - using fallback")
return result
# Step 2: Check Snowflake availability
if not SNOWFLAKE_AVAILABLE:
logger.debug("Snowflake not available - using fallback")
return result
if not is_snowflake_configured():
logger.debug("Snowflake not configured - using fallback")
return result
# Step 3: Get patient pseudonym from UPID
# UPID format is Provider Code (3 chars) + PersonKey
# We need to query Snowflake to get the PatientPseudonym for this PersonKey
# However, patient_has_indication_direct expects PatientPseudonym, not UPID
# For now, we'll use UPID as the identifier - the actual integration
# will need to happen at the DataFrame level where we have PersonKey
#
# NOTE: This function will be called from the pipeline where we have
# access to PatientPseudonym. The UPID is passed for logging/tracking.
# Actually, looking at the pipeline, we need PatientPseudonym, not UPID.
# The caller should pass the PatientPseudonym or we need to look it up.
# For now, let's assume the caller will use this in a batch context
# where they can map UPID -> PatientPseudonym.
# Let me reconsider: the function signature takes UPID but we need
# PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),
# we'll have the PersonKey column which IS the PatientPseudonym.
# So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.
#
# We can extract PatientPseudonym from UPID by removing the first 3 chars.
patient_pseudonym = upid[3:] if len(upid) > 3 else upid
# Step 4: Check patient's GP records for matching SNOMED codes
match_result = patient_has_indication_direct(
patient_pseudonym=patient_pseudonym,
drug_snomed_mappings=drug_snomed_mappings,
connector=connector,
before_date=before_date,
)
if match_result.matched and match_result.primary_directorate:
return DirectorateAssignment(
upid=upid,
drug_name=drug_name,
directorate=match_result.primary_directorate,
search_term=match_result.search_term,
source="DIAGNOSIS",
snomed_code=match_result.snomed_code,
event_date=match_result.event_date,
)
# No match found - return fallback result
return result
def get_cluster_snomed_codes(
cluster_id: str,
connector: Optional[SnowflakeConnector] = None,
@@ -864,229 +568,6 @@ def get_available_clusters(
return []
def batch_lookup_indication_groups(
df: "pd.DataFrame",
connector: Optional[SnowflakeConnector] = None,
db_manager: Optional[DatabaseManager] = None,
batch_size: int = 500,
) -> "pd.DataFrame":
"""
Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.
This is the efficient batch version of get_directorate_from_diagnosis().
Instead of querying Snowflake per patient, it batches the lookups for performance.
Strategy:
1. Get all unique (PersonKey, Drug Name) pairs from DataFrame
2. For each unique drug, get all SNOMED codes from local SQLite
3. Build batched Snowflake queries to check GP records
4. Return indication_df mapping UPID → Indication_Group
For unmatched patients, Indication_Group will be their Directory (with suffix).
Args:
df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey
connector: Optional SnowflakeConnector (defaults to singleton)
db_manager: Optional DatabaseManager (defaults to default_db_manager)
batch_size: Number of patients per Snowflake query batch
Returns:
DataFrame with columns: UPID, Indication_Group, Source
- Indication_Group: Search_Term (if matched) or "Directory (no GP dx)" (if not)
- Source: "DIAGNOSIS" or "FALLBACK"
"""
import pandas as pd
if db_manager is None:
db_manager = default_db_manager
logger.info(f"Starting batch indication lookup for {len(df)} records...")
# Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
# We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
# Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
if 'PseudoNHSNoLinked' not in df.columns:
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
# Return fallback for all patients
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
result_df['Source'] = "FALLBACK"
return result_df[['UPID', 'Indication_Group', 'Source']]
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
# Step 2: Get all unique drugs and their SNOMED codes
unique_drugs = unique_pairs['Drug Name'].unique()
logger.info(f"Building SNOMED lookup for {len(unique_drugs)} unique drugs...")
# Build drug -> list of DrugSnomedMapping dict
drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}
all_snomed_codes: set[str] = set()
snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {} # snomed -> [(drug, search_term, primary_dir), ...]
for drug_name in unique_drugs:
mappings = get_drug_snomed_codes(drug_name, db_manager)
drug_snomed_map[drug_name] = mappings
for m in mappings:
all_snomed_codes.add(m.snomed_code)
if m.snomed_code not in snomed_to_drug_searchterm:
snomed_to_drug_searchterm[m.snomed_code] = []
snomed_to_drug_searchterm[m.snomed_code].append(
(drug_name, m.search_term, m.primary_directorate)
)
logger.info(f"Total SNOMED codes to check: {len(all_snomed_codes)}")
# Step 3: Check Snowflake availability
if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
logger.warning("Snowflake not available - returning fallback for all patients")
result_df = unique_pairs[['UPID', 'Directory']].copy()
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
result_df['Source'] = "FALLBACK"
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
if connector is None:
connector = get_connector()
# Step 4: Query GP records for all patients in batches
# The query finds the most recent matching SNOMED code for each patient
# Get unique PseudoNHSNoLinked values (each = one patient in GP records)
unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
# Results dict: PersonKey -> (snomed_code, event_date)
gp_matches: dict[str, tuple[str, Any]] = {}
# Convert SNOMED codes to list for query
snomed_list = list(all_snomed_codes)
if not snomed_list:
logger.warning("No SNOMED codes to check - returning fallback for all patients")
result_df = unique_pairs[['UPID', 'Directory']].copy()
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
result_df['Source'] = "FALLBACK"
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
# Build SNOMED IN clause (reused across batches)
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
# Process patients in batches
for batch_start in range(0, len(patient_pseudonyms), batch_size):
batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
# Build patient IN clause
patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
# Query to find all matching SNOMED codes for these patients
# We'll get all matches and pick the most recent per patient in Python
query = f'''
SELECT
"PatientPseudonym",
"SNOMEDCode",
"EventDateTime"
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
WHERE "PatientPseudonym" IN ({patient_placeholders})
AND "SNOMEDCode" IN ({snomed_placeholders})
ORDER BY "PatientPseudonym", "EventDateTime" DESC
'''
params = tuple(batch_pseudonyms) + tuple(snomed_list)
try:
results = connector.execute_dict(query, params)
# Process results - pick most recent per patient
for row in results:
person_key = row.get("PatientPseudonym")
snomed_code = row.get("SNOMEDCode")
event_date = row.get("EventDateTime")
if person_key and snomed_code:
# Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)
if person_key not in gp_matches:
gp_matches[person_key] = (snomed_code, event_date)
except Exception as e:
logger.error(f"Error querying GP records for batch: {e}")
# Continue with other batches
logger.info(f"Found GP matches for {len(gp_matches)} patients")
# Step 5: Build result DataFrame
# For each unique_pair, determine Indication_Group based on match status
results_list = []
# We need to dedupe by UPID - a patient might be on multiple drugs
# Strategy: For each UPID, use the most recent match (if any)
upid_to_match: dict[str, tuple[str, str]] = {} # UPID -> (Indication_Group, Source)
for _, row in unique_pairs.iterrows():
upid = row['UPID']
drug_name = row['Drug Name']
patient_pseudonym = row['PseudoNHSNoLinked']
directory = row['Directory']
# Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
if patient_pseudonym in gp_matches:
matched_snomed, event_date = gp_matches[patient_pseudonym]
# Find the search_term for this SNOMED code and drug
# (A SNOMED code might map to multiple drugs with different search_terms)
if matched_snomed in snomed_to_drug_searchterm:
# Look for match with current drug first
search_term = None
for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:
if drug.upper() == drug_name.upper():
search_term = st
break
# If no drug-specific match, use any match
if search_term is None:
search_term = snomed_to_drug_searchterm[matched_snomed][0][1]
# Only update if we don't have a match for this UPID yet
if upid not in upid_to_match:
upid_to_match[upid] = (search_term, "DIAGNOSIS")
else:
# Shouldn't happen but fallback just in case
if upid not in upid_to_match:
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
else:
# No GP match - use fallback
if upid not in upid_to_match:
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
# Build result DataFrame
for upid, (indication_group, source) in upid_to_match.items():
results_list.append({
'UPID': upid,
'Indication_Group': indication_group,
'Source': source,
})
result_df = pd.DataFrame(results_list)
# Log statistics
diagnosis_count = len([s for s in result_df['Source'] if s == "DIAGNOSIS"])
fallback_count = len([s for s in result_df['Source'] if s == "FALLBACK"])
total = len(result_df)
logger.info(f"Indication lookup complete:")
logger.info(f" Total unique patients: {total}")
logger.info(f" DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)")
logger.info(f" FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)")
return result_df
# === Drug-to-indication mapping from DimSearchTerm.csv ===
@@ -1713,10 +1194,7 @@ __all__ = [
"ClusterSnomedCodes",
"IndicationValidationResult",
"DrugIndicationMatchRate",
"DrugSnomedMapping",
"DirectSnomedMatchResult",
"DirectorateAssignment",
# Cluster-based lookup functions (existing)
# Cluster-based lookup functions
"get_drug_clusters",
"get_drug_cluster_ids",
"get_cluster_snomed_codes",
@@ -1725,20 +1203,13 @@ __all__ = [
"get_indication_match_rate",
"batch_validate_indications",
"get_available_clusters",
# Direct SNOMED lookup functions (new)
"get_drug_snomed_codes",
"patient_has_indication_direct",
# Diagnosis-based directorate assignment
"get_directorate_from_diagnosis",
# Batch lookup for indication groups
"batch_lookup_indication_groups",
# Drug-indication mapping from DimSearchTerm.csv
"SEARCH_TERM_MERGE_MAP",
"load_drug_indication_mapping",
"get_search_terms_for_drug",
# Drug-aware indication assignment
"assign_drug_indications",
# Snowflake-direct indication lookup (new approach)
# Snowflake-direct indication lookup
"get_patient_indication_groups",
"CLUSTER_MAPPING_SQL",
]