refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables
Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
This commit is contained in:
@@ -78,42 +78,6 @@ class DrugIndicationMatchRate:
|
||||
sample_unmatched: list[str] = field(default_factory=list) # Sample patient IDs
|
||||
|
||||
|
||||
@dataclass
|
||||
class DrugSnomedMapping:
|
||||
"""SNOMED code mapping for a drug from ref_drug_snomed_mapping."""
|
||||
snomed_code: str
|
||||
snomed_description: str
|
||||
search_term: str
|
||||
primary_directorate: str
|
||||
indication: str = ""
|
||||
ta_id: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirectSnomedMatchResult:
|
||||
"""Result of direct SNOMED code lookup in GP records."""
|
||||
patient_pseudonym: str
|
||||
matched: bool
|
||||
snomed_code: Optional[str] = None
|
||||
snomed_description: Optional[str] = None
|
||||
search_term: Optional[str] = None
|
||||
primary_directorate: Optional[str] = None
|
||||
event_date: Optional[datetime] = None
|
||||
source: str = "DIRECT_SNOMED" # DIRECT_SNOMED | NONE
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirectorateAssignment:
|
||||
"""Result of directorate assignment for a patient-drug combination."""
|
||||
upid: str
|
||||
drug_name: str
|
||||
directorate: Optional[str]
|
||||
search_term: Optional[str] = None
|
||||
source: str = "FALLBACK" # DIAGNOSIS | FALLBACK
|
||||
snomed_code: Optional[str] = None
|
||||
event_date: Optional[datetime] = None
|
||||
|
||||
|
||||
def get_drug_clusters(
|
||||
drug_name: str,
|
||||
db_manager: Optional[DatabaseManager] = None
|
||||
@@ -180,266 +144,6 @@ def get_drug_cluster_ids(
|
||||
return list(set(c["cluster_id"] for c in clusters))
|
||||
|
||||
|
||||
def get_drug_snomed_codes(
|
||||
drug_name: str,
|
||||
db_manager: Optional[DatabaseManager] = None
|
||||
) -> list[DrugSnomedMapping]:
|
||||
"""
|
||||
Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.
|
||||
|
||||
This uses the enriched mapping CSV data loaded into SQLite, which provides
|
||||
direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.
|
||||
|
||||
Args:
|
||||
drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
|
||||
Returns:
|
||||
List of DrugSnomedMapping with snomed_code, snomed_description,
|
||||
search_term, primary_directorate, indication, ta_id
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = default_db_manager
|
||||
|
||||
query = """
|
||||
SELECT DISTINCT
|
||||
snomed_code,
|
||||
snomed_description,
|
||||
search_term,
|
||||
primary_directorate,
|
||||
indication,
|
||||
ta_id
|
||||
FROM ref_drug_snomed_mapping
|
||||
WHERE UPPER(cleaned_drug_name) = UPPER(?)
|
||||
OR UPPER(drug_name) = UPPER(?)
|
||||
ORDER BY search_term, snomed_code
|
||||
"""
|
||||
|
||||
try:
|
||||
with db_manager.get_connection() as conn:
|
||||
cursor = conn.execute(query, (drug_name, drug_name))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
results.append(DrugSnomedMapping(
|
||||
snomed_code=row["snomed_code"],
|
||||
snomed_description=row["snomed_description"] or "",
|
||||
search_term=row["search_term"] or "",
|
||||
primary_directorate=row["primary_directorate"] or "",
|
||||
indication=row["indication"] or "",
|
||||
ta_id=row["ta_id"] or "",
|
||||
))
|
||||
|
||||
logger.debug(f"Found {len(results)} SNOMED mappings for drug '{drug_name}'")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting SNOMED codes for drug '{drug_name}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
def patient_has_indication_direct(
|
||||
patient_pseudonym: str,
|
||||
drug_snomed_mappings: list[DrugSnomedMapping],
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
before_date: Optional[date] = None,
|
||||
) -> DirectSnomedMatchResult:
|
||||
"""
|
||||
Check if patient has any of the SNOMED codes in their GP records.
|
||||
|
||||
This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding
|
||||
for exact SNOMED code matches (not via cluster). Returns the most recent
|
||||
match by EventDateTime if multiple matches exist.
|
||||
|
||||
Args:
|
||||
patient_pseudonym: Patient's pseudonymised NHS number
|
||||
drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
before_date: Optional date - only check diagnoses before this date
|
||||
|
||||
Returns:
|
||||
DirectSnomedMatchResult with match details (most recent by EventDateTime)
|
||||
"""
|
||||
result = DirectSnomedMatchResult(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
matched=False,
|
||||
source="NONE",
|
||||
)
|
||||
|
||||
if not drug_snomed_mappings:
|
||||
return result
|
||||
|
||||
if not SNOWFLAKE_AVAILABLE:
|
||||
logger.warning("Snowflake connector not available")
|
||||
return result
|
||||
|
||||
if not is_snowflake_configured():
|
||||
logger.warning("Snowflake not configured - cannot check GP records")
|
||||
return result
|
||||
|
||||
if connector is None:
|
||||
connector = get_connector()
|
||||
|
||||
# Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)
|
||||
snomed_lookup = {
|
||||
m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)
|
||||
for m in drug_snomed_mappings
|
||||
}
|
||||
|
||||
# Get unique SNOMED codes
|
||||
snomed_codes = list(snomed_lookup.keys())
|
||||
|
||||
# Build placeholders for SNOMED codes
|
||||
placeholders = ", ".join(["%s"] * len(snomed_codes))
|
||||
|
||||
# Query to find most recent matching SNOMED code in GP records
|
||||
query = f'''
|
||||
SELECT
|
||||
"SNOMEDCode",
|
||||
"EventDateTime"
|
||||
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
|
||||
WHERE "PatientPseudonym" = %s
|
||||
AND "SNOMEDCode" IN ({placeholders})
|
||||
'''
|
||||
|
||||
params: list = [patient_pseudonym] + snomed_codes
|
||||
|
||||
if before_date:
|
||||
query += ' AND "EventDateTime" < %s'
|
||||
params.append(before_date.isoformat())
|
||||
|
||||
query += ' ORDER BY "EventDateTime" DESC LIMIT 1'
|
||||
|
||||
try:
|
||||
results = connector.execute_dict(query, tuple(params))
|
||||
|
||||
if results:
|
||||
row = results[0]
|
||||
matched_code = row.get("SNOMEDCode")
|
||||
event_dt = row.get("EventDateTime")
|
||||
|
||||
if matched_code and matched_code in snomed_lookup:
|
||||
search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]
|
||||
|
||||
return DirectSnomedMatchResult(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
matched=True,
|
||||
snomed_code=matched_code,
|
||||
snomed_description=snomed_desc,
|
||||
search_term=search_term,
|
||||
primary_directorate=primary_dir,
|
||||
event_date=event_dt,
|
||||
source="DIRECT_SNOMED",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}")
|
||||
return result
|
||||
|
||||
|
||||
def get_directorate_from_diagnosis(
|
||||
upid: str,
|
||||
drug_name: str,
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
before_date: Optional[date] = None,
|
||||
) -> DirectorateAssignment:
|
||||
"""
|
||||
Get directorate assignment for a patient-drug combination using diagnosis-based lookup.
|
||||
|
||||
This function attempts to assign a directorate based on the patient's GP records
|
||||
(direct SNOMED code matching). If no match is found, it returns a FALLBACK result
|
||||
indicating that the caller should use alternative assignment methods (e.g.,
|
||||
department_identification() from tools/data.py).
|
||||
|
||||
Workflow:
|
||||
1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping
|
||||
2. Query patient's GP records for matching SNOMED codes
|
||||
3. If match found → return diagnosis-based directorate and search_term
|
||||
4. If no match → return FALLBACK result (caller handles fallback logic)
|
||||
|
||||
Args:
|
||||
upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)
|
||||
drug_name: Drug name to look up
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
before_date: Optional date - only check diagnoses before this date
|
||||
|
||||
Returns:
|
||||
DirectorateAssignment with directorate, search_term, and source
|
||||
"""
|
||||
result = DirectorateAssignment(
|
||||
upid=upid,
|
||||
drug_name=drug_name,
|
||||
directorate=None,
|
||||
source="FALLBACK",
|
||||
)
|
||||
|
||||
# Step 1: Get SNOMED codes for the drug
|
||||
drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)
|
||||
|
||||
if not drug_snomed_mappings:
|
||||
logger.debug(f"No SNOMED mappings found for drug '{drug_name}' - using fallback")
|
||||
return result
|
||||
|
||||
# Step 2: Check Snowflake availability
|
||||
if not SNOWFLAKE_AVAILABLE:
|
||||
logger.debug("Snowflake not available - using fallback")
|
||||
return result
|
||||
|
||||
if not is_snowflake_configured():
|
||||
logger.debug("Snowflake not configured - using fallback")
|
||||
return result
|
||||
|
||||
# Step 3: Get patient pseudonym from UPID
|
||||
# UPID format is Provider Code (3 chars) + PersonKey
|
||||
# We need to query Snowflake to get the PatientPseudonym for this PersonKey
|
||||
# However, patient_has_indication_direct expects PatientPseudonym, not UPID
|
||||
# For now, we'll use UPID as the identifier - the actual integration
|
||||
# will need to happen at the DataFrame level where we have PersonKey
|
||||
#
|
||||
# NOTE: This function will be called from the pipeline where we have
|
||||
# access to PatientPseudonym. The UPID is passed for logging/tracking.
|
||||
|
||||
# Actually, looking at the pipeline, we need PatientPseudonym, not UPID.
|
||||
# The caller should pass the PatientPseudonym or we need to look it up.
|
||||
# For now, let's assume the caller will use this in a batch context
|
||||
# where they can map UPID -> PatientPseudonym.
|
||||
|
||||
# Let me reconsider: the function signature takes UPID but we need
|
||||
# PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),
|
||||
# we'll have the PersonKey column which IS the PatientPseudonym.
|
||||
# So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.
|
||||
#
|
||||
# We can extract PatientPseudonym from UPID by removing the first 3 chars.
|
||||
patient_pseudonym = upid[3:] if len(upid) > 3 else upid
|
||||
|
||||
# Step 4: Check patient's GP records for matching SNOMED codes
|
||||
match_result = patient_has_indication_direct(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
drug_snomed_mappings=drug_snomed_mappings,
|
||||
connector=connector,
|
||||
before_date=before_date,
|
||||
)
|
||||
|
||||
if match_result.matched and match_result.primary_directorate:
|
||||
return DirectorateAssignment(
|
||||
upid=upid,
|
||||
drug_name=drug_name,
|
||||
directorate=match_result.primary_directorate,
|
||||
search_term=match_result.search_term,
|
||||
source="DIAGNOSIS",
|
||||
snomed_code=match_result.snomed_code,
|
||||
event_date=match_result.event_date,
|
||||
)
|
||||
|
||||
# No match found - return fallback result
|
||||
return result
|
||||
|
||||
|
||||
def get_cluster_snomed_codes(
|
||||
cluster_id: str,
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
@@ -864,229 +568,6 @@ def get_available_clusters(
|
||||
return []
|
||||
|
||||
|
||||
def batch_lookup_indication_groups(
|
||||
df: "pd.DataFrame",
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
batch_size: int = 500,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.
|
||||
|
||||
This is the efficient batch version of get_directorate_from_diagnosis().
|
||||
Instead of querying Snowflake per patient, it batches the lookups for performance.
|
||||
|
||||
Strategy:
|
||||
1. Get all unique (PersonKey, Drug Name) pairs from DataFrame
|
||||
2. For each unique drug, get all SNOMED codes from local SQLite
|
||||
3. Build batched Snowflake queries to check GP records
|
||||
4. Return indication_df mapping UPID → Indication_Group
|
||||
|
||||
For unmatched patients, Indication_Group will be their Directory (with suffix).
|
||||
|
||||
Args:
|
||||
df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
batch_size: Number of patients per Snowflake query batch
|
||||
|
||||
Returns:
|
||||
DataFrame with columns: UPID, Indication_Group, Source
|
||||
- Indication_Group: Search_Term (if matched) or "Directory (no GP dx)" (if not)
|
||||
- Source: "DIAGNOSIS" or "FALLBACK"
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
if db_manager is None:
|
||||
db_manager = default_db_manager
|
||||
|
||||
logger.info(f"Starting batch indication lookup for {len(df)} records...")
|
||||
|
||||
# Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
|
||||
# We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
|
||||
# Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
|
||||
if 'PseudoNHSNoLinked' not in df.columns:
|
||||
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
|
||||
# Return fallback for all patients
|
||||
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']]
|
||||
|
||||
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
|
||||
unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
|
||||
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
|
||||
|
||||
# Step 2: Get all unique drugs and their SNOMED codes
|
||||
unique_drugs = unique_pairs['Drug Name'].unique()
|
||||
logger.info(f"Building SNOMED lookup for {len(unique_drugs)} unique drugs...")
|
||||
|
||||
# Build drug -> list of DrugSnomedMapping dict
|
||||
drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}
|
||||
all_snomed_codes: set[str] = set()
|
||||
snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {} # snomed -> [(drug, search_term, primary_dir), ...]
|
||||
|
||||
for drug_name in unique_drugs:
|
||||
mappings = get_drug_snomed_codes(drug_name, db_manager)
|
||||
drug_snomed_map[drug_name] = mappings
|
||||
|
||||
for m in mappings:
|
||||
all_snomed_codes.add(m.snomed_code)
|
||||
if m.snomed_code not in snomed_to_drug_searchterm:
|
||||
snomed_to_drug_searchterm[m.snomed_code] = []
|
||||
snomed_to_drug_searchterm[m.snomed_code].append(
|
||||
(drug_name, m.search_term, m.primary_directorate)
|
||||
)
|
||||
|
||||
logger.info(f"Total SNOMED codes to check: {len(all_snomed_codes)}")
|
||||
|
||||
# Step 3: Check Snowflake availability
|
||||
if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
|
||||
logger.warning("Snowflake not available - returning fallback for all patients")
|
||||
result_df = unique_pairs[['UPID', 'Directory']].copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
|
||||
|
||||
if connector is None:
|
||||
connector = get_connector()
|
||||
|
||||
# Step 4: Query GP records for all patients in batches
|
||||
# The query finds the most recent matching SNOMED code for each patient
|
||||
|
||||
# Get unique PseudoNHSNoLinked values (each = one patient in GP records)
|
||||
unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
|
||||
patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
|
||||
|
||||
logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
|
||||
|
||||
# Results dict: PersonKey -> (snomed_code, event_date)
|
||||
gp_matches: dict[str, tuple[str, Any]] = {}
|
||||
|
||||
# Convert SNOMED codes to list for query
|
||||
snomed_list = list(all_snomed_codes)
|
||||
|
||||
if not snomed_list:
|
||||
logger.warning("No SNOMED codes to check - returning fallback for all patients")
|
||||
result_df = unique_pairs[['UPID', 'Directory']].copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
|
||||
|
||||
# Build SNOMED IN clause (reused across batches)
|
||||
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
|
||||
|
||||
# Process patients in batches
|
||||
for batch_start in range(0, len(patient_pseudonyms), batch_size):
|
||||
batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
|
||||
batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
|
||||
|
||||
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
|
||||
|
||||
# Build patient IN clause
|
||||
patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
|
||||
|
||||
# Query to find all matching SNOMED codes for these patients
|
||||
# We'll get all matches and pick the most recent per patient in Python
|
||||
query = f'''
|
||||
SELECT
|
||||
"PatientPseudonym",
|
||||
"SNOMEDCode",
|
||||
"EventDateTime"
|
||||
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
|
||||
WHERE "PatientPseudonym" IN ({patient_placeholders})
|
||||
AND "SNOMEDCode" IN ({snomed_placeholders})
|
||||
ORDER BY "PatientPseudonym", "EventDateTime" DESC
|
||||
'''
|
||||
|
||||
params = tuple(batch_pseudonyms) + tuple(snomed_list)
|
||||
|
||||
try:
|
||||
results = connector.execute_dict(query, params)
|
||||
|
||||
# Process results - pick most recent per patient
|
||||
for row in results:
|
||||
person_key = row.get("PatientPseudonym")
|
||||
snomed_code = row.get("SNOMEDCode")
|
||||
event_date = row.get("EventDateTime")
|
||||
|
||||
if person_key and snomed_code:
|
||||
# Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)
|
||||
if person_key not in gp_matches:
|
||||
gp_matches[person_key] = (snomed_code, event_date)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error querying GP records for batch: {e}")
|
||||
# Continue with other batches
|
||||
|
||||
logger.info(f"Found GP matches for {len(gp_matches)} patients")
|
||||
|
||||
# Step 5: Build result DataFrame
|
||||
# For each unique_pair, determine Indication_Group based on match status
|
||||
results_list = []
|
||||
|
||||
# We need to dedupe by UPID - a patient might be on multiple drugs
|
||||
# Strategy: For each UPID, use the most recent match (if any)
|
||||
upid_to_match: dict[str, tuple[str, str]] = {} # UPID -> (Indication_Group, Source)
|
||||
|
||||
for _, row in unique_pairs.iterrows():
|
||||
upid = row['UPID']
|
||||
drug_name = row['Drug Name']
|
||||
patient_pseudonym = row['PseudoNHSNoLinked']
|
||||
directory = row['Directory']
|
||||
|
||||
# Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
|
||||
if patient_pseudonym in gp_matches:
|
||||
matched_snomed, event_date = gp_matches[patient_pseudonym]
|
||||
|
||||
# Find the search_term for this SNOMED code and drug
|
||||
# (A SNOMED code might map to multiple drugs with different search_terms)
|
||||
if matched_snomed in snomed_to_drug_searchterm:
|
||||
# Look for match with current drug first
|
||||
search_term = None
|
||||
for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:
|
||||
if drug.upper() == drug_name.upper():
|
||||
search_term = st
|
||||
break
|
||||
# If no drug-specific match, use any match
|
||||
if search_term is None:
|
||||
search_term = snomed_to_drug_searchterm[matched_snomed][0][1]
|
||||
|
||||
# Only update if we don't have a match for this UPID yet
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (search_term, "DIAGNOSIS")
|
||||
else:
|
||||
# Shouldn't happen but fallback just in case
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
|
||||
else:
|
||||
# No GP match - use fallback
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
|
||||
|
||||
# Build result DataFrame
|
||||
for upid, (indication_group, source) in upid_to_match.items():
|
||||
results_list.append({
|
||||
'UPID': upid,
|
||||
'Indication_Group': indication_group,
|
||||
'Source': source,
|
||||
})
|
||||
|
||||
result_df = pd.DataFrame(results_list)
|
||||
|
||||
# Log statistics
|
||||
diagnosis_count = len([s for s in result_df['Source'] if s == "DIAGNOSIS"])
|
||||
fallback_count = len([s for s in result_df['Source'] if s == "FALLBACK"])
|
||||
total = len(result_df)
|
||||
|
||||
logger.info(f"Indication lookup complete:")
|
||||
logger.info(f" Total unique patients: {total}")
|
||||
logger.info(f" DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)")
|
||||
logger.info(f" FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)")
|
||||
|
||||
return result_df
|
||||
|
||||
|
||||
# === Drug-to-indication mapping from DimSearchTerm.csv ===
|
||||
|
||||
|
||||
@@ -1713,10 +1194,7 @@ __all__ = [
|
||||
"ClusterSnomedCodes",
|
||||
"IndicationValidationResult",
|
||||
"DrugIndicationMatchRate",
|
||||
"DrugSnomedMapping",
|
||||
"DirectSnomedMatchResult",
|
||||
"DirectorateAssignment",
|
||||
# Cluster-based lookup functions (existing)
|
||||
# Cluster-based lookup functions
|
||||
"get_drug_clusters",
|
||||
"get_drug_cluster_ids",
|
||||
"get_cluster_snomed_codes",
|
||||
@@ -1725,20 +1203,13 @@ __all__ = [
|
||||
"get_indication_match_rate",
|
||||
"batch_validate_indications",
|
||||
"get_available_clusters",
|
||||
# Direct SNOMED lookup functions (new)
|
||||
"get_drug_snomed_codes",
|
||||
"patient_has_indication_direct",
|
||||
# Diagnosis-based directorate assignment
|
||||
"get_directorate_from_diagnosis",
|
||||
# Batch lookup for indication groups
|
||||
"batch_lookup_indication_groups",
|
||||
# Drug-indication mapping from DimSearchTerm.csv
|
||||
"SEARCH_TERM_MERGE_MAP",
|
||||
"load_drug_indication_mapping",
|
||||
"get_search_terms_for_drug",
|
||||
# Drug-aware indication assignment
|
||||
"assign_drug_indications",
|
||||
# Snowflake-direct indication lookup (new approach)
|
||||
# Snowflake-direct indication lookup
|
||||
"get_patient_indication_groups",
|
||||
"CLUSTER_MAPPING_SQL",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user