From 5b1569ed5cb932448cf88fc7ffd72a8d1d65fb3f Mon Sep 17 00:00:00 2001 From: Andrew Charlwood Date: Thu, 5 Feb 2026 15:49:24 +0000 Subject: [PATCH] fix: correct patient identifier for GP diagnosis lookup (Task 3.3) Two critical fixes for the indication-based pathway feature: 1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16") - CSV export from pandas/Excel converts large SNOMED codes to scientific notation - Without this fix, codes like "10629311000119108" were stored as "1.06e+16" - Now properly converts to full integer strings 2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey - PersonKey is LocalPatientID (provider-specific like "J188448") - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records - Without this fix, 0% of patients matched GP records - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier --- IMPLEMENTATION_PLAN.md | 2 +- data_processing/diagnosis_lookup.py | 37 +++++++++++++------------- data_processing/load_snomed_mapping.py | 27 +++++++++++++++---- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index 368748f..d0ad397 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -130,7 +130,7 @@ python -m reflex compile - [ ] Verify: Test refresh with --dry-run, check coverage stats ### 3.3 Test Full Refresh Pipeline -- [ ] Run `python -m cli.refresh_pathways` with real data +- [~] Run `python -m cli.refresh_pathways` with real data - [ ] Verify pathway_nodes table has both chart_type values - [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug) - [ ] Verify unmatched patients appear with directorate fallback label diff --git a/data_processing/diagnosis_lookup.py b/data_processing/diagnosis_lookup.py index 047eee9..824f1cd 100644 --- a/data_processing/diagnosis_lookup.py +++ b/data_processing/diagnosis_lookup.py @@ -902,10 +902,11 @@ def batch_lookup_indication_groups( logger.info(f"Starting batch indication lookup for {len(df)} records...") - # Step 1: Get unique (UPID, Drug Name, PersonKey, Directory) combinations - # We need PersonKey to query Snowflake (it's the PatientPseudonym) - if 'PersonKey' not in df.columns: - logger.error("DataFrame missing 'PersonKey' column - cannot lookup GP records") + # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations + # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records + # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records + if 'PseudoNHSNoLinked' not in df.columns: + logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records") # Return fallback for all patients result_df = df[['UPID', 'Directory']].drop_duplicates().copy() result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)" @@ -913,7 +914,7 @@ def batch_lookup_indication_groups( return result_df[['UPID', 'Indication_Group', 'Source']] # Get unique patient-drug combinations (we need one lookup per patient-drug pair) - unique_pairs = df[['UPID', 'Drug Name', 'PersonKey', 'Directory']].drop_duplicates() + unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates() logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations") # Step 2: Get all unique drugs and their SNOMED codes @@ -953,11 +954,11 @@ def batch_lookup_indication_groups( # Step 4: Query GP records for all patients in batches # The query finds the most recent matching SNOMED code for each patient - # Get unique PersonKeys (each PersonKey = one patient) - unique_patients = unique_pairs[['PersonKey', 'UPID', 'Directory']].drop_duplicates(subset=['PersonKey']) - person_keys = unique_patients['PersonKey'].tolist() + # Get unique PseudoNHSNoLinked values (each = one patient in GP records) + unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked']) + patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist() - logger.info(f"Querying GP records for {len(person_keys)} unique patients in batches of {batch_size}...") + logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...") # Results dict: PersonKey -> (snomed_code, event_date) gp_matches: dict[str, tuple[str, Any]] = {} @@ -976,14 +977,14 @@ def batch_lookup_indication_groups( snomed_placeholders = ", ".join(["%s"] * len(snomed_list)) # Process patients in batches - for batch_start in range(0, len(person_keys), batch_size): - batch_end = min(batch_start + batch_size, len(person_keys)) - batch_person_keys = person_keys[batch_start:batch_end] + for batch_start in range(0, len(patient_pseudonyms), batch_size): + batch_end = min(batch_start + batch_size, len(patient_pseudonyms)) + batch_pseudonyms = patient_pseudonyms[batch_start:batch_end] logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}") # Build patient IN clause - patient_placeholders = ", ".join(["%s"] * len(batch_person_keys)) + patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms)) # Query to find all matching SNOMED codes for these patients # We'll get all matches and pick the most recent per patient in Python @@ -998,7 +999,7 @@ def batch_lookup_indication_groups( ORDER BY "PatientPseudonym", "EventDateTime" DESC ''' - params = tuple(batch_person_keys) + tuple(snomed_list) + params = tuple(batch_pseudonyms) + tuple(snomed_list) try: results = connector.execute_dict(query, params) @@ -1031,12 +1032,12 @@ def batch_lookup_indication_groups( for _, row in unique_pairs.iterrows(): upid = row['UPID'] drug_name = row['Drug Name'] - person_key = row['PersonKey'] + patient_pseudonym = row['PseudoNHSNoLinked'] directory = row['Directory'] - # Check if patient has GP match - if person_key in gp_matches: - matched_snomed, event_date = gp_matches[person_key] + # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP) + if patient_pseudonym in gp_matches: + matched_snomed, event_date = gp_matches[patient_pseudonym] # Find the search_term for this SNOMED code and drug # (A SNOMED code might map to multiple drugs with different search_terms) diff --git a/data_processing/load_snomed_mapping.py b/data_processing/load_snomed_mapping.py index 9011160..c10821c 100644 --- a/data_processing/load_snomed_mapping.py +++ b/data_processing/load_snomed_mapping.py @@ -36,23 +36,40 @@ DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv") def clean_snomed_code(snomed_code: str) -> str: """ - Clean SNOMED code by removing trailing .0 suffix. + Clean SNOMED code by removing trailing .0 suffix and handling scientific notation. - The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0") - that need to be converted to clean integer strings. + The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0") + or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export. + These need to be converted to clean integer strings. Args: snomed_code: Raw SNOMED code from CSV. Returns: - Cleaned SNOMED code as string (e.g., "156370009"). + Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108"). """ if not snomed_code: return "" code = snomed_code.strip() - # Remove trailing .0 if present + # Handle scientific notation (e.g., "1.0629311000119108e+16") + if 'e' in code.lower(): + try: + # Convert to float first, then to int, then to string + # Using int() directly on the float preserves precision for SNOMED codes + value = float(code) + # Check if it's a whole number (no decimal part) + if value == int(value): + return str(int(value)) + else: + # Has decimal part - return as cleaned float + return str(value).replace('.0', '') + except (ValueError, OverflowError): + # If conversion fails, return as-is but cleaned + return code + + # Remove trailing .0 if present (for non-scientific notation) if code.endswith(".0"): code = code[:-2]