From 5b1569ed5cb932448cf88fc7ffd72a8d1d65fb3f Mon Sep 17 00:00:00 2001
From: Andrew Charlwood <andrew.charlwood@nhs.net>
Date: Thu, 5 Feb 2026 15:49:24 +0000
Subject: [PATCH] fix: correct patient identifier for GP diagnosis lookup (Task
 3.3)

Two critical fixes for the indication-based pathway feature:

1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16")
   - CSV export from pandas/Excel converts large SNOMED codes to scientific notation
   - Without this fix, codes like "10629311000119108" were stored as "1.06e+16"
   - Now properly converts to full integer strings

2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey
   - PersonKey is LocalPatientID (provider-specific like "J188448")
   - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records
   - Without this fix, 0% of patients matched GP records
   - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier
---
 IMPLEMENTATION_PLAN.md                 |  2 +-
 data_processing/diagnosis_lookup.py    | 37 +++++++++++++-------------
 data_processing/load_snomed_mapping.py | 27 +++++++++++++++----
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
index 368748f..d0ad397 100644
--- a/IMPLEMENTATION_PLAN.md
+++ b/IMPLEMENTATION_PLAN.md
@@ -130,7 +130,7 @@ python -m reflex compile
 - [ ] Verify: Test refresh with --dry-run, check coverage stats
 
 ### 3.3 Test Full Refresh Pipeline
-- [ ] Run `python -m cli.refresh_pathways` with real data
+- [~] Run `python -m cli.refresh_pathways` with real data
 - [ ] Verify pathway_nodes table has both chart_type values
 - [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug)
 - [ ] Verify unmatched patients appear with directorate fallback label
diff --git a/data_processing/diagnosis_lookup.py b/data_processing/diagnosis_lookup.py
index 047eee9..824f1cd 100644
--- a/data_processing/diagnosis_lookup.py
+++ b/data_processing/diagnosis_lookup.py
@@ -902,10 +902,11 @@ def batch_lookup_indication_groups(
 
     logger.info(f"Starting batch indication lookup for {len(df)} records...")
 
-    # Step 1: Get unique (UPID, Drug Name, PersonKey, Directory) combinations
-    # We need PersonKey to query Snowflake (it's the PatientPseudonym)
-    if 'PersonKey' not in df.columns:
-        logger.error("DataFrame missing 'PersonKey' column - cannot lookup GP records")
+    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
+    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
+    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
+    if 'PseudoNHSNoLinked' not in df.columns:
+        logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
         # Return fallback for all patients
         result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
         result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
@@ -913,7 +914,7 @@ def batch_lookup_indication_groups(
         return result_df[['UPID', 'Indication_Group', 'Source']]
 
     # Get unique patient-drug combinations (we need one lookup per patient-drug pair)
-    unique_pairs = df[['UPID', 'Drug Name', 'PersonKey', 'Directory']].drop_duplicates()
+    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
     logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
 
     # Step 2: Get all unique drugs and their SNOMED codes
@@ -953,11 +954,11 @@ def batch_lookup_indication_groups(
     # Step 4: Query GP records for all patients in batches
     # The query finds the most recent matching SNOMED code for each patient
 
-    # Get unique PersonKeys (each PersonKey = one patient)
-    unique_patients = unique_pairs[['PersonKey', 'UPID', 'Directory']].drop_duplicates(subset=['PersonKey'])
-    person_keys = unique_patients['PersonKey'].tolist()
+    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)
+    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
+    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
 
-    logger.info(f"Querying GP records for {len(person_keys)} unique patients in batches of {batch_size}...")
+    logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
 
     # Results dict: PersonKey -> (snomed_code, event_date)
     gp_matches: dict[str, tuple[str, Any]] = {}
@@ -976,14 +977,14 @@ def batch_lookup_indication_groups(
     snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
 
     # Process patients in batches
-    for batch_start in range(0, len(person_keys), batch_size):
-        batch_end = min(batch_start + batch_size, len(person_keys))
-        batch_person_keys = person_keys[batch_start:batch_end]
+    for batch_start in range(0, len(patient_pseudonyms), batch_size):
+        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
+        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
 
         logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
 
         # Build patient IN clause
-        patient_placeholders = ", ".join(["%s"] * len(batch_person_keys))
+        patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
 
         # Query to find all matching SNOMED codes for these patients
         # We'll get all matches and pick the most recent per patient in Python
@@ -998,7 +999,7 @@ def batch_lookup_indication_groups(
             ORDER BY "PatientPseudonym", "EventDateTime" DESC
         '''
 
-        params = tuple(batch_person_keys) + tuple(snomed_list)
+        params = tuple(batch_pseudonyms) + tuple(snomed_list)
 
         try:
             results = connector.execute_dict(query, params)
@@ -1031,12 +1032,12 @@ def batch_lookup_indication_groups(
     for _, row in unique_pairs.iterrows():
         upid = row['UPID']
         drug_name = row['Drug Name']
-        person_key = row['PersonKey']
+        patient_pseudonym = row['PseudoNHSNoLinked']
         directory = row['Directory']
 
-        # Check if patient has GP match
-        if person_key in gp_matches:
-            matched_snomed, event_date = gp_matches[person_key]
+        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
+        if patient_pseudonym in gp_matches:
+            matched_snomed, event_date = gp_matches[patient_pseudonym]
 
             # Find the search_term for this SNOMED code and drug
             # (A SNOMED code might map to multiple drugs with different search_terms)
diff --git a/data_processing/load_snomed_mapping.py b/data_processing/load_snomed_mapping.py
index 9011160..c10821c 100644
--- a/data_processing/load_snomed_mapping.py
+++ b/data_processing/load_snomed_mapping.py
@@ -36,23 +36,40 @@ DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
 
 def clean_snomed_code(snomed_code: str) -> str:
     """
-    Clean SNOMED code by removing trailing .0 suffix.
+    Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
 
-    The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0")
-    that need to be converted to clean integer strings.
+    The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
+    or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
+    These need to be converted to clean integer strings.
 
     Args:
         snomed_code: Raw SNOMED code from CSV.
 
     Returns:
-        Cleaned SNOMED code as string (e.g., "156370009").
+        Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
     """
     if not snomed_code:
         return ""
 
     code = snomed_code.strip()
 
-    # Remove trailing .0 if present
+    # Handle scientific notation (e.g., "1.0629311000119108e+16")
+    if 'e' in code.lower():
+        try:
+            # Convert to float first, then to int, then to string
+            # Using int() directly on the float preserves precision for SNOMED codes
+            value = float(code)
+            # Check if it's a whole number (no decimal part)
+            if value == int(value):
+                return str(int(value))
+            else:
+                # Has decimal part - return as cleaned float
+                return str(value).replace('.0', '')
+        except (ValueError, OverflowError):
+            # If conversion fails, return as-is but cleaned
+            return code
+
+    # Remove trailing .0 if present (for non-scientific notation)
     if code.endswith(".0"):
         code = code[:-2]