fix: correct patient identifier for GP diagnosis lookup (Task 3.3)

Two critical fixes for the indication-based pathway feature:

1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16")
   - CSV export from pandas/Excel converts large SNOMED codes to scientific notation
   - Without this fix, codes like "10629311000119108" were stored as "1.06e+16"
   - Now properly converts to full integer strings

2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey
   - PersonKey is LocalPatientID (provider-specific like "J188448")
   - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records
   - Without this fix, 0% of patients matched GP records
   - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier
This commit is contained in:
Andrew Charlwood
2026-02-05 15:49:24 +00:00
parent b9f4041670
commit 5b1569ed5c
3 changed files with 42 additions and 24 deletions
+1 -1
View File
@@ -130,7 +130,7 @@ python -m reflex compile
- [ ] Verify: Test refresh with --dry-run, check coverage stats - [ ] Verify: Test refresh with --dry-run, check coverage stats
### 3.3 Test Full Refresh Pipeline ### 3.3 Test Full Refresh Pipeline
- [ ] Run `python -m cli.refresh_pathways` with real data - [~] Run `python -m cli.refresh_pathways` with real data
- [ ] Verify pathway_nodes table has both chart_type values - [ ] Verify pathway_nodes table has both chart_type values
- [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug) - [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug)
- [ ] Verify unmatched patients appear with directorate fallback label - [ ] Verify unmatched patients appear with directorate fallback label
+19 -18
View File
@@ -902,10 +902,11 @@ def batch_lookup_indication_groups(
logger.info(f"Starting batch indication lookup for {len(df)} records...") logger.info(f"Starting batch indication lookup for {len(df)} records...")
# Step 1: Get unique (UPID, Drug Name, PersonKey, Directory) combinations # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
# We need PersonKey to query Snowflake (it's the PatientPseudonym) # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
if 'PersonKey' not in df.columns: # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
logger.error("DataFrame missing 'PersonKey' column - cannot lookup GP records") if 'PseudoNHSNoLinked' not in df.columns:
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
# Return fallback for all patients # Return fallback for all patients
result_df = df[['UPID', 'Directory']].drop_duplicates().copy() result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)" result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
@@ -913,7 +914,7 @@ def batch_lookup_indication_groups(
return result_df[['UPID', 'Indication_Group', 'Source']] return result_df[['UPID', 'Indication_Group', 'Source']]
# Get unique patient-drug combinations (we need one lookup per patient-drug pair) # Get unique patient-drug combinations (we need one lookup per patient-drug pair)
unique_pairs = df[['UPID', 'Drug Name', 'PersonKey', 'Directory']].drop_duplicates() unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations") logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
# Step 2: Get all unique drugs and their SNOMED codes # Step 2: Get all unique drugs and their SNOMED codes
@@ -953,11 +954,11 @@ def batch_lookup_indication_groups(
# Step 4: Query GP records for all patients in batches # Step 4: Query GP records for all patients in batches
# The query finds the most recent matching SNOMED code for each patient # The query finds the most recent matching SNOMED code for each patient
# Get unique PersonKeys (each PersonKey = one patient) # Get unique PseudoNHSNoLinked values (each = one patient in GP records)
unique_patients = unique_pairs[['PersonKey', 'UPID', 'Directory']].drop_duplicates(subset=['PersonKey']) unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
person_keys = unique_patients['PersonKey'].tolist() patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
logger.info(f"Querying GP records for {len(person_keys)} unique patients in batches of {batch_size}...") logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
# Results dict: PersonKey -> (snomed_code, event_date) # Results dict: PersonKey -> (snomed_code, event_date)
gp_matches: dict[str, tuple[str, Any]] = {} gp_matches: dict[str, tuple[str, Any]] = {}
@@ -976,14 +977,14 @@ def batch_lookup_indication_groups(
snomed_placeholders = ", ".join(["%s"] * len(snomed_list)) snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
# Process patients in batches # Process patients in batches
for batch_start in range(0, len(person_keys), batch_size): for batch_start in range(0, len(patient_pseudonyms), batch_size):
batch_end = min(batch_start + batch_size, len(person_keys)) batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
batch_person_keys = person_keys[batch_start:batch_end] batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}") logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
# Build patient IN clause # Build patient IN clause
patient_placeholders = ", ".join(["%s"] * len(batch_person_keys)) patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
# Query to find all matching SNOMED codes for these patients # Query to find all matching SNOMED codes for these patients
# We'll get all matches and pick the most recent per patient in Python # We'll get all matches and pick the most recent per patient in Python
@@ -998,7 +999,7 @@ def batch_lookup_indication_groups(
ORDER BY "PatientPseudonym", "EventDateTime" DESC ORDER BY "PatientPseudonym", "EventDateTime" DESC
''' '''
params = tuple(batch_person_keys) + tuple(snomed_list) params = tuple(batch_pseudonyms) + tuple(snomed_list)
try: try:
results = connector.execute_dict(query, params) results = connector.execute_dict(query, params)
@@ -1031,12 +1032,12 @@ def batch_lookup_indication_groups(
for _, row in unique_pairs.iterrows(): for _, row in unique_pairs.iterrows():
upid = row['UPID'] upid = row['UPID']
drug_name = row['Drug Name'] drug_name = row['Drug Name']
person_key = row['PersonKey'] patient_pseudonym = row['PseudoNHSNoLinked']
directory = row['Directory'] directory = row['Directory']
# Check if patient has GP match # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
if person_key in gp_matches: if patient_pseudonym in gp_matches:
matched_snomed, event_date = gp_matches[person_key] matched_snomed, event_date = gp_matches[patient_pseudonym]
# Find the search_term for this SNOMED code and drug # Find the search_term for this SNOMED code and drug
# (A SNOMED code might map to multiple drugs with different search_terms) # (A SNOMED code might map to multiple drugs with different search_terms)
+22 -5
View File
@@ -36,23 +36,40 @@ DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
def clean_snomed_code(snomed_code: str) -> str: def clean_snomed_code(snomed_code: str) -> str:
""" """
Clean SNOMED code by removing trailing .0 suffix. Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0") The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
that need to be converted to clean integer strings. or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
These need to be converted to clean integer strings.
Args: Args:
snomed_code: Raw SNOMED code from CSV. snomed_code: Raw SNOMED code from CSV.
Returns: Returns:
Cleaned SNOMED code as string (e.g., "156370009"). Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
""" """
if not snomed_code: if not snomed_code:
return "" return ""
code = snomed_code.strip() code = snomed_code.strip()
# Remove trailing .0 if present # Handle scientific notation (e.g., "1.0629311000119108e+16")
if 'e' in code.lower():
try:
# Convert to float first, then to int, then to string
# Using int() directly on the float preserves precision for SNOMED codes
value = float(code)
# Check if it's a whole number (no decimal part)
if value == int(value):
return str(int(value))
else:
# Has decimal part - return as cleaned float
return str(value).replace('.0', '')
except (ValueError, OverflowError):
# If conversion fails, return as-is but cleaned
return code
# Remove trailing .0 if present (for non-scientific notation)
if code.endswith(".0"): if code.endswith(".0"):
code = code[:-2] code = code[:-2]