fix: correct patient identifier for GP diagnosis lookup (Task 3.3)

Two critical fixes for the indication-based pathway feature:

1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16")
   - CSV export from pandas/Excel converts large SNOMED codes to scientific notation
   - Without this fix, codes like "10629311000119108" were stored as "1.06e+16"
   - Now properly converts to full integer strings

2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey
   - PersonKey is LocalPatientID (provider-specific like "J188448")
   - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records
   - Without this fix, 0% of patients matched GP records
   - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier
This commit is contained in:
Andrew Charlwood
2026-02-05 15:49:24 +00:00
parent b9f4041670
commit 5b1569ed5c
3 changed files with 42 additions and 24 deletions
+1 -1
View File
@@ -130,7 +130,7 @@ python -m reflex compile
- [ ] Verify: Test refresh with --dry-run, check coverage stats
### 3.3 Test Full Refresh Pipeline
- [ ] Run `python -m cli.refresh_pathways` with real data
- [~] Run `python -m cli.refresh_pathways` with real data
- [ ] Verify pathway_nodes table has both chart_type values
- [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug)
- [ ] Verify unmatched patients appear with directorate fallback label
+19 -18
View File
@@ -902,10 +902,11 @@ def batch_lookup_indication_groups(
logger.info(f"Starting batch indication lookup for {len(df)} records...")
# Step 1: Get unique (UPID, Drug Name, PersonKey, Directory) combinations
# We need PersonKey to query Snowflake (it's the PatientPseudonym)
if 'PersonKey' not in df.columns:
logger.error("DataFrame missing 'PersonKey' column - cannot lookup GP records")
# Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
# We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
# Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
if 'PseudoNHSNoLinked' not in df.columns:
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
# Return fallback for all patients
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
@@ -913,7 +914,7 @@ def batch_lookup_indication_groups(
return result_df[['UPID', 'Indication_Group', 'Source']]
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
unique_pairs = df[['UPID', 'Drug Name', 'PersonKey', 'Directory']].drop_duplicates()
unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
# Step 2: Get all unique drugs and their SNOMED codes
@@ -953,11 +954,11 @@ def batch_lookup_indication_groups(
# Step 4: Query GP records for all patients in batches
# The query finds the most recent matching SNOMED code for each patient
# Get unique PersonKeys (each PersonKey = one patient)
unique_patients = unique_pairs[['PersonKey', 'UPID', 'Directory']].drop_duplicates(subset=['PersonKey'])
person_keys = unique_patients['PersonKey'].tolist()
# Get unique PseudoNHSNoLinked values (each = one patient in GP records)
unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
logger.info(f"Querying GP records for {len(person_keys)} unique patients in batches of {batch_size}...")
logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
# Results dict: PersonKey -> (snomed_code, event_date)
gp_matches: dict[str, tuple[str, Any]] = {}
@@ -976,14 +977,14 @@ def batch_lookup_indication_groups(
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
# Process patients in batches
for batch_start in range(0, len(person_keys), batch_size):
batch_end = min(batch_start + batch_size, len(person_keys))
batch_person_keys = person_keys[batch_start:batch_end]
for batch_start in range(0, len(patient_pseudonyms), batch_size):
batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
# Build patient IN clause
patient_placeholders = ", ".join(["%s"] * len(batch_person_keys))
patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
# Query to find all matching SNOMED codes for these patients
# We'll get all matches and pick the most recent per patient in Python
@@ -998,7 +999,7 @@ def batch_lookup_indication_groups(
ORDER BY "PatientPseudonym", "EventDateTime" DESC
'''
params = tuple(batch_person_keys) + tuple(snomed_list)
params = tuple(batch_pseudonyms) + tuple(snomed_list)
try:
results = connector.execute_dict(query, params)
@@ -1031,12 +1032,12 @@ def batch_lookup_indication_groups(
for _, row in unique_pairs.iterrows():
upid = row['UPID']
drug_name = row['Drug Name']
person_key = row['PersonKey']
patient_pseudonym = row['PseudoNHSNoLinked']
directory = row['Directory']
# Check if patient has GP match
if person_key in gp_matches:
matched_snomed, event_date = gp_matches[person_key]
# Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
if patient_pseudonym in gp_matches:
matched_snomed, event_date = gp_matches[patient_pseudonym]
# Find the search_term for this SNOMED code and drug
# (A SNOMED code might map to multiple drugs with different search_terms)
+22 -5
View File
@@ -36,23 +36,40 @@ DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
def clean_snomed_code(snomed_code: str) -> str:
"""
Clean SNOMED code by removing trailing .0 suffix.
Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0")
that need to be converted to clean integer strings.
The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
These need to be converted to clean integer strings.
Args:
snomed_code: Raw SNOMED code from CSV.
Returns:
Cleaned SNOMED code as string (e.g., "156370009").
Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
"""
if not snomed_code:
return ""
code = snomed_code.strip()
# Remove trailing .0 if present
# Handle scientific notation (e.g., "1.0629311000119108e+16")
if 'e' in code.lower():
try:
# Convert to float first, then to int, then to string
# Using int() directly on the float preserves precision for SNOMED codes
value = float(code)
# Check if it's a whole number (no decimal part)
if value == int(value):
return str(int(value))
else:
# Has decimal part - return as cleaned float
return str(value).replace('.0', '')
except (ValueError, OverflowError):
# If conversion fails, return as-is but cleaned
return code
# Remove trailing .0 if present (for non-scientific notation)
if code.endswith(".0"):
code = code[:-2]