fix: correct patient identifier for GP diagnosis lookup (Task 3.3)
Two critical fixes for the indication-based pathway feature: 1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16") - CSV export from pandas/Excel converts large SNOMED codes to scientific notation - Without this fix, codes like "10629311000119108" were stored as "1.06e+16" - Now properly converts to full integer strings 2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey - PersonKey is LocalPatientID (provider-specific like "J188448") - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records - Without this fix, 0% of patients matched GP records - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier
This commit is contained in:
@@ -130,7 +130,7 @@ python -m reflex compile
|
|||||||
- [ ] Verify: Test refresh with --dry-run, check coverage stats
|
- [ ] Verify: Test refresh with --dry-run, check coverage stats
|
||||||
|
|
||||||
### 3.3 Test Full Refresh Pipeline
|
### 3.3 Test Full Refresh Pipeline
|
||||||
- [ ] Run `python -m cli.refresh_pathways` with real data
|
- [~] Run `python -m cli.refresh_pathways` with real data
|
||||||
- [ ] Verify pathway_nodes table has both chart_type values
|
- [ ] Verify pathway_nodes table has both chart_type values
|
||||||
- [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug)
|
- [ ] Verify indication chart has expected hierarchy (Trust → SearchTerm → Drug)
|
||||||
- [ ] Verify unmatched patients appear with directorate fallback label
|
- [ ] Verify unmatched patients appear with directorate fallback label
|
||||||
|
|||||||
@@ -902,10 +902,11 @@ def batch_lookup_indication_groups(
|
|||||||
|
|
||||||
logger.info(f"Starting batch indication lookup for {len(df)} records...")
|
logger.info(f"Starting batch indication lookup for {len(df)} records...")
|
||||||
|
|
||||||
# Step 1: Get unique (UPID, Drug Name, PersonKey, Directory) combinations
|
# Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
|
||||||
# We need PersonKey to query Snowflake (it's the PatientPseudonym)
|
# We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
|
||||||
if 'PersonKey' not in df.columns:
|
# Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
|
||||||
logger.error("DataFrame missing 'PersonKey' column - cannot lookup GP records")
|
if 'PseudoNHSNoLinked' not in df.columns:
|
||||||
|
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
|
||||||
# Return fallback for all patients
|
# Return fallback for all patients
|
||||||
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
|
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
|
||||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||||
@@ -913,7 +914,7 @@ def batch_lookup_indication_groups(
|
|||||||
return result_df[['UPID', 'Indication_Group', 'Source']]
|
return result_df[['UPID', 'Indication_Group', 'Source']]
|
||||||
|
|
||||||
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
|
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
|
||||||
unique_pairs = df[['UPID', 'Drug Name', 'PersonKey', 'Directory']].drop_duplicates()
|
unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
|
||||||
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
|
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
|
||||||
|
|
||||||
# Step 2: Get all unique drugs and their SNOMED codes
|
# Step 2: Get all unique drugs and their SNOMED codes
|
||||||
@@ -953,11 +954,11 @@ def batch_lookup_indication_groups(
|
|||||||
# Step 4: Query GP records for all patients in batches
|
# Step 4: Query GP records for all patients in batches
|
||||||
# The query finds the most recent matching SNOMED code for each patient
|
# The query finds the most recent matching SNOMED code for each patient
|
||||||
|
|
||||||
# Get unique PersonKeys (each PersonKey = one patient)
|
# Get unique PseudoNHSNoLinked values (each = one patient in GP records)
|
||||||
unique_patients = unique_pairs[['PersonKey', 'UPID', 'Directory']].drop_duplicates(subset=['PersonKey'])
|
unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
|
||||||
person_keys = unique_patients['PersonKey'].tolist()
|
patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
|
||||||
|
|
||||||
logger.info(f"Querying GP records for {len(person_keys)} unique patients in batches of {batch_size}...")
|
logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
|
||||||
|
|
||||||
# Results dict: PersonKey -> (snomed_code, event_date)
|
# Results dict: PersonKey -> (snomed_code, event_date)
|
||||||
gp_matches: dict[str, tuple[str, Any]] = {}
|
gp_matches: dict[str, tuple[str, Any]] = {}
|
||||||
@@ -976,14 +977,14 @@ def batch_lookup_indication_groups(
|
|||||||
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
|
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
|
||||||
|
|
||||||
# Process patients in batches
|
# Process patients in batches
|
||||||
for batch_start in range(0, len(person_keys), batch_size):
|
for batch_start in range(0, len(patient_pseudonyms), batch_size):
|
||||||
batch_end = min(batch_start + batch_size, len(person_keys))
|
batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
|
||||||
batch_person_keys = person_keys[batch_start:batch_end]
|
batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
|
||||||
|
|
||||||
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
|
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
|
||||||
|
|
||||||
# Build patient IN clause
|
# Build patient IN clause
|
||||||
patient_placeholders = ", ".join(["%s"] * len(batch_person_keys))
|
patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
|
||||||
|
|
||||||
# Query to find all matching SNOMED codes for these patients
|
# Query to find all matching SNOMED codes for these patients
|
||||||
# We'll get all matches and pick the most recent per patient in Python
|
# We'll get all matches and pick the most recent per patient in Python
|
||||||
@@ -998,7 +999,7 @@ def batch_lookup_indication_groups(
|
|||||||
ORDER BY "PatientPseudonym", "EventDateTime" DESC
|
ORDER BY "PatientPseudonym", "EventDateTime" DESC
|
||||||
'''
|
'''
|
||||||
|
|
||||||
params = tuple(batch_person_keys) + tuple(snomed_list)
|
params = tuple(batch_pseudonyms) + tuple(snomed_list)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = connector.execute_dict(query, params)
|
results = connector.execute_dict(query, params)
|
||||||
@@ -1031,12 +1032,12 @@ def batch_lookup_indication_groups(
|
|||||||
for _, row in unique_pairs.iterrows():
|
for _, row in unique_pairs.iterrows():
|
||||||
upid = row['UPID']
|
upid = row['UPID']
|
||||||
drug_name = row['Drug Name']
|
drug_name = row['Drug Name']
|
||||||
person_key = row['PersonKey']
|
patient_pseudonym = row['PseudoNHSNoLinked']
|
||||||
directory = row['Directory']
|
directory = row['Directory']
|
||||||
|
|
||||||
# Check if patient has GP match
|
# Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
|
||||||
if person_key in gp_matches:
|
if patient_pseudonym in gp_matches:
|
||||||
matched_snomed, event_date = gp_matches[person_key]
|
matched_snomed, event_date = gp_matches[patient_pseudonym]
|
||||||
|
|
||||||
# Find the search_term for this SNOMED code and drug
|
# Find the search_term for this SNOMED code and drug
|
||||||
# (A SNOMED code might map to multiple drugs with different search_terms)
|
# (A SNOMED code might map to multiple drugs with different search_terms)
|
||||||
|
|||||||
@@ -36,23 +36,40 @@ DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
|
|||||||
|
|
||||||
def clean_snomed_code(snomed_code: str) -> str:
|
def clean_snomed_code(snomed_code: str) -> str:
|
||||||
"""
|
"""
|
||||||
Clean SNOMED code by removing trailing .0 suffix.
|
Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
|
||||||
|
|
||||||
The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0")
|
The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
|
||||||
that need to be converted to clean integer strings.
|
or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
|
||||||
|
These need to be converted to clean integer strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
snomed_code: Raw SNOMED code from CSV.
|
snomed_code: Raw SNOMED code from CSV.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Cleaned SNOMED code as string (e.g., "156370009").
|
Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
|
||||||
"""
|
"""
|
||||||
if not snomed_code:
|
if not snomed_code:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
code = snomed_code.strip()
|
code = snomed_code.strip()
|
||||||
|
|
||||||
# Remove trailing .0 if present
|
# Handle scientific notation (e.g., "1.0629311000119108e+16")
|
||||||
|
if 'e' in code.lower():
|
||||||
|
try:
|
||||||
|
# Convert to float first, then to int, then to string
|
||||||
|
# Using int() directly on the float preserves precision for SNOMED codes
|
||||||
|
value = float(code)
|
||||||
|
# Check if it's a whole number (no decimal part)
|
||||||
|
if value == int(value):
|
||||||
|
return str(int(value))
|
||||||
|
else:
|
||||||
|
# Has decimal part - return as cleaned float
|
||||||
|
return str(value).replace('.0', '')
|
||||||
|
except (ValueError, OverflowError):
|
||||||
|
# If conversion fails, return as-is but cleaned
|
||||||
|
return code
|
||||||
|
|
||||||
|
# Remove trailing .0 if present (for non-scientific notation)
|
||||||
if code.endswith(".0"):
|
if code.endswith(".0"):
|
||||||
code = code[:-2]
|
code = code[:-2]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user