fix: resolve Snowflake column casing and UPID mapping issues (Task 3.1)

Three issues identified and fixed during Task 3.1 testing:

1. Snowflake column name casing:
   - Unquoted columns in Snowflake are returned as UPPERCASE
   - Fixed by aliasing columns with quoted names: AS "Search_Term"
   - Now correctly populates 139 unique Search_Terms (was 0)

2. Duplicate UPID index error:
   - indication_df_for_chart could have duplicate UPIDs
   - Added drop_duplicates(subset=['UPID']) before set_index()
   - Keeps first occurrence (DIAGNOSIS over FALLBACK)

3. Missing UPIDs in indication lookup:
   - Old code: built indication_df from unique PseudoNHSNoLinked only
   - Problem: patients with multiple UPIDs (multi-provider) were missing
   - Fixed: now builds indication_df from ALL unique UPIDs in df
   - Also handles NaN values in Directory column safely

Validation results from test run:
- 36,628 patients queried
- 34,006 (92.8%) had GP diagnosis matches
- 139 unique Search_Terms found
- Top 5: drug misuse (8602), influenza (6239), diabetes (2476)

Still to verify: full pathway processing after these fixes.
This commit is contained in:
Andrew Charlwood
2026-02-05 18:30:23 +00:00
parent f7166b38c8
commit 22222fe9ca
3 changed files with 27 additions and 15 deletions
+1 -1
View File
@@ -90,7 +90,7 @@ python -m reflex compile
## Phase 3: Test Full Pipeline ## Phase 3: Test Full Pipeline
### 3.1 Test Refresh with Real Data ### 3.1 Test Refresh with Real Data
- [ ] Run `python -m cli.refresh_pathways --chart-type all` with Snowflake - [~] Run `python -m cli.refresh_pathways --chart-type all` with Snowflake
- [ ] Verify pathway_nodes table has both chart_type values: - [ ] Verify pathway_nodes table has both chart_type values:
- `SELECT chart_type, COUNT(*) FROM pathway_nodes GROUP BY chart_type` - `SELECT chart_type, COUNT(*) FROM pathway_nodes GROUP BY chart_type`
- [ ] Verify indication hierarchy: Trust → Search_Term → Drug → Pathway - [ ] Verify indication hierarchy: Trust → Search_Term → Drug → Pathway
+22 -11
View File
@@ -384,11 +384,8 @@ def refresh_pathways(
results[f"{config.id}:indication"] = [] results[f"{config.id}:indication"] = []
continue continue
# Get unique patient pseudonyms and their corresponding UPID/Directory # Get unique patient pseudonyms for GP lookup (avoid redundant queries)
patient_lookup = df[['UPID', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates( patient_pseudonyms = df['PseudoNHSNoLinked'].dropna().unique().tolist()
subset=['PseudoNHSNoLinked']
).copy()
patient_pseudonyms = patient_lookup['PseudoNHSNoLinked'].dropna().unique().tolist()
logger.info(f"Looking up GP diagnoses for {len(patient_pseudonyms)} unique patients...") logger.info(f"Looking up GP diagnoses for {len(patient_pseudonyms)} unique patients...")
@@ -402,15 +399,24 @@ def refresh_pathways(
# Step 3: Build indication_df mapping UPID -> Indication_Group # Step 3: Build indication_df mapping UPID -> Indication_Group
# For matched patients: Indication_Group = Search_Term # For matched patients: Indication_Group = Search_Term
# For unmatched patients: Indication_Group = Directory + " (no GP dx)" # For unmatched patients: Indication_Group = Directory + " (no GP dx)"
#
# IMPORTANT: We need ALL unique UPIDs, not just unique PseudoNHSNoLinked.
# A patient can have multiple UPIDs if they visited multiple providers.
# Get all unique UPID records with their PseudoNHSNoLinked and Directory
upid_lookup = df[['UPID', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates(
subset=['UPID']
).copy()
if gp_matches_df.empty: if gp_matches_df.empty:
logger.warning("No GP matches found - all patients will use fallback directory") logger.warning("No GP matches found - all patients will use fallback directory")
# All patients use fallback # All patients use fallback
indication_records = [] indication_records = []
for _, row in patient_lookup.iterrows(): for _, row in upid_lookup.iterrows():
directory = row['Directory']
indication_records.append({ indication_records.append({
'UPID': row['UPID'], 'UPID': row['UPID'],
'Indication_Group': str(row['Directory']) + " (no GP dx)", 'Indication_Group': str(directory) + " (no GP dx)" if pd.notna(directory) else "UNKNOWN (no GP dx)",
'Source': 'FALLBACK', 'Source': 'FALLBACK',
}) })
indication_df = pd.DataFrame(indication_records) indication_df = pd.DataFrame(indication_records)
@@ -421,23 +427,25 @@ def refresh_pathways(
gp_matches_df['Search_Term'] gp_matches_df['Search_Term']
)) ))
# Build indication records for each unique patient # Build indication records for each unique UPID
indication_records = [] indication_records = []
for _, row in patient_lookup.iterrows(): for _, row in upid_lookup.iterrows():
pseudo = row['PseudoNHSNoLinked'] pseudo = row['PseudoNHSNoLinked']
upid = row['UPID'] upid = row['UPID']
directory = row['Directory'] directory = row['Directory']
if pseudo in match_lookup: if pd.notna(pseudo) and pseudo in match_lookup:
indication_records.append({ indication_records.append({
'UPID': upid, 'UPID': upid,
'Indication_Group': match_lookup[pseudo], 'Indication_Group': match_lookup[pseudo],
'Source': 'DIAGNOSIS', 'Source': 'DIAGNOSIS',
}) })
else: else:
# Use fallback: Directory + " (no GP dx)"
fallback_label = str(directory) + " (no GP dx)" if pd.notna(directory) else "UNKNOWN (no GP dx)"
indication_records.append({ indication_records.append({
'UPID': upid, 'UPID': upid,
'Indication_Group': str(directory) + " (no GP dx)", 'Indication_Group': fallback_label,
'Source': 'FALLBACK', 'Source': 'FALLBACK',
}) })
@@ -465,6 +473,9 @@ def refresh_pathways(
# It expects indication_df to have 'Directory' column (mapped from Indication_Group) # It expects indication_df to have 'Directory' column (mapped from Indication_Group)
indication_df_for_chart = indication_df[['UPID', 'Indication_Group']].copy() indication_df_for_chart = indication_df[['UPID', 'Indication_Group']].copy()
indication_df_for_chart = indication_df_for_chart.rename(columns={'Indication_Group': 'Directory'}) indication_df_for_chart = indication_df_for_chart.rename(columns={'Indication_Group': 'Directory'})
# Ensure unique UPID index (build_hierarchy requires uniquely valued Index)
# Keep first occurrence - DIAGNOSIS entries should come before FALLBACK
indication_df_for_chart = indication_df_for_chart.drop_duplicates(subset=['UPID'], keep='first')
indication_df_for_chart = indication_df_for_chart.set_index('UPID') indication_df_for_chart = indication_df_for_chart.set_index('UPID')
# Process each date filter with indication grouping # Process each date filter with indication grouping
+4 -3
View File
@@ -1348,12 +1348,13 @@ def get_patient_indication_groups(
# Build the full query with cluster CTE # Build the full query with cluster CTE
# This finds the most recent matching diagnosis for each patient # This finds the most recent matching diagnosis for each patient
# Note: Column names must be aliased to ensure consistent casing in results
query = f""" query = f"""
{CLUSTER_MAPPING_SQL} {CLUSTER_MAPPING_SQL}
SELECT SELECT
pc."PatientPseudonym", pc."PatientPseudonym" AS "PatientPseudonym",
aic.Search_Term, aic.Search_Term AS "Search_Term",
pc."EventDateTime" pc."EventDateTime" AS "EventDateTime"
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding" pc FROM DATA_HUB.PHM."PrimaryCareClinicalCoding" pc
INNER JOIN AllIndicationCodes aic INNER JOIN AllIndicationCodes aic
ON pc."SNOMEDCode" = aic.SNOMEDCode ON pc."SNOMEDCode" = aic.SNOMEDCode