Files
HighCostDrugsDemo/data_processing/load_snomed_mapping.py
T
Andrew Charlwood 5b1569ed5c fix: correct patient identifier for GP diagnosis lookup (Task 3.3)
Two critical fixes for the indication-based pathway feature:

1. clean_snomed_code() now handles scientific notation (e.g., "1.06e+16")
   - CSV export from pandas/Excel converts large SNOMED codes to scientific notation
   - Without this fix, codes like "10629311000119108" were stored as "1.06e+16"
   - Now properly converts to full integer strings

2. batch_lookup_indication_groups() now uses PseudoNHSNoLinked instead of PersonKey
   - PersonKey is LocalPatientID (provider-specific like "J188448")
   - PseudoNHSNoLinked is the pseudonymised NHS number that matches PatientPseudonym in GP records
   - Without this fix, 0% of patients matched GP records
   - Test shows ~20% match rate for ADALIMUMAB patients with correct identifier
2026-02-05 15:49:24 +00:00

402 lines
14 KiB
Python

"""
Load enriched SNOMED mapping data into SQLite database.
This module loads the drug_snomed_mapping_enriched.csv file into the
ref_drug_snomed_mapping table for direct GP record matching.
Source file: data/drug_snomed_mapping_enriched.csv (163K rows)
Target table: ref_drug_snomed_mapping
Usage:
python -m data_processing.load_snomed_mapping
Columns mapped:
Drug -> drug_name
Indication -> indication
TA_ID -> ta_id
Search_Term -> search_term
SNOMEDCode -> snomed_code (cleaned: removes trailing .0)
SNOMEDDescription -> snomed_description
CleanedDrugName -> cleaned_drug_name
PrimaryDirectorate -> primary_directorate
AllDirectorates -> all_directorates
"""
from pathlib import Path
from typing import Optional
from core.logging_config import get_logger
from data_processing.database import DatabaseManager
from data_processing.reference_data import MigrationResult, _read_csv_with_fallback_encoding
logger = get_logger(__name__)
DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
def clean_snomed_code(snomed_code: str) -> str:
"""
Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
These need to be converted to clean integer strings.
Args:
snomed_code: Raw SNOMED code from CSV.
Returns:
Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
"""
if not snomed_code:
return ""
code = snomed_code.strip()
# Handle scientific notation (e.g., "1.0629311000119108e+16")
if 'e' in code.lower():
try:
# Convert to float first, then to int, then to string
# Using int() directly on the float preserves precision for SNOMED codes
value = float(code)
# Check if it's a whole number (no decimal part)
if value == int(value):
return str(int(value))
else:
# Has decimal part - return as cleaned float
return str(value).replace('.0', '')
except (ValueError, OverflowError):
# If conversion fails, return as-is but cleaned
return code
# Remove trailing .0 if present (for non-scientific notation)
if code.endswith(".0"):
code = code[:-2]
return code
def migrate_drug_snomed_mapping(
db_manager: Optional[DatabaseManager] = None,
csv_path: Optional[Path] = None
) -> MigrationResult:
"""
Migrate drug SNOMED mappings from CSV to SQLite ref_drug_snomed_mapping table.
Source file format (with header):
Drug,Indication,TA_ID,Search_Term,SNOMEDCode,SNOMEDDescription,
CleanedDrugName,PrimaryDirectorate,AllDirectorates
Example rows:
ABATACEPT,Psoriatic arthritis after DMARDs,TA568,psoriatic arthritis,
156370009.0,Psoriatic arthritis,ABATACEPT,RHEUMATOLOGY,RHEUMATOLOGY|DERMATOLOGY
Args:
db_manager: DatabaseManager instance. Uses default if not provided.
csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
Returns:
MigrationResult with statistics about the migration.
"""
if db_manager is None:
db_manager = DatabaseManager()
if csv_path is None:
csv_path = DEFAULT_CSV_PATH
table_name = "ref_drug_snomed_mapping"
logger.info(f"Migrating drug SNOMED mappings from {csv_path} to {table_name}")
if not csv_path.exists():
error_msg = f"Source file not found: {csv_path}"
logger.error(error_msg)
return MigrationResult(
table_name=table_name,
source_file=str(csv_path),
rows_read=0,
rows_inserted=0,
rows_skipped=0,
success=False,
error_message=error_msg
)
rows_read = 0
rows_inserted = 0
rows_skipped = 0
try:
with db_manager.get_transaction() as conn:
rows = _read_csv_with_fallback_encoding(csv_path)
for i, row in enumerate(rows):
# Skip header row
if i == 0 and len(row) >= 5 and row[0].strip().lower() == "drug":
logger.debug("Skipping header row")
continue
rows_read += 1
# Validate row format (need at least: Drug, Indication, TA_ID, Search_Term, SNOMEDCode)
if len(row) < 5:
logger.warning(f"Skipping malformed row {rows_read}: {row}")
rows_skipped += 1
continue
drug_name = row[0].strip()
indication = row[1].strip()
ta_id = row[2].strip() if len(row) > 2 else ""
search_term = row[3].strip()
snomed_code_raw = row[4].strip() if len(row) > 4 else ""
snomed_description = row[5].strip() if len(row) > 5 else ""
cleaned_drug_name = row[6].strip() if len(row) > 6 else drug_name.upper()
primary_directorate = row[7].strip() if len(row) > 7 else ""
all_directorates = row[8].strip() if len(row) > 8 else ""
# Skip if required fields are empty
if not drug_name or not indication or not search_term or not snomed_code_raw:
logger.warning(f"Skipping row {rows_read} with empty required fields")
rows_skipped += 1
continue
# Clean SNOMED code (remove trailing .0)
snomed_code = clean_snomed_code(snomed_code_raw)
if not snomed_code:
logger.warning(f"Skipping row {rows_read} with invalid SNOMED code: {snomed_code_raw}")
rows_skipped += 1
continue
cursor = conn.execute(
"""
INSERT OR IGNORE INTO ref_drug_snomed_mapping
(drug_name, indication, ta_id, search_term, snomed_code, snomed_description,
cleaned_drug_name, primary_directorate, all_directorates)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
drug_name,
indication,
ta_id,
search_term,
snomed_code,
snomed_description,
cleaned_drug_name,
primary_directorate,
all_directorates,
)
)
if cursor.rowcount > 0:
rows_inserted += 1
else:
rows_skipped += 1
# Log progress every 10000 rows
if rows_read % 10000 == 0:
logger.info(f"Processed {rows_read} rows, inserted {rows_inserted}")
logger.info(
f"Drug SNOMED mapping migration complete: {rows_read} rows read, "
f"{rows_inserted} inserted, {rows_skipped} skipped"
)
return MigrationResult(
table_name=table_name,
source_file=str(csv_path),
rows_read=rows_read,
rows_inserted=rows_inserted,
rows_skipped=rows_skipped,
success=True
)
except Exception as e:
error_msg = f"Migration failed: {e}"
logger.error(error_msg)
return MigrationResult(
table_name=table_name,
source_file=str(csv_path),
rows_read=rows_read,
rows_inserted=0,
rows_skipped=0,
success=False,
error_message=error_msg
)
def get_drug_snomed_mapping_counts(db_manager: Optional[DatabaseManager] = None) -> dict:
"""
Get statistics about the ref_drug_snomed_mapping table.
Args:
db_manager: DatabaseManager instance. Uses default if not provided.
Returns:
Dictionary with:
- total_mappings: Total rows in table
- unique_drugs: Count of distinct drug names
- unique_search_terms: Count of distinct search terms
- unique_snomed_codes: Count of distinct SNOMED codes
- unique_indications: Count of distinct indications
"""
if db_manager is None:
db_manager = DatabaseManager()
with db_manager.get_connection() as conn:
cursor = conn.execute("SELECT COUNT(*) FROM ref_drug_snomed_mapping")
total = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(DISTINCT drug_name) FROM ref_drug_snomed_mapping")
unique_drugs = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(DISTINCT search_term) FROM ref_drug_snomed_mapping")
unique_search_terms = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(DISTINCT snomed_code) FROM ref_drug_snomed_mapping")
unique_snomed_codes = cursor.fetchone()[0]
cursor = conn.execute("SELECT COUNT(DISTINCT indication) FROM ref_drug_snomed_mapping")
unique_indications = cursor.fetchone()[0]
return {
"total_mappings": total,
"unique_drugs": unique_drugs,
"unique_search_terms": unique_search_terms,
"unique_snomed_codes": unique_snomed_codes,
"unique_indications": unique_indications,
}
def verify_drug_snomed_mapping_migration(
db_manager: Optional[DatabaseManager] = None,
csv_path: Optional[Path] = None
) -> tuple[bool, str]:
"""
Verify that drug SNOMED mappings were migrated correctly.
Checks:
- Row count is reasonable (163K+ expected)
- Unique search terms is reasonable (187 expected)
- Sample lookups return expected values
Args:
db_manager: DatabaseManager instance. Uses default if not provided.
csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
Returns:
Tuple of (success: bool, message: str)
"""
if db_manager is None:
db_manager = DatabaseManager()
if csv_path is None:
csv_path = DEFAULT_CSV_PATH
stats = get_drug_snomed_mapping_counts(db_manager)
# Basic sanity checks
if stats["total_mappings"] < 100000:
return False, f"Too few rows: expected 163K+, got {stats['total_mappings']}"
if stats["unique_search_terms"] < 100:
return False, f"Too few search terms: expected ~187, got {stats['unique_search_terms']}"
# Sample lookup verification
with db_manager.get_connection() as conn:
# Check that ABATACEPT exists (from sample data)
cursor = conn.execute(
"SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE drug_name = 'ABATACEPT'"
)
abatacept_count = cursor.fetchone()[0]
if abatacept_count == 0:
return False, "Sample drug ABATACEPT not found in table"
# Check that SNOMED codes were cleaned (no .0 suffix)
cursor = conn.execute(
"SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE snomed_code LIKE '%.0'"
)
dirty_codes = cursor.fetchone()[0]
if dirty_codes > 0:
return False, f"Found {dirty_codes} SNOMED codes with uncleaned .0 suffix"
return True, (
f"Verified {stats['total_mappings']:,} mappings: "
f"{stats['unique_drugs']} drugs, "
f"{stats['unique_search_terms']} search terms, "
f"{stats['unique_snomed_codes']:,} SNOMED codes"
)
def main():
"""CLI entry point for loading SNOMED mapping data."""
import argparse
parser = argparse.ArgumentParser(
description="Load drug SNOMED mapping data into SQLite database"
)
parser.add_argument(
"--csv",
type=Path,
default=DEFAULT_CSV_PATH,
help=f"Path to CSV file (default: {DEFAULT_CSV_PATH})"
)
parser.add_argument(
"--verify-only",
action="store_true",
help="Only verify existing data, don't migrate"
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Configure logging
import logging
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
if args.verify_only:
print("Verifying existing data...")
success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
if success:
print(f"[OK] Verification passed: {message}")
else:
print(f"[FAILED] Verification failed: {message}")
return 0 if success else 1
# Run migration
print(f"Loading SNOMED mapping from {args.csv}...")
result = migrate_drug_snomed_mapping(csv_path=args.csv)
if result.success:
print(f"[OK] {result}")
# Show statistics
stats = get_drug_snomed_mapping_counts()
print(f"\nTable statistics:")
print(f" Total mappings: {stats['total_mappings']:,}")
print(f" Unique drugs: {stats['unique_drugs']}")
print(f" Unique search terms: {stats['unique_search_terms']}")
print(f" Unique SNOMED codes: {stats['unique_snomed_codes']:,}")
print(f" Unique indications: {stats['unique_indications']}")
# Verify
success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
if success:
print(f"\n[OK] Verification: {message}")
else:
print(f"\n[WARNING] Verification: {message}")
return 1
else:
print(f"[FAILED] {result}")
return 1
return 0
if __name__ == "__main__":
exit(main())