""" Load enriched SNOMED mapping data into SQLite database. This module loads the drug_snomed_mapping_enriched.csv file into the ref_drug_snomed_mapping table for direct GP record matching. Source file: data/drug_snomed_mapping_enriched.csv (163K rows) Target table: ref_drug_snomed_mapping Usage: python -m data_processing.load_snomed_mapping Columns mapped: Drug -> drug_name Indication -> indication TA_ID -> ta_id Search_Term -> search_term SNOMEDCode -> snomed_code (cleaned: removes trailing .0) SNOMEDDescription -> snomed_description CleanedDrugName -> cleaned_drug_name PrimaryDirectorate -> primary_directorate AllDirectorates -> all_directorates """ from pathlib import Path from typing import Optional from core.logging_config import get_logger from data_processing.database import DatabaseManager from data_processing.reference_data import MigrationResult, _read_csv_with_fallback_encoding logger = get_logger(__name__) DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv") def clean_snomed_code(snomed_code: str) -> str: """ Clean SNOMED code by removing trailing .0 suffix. The enriched CSV has SNOMED codes with decimal notation (e.g., "156370009.0") that need to be converted to clean integer strings. Args: snomed_code: Raw SNOMED code from CSV. Returns: Cleaned SNOMED code as string (e.g., "156370009"). """ if not snomed_code: return "" code = snomed_code.strip() # Remove trailing .0 if present if code.endswith(".0"): code = code[:-2] return code def migrate_drug_snomed_mapping( db_manager: Optional[DatabaseManager] = None, csv_path: Optional[Path] = None ) -> MigrationResult: """ Migrate drug SNOMED mappings from CSV to SQLite ref_drug_snomed_mapping table. Source file format (with header): Drug,Indication,TA_ID,Search_Term,SNOMEDCode,SNOMEDDescription, CleanedDrugName,PrimaryDirectorate,AllDirectorates Example rows: ABATACEPT,Psoriatic arthritis after DMARDs,TA568,psoriatic arthritis, 156370009.0,Psoriatic arthritis,ABATACEPT,RHEUMATOLOGY,RHEUMATOLOGY|DERMATOLOGY Args: db_manager: DatabaseManager instance. Uses default if not provided. csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv. Returns: MigrationResult with statistics about the migration. """ if db_manager is None: db_manager = DatabaseManager() if csv_path is None: csv_path = DEFAULT_CSV_PATH table_name = "ref_drug_snomed_mapping" logger.info(f"Migrating drug SNOMED mappings from {csv_path} to {table_name}") if not csv_path.exists(): error_msg = f"Source file not found: {csv_path}" logger.error(error_msg) return MigrationResult( table_name=table_name, source_file=str(csv_path), rows_read=0, rows_inserted=0, rows_skipped=0, success=False, error_message=error_msg ) rows_read = 0 rows_inserted = 0 rows_skipped = 0 try: with db_manager.get_transaction() as conn: rows = _read_csv_with_fallback_encoding(csv_path) for i, row in enumerate(rows): # Skip header row if i == 0 and len(row) >= 5 and row[0].strip().lower() == "drug": logger.debug("Skipping header row") continue rows_read += 1 # Validate row format (need at least: Drug, Indication, TA_ID, Search_Term, SNOMEDCode) if len(row) < 5: logger.warning(f"Skipping malformed row {rows_read}: {row}") rows_skipped += 1 continue drug_name = row[0].strip() indication = row[1].strip() ta_id = row[2].strip() if len(row) > 2 else "" search_term = row[3].strip() snomed_code_raw = row[4].strip() if len(row) > 4 else "" snomed_description = row[5].strip() if len(row) > 5 else "" cleaned_drug_name = row[6].strip() if len(row) > 6 else drug_name.upper() primary_directorate = row[7].strip() if len(row) > 7 else "" all_directorates = row[8].strip() if len(row) > 8 else "" # Skip if required fields are empty if not drug_name or not indication or not search_term or not snomed_code_raw: logger.warning(f"Skipping row {rows_read} with empty required fields") rows_skipped += 1 continue # Clean SNOMED code (remove trailing .0) snomed_code = clean_snomed_code(snomed_code_raw) if not snomed_code: logger.warning(f"Skipping row {rows_read} with invalid SNOMED code: {snomed_code_raw}") rows_skipped += 1 continue cursor = conn.execute( """ INSERT OR IGNORE INTO ref_drug_snomed_mapping (drug_name, indication, ta_id, search_term, snomed_code, snomed_description, cleaned_drug_name, primary_directorate, all_directorates) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( drug_name, indication, ta_id, search_term, snomed_code, snomed_description, cleaned_drug_name, primary_directorate, all_directorates, ) ) if cursor.rowcount > 0: rows_inserted += 1 else: rows_skipped += 1 # Log progress every 10000 rows if rows_read % 10000 == 0: logger.info(f"Processed {rows_read} rows, inserted {rows_inserted}") logger.info( f"Drug SNOMED mapping migration complete: {rows_read} rows read, " f"{rows_inserted} inserted, {rows_skipped} skipped" ) return MigrationResult( table_name=table_name, source_file=str(csv_path), rows_read=rows_read, rows_inserted=rows_inserted, rows_skipped=rows_skipped, success=True ) except Exception as e: error_msg = f"Migration failed: {e}" logger.error(error_msg) return MigrationResult( table_name=table_name, source_file=str(csv_path), rows_read=rows_read, rows_inserted=0, rows_skipped=0, success=False, error_message=error_msg ) def get_drug_snomed_mapping_counts(db_manager: Optional[DatabaseManager] = None) -> dict: """ Get statistics about the ref_drug_snomed_mapping table. Args: db_manager: DatabaseManager instance. Uses default if not provided. Returns: Dictionary with: - total_mappings: Total rows in table - unique_drugs: Count of distinct drug names - unique_search_terms: Count of distinct search terms - unique_snomed_codes: Count of distinct SNOMED codes - unique_indications: Count of distinct indications """ if db_manager is None: db_manager = DatabaseManager() with db_manager.get_connection() as conn: cursor = conn.execute("SELECT COUNT(*) FROM ref_drug_snomed_mapping") total = cursor.fetchone()[0] cursor = conn.execute("SELECT COUNT(DISTINCT drug_name) FROM ref_drug_snomed_mapping") unique_drugs = cursor.fetchone()[0] cursor = conn.execute("SELECT COUNT(DISTINCT search_term) FROM ref_drug_snomed_mapping") unique_search_terms = cursor.fetchone()[0] cursor = conn.execute("SELECT COUNT(DISTINCT snomed_code) FROM ref_drug_snomed_mapping") unique_snomed_codes = cursor.fetchone()[0] cursor = conn.execute("SELECT COUNT(DISTINCT indication) FROM ref_drug_snomed_mapping") unique_indications = cursor.fetchone()[0] return { "total_mappings": total, "unique_drugs": unique_drugs, "unique_search_terms": unique_search_terms, "unique_snomed_codes": unique_snomed_codes, "unique_indications": unique_indications, } def verify_drug_snomed_mapping_migration( db_manager: Optional[DatabaseManager] = None, csv_path: Optional[Path] = None ) -> tuple[bool, str]: """ Verify that drug SNOMED mappings were migrated correctly. Checks: - Row count is reasonable (163K+ expected) - Unique search terms is reasonable (187 expected) - Sample lookups return expected values Args: db_manager: DatabaseManager instance. Uses default if not provided. csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv. Returns: Tuple of (success: bool, message: str) """ if db_manager is None: db_manager = DatabaseManager() if csv_path is None: csv_path = DEFAULT_CSV_PATH stats = get_drug_snomed_mapping_counts(db_manager) # Basic sanity checks if stats["total_mappings"] < 100000: return False, f"Too few rows: expected 163K+, got {stats['total_mappings']}" if stats["unique_search_terms"] < 100: return False, f"Too few search terms: expected ~187, got {stats['unique_search_terms']}" # Sample lookup verification with db_manager.get_connection() as conn: # Check that ABATACEPT exists (from sample data) cursor = conn.execute( "SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE drug_name = 'ABATACEPT'" ) abatacept_count = cursor.fetchone()[0] if abatacept_count == 0: return False, "Sample drug ABATACEPT not found in table" # Check that SNOMED codes were cleaned (no .0 suffix) cursor = conn.execute( "SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE snomed_code LIKE '%.0'" ) dirty_codes = cursor.fetchone()[0] if dirty_codes > 0: return False, f"Found {dirty_codes} SNOMED codes with uncleaned .0 suffix" return True, ( f"Verified {stats['total_mappings']:,} mappings: " f"{stats['unique_drugs']} drugs, " f"{stats['unique_search_terms']} search terms, " f"{stats['unique_snomed_codes']:,} SNOMED codes" ) def main(): """CLI entry point for loading SNOMED mapping data.""" import argparse parser = argparse.ArgumentParser( description="Load drug SNOMED mapping data into SQLite database" ) parser.add_argument( "--csv", type=Path, default=DEFAULT_CSV_PATH, help=f"Path to CSV file (default: {DEFAULT_CSV_PATH})" ) parser.add_argument( "--verify-only", action="store_true", help="Only verify existing data, don't migrate" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() # Configure logging import logging if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if args.verify_only: print("Verifying existing data...") success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv) if success: print(f"[OK] Verification passed: {message}") else: print(f"[FAILED] Verification failed: {message}") return 0 if success else 1 # Run migration print(f"Loading SNOMED mapping from {args.csv}...") result = migrate_drug_snomed_mapping(csv_path=args.csv) if result.success: print(f"[OK] {result}") # Show statistics stats = get_drug_snomed_mapping_counts() print(f"\nTable statistics:") print(f" Total mappings: {stats['total_mappings']:,}") print(f" Unique drugs: {stats['unique_drugs']}") print(f" Unique search terms: {stats['unique_search_terms']}") print(f" Unique SNOMED codes: {stats['unique_snomed_codes']:,}") print(f" Unique indications: {stats['unique_indications']}") # Verify success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv) if success: print(f"\n[OK] Verification: {message}") else: print(f"\n[WARNING] Verification: {message}") return 1 else: print(f"[FAILED] {result}") return 1 return 0 if __name__ == "__main__": exit(main())