feat: add ref_drug_snomed_mapping schema (Task 1.1)

- Add REF_DRUG_SNOMED_MAPPING_SCHEMA with 11 columns for direct SNOMED mapping
- Add 5 indexes for lookup performance (drug, cleaned_drug, snomed, search_term, composite)
- Add create_drug_snomed_mapping_table() helper function
- Update helper functions (drop, get_counts, verify_exists) to include new table
- Table is included in REFERENCE_TABLES_SCHEMA and created by migration
This commit is contained in:
Andrew Charlwood
2026-02-05 14:06:16 +00:00
parent fa72fb3098
commit 9943e85761
3 changed files with 346 additions and 1271 deletions
+77 -5
View File
@@ -115,6 +115,43 @@ CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_cluster ON ref_drug_
CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_drug_indication_clusters(indication);
"""
REF_DRUG_SNOMED_MAPPING_SCHEMA = """
-- Direct SNOMED code mapping from drug to indication to GP diagnosis codes
-- Source: data/drug_snomed_mapping_enriched.csv (163K rows)
-- Used for direct GP record matching to assign diagnosis-based directorates
-- and to support indication-based pathway hierarchy (Trust → Search_Term → Drug → Pathway)
CREATE TABLE IF NOT EXISTS ref_drug_snomed_mapping (
id INTEGER PRIMARY KEY AUTOINCREMENT,
drug_name TEXT NOT NULL, -- Original drug name from mapping
indication TEXT NOT NULL, -- Specific indication (603 unique values)
ta_id TEXT, -- NICE TA reference (e.g., TA568)
search_term TEXT NOT NULL, -- Simplified grouping (187 unique values)
snomed_code TEXT NOT NULL, -- SNOMED CT code for GP record matching
snomed_description TEXT, -- SNOMED code description
cleaned_drug_name TEXT NOT NULL, -- Standardized drug name for matching
primary_directorate TEXT, -- Primary directorate for this indication
all_directorates TEXT, -- Pipe-separated list of valid directorates
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(drug_name, indication, snomed_code)
);
-- Index for looking up SNOMED codes by drug name (most common access pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug ON ref_drug_snomed_mapping(drug_name);
-- Index for looking up by cleaned drug name (standardized matching)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_cleaned ON ref_drug_snomed_mapping(cleaned_drug_name);
-- Index for looking up by SNOMED code (reverse lookup from GP record)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_snomed ON ref_drug_snomed_mapping(snomed_code);
-- Index for grouping by search_term (indication-based hierarchy)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_search_term ON ref_drug_snomed_mapping(search_term);
-- Composite index for drug + snomed code (common lookup pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug_snomed
ON ref_drug_snomed_mapping(cleaned_drug_name, snomed_code);
"""
# =============================================================================
# Pathway Data Architecture Schemas
@@ -477,6 +514,8 @@ REFERENCE_TABLES_SCHEMA = f"""
{REF_DRUG_DIRECTORY_MAP_SCHEMA}
{REF_DRUG_INDICATION_CLUSTERS_SCHEMA}
{REF_DRUG_SNOMED_MAPPING_SCHEMA}
"""
FACT_TABLES_SCHEMA = f"""
@@ -535,10 +574,26 @@ def drop_reference_tables(conn: sqlite3.Connection) -> None:
DROP TABLE IF EXISTS ref_directories;
DROP TABLE IF EXISTS ref_drug_directory_map;
DROP TABLE IF EXISTS ref_drug_indication_clusters;
DROP TABLE IF EXISTS ref_drug_snomed_mapping;
""")
logger.info("Reference tables dropped")
def create_drug_snomed_mapping_table(conn: sqlite3.Connection) -> None:
"""
Create the ref_drug_snomed_mapping table for direct SNOMED code mapping.
This table stores mappings from drugs to SNOMED codes for GP record matching,
enabling diagnosis-based directorate assignment and indication-based pathways.
Args:
conn: SQLite database connection.
"""
logger.info("Creating ref_drug_snomed_mapping table...")
conn.executescript(REF_DRUG_SNOMED_MAPPING_SCHEMA)
logger.info("ref_drug_snomed_mapping table created successfully")
def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
"""
Get row counts for all reference tables.
@@ -549,13 +604,23 @@ def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
Returns:
Dictionary mapping table name to row count.
"""
tables = ["ref_drug_names", "ref_organizations", "ref_directories", "ref_drug_directory_map", "ref_drug_indication_clusters"]
tables = [
"ref_drug_names",
"ref_organizations",
"ref_directories",
"ref_drug_directory_map",
"ref_drug_indication_clusters",
"ref_drug_snomed_mapping",
]
counts = {}
for table in tables:
cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
result = cursor.fetchone()
counts[table] = result[0] if result else 0
try:
cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
result = cursor.fetchone()
counts[table] = result[0] if result else 0
except sqlite3.OperationalError:
counts[table] = 0
return counts
@@ -570,7 +635,14 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
Returns:
List of missing table names. Empty list means all tables exist.
"""
required_tables = ["ref_drug_names", "ref_organizations", "ref_directories", "ref_drug_directory_map", "ref_drug_indication_clusters"]
required_tables = [
"ref_drug_names",
"ref_organizations",
"ref_directories",
"ref_drug_directory_map",
"ref_drug_indication_clusters",
"ref_drug_snomed_mapping",
]
missing = []
for table in required_tables: