refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables
Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
This commit is contained in:
@@ -24,15 +24,6 @@ from data_processing.schema import (
|
||||
REF_DRUG_DIRECTORY_MAP_SCHEMA,
|
||||
REF_DRUG_INDICATION_CLUSTERS_SCHEMA,
|
||||
REFERENCE_TABLES_SCHEMA,
|
||||
# Fact table schemas
|
||||
FACT_INTERVENTIONS_SCHEMA,
|
||||
FACT_TABLES_SCHEMA,
|
||||
# Materialized view schemas
|
||||
MV_PATIENT_TREATMENT_SUMMARY_SCHEMA,
|
||||
MATERIALIZED_VIEWS_SCHEMA,
|
||||
# File tracking schemas
|
||||
PROCESSED_FILES_SCHEMA,
|
||||
FILE_TRACKING_SCHEMA,
|
||||
# Combined schema
|
||||
ALL_TABLES_SCHEMA,
|
||||
# Reference table functions
|
||||
@@ -40,16 +31,6 @@ from data_processing.schema import (
|
||||
drop_reference_tables,
|
||||
get_reference_table_counts,
|
||||
verify_reference_tables_exist,
|
||||
# Fact table functions
|
||||
create_fact_tables,
|
||||
drop_fact_tables,
|
||||
get_fact_table_counts,
|
||||
verify_fact_tables_exist,
|
||||
# File tracking functions
|
||||
create_file_tracking_tables,
|
||||
drop_file_tracking_tables,
|
||||
get_file_tracking_counts,
|
||||
verify_file_tracking_tables_exist,
|
||||
# Combined functions
|
||||
create_all_tables,
|
||||
drop_all_tables,
|
||||
@@ -81,27 +62,12 @@ from data_processing.reference_data import (
|
||||
from data_processing.loader import (
|
||||
DataLoader,
|
||||
FileDataLoader,
|
||||
SQLiteDataLoader,
|
||||
LoadResult,
|
||||
get_loader,
|
||||
REQUIRED_COLUMNS,
|
||||
OPTIONAL_COLUMNS,
|
||||
)
|
||||
|
||||
# Patient data migration functions
|
||||
from data_processing.patient_data import (
|
||||
PatientDataLoadResult,
|
||||
load_patient_data,
|
||||
get_patient_data_stats,
|
||||
list_processed_files,
|
||||
calculate_file_hash,
|
||||
# Materialized view functions
|
||||
MVRefreshResult,
|
||||
refresh_patient_treatment_summary,
|
||||
get_patient_summary_stats,
|
||||
verify_mv_consistency,
|
||||
)
|
||||
|
||||
# Snowflake connector
|
||||
from data_processing.snowflake_connector import (
|
||||
SnowflakeConnector,
|
||||
@@ -165,15 +131,6 @@ __all__ = [
|
||||
"REF_DRUG_DIRECTORY_MAP_SCHEMA",
|
||||
"REF_DRUG_INDICATION_CLUSTERS_SCHEMA",
|
||||
"REFERENCE_TABLES_SCHEMA",
|
||||
# Fact table schemas
|
||||
"FACT_INTERVENTIONS_SCHEMA",
|
||||
"FACT_TABLES_SCHEMA",
|
||||
# Materialized view schemas
|
||||
"MV_PATIENT_TREATMENT_SUMMARY_SCHEMA",
|
||||
"MATERIALIZED_VIEWS_SCHEMA",
|
||||
# File tracking schemas
|
||||
"PROCESSED_FILES_SCHEMA",
|
||||
"FILE_TRACKING_SCHEMA",
|
||||
# Combined schema
|
||||
"ALL_TABLES_SCHEMA",
|
||||
# Reference table functions
|
||||
@@ -181,16 +138,6 @@ __all__ = [
|
||||
"drop_reference_tables",
|
||||
"get_reference_table_counts",
|
||||
"verify_reference_tables_exist",
|
||||
# Fact table functions
|
||||
"create_fact_tables",
|
||||
"drop_fact_tables",
|
||||
"get_fact_table_counts",
|
||||
"verify_fact_tables_exist",
|
||||
# File tracking functions
|
||||
"create_file_tracking_tables",
|
||||
"drop_file_tracking_tables",
|
||||
"get_file_tracking_counts",
|
||||
"verify_file_tracking_tables_exist",
|
||||
# Combined functions
|
||||
"create_all_tables",
|
||||
"drop_all_tables",
|
||||
@@ -216,22 +163,10 @@ __all__ = [
|
||||
# Data loader abstractions
|
||||
"DataLoader",
|
||||
"FileDataLoader",
|
||||
"SQLiteDataLoader",
|
||||
"LoadResult",
|
||||
"get_loader",
|
||||
"REQUIRED_COLUMNS",
|
||||
"OPTIONAL_COLUMNS",
|
||||
# Patient data migration
|
||||
"PatientDataLoadResult",
|
||||
"load_patient_data",
|
||||
"get_patient_data_stats",
|
||||
"list_processed_files",
|
||||
"calculate_file_hash",
|
||||
# Materialized view functions
|
||||
"MVRefreshResult",
|
||||
"refresh_patient_treatment_summary",
|
||||
"get_patient_summary_stats",
|
||||
"verify_mv_consistency",
|
||||
# Snowflake connector
|
||||
"SnowflakeConnector",
|
||||
"SnowflakeConnectionError",
|
||||
|
||||
@@ -232,9 +232,9 @@ class DataSourceManager:
|
||||
)
|
||||
|
||||
def _check_sqlite_status(self) -> SourceStatus:
|
||||
"""Check if SQLite database is available with data."""
|
||||
"""Check if SQLite database is available with pathway data."""
|
||||
try:
|
||||
from data_processing.database import default_db_manager, default_db_config
|
||||
from data_processing.database import default_db_config
|
||||
|
||||
db_path = self._sqlite_db_path or Path(default_db_config.db_path)
|
||||
|
||||
@@ -252,22 +252,22 @@ class DataSourceManager:
|
||||
config = DatabaseConfig(db_path=db_path)
|
||||
manager = DatabaseManager(config)
|
||||
|
||||
if not manager.table_exists("fact_interventions"):
|
||||
if not manager.table_exists("pathway_nodes"):
|
||||
return SourceStatus(
|
||||
source_type=DataSourceType.SQLITE,
|
||||
available=False,
|
||||
configured=True,
|
||||
message="fact_interventions table not found",
|
||||
message="pathway_nodes table not found",
|
||||
last_checked=datetime.now(),
|
||||
)
|
||||
|
||||
count = manager.get_table_count("fact_interventions")
|
||||
count = manager.get_table_count("pathway_nodes")
|
||||
if count == 0:
|
||||
return SourceStatus(
|
||||
source_type=DataSourceType.SQLITE,
|
||||
available=False,
|
||||
configured=True,
|
||||
message="fact_interventions table is empty",
|
||||
message="pathway_nodes table is empty",
|
||||
last_checked=datetime.now(),
|
||||
)
|
||||
|
||||
@@ -275,7 +275,7 @@ class DataSourceManager:
|
||||
source_type=DataSourceType.SQLITE,
|
||||
available=True,
|
||||
configured=True,
|
||||
message=f"SQLite database ready ({count:,} rows)",
|
||||
message=f"SQLite database ready ({count:,} pathway nodes)",
|
||||
last_checked=datetime.now(),
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -535,50 +535,14 @@ class DataSourceManager:
|
||||
drugs: Optional[list[str]],
|
||||
directories: Optional[list[str]],
|
||||
) -> Optional[DataSourceResult]:
|
||||
"""Try to get data from SQLite."""
|
||||
import time
|
||||
"""Try to get data from SQLite.
|
||||
|
||||
try:
|
||||
from data_processing.loader import SQLiteDataLoader
|
||||
|
||||
# Determine database path
|
||||
db_path = self._sqlite_db_path
|
||||
if db_path is None:
|
||||
from data_processing.database import default_db_config
|
||||
db_path = Path(default_db_config.db_path)
|
||||
|
||||
loader = SQLiteDataLoader(
|
||||
db_path=db_path,
|
||||
date_range=(start_date, end_date) if start_date and end_date else None,
|
||||
trusts=trusts,
|
||||
drugs=drugs,
|
||||
directories=directories,
|
||||
)
|
||||
|
||||
# Check if source is valid
|
||||
is_valid, msg = loader.validate_source()
|
||||
if not is_valid:
|
||||
logger.debug(f"SQLite not available: {msg}")
|
||||
return None
|
||||
|
||||
start_time = time.time()
|
||||
result = loader.load()
|
||||
load_time = time.time() - start_time
|
||||
|
||||
logger.info(f"SQLite loaded {result.row_count} rows in {load_time:.2f}s")
|
||||
|
||||
return DataSourceResult(
|
||||
df=result.df,
|
||||
source_type=DataSourceType.SQLITE,
|
||||
source_detail=str(db_path),
|
||||
row_count=result.row_count,
|
||||
cached=False,
|
||||
from_fallback=False,
|
||||
load_time_seconds=load_time,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"SQLite query failed: {e}")
|
||||
return None
|
||||
Note: Raw intervention data is no longer stored in SQLite.
|
||||
The app now uses pre-computed pathway_nodes via load_pathway_data().
|
||||
This fallback is retained for interface compatibility but always returns None.
|
||||
"""
|
||||
logger.debug("SQLite raw data fallback skipped (fact_interventions removed)")
|
||||
return None
|
||||
|
||||
def _try_file(
|
||||
self,
|
||||
|
||||
@@ -78,42 +78,6 @@ class DrugIndicationMatchRate:
|
||||
sample_unmatched: list[str] = field(default_factory=list) # Sample patient IDs
|
||||
|
||||
|
||||
@dataclass
|
||||
class DrugSnomedMapping:
|
||||
"""SNOMED code mapping for a drug from ref_drug_snomed_mapping."""
|
||||
snomed_code: str
|
||||
snomed_description: str
|
||||
search_term: str
|
||||
primary_directorate: str
|
||||
indication: str = ""
|
||||
ta_id: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirectSnomedMatchResult:
|
||||
"""Result of direct SNOMED code lookup in GP records."""
|
||||
patient_pseudonym: str
|
||||
matched: bool
|
||||
snomed_code: Optional[str] = None
|
||||
snomed_description: Optional[str] = None
|
||||
search_term: Optional[str] = None
|
||||
primary_directorate: Optional[str] = None
|
||||
event_date: Optional[datetime] = None
|
||||
source: str = "DIRECT_SNOMED" # DIRECT_SNOMED | NONE
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirectorateAssignment:
|
||||
"""Result of directorate assignment for a patient-drug combination."""
|
||||
upid: str
|
||||
drug_name: str
|
||||
directorate: Optional[str]
|
||||
search_term: Optional[str] = None
|
||||
source: str = "FALLBACK" # DIAGNOSIS | FALLBACK
|
||||
snomed_code: Optional[str] = None
|
||||
event_date: Optional[datetime] = None
|
||||
|
||||
|
||||
def get_drug_clusters(
|
||||
drug_name: str,
|
||||
db_manager: Optional[DatabaseManager] = None
|
||||
@@ -180,266 +144,6 @@ def get_drug_cluster_ids(
|
||||
return list(set(c["cluster_id"] for c in clusters))
|
||||
|
||||
|
||||
def get_drug_snomed_codes(
|
||||
drug_name: str,
|
||||
db_manager: Optional[DatabaseManager] = None
|
||||
) -> list[DrugSnomedMapping]:
|
||||
"""
|
||||
Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.
|
||||
|
||||
This uses the enriched mapping CSV data loaded into SQLite, which provides
|
||||
direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.
|
||||
|
||||
Args:
|
||||
drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
|
||||
Returns:
|
||||
List of DrugSnomedMapping with snomed_code, snomed_description,
|
||||
search_term, primary_directorate, indication, ta_id
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = default_db_manager
|
||||
|
||||
query = """
|
||||
SELECT DISTINCT
|
||||
snomed_code,
|
||||
snomed_description,
|
||||
search_term,
|
||||
primary_directorate,
|
||||
indication,
|
||||
ta_id
|
||||
FROM ref_drug_snomed_mapping
|
||||
WHERE UPPER(cleaned_drug_name) = UPPER(?)
|
||||
OR UPPER(drug_name) = UPPER(?)
|
||||
ORDER BY search_term, snomed_code
|
||||
"""
|
||||
|
||||
try:
|
||||
with db_manager.get_connection() as conn:
|
||||
cursor = conn.execute(query, (drug_name, drug_name))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
results.append(DrugSnomedMapping(
|
||||
snomed_code=row["snomed_code"],
|
||||
snomed_description=row["snomed_description"] or "",
|
||||
search_term=row["search_term"] or "",
|
||||
primary_directorate=row["primary_directorate"] or "",
|
||||
indication=row["indication"] or "",
|
||||
ta_id=row["ta_id"] or "",
|
||||
))
|
||||
|
||||
logger.debug(f"Found {len(results)} SNOMED mappings for drug '{drug_name}'")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting SNOMED codes for drug '{drug_name}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
def patient_has_indication_direct(
|
||||
patient_pseudonym: str,
|
||||
drug_snomed_mappings: list[DrugSnomedMapping],
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
before_date: Optional[date] = None,
|
||||
) -> DirectSnomedMatchResult:
|
||||
"""
|
||||
Check if patient has any of the SNOMED codes in their GP records.
|
||||
|
||||
This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding
|
||||
for exact SNOMED code matches (not via cluster). Returns the most recent
|
||||
match by EventDateTime if multiple matches exist.
|
||||
|
||||
Args:
|
||||
patient_pseudonym: Patient's pseudonymised NHS number
|
||||
drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
before_date: Optional date - only check diagnoses before this date
|
||||
|
||||
Returns:
|
||||
DirectSnomedMatchResult with match details (most recent by EventDateTime)
|
||||
"""
|
||||
result = DirectSnomedMatchResult(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
matched=False,
|
||||
source="NONE",
|
||||
)
|
||||
|
||||
if not drug_snomed_mappings:
|
||||
return result
|
||||
|
||||
if not SNOWFLAKE_AVAILABLE:
|
||||
logger.warning("Snowflake connector not available")
|
||||
return result
|
||||
|
||||
if not is_snowflake_configured():
|
||||
logger.warning("Snowflake not configured - cannot check GP records")
|
||||
return result
|
||||
|
||||
if connector is None:
|
||||
connector = get_connector()
|
||||
|
||||
# Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)
|
||||
snomed_lookup = {
|
||||
m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)
|
||||
for m in drug_snomed_mappings
|
||||
}
|
||||
|
||||
# Get unique SNOMED codes
|
||||
snomed_codes = list(snomed_lookup.keys())
|
||||
|
||||
# Build placeholders for SNOMED codes
|
||||
placeholders = ", ".join(["%s"] * len(snomed_codes))
|
||||
|
||||
# Query to find most recent matching SNOMED code in GP records
|
||||
query = f'''
|
||||
SELECT
|
||||
"SNOMEDCode",
|
||||
"EventDateTime"
|
||||
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
|
||||
WHERE "PatientPseudonym" = %s
|
||||
AND "SNOMEDCode" IN ({placeholders})
|
||||
'''
|
||||
|
||||
params: list = [patient_pseudonym] + snomed_codes
|
||||
|
||||
if before_date:
|
||||
query += ' AND "EventDateTime" < %s'
|
||||
params.append(before_date.isoformat())
|
||||
|
||||
query += ' ORDER BY "EventDateTime" DESC LIMIT 1'
|
||||
|
||||
try:
|
||||
results = connector.execute_dict(query, tuple(params))
|
||||
|
||||
if results:
|
||||
row = results[0]
|
||||
matched_code = row.get("SNOMEDCode")
|
||||
event_dt = row.get("EventDateTime")
|
||||
|
||||
if matched_code and matched_code in snomed_lookup:
|
||||
search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]
|
||||
|
||||
return DirectSnomedMatchResult(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
matched=True,
|
||||
snomed_code=matched_code,
|
||||
snomed_description=snomed_desc,
|
||||
search_term=search_term,
|
||||
primary_directorate=primary_dir,
|
||||
event_date=event_dt,
|
||||
source="DIRECT_SNOMED",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}")
|
||||
return result
|
||||
|
||||
|
||||
def get_directorate_from_diagnosis(
|
||||
upid: str,
|
||||
drug_name: str,
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
before_date: Optional[date] = None,
|
||||
) -> DirectorateAssignment:
|
||||
"""
|
||||
Get directorate assignment for a patient-drug combination using diagnosis-based lookup.
|
||||
|
||||
This function attempts to assign a directorate based on the patient's GP records
|
||||
(direct SNOMED code matching). If no match is found, it returns a FALLBACK result
|
||||
indicating that the caller should use alternative assignment methods (e.g.,
|
||||
department_identification() from tools/data.py).
|
||||
|
||||
Workflow:
|
||||
1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping
|
||||
2. Query patient's GP records for matching SNOMED codes
|
||||
3. If match found → return diagnosis-based directorate and search_term
|
||||
4. If no match → return FALLBACK result (caller handles fallback logic)
|
||||
|
||||
Args:
|
||||
upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)
|
||||
drug_name: Drug name to look up
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
before_date: Optional date - only check diagnoses before this date
|
||||
|
||||
Returns:
|
||||
DirectorateAssignment with directorate, search_term, and source
|
||||
"""
|
||||
result = DirectorateAssignment(
|
||||
upid=upid,
|
||||
drug_name=drug_name,
|
||||
directorate=None,
|
||||
source="FALLBACK",
|
||||
)
|
||||
|
||||
# Step 1: Get SNOMED codes for the drug
|
||||
drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)
|
||||
|
||||
if not drug_snomed_mappings:
|
||||
logger.debug(f"No SNOMED mappings found for drug '{drug_name}' - using fallback")
|
||||
return result
|
||||
|
||||
# Step 2: Check Snowflake availability
|
||||
if not SNOWFLAKE_AVAILABLE:
|
||||
logger.debug("Snowflake not available - using fallback")
|
||||
return result
|
||||
|
||||
if not is_snowflake_configured():
|
||||
logger.debug("Snowflake not configured - using fallback")
|
||||
return result
|
||||
|
||||
# Step 3: Get patient pseudonym from UPID
|
||||
# UPID format is Provider Code (3 chars) + PersonKey
|
||||
# We need to query Snowflake to get the PatientPseudonym for this PersonKey
|
||||
# However, patient_has_indication_direct expects PatientPseudonym, not UPID
|
||||
# For now, we'll use UPID as the identifier - the actual integration
|
||||
# will need to happen at the DataFrame level where we have PersonKey
|
||||
#
|
||||
# NOTE: This function will be called from the pipeline where we have
|
||||
# access to PatientPseudonym. The UPID is passed for logging/tracking.
|
||||
|
||||
# Actually, looking at the pipeline, we need PatientPseudonym, not UPID.
|
||||
# The caller should pass the PatientPseudonym or we need to look it up.
|
||||
# For now, let's assume the caller will use this in a batch context
|
||||
# where they can map UPID -> PatientPseudonym.
|
||||
|
||||
# Let me reconsider: the function signature takes UPID but we need
|
||||
# PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),
|
||||
# we'll have the PersonKey column which IS the PatientPseudonym.
|
||||
# So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.
|
||||
#
|
||||
# We can extract PatientPseudonym from UPID by removing the first 3 chars.
|
||||
patient_pseudonym = upid[3:] if len(upid) > 3 else upid
|
||||
|
||||
# Step 4: Check patient's GP records for matching SNOMED codes
|
||||
match_result = patient_has_indication_direct(
|
||||
patient_pseudonym=patient_pseudonym,
|
||||
drug_snomed_mappings=drug_snomed_mappings,
|
||||
connector=connector,
|
||||
before_date=before_date,
|
||||
)
|
||||
|
||||
if match_result.matched and match_result.primary_directorate:
|
||||
return DirectorateAssignment(
|
||||
upid=upid,
|
||||
drug_name=drug_name,
|
||||
directorate=match_result.primary_directorate,
|
||||
search_term=match_result.search_term,
|
||||
source="DIAGNOSIS",
|
||||
snomed_code=match_result.snomed_code,
|
||||
event_date=match_result.event_date,
|
||||
)
|
||||
|
||||
# No match found - return fallback result
|
||||
return result
|
||||
|
||||
|
||||
def get_cluster_snomed_codes(
|
||||
cluster_id: str,
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
@@ -864,229 +568,6 @@ def get_available_clusters(
|
||||
return []
|
||||
|
||||
|
||||
def batch_lookup_indication_groups(
|
||||
df: "pd.DataFrame",
|
||||
connector: Optional[SnowflakeConnector] = None,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
batch_size: int = 500,
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.
|
||||
|
||||
This is the efficient batch version of get_directorate_from_diagnosis().
|
||||
Instead of querying Snowflake per patient, it batches the lookups for performance.
|
||||
|
||||
Strategy:
|
||||
1. Get all unique (PersonKey, Drug Name) pairs from DataFrame
|
||||
2. For each unique drug, get all SNOMED codes from local SQLite
|
||||
3. Build batched Snowflake queries to check GP records
|
||||
4. Return indication_df mapping UPID → Indication_Group
|
||||
|
||||
For unmatched patients, Indication_Group will be their Directory (with suffix).
|
||||
|
||||
Args:
|
||||
df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey
|
||||
connector: Optional SnowflakeConnector (defaults to singleton)
|
||||
db_manager: Optional DatabaseManager (defaults to default_db_manager)
|
||||
batch_size: Number of patients per Snowflake query batch
|
||||
|
||||
Returns:
|
||||
DataFrame with columns: UPID, Indication_Group, Source
|
||||
- Indication_Group: Search_Term (if matched) or "Directory (no GP dx)" (if not)
|
||||
- Source: "DIAGNOSIS" or "FALLBACK"
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
if db_manager is None:
|
||||
db_manager = default_db_manager
|
||||
|
||||
logger.info(f"Starting batch indication lookup for {len(df)} records...")
|
||||
|
||||
# Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations
|
||||
# We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records
|
||||
# Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records
|
||||
if 'PseudoNHSNoLinked' not in df.columns:
|
||||
logger.error("DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records")
|
||||
# Return fallback for all patients
|
||||
result_df = df[['UPID', 'Directory']].drop_duplicates().copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']]
|
||||
|
||||
# Get unique patient-drug combinations (we need one lookup per patient-drug pair)
|
||||
unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()
|
||||
logger.info(f"Found {len(unique_pairs)} unique patient-drug combinations")
|
||||
|
||||
# Step 2: Get all unique drugs and their SNOMED codes
|
||||
unique_drugs = unique_pairs['Drug Name'].unique()
|
||||
logger.info(f"Building SNOMED lookup for {len(unique_drugs)} unique drugs...")
|
||||
|
||||
# Build drug -> list of DrugSnomedMapping dict
|
||||
drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}
|
||||
all_snomed_codes: set[str] = set()
|
||||
snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {} # snomed -> [(drug, search_term, primary_dir), ...]
|
||||
|
||||
for drug_name in unique_drugs:
|
||||
mappings = get_drug_snomed_codes(drug_name, db_manager)
|
||||
drug_snomed_map[drug_name] = mappings
|
||||
|
||||
for m in mappings:
|
||||
all_snomed_codes.add(m.snomed_code)
|
||||
if m.snomed_code not in snomed_to_drug_searchterm:
|
||||
snomed_to_drug_searchterm[m.snomed_code] = []
|
||||
snomed_to_drug_searchterm[m.snomed_code].append(
|
||||
(drug_name, m.search_term, m.primary_directorate)
|
||||
)
|
||||
|
||||
logger.info(f"Total SNOMED codes to check: {len(all_snomed_codes)}")
|
||||
|
||||
# Step 3: Check Snowflake availability
|
||||
if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
|
||||
logger.warning("Snowflake not available - returning fallback for all patients")
|
||||
result_df = unique_pairs[['UPID', 'Directory']].copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
|
||||
|
||||
if connector is None:
|
||||
connector = get_connector()
|
||||
|
||||
# Step 4: Query GP records for all patients in batches
|
||||
# The query finds the most recent matching SNOMED code for each patient
|
||||
|
||||
# Get unique PseudoNHSNoLinked values (each = one patient in GP records)
|
||||
unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])
|
||||
patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()
|
||||
|
||||
logger.info(f"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...")
|
||||
|
||||
# Results dict: PersonKey -> (snomed_code, event_date)
|
||||
gp_matches: dict[str, tuple[str, Any]] = {}
|
||||
|
||||
# Convert SNOMED codes to list for query
|
||||
snomed_list = list(all_snomed_codes)
|
||||
|
||||
if not snomed_list:
|
||||
logger.warning("No SNOMED codes to check - returning fallback for all patients")
|
||||
result_df = unique_pairs[['UPID', 'Directory']].copy()
|
||||
result_df['Indication_Group'] = result_df['Directory'] + " (no GP dx)"
|
||||
result_df['Source'] = "FALLBACK"
|
||||
return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])
|
||||
|
||||
# Build SNOMED IN clause (reused across batches)
|
||||
snomed_placeholders = ", ".join(["%s"] * len(snomed_list))
|
||||
|
||||
# Process patients in batches
|
||||
for batch_start in range(0, len(patient_pseudonyms), batch_size):
|
||||
batch_end = min(batch_start + batch_size, len(patient_pseudonyms))
|
||||
batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
|
||||
|
||||
logger.info(f"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}")
|
||||
|
||||
# Build patient IN clause
|
||||
patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))
|
||||
|
||||
# Query to find all matching SNOMED codes for these patients
|
||||
# We'll get all matches and pick the most recent per patient in Python
|
||||
query = f'''
|
||||
SELECT
|
||||
"PatientPseudonym",
|
||||
"SNOMEDCode",
|
||||
"EventDateTime"
|
||||
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding"
|
||||
WHERE "PatientPseudonym" IN ({patient_placeholders})
|
||||
AND "SNOMEDCode" IN ({snomed_placeholders})
|
||||
ORDER BY "PatientPseudonym", "EventDateTime" DESC
|
||||
'''
|
||||
|
||||
params = tuple(batch_pseudonyms) + tuple(snomed_list)
|
||||
|
||||
try:
|
||||
results = connector.execute_dict(query, params)
|
||||
|
||||
# Process results - pick most recent per patient
|
||||
for row in results:
|
||||
person_key = row.get("PatientPseudonym")
|
||||
snomed_code = row.get("SNOMEDCode")
|
||||
event_date = row.get("EventDateTime")
|
||||
|
||||
if person_key and snomed_code:
|
||||
# Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)
|
||||
if person_key not in gp_matches:
|
||||
gp_matches[person_key] = (snomed_code, event_date)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error querying GP records for batch: {e}")
|
||||
# Continue with other batches
|
||||
|
||||
logger.info(f"Found GP matches for {len(gp_matches)} patients")
|
||||
|
||||
# Step 5: Build result DataFrame
|
||||
# For each unique_pair, determine Indication_Group based on match status
|
||||
results_list = []
|
||||
|
||||
# We need to dedupe by UPID - a patient might be on multiple drugs
|
||||
# Strategy: For each UPID, use the most recent match (if any)
|
||||
upid_to_match: dict[str, tuple[str, str]] = {} # UPID -> (Indication_Group, Source)
|
||||
|
||||
for _, row in unique_pairs.iterrows():
|
||||
upid = row['UPID']
|
||||
drug_name = row['Drug Name']
|
||||
patient_pseudonym = row['PseudoNHSNoLinked']
|
||||
directory = row['Directory']
|
||||
|
||||
# Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)
|
||||
if patient_pseudonym in gp_matches:
|
||||
matched_snomed, event_date = gp_matches[patient_pseudonym]
|
||||
|
||||
# Find the search_term for this SNOMED code and drug
|
||||
# (A SNOMED code might map to multiple drugs with different search_terms)
|
||||
if matched_snomed in snomed_to_drug_searchterm:
|
||||
# Look for match with current drug first
|
||||
search_term = None
|
||||
for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:
|
||||
if drug.upper() == drug_name.upper():
|
||||
search_term = st
|
||||
break
|
||||
# If no drug-specific match, use any match
|
||||
if search_term is None:
|
||||
search_term = snomed_to_drug_searchterm[matched_snomed][0][1]
|
||||
|
||||
# Only update if we don't have a match for this UPID yet
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (search_term, "DIAGNOSIS")
|
||||
else:
|
||||
# Shouldn't happen but fallback just in case
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
|
||||
else:
|
||||
# No GP match - use fallback
|
||||
if upid not in upid_to_match:
|
||||
upid_to_match[upid] = (directory + " (no GP dx)", "FALLBACK")
|
||||
|
||||
# Build result DataFrame
|
||||
for upid, (indication_group, source) in upid_to_match.items():
|
||||
results_list.append({
|
||||
'UPID': upid,
|
||||
'Indication_Group': indication_group,
|
||||
'Source': source,
|
||||
})
|
||||
|
||||
result_df = pd.DataFrame(results_list)
|
||||
|
||||
# Log statistics
|
||||
diagnosis_count = len([s for s in result_df['Source'] if s == "DIAGNOSIS"])
|
||||
fallback_count = len([s for s in result_df['Source'] if s == "FALLBACK"])
|
||||
total = len(result_df)
|
||||
|
||||
logger.info(f"Indication lookup complete:")
|
||||
logger.info(f" Total unique patients: {total}")
|
||||
logger.info(f" DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)")
|
||||
logger.info(f" FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)")
|
||||
|
||||
return result_df
|
||||
|
||||
|
||||
# === Drug-to-indication mapping from DimSearchTerm.csv ===
|
||||
|
||||
|
||||
@@ -1713,10 +1194,7 @@ __all__ = [
|
||||
"ClusterSnomedCodes",
|
||||
"IndicationValidationResult",
|
||||
"DrugIndicationMatchRate",
|
||||
"DrugSnomedMapping",
|
||||
"DirectSnomedMatchResult",
|
||||
"DirectorateAssignment",
|
||||
# Cluster-based lookup functions (existing)
|
||||
# Cluster-based lookup functions
|
||||
"get_drug_clusters",
|
||||
"get_drug_cluster_ids",
|
||||
"get_cluster_snomed_codes",
|
||||
@@ -1725,20 +1203,13 @@ __all__ = [
|
||||
"get_indication_match_rate",
|
||||
"batch_validate_indications",
|
||||
"get_available_clusters",
|
||||
# Direct SNOMED lookup functions (new)
|
||||
"get_drug_snomed_codes",
|
||||
"patient_has_indication_direct",
|
||||
# Diagnosis-based directorate assignment
|
||||
"get_directorate_from_diagnosis",
|
||||
# Batch lookup for indication groups
|
||||
"batch_lookup_indication_groups",
|
||||
# Drug-indication mapping from DimSearchTerm.csv
|
||||
"SEARCH_TERM_MERGE_MAP",
|
||||
"load_drug_indication_mapping",
|
||||
"get_search_terms_for_drug",
|
||||
# Drug-aware indication assignment
|
||||
"assign_drug_indications",
|
||||
# Snowflake-direct indication lookup (new approach)
|
||||
# Snowflake-direct indication lookup
|
||||
"get_patient_indication_groups",
|
||||
"CLUSTER_MAPPING_SQL",
|
||||
]
|
||||
|
||||
@@ -1,401 +0,0 @@
|
||||
"""
|
||||
Load enriched SNOMED mapping data into SQLite database.
|
||||
|
||||
This module loads the drug_snomed_mapping_enriched.csv file into the
|
||||
ref_drug_snomed_mapping table for direct GP record matching.
|
||||
|
||||
Source file: data/drug_snomed_mapping_enriched.csv (163K rows)
|
||||
Target table: ref_drug_snomed_mapping
|
||||
|
||||
Usage:
|
||||
python -m data_processing.load_snomed_mapping
|
||||
|
||||
Columns mapped:
|
||||
Drug -> drug_name
|
||||
Indication -> indication
|
||||
TA_ID -> ta_id
|
||||
Search_Term -> search_term
|
||||
SNOMEDCode -> snomed_code (cleaned: removes trailing .0)
|
||||
SNOMEDDescription -> snomed_description
|
||||
CleanedDrugName -> cleaned_drug_name
|
||||
PrimaryDirectorate -> primary_directorate
|
||||
AllDirectorates -> all_directorates
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from core.logging_config import get_logger
|
||||
from data_processing.database import DatabaseManager
|
||||
from data_processing.reference_data import MigrationResult, _read_csv_with_fallback_encoding
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
DEFAULT_CSV_PATH = Path("./data/drug_snomed_mapping_enriched.csv")
|
||||
|
||||
|
||||
def clean_snomed_code(snomed_code: str) -> str:
|
||||
"""
|
||||
Clean SNOMED code by removing trailing .0 suffix and handling scientific notation.
|
||||
|
||||
The enriched CSV has SNOMED codes that may be in decimal notation (e.g., "156370009.0")
|
||||
or scientific notation (e.g., "1.0629311000119108e+16") due to pandas/Excel export.
|
||||
These need to be converted to clean integer strings.
|
||||
|
||||
Args:
|
||||
snomed_code: Raw SNOMED code from CSV.
|
||||
|
||||
Returns:
|
||||
Cleaned SNOMED code as string (e.g., "156370009" or "10629311000119108").
|
||||
"""
|
||||
if not snomed_code:
|
||||
return ""
|
||||
|
||||
code = snomed_code.strip()
|
||||
|
||||
# Handle scientific notation (e.g., "1.0629311000119108e+16")
|
||||
if 'e' in code.lower():
|
||||
try:
|
||||
# Convert to float first, then to int, then to string
|
||||
# Using int() directly on the float preserves precision for SNOMED codes
|
||||
value = float(code)
|
||||
# Check if it's a whole number (no decimal part)
|
||||
if value == int(value):
|
||||
return str(int(value))
|
||||
else:
|
||||
# Has decimal part - return as cleaned float
|
||||
return str(value).replace('.0', '')
|
||||
except (ValueError, OverflowError):
|
||||
# If conversion fails, return as-is but cleaned
|
||||
return code
|
||||
|
||||
# Remove trailing .0 if present (for non-scientific notation)
|
||||
if code.endswith(".0"):
|
||||
code = code[:-2]
|
||||
|
||||
return code
|
||||
|
||||
|
||||
def migrate_drug_snomed_mapping(
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
csv_path: Optional[Path] = None
|
||||
) -> MigrationResult:
|
||||
"""
|
||||
Migrate drug SNOMED mappings from CSV to SQLite ref_drug_snomed_mapping table.
|
||||
|
||||
Source file format (with header):
|
||||
Drug,Indication,TA_ID,Search_Term,SNOMEDCode,SNOMEDDescription,
|
||||
CleanedDrugName,PrimaryDirectorate,AllDirectorates
|
||||
|
||||
Example rows:
|
||||
ABATACEPT,Psoriatic arthritis after DMARDs,TA568,psoriatic arthritis,
|
||||
156370009.0,Psoriatic arthritis,ABATACEPT,RHEUMATOLOGY,RHEUMATOLOGY|DERMATOLOGY
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
|
||||
|
||||
Returns:
|
||||
MigrationResult with statistics about the migration.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
if csv_path is None:
|
||||
csv_path = DEFAULT_CSV_PATH
|
||||
|
||||
table_name = "ref_drug_snomed_mapping"
|
||||
|
||||
logger.info(f"Migrating drug SNOMED mappings from {csv_path} to {table_name}")
|
||||
|
||||
if not csv_path.exists():
|
||||
error_msg = f"Source file not found: {csv_path}"
|
||||
logger.error(error_msg)
|
||||
return MigrationResult(
|
||||
table_name=table_name,
|
||||
source_file=str(csv_path),
|
||||
rows_read=0,
|
||||
rows_inserted=0,
|
||||
rows_skipped=0,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
rows_read = 0
|
||||
rows_inserted = 0
|
||||
rows_skipped = 0
|
||||
|
||||
try:
|
||||
with db_manager.get_transaction() as conn:
|
||||
rows = _read_csv_with_fallback_encoding(csv_path)
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
# Skip header row
|
||||
if i == 0 and len(row) >= 5 and row[0].strip().lower() == "drug":
|
||||
logger.debug("Skipping header row")
|
||||
continue
|
||||
|
||||
rows_read += 1
|
||||
|
||||
# Validate row format (need at least: Drug, Indication, TA_ID, Search_Term, SNOMEDCode)
|
||||
if len(row) < 5:
|
||||
logger.warning(f"Skipping malformed row {rows_read}: {row}")
|
||||
rows_skipped += 1
|
||||
continue
|
||||
|
||||
drug_name = row[0].strip()
|
||||
indication = row[1].strip()
|
||||
ta_id = row[2].strip() if len(row) > 2 else ""
|
||||
search_term = row[3].strip()
|
||||
snomed_code_raw = row[4].strip() if len(row) > 4 else ""
|
||||
snomed_description = row[5].strip() if len(row) > 5 else ""
|
||||
cleaned_drug_name = row[6].strip() if len(row) > 6 else drug_name.upper()
|
||||
primary_directorate = row[7].strip() if len(row) > 7 else ""
|
||||
all_directorates = row[8].strip() if len(row) > 8 else ""
|
||||
|
||||
# Skip if required fields are empty
|
||||
if not drug_name or not indication or not search_term or not snomed_code_raw:
|
||||
logger.warning(f"Skipping row {rows_read} with empty required fields")
|
||||
rows_skipped += 1
|
||||
continue
|
||||
|
||||
# Clean SNOMED code (remove trailing .0)
|
||||
snomed_code = clean_snomed_code(snomed_code_raw)
|
||||
|
||||
if not snomed_code:
|
||||
logger.warning(f"Skipping row {rows_read} with invalid SNOMED code: {snomed_code_raw}")
|
||||
rows_skipped += 1
|
||||
continue
|
||||
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO ref_drug_snomed_mapping
|
||||
(drug_name, indication, ta_id, search_term, snomed_code, snomed_description,
|
||||
cleaned_drug_name, primary_directorate, all_directorates)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
drug_name,
|
||||
indication,
|
||||
ta_id,
|
||||
search_term,
|
||||
snomed_code,
|
||||
snomed_description,
|
||||
cleaned_drug_name,
|
||||
primary_directorate,
|
||||
all_directorates,
|
||||
)
|
||||
)
|
||||
|
||||
if cursor.rowcount > 0:
|
||||
rows_inserted += 1
|
||||
else:
|
||||
rows_skipped += 1
|
||||
|
||||
# Log progress every 10000 rows
|
||||
if rows_read % 10000 == 0:
|
||||
logger.info(f"Processed {rows_read} rows, inserted {rows_inserted}")
|
||||
|
||||
logger.info(
|
||||
f"Drug SNOMED mapping migration complete: {rows_read} rows read, "
|
||||
f"{rows_inserted} inserted, {rows_skipped} skipped"
|
||||
)
|
||||
|
||||
return MigrationResult(
|
||||
table_name=table_name,
|
||||
source_file=str(csv_path),
|
||||
rows_read=rows_read,
|
||||
rows_inserted=rows_inserted,
|
||||
rows_skipped=rows_skipped,
|
||||
success=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Migration failed: {e}"
|
||||
logger.error(error_msg)
|
||||
return MigrationResult(
|
||||
table_name=table_name,
|
||||
source_file=str(csv_path),
|
||||
rows_read=rows_read,
|
||||
rows_inserted=0,
|
||||
rows_skipped=0,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
|
||||
def get_drug_snomed_mapping_counts(db_manager: Optional[DatabaseManager] = None) -> dict:
|
||||
"""
|
||||
Get statistics about the ref_drug_snomed_mapping table.
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- total_mappings: Total rows in table
|
||||
- unique_drugs: Count of distinct drug names
|
||||
- unique_search_terms: Count of distinct search terms
|
||||
- unique_snomed_codes: Count of distinct SNOMED codes
|
||||
- unique_indications: Count of distinct indications
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
with db_manager.get_connection() as conn:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM ref_drug_snomed_mapping")
|
||||
total = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT drug_name) FROM ref_drug_snomed_mapping")
|
||||
unique_drugs = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT search_term) FROM ref_drug_snomed_mapping")
|
||||
unique_search_terms = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT snomed_code) FROM ref_drug_snomed_mapping")
|
||||
unique_snomed_codes = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT indication) FROM ref_drug_snomed_mapping")
|
||||
unique_indications = cursor.fetchone()[0]
|
||||
|
||||
return {
|
||||
"total_mappings": total,
|
||||
"unique_drugs": unique_drugs,
|
||||
"unique_search_terms": unique_search_terms,
|
||||
"unique_snomed_codes": unique_snomed_codes,
|
||||
"unique_indications": unique_indications,
|
||||
}
|
||||
|
||||
|
||||
def verify_drug_snomed_mapping_migration(
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
csv_path: Optional[Path] = None
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Verify that drug SNOMED mappings were migrated correctly.
|
||||
|
||||
Checks:
|
||||
- Row count is reasonable (163K+ expected)
|
||||
- Unique search terms is reasonable (187 expected)
|
||||
- Sample lookups return expected values
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
csv_path: Path to the CSV file. Defaults to data/drug_snomed_mapping_enriched.csv.
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, message: str)
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
if csv_path is None:
|
||||
csv_path = DEFAULT_CSV_PATH
|
||||
|
||||
stats = get_drug_snomed_mapping_counts(db_manager)
|
||||
|
||||
# Basic sanity checks
|
||||
if stats["total_mappings"] < 100000:
|
||||
return False, f"Too few rows: expected 163K+, got {stats['total_mappings']}"
|
||||
|
||||
if stats["unique_search_terms"] < 100:
|
||||
return False, f"Too few search terms: expected ~187, got {stats['unique_search_terms']}"
|
||||
|
||||
# Sample lookup verification
|
||||
with db_manager.get_connection() as conn:
|
||||
# Check that ABATACEPT exists (from sample data)
|
||||
cursor = conn.execute(
|
||||
"SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE drug_name = 'ABATACEPT'"
|
||||
)
|
||||
abatacept_count = cursor.fetchone()[0]
|
||||
if abatacept_count == 0:
|
||||
return False, "Sample drug ABATACEPT not found in table"
|
||||
|
||||
# Check that SNOMED codes were cleaned (no .0 suffix)
|
||||
cursor = conn.execute(
|
||||
"SELECT COUNT(*) FROM ref_drug_snomed_mapping WHERE snomed_code LIKE '%.0'"
|
||||
)
|
||||
dirty_codes = cursor.fetchone()[0]
|
||||
if dirty_codes > 0:
|
||||
return False, f"Found {dirty_codes} SNOMED codes with uncleaned .0 suffix"
|
||||
|
||||
return True, (
|
||||
f"Verified {stats['total_mappings']:,} mappings: "
|
||||
f"{stats['unique_drugs']} drugs, "
|
||||
f"{stats['unique_search_terms']} search terms, "
|
||||
f"{stats['unique_snomed_codes']:,} SNOMED codes"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point for loading SNOMED mapping data."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load drug SNOMED mapping data into SQLite database"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
type=Path,
|
||||
default=DEFAULT_CSV_PATH,
|
||||
help=f"Path to CSV file (default: {DEFAULT_CSV_PATH})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verify-only",
|
||||
action="store_true",
|
||||
help="Only verify existing data, don't migrate"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose logging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
import logging
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if args.verify_only:
|
||||
print("Verifying existing data...")
|
||||
success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
|
||||
if success:
|
||||
print(f"[OK] Verification passed: {message}")
|
||||
else:
|
||||
print(f"[FAILED] Verification failed: {message}")
|
||||
return 0 if success else 1
|
||||
|
||||
# Run migration
|
||||
print(f"Loading SNOMED mapping from {args.csv}...")
|
||||
result = migrate_drug_snomed_mapping(csv_path=args.csv)
|
||||
|
||||
if result.success:
|
||||
print(f"[OK] {result}")
|
||||
|
||||
# Show statistics
|
||||
stats = get_drug_snomed_mapping_counts()
|
||||
print(f"\nTable statistics:")
|
||||
print(f" Total mappings: {stats['total_mappings']:,}")
|
||||
print(f" Unique drugs: {stats['unique_drugs']}")
|
||||
print(f" Unique search terms: {stats['unique_search_terms']}")
|
||||
print(f" Unique SNOMED codes: {stats['unique_snomed_codes']:,}")
|
||||
print(f" Unique indications: {stats['unique_indications']}")
|
||||
|
||||
# Verify
|
||||
success, message = verify_drug_snomed_mapping_migration(csv_path=args.csv)
|
||||
if success:
|
||||
print(f"\n[OK] Verification: {message}")
|
||||
else:
|
||||
print(f"\n[WARNING] Verification: {message}")
|
||||
return 1
|
||||
else:
|
||||
print(f"[FAILED] {result}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
+2
-155
@@ -11,7 +11,6 @@ The DataLoader ABC defines the contract for all loader implementations.
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@@ -29,7 +28,7 @@ class LoadResult:
|
||||
|
||||
Attributes:
|
||||
df: The loaded DataFrame with processed patient intervention data
|
||||
source: Description of the data source (e.g., "csv:/path/to/file.csv", "sqlite:fact_interventions")
|
||||
source: Description of the data source (e.g., "file:/path/to/file.csv")
|
||||
row_count: Number of rows loaded
|
||||
columns: List of column names in the DataFrame
|
||||
load_time_seconds: Time taken to load the data
|
||||
@@ -224,150 +223,6 @@ class FileDataLoader(DataLoader):
|
||||
)
|
||||
|
||||
|
||||
class SQLiteDataLoader(DataLoader):
|
||||
"""Loads data from SQLite fact_interventions table.
|
||||
|
||||
This provides faster loading by reading pre-processed data from SQLite
|
||||
instead of re-processing CSV files each time.
|
||||
|
||||
The SQLite database must have been populated by the migration scripts.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database (uses default if None)
|
||||
date_range: Optional tuple of (start_date, end_date) to filter data
|
||||
trusts: Optional list of trust names to filter
|
||||
drugs: Optional list of drug names to filter
|
||||
directories: Optional list of directories to filter
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_path: Optional[Path | str] = None,
|
||||
date_range: Optional[tuple[date, date]] = None,
|
||||
trusts: Optional[list[str]] = None,
|
||||
drugs: Optional[list[str]] = None,
|
||||
directories: Optional[list[str]] = None,
|
||||
):
|
||||
from data_processing.database import default_db_config
|
||||
|
||||
self.db_path = Path(db_path) if db_path else Path(default_db_config.db_path)
|
||||
self.date_range = date_range
|
||||
self.trusts = trusts
|
||||
self.drugs = drugs
|
||||
self.directories = directories
|
||||
|
||||
def validate_source(self) -> tuple[bool, str]:
|
||||
"""Check if the database exists and has the fact_interventions table."""
|
||||
if not self.db_path.exists():
|
||||
return False, f"Database not found: {self.db_path}"
|
||||
|
||||
# Check if fact_interventions table exists
|
||||
from data_processing.database import DatabaseManager, DatabaseConfig
|
||||
|
||||
config = DatabaseConfig(db_path=self.db_path)
|
||||
manager = DatabaseManager(config)
|
||||
|
||||
if not manager.table_exists("fact_interventions"):
|
||||
return False, "fact_interventions table not found in database"
|
||||
|
||||
count = manager.get_table_count("fact_interventions")
|
||||
if count == 0:
|
||||
return False, "fact_interventions table is empty"
|
||||
|
||||
return True, f"OK ({count:,} rows available)"
|
||||
|
||||
@property
|
||||
def source_description(self) -> str:
|
||||
return f"sqlite:{self.db_path}"
|
||||
|
||||
def load(self) -> LoadResult:
|
||||
"""Load data from SQLite fact_interventions table.
|
||||
|
||||
Maps SQLite column names to the expected DataFrame column names.
|
||||
Applies optional filters for date range, trusts, drugs, directories.
|
||||
"""
|
||||
import time
|
||||
from data_processing.database import DatabaseManager, DatabaseConfig
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Validate source
|
||||
is_valid, msg = self.validate_source()
|
||||
if not is_valid:
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
logger.info(f"Loading data from SQLite: {self.db_path}")
|
||||
|
||||
# Build query with optional filters
|
||||
query = """
|
||||
SELECT
|
||||
upid AS "UPID",
|
||||
provider_code AS "Provider Code",
|
||||
person_key AS "PersonKey",
|
||||
drug_name_std AS "Drug Name",
|
||||
intervention_date AS "Intervention Date",
|
||||
price_actual AS "Price Actual",
|
||||
org_name AS "OrganisationName",
|
||||
directory AS "Directory",
|
||||
treatment_function_code AS "Treatment Function Code",
|
||||
additional_detail_1 AS "Additional Detail 1",
|
||||
additional_detail_2 AS "Additional Detail 2",
|
||||
additional_detail_3 AS "Additional Detail 3",
|
||||
additional_detail_4 AS "Additional Detail 4",
|
||||
additional_detail_5 AS "Additional Detail 5"
|
||||
FROM fact_interventions
|
||||
WHERE 1=1
|
||||
"""
|
||||
params = []
|
||||
|
||||
if self.date_range:
|
||||
start, end = self.date_range
|
||||
query += " AND intervention_date >= ? AND intervention_date < ?"
|
||||
params.extend([str(start), str(end)])
|
||||
|
||||
if self.trusts:
|
||||
placeholders = ','.join('?' * len(self.trusts))
|
||||
query += f" AND org_name IN ({placeholders})"
|
||||
params.extend(self.trusts)
|
||||
|
||||
if self.drugs:
|
||||
placeholders = ','.join('?' * len(self.drugs))
|
||||
query += f" AND drug_name_std IN ({placeholders})"
|
||||
params.extend(self.drugs)
|
||||
|
||||
if self.directories:
|
||||
placeholders = ','.join('?' * len(self.directories))
|
||||
query += f" AND directory IN ({placeholders})"
|
||||
params.extend(self.directories)
|
||||
|
||||
# Execute query
|
||||
config = DatabaseConfig(db_path=self.db_path)
|
||||
manager = DatabaseManager(config)
|
||||
|
||||
with manager.get_connection() as conn:
|
||||
df = pd.read_sql_query(query, conn, params=params)
|
||||
|
||||
# Convert intervention_date to datetime
|
||||
df['Intervention Date'] = pd.to_datetime(df['Intervention Date'])
|
||||
|
||||
logger.info(f"Loaded {len(df)} rows from SQLite")
|
||||
|
||||
# Validate result
|
||||
is_valid, missing = self.validate_dataframe(df)
|
||||
if not is_valid:
|
||||
raise ValueError(f"SQLite data missing required columns: {missing}")
|
||||
|
||||
load_time = time.time() - start_time
|
||||
logger.info(f"SQLite data loading complete. {len(df)} rows in {load_time:.2f}s")
|
||||
|
||||
return LoadResult(
|
||||
df=df,
|
||||
source=self.source_description,
|
||||
row_count=len(df),
|
||||
load_time_seconds=load_time,
|
||||
)
|
||||
|
||||
|
||||
def get_loader(
|
||||
source: str | Path,
|
||||
paths: Optional[PathConfig] = None,
|
||||
@@ -376,7 +231,7 @@ def get_loader(
|
||||
"""Factory function to create the appropriate DataLoader.
|
||||
|
||||
Args:
|
||||
source: Either a file path (CSV/Parquet) or "sqlite" for database
|
||||
source: File path (CSV/Parquet)
|
||||
paths: PathConfig for reference data (used by FileDataLoader)
|
||||
**kwargs: Additional arguments passed to the loader constructor
|
||||
|
||||
@@ -386,14 +241,6 @@ def get_loader(
|
||||
Examples:
|
||||
>>> loader = get_loader("data/activity.csv")
|
||||
>>> loader = get_loader("data/activity.parquet")
|
||||
>>> loader = get_loader("sqlite")
|
||||
>>> loader = get_loader("sqlite", date_range=(date(2024, 1, 1), date(2024, 12, 31)))
|
||||
"""
|
||||
source_str = str(source).lower()
|
||||
|
||||
if source_str == "sqlite":
|
||||
return SQLiteDataLoader(**kwargs)
|
||||
|
||||
# Assume it's a file path
|
||||
path = Path(source)
|
||||
return FileDataLoader(file_path=path, paths=paths)
|
||||
|
||||
+11
-155
@@ -35,6 +35,7 @@ from data_processing.schema import (
|
||||
verify_all_tables_exist,
|
||||
get_all_table_counts,
|
||||
migrate_pathway_nodes_chart_type,
|
||||
migrate_refresh_log_source_row_count,
|
||||
)
|
||||
from data_processing.reference_data import (
|
||||
MigrationResult,
|
||||
@@ -49,12 +50,6 @@ from data_processing.reference_data import (
|
||||
verify_drug_directory_map_migration,
|
||||
verify_drug_indication_clusters_migration,
|
||||
)
|
||||
from data_processing.patient_data import (
|
||||
load_patient_data,
|
||||
refresh_patient_treatment_summary,
|
||||
get_patient_data_stats,
|
||||
verify_mv_consistency,
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -67,9 +62,8 @@ def initialize_database(
|
||||
"""
|
||||
Initialize the database with all required tables.
|
||||
|
||||
Creates all tables defined in the schema (reference tables, fact tables,
|
||||
materialized views, and file tracking tables). Uses IF NOT EXISTS so
|
||||
safe to run multiple times.
|
||||
Creates all tables defined in the schema (reference tables and pathway
|
||||
tables). Uses IF NOT EXISTS so safe to run multiple times.
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
@@ -122,6 +116,14 @@ def initialize_database(
|
||||
else:
|
||||
logger.error(f"pathway_nodes migration failed: {msg}")
|
||||
return False
|
||||
|
||||
# Add source_row_count column to pathway_refresh_log if it doesn't exist
|
||||
success, msg = migrate_refresh_log_source_row_count(conn)
|
||||
if success:
|
||||
logger.info(f"pathway_refresh_log migration: {msg}")
|
||||
else:
|
||||
logger.error(f"pathway_refresh_log migration failed: {msg}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Migration failed: {e}")
|
||||
return False
|
||||
@@ -274,107 +276,6 @@ def create_progress_reporter(description: str = "Loading", width: int = 40):
|
||||
return report_progress
|
||||
|
||||
|
||||
def load_patient_data_cli(
|
||||
file_path: Path,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
paths: Optional[PathConfig] = None,
|
||||
force: bool = False,
|
||||
refresh_mv: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Load patient data from file with CLI progress reporting.
|
||||
|
||||
Args:
|
||||
file_path: Path to CSV or Parquet file.
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
paths: PathConfig for reference data. Uses default if not provided.
|
||||
force: If True, re-process even if file hash matches.
|
||||
refresh_mv: If True, refresh the materialized view after loading.
|
||||
|
||||
Returns:
|
||||
True if loading succeeded, False otherwise.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
if paths is None:
|
||||
paths = default_paths
|
||||
|
||||
print(f"\n=== Loading Patient Data ===\n")
|
||||
print(f"File: {file_path}")
|
||||
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
print(f"ERROR: File not found: {file_path}")
|
||||
return False
|
||||
|
||||
# Calculate and display file info
|
||||
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
print(f"Size: {file_size_mb:.1f} MB")
|
||||
print()
|
||||
|
||||
# Create progress callback
|
||||
progress_callback = create_progress_reporter("Loading rows", width=40)
|
||||
|
||||
# Load the data
|
||||
result = load_patient_data(
|
||||
file_path=file_path,
|
||||
db_manager=db_manager,
|
||||
paths=paths,
|
||||
batch_size=5000,
|
||||
force=force,
|
||||
progress_callback=progress_callback
|
||||
)
|
||||
|
||||
# Print result
|
||||
print()
|
||||
if result.was_already_processed:
|
||||
print("File already processed (same hash). Skipping.")
|
||||
print(f"Use --force to re-process.")
|
||||
elif result.success:
|
||||
print(f"Loaded {result.rows_inserted:,} rows in {result.load_time_seconds:.1f}s")
|
||||
if result.rows_skipped > 0:
|
||||
print(f"Skipped {result.rows_skipped:,} rows (missing UPID or date)")
|
||||
else:
|
||||
print(f"FAILED: {result.error_message}")
|
||||
return False
|
||||
|
||||
# Refresh materialized view if requested
|
||||
if refresh_mv and result.success and not result.was_already_processed:
|
||||
print()
|
||||
print("Refreshing materialized view...")
|
||||
mv_progress = create_progress_reporter("Processing patients", width=40)
|
||||
mv_result = refresh_patient_treatment_summary(
|
||||
db_manager=db_manager,
|
||||
progress_callback=mv_progress
|
||||
)
|
||||
|
||||
if mv_result.success:
|
||||
print(f"MV refreshed: {mv_result.patients_processed:,} patients in {mv_result.refresh_time_seconds:.1f}s")
|
||||
|
||||
# Verify consistency
|
||||
consistent, msg = verify_mv_consistency(db_manager)
|
||||
if consistent:
|
||||
print(f"MV verification: OK")
|
||||
else:
|
||||
print(f"MV verification: FAILED - {msg}")
|
||||
else:
|
||||
print(f"MV refresh FAILED: {mv_result.error_message}")
|
||||
|
||||
# Print summary statistics
|
||||
print()
|
||||
print("=== Patient Data Summary ===")
|
||||
stats = get_patient_data_stats(db_manager)
|
||||
print(f" Total rows: {stats['total_rows']:,}")
|
||||
print(f" Unique patients: {stats['unique_patients']:,}")
|
||||
print(f" Unique drugs: {stats['unique_drugs']:,}")
|
||||
print(f" Unique organizations: {stats['unique_organizations']:,}")
|
||||
if stats['date_range'][0] and stats['date_range'][1]:
|
||||
print(f" Date range: {stats['date_range'][0]} to {stats['date_range'][1]}")
|
||||
print()
|
||||
|
||||
return result.success
|
||||
|
||||
|
||||
def get_database_status(db_manager: Optional[DatabaseManager] = None) -> dict:
|
||||
"""
|
||||
Get the current status of the database.
|
||||
@@ -452,8 +353,6 @@ Examples:
|
||||
python -m data_processing.migrate --drop-existing # Reset database
|
||||
python -m data_processing.migrate --reference-data # Migrate reference data
|
||||
python -m data_processing.migrate --reference-data --verify # With verification
|
||||
python -m data_processing.migrate --load-patient-data data.parquet # Load patient data
|
||||
python -m data_processing.migrate --load-patient-data data.csv --force # Force reload
|
||||
python -m data_processing.migrate --db-path ./data/test.db # Custom path
|
||||
"""
|
||||
)
|
||||
@@ -493,23 +392,6 @@ Examples:
|
||||
action="store_true",
|
||||
help="Enable verbose logging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-patient-data",
|
||||
type=Path,
|
||||
metavar="FILE",
|
||||
help="Load patient data from CSV or Parquet file with progress reporting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Force re-processing even if file hash matches (use with --load-patient-data)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-refresh-mv",
|
||||
action="store_true",
|
||||
help="Skip materialized view refresh after loading (use with --load-patient-data)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up logging
|
||||
@@ -562,32 +444,6 @@ Examples:
|
||||
print("Reference data migration completed with errors. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Handle --load-patient-data (load patient data from CSV/Parquet)
|
||||
if args.load_patient_data:
|
||||
# Ensure database exists with tables first
|
||||
if not db_manager.exists:
|
||||
print("Database does not exist. Initializing schema first...")
|
||||
success = initialize_database(db_manager=db_manager)
|
||||
if not success:
|
||||
print("\nDatabase initialization failed. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Load patient data with progress reporting
|
||||
success = load_patient_data_cli(
|
||||
file_path=args.load_patient_data,
|
||||
db_manager=db_manager,
|
||||
paths=default_paths,
|
||||
force=args.force,
|
||||
refresh_mv=not args.no_refresh_mv
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Patient data load completed successfully.")
|
||||
return 0
|
||||
else:
|
||||
print("Patient data load failed. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Run schema migration (default behavior)
|
||||
success = initialize_database(
|
||||
db_manager=db_manager,
|
||||
|
||||
@@ -1,890 +0,0 @@
|
||||
"""
|
||||
Patient data migration functions for NHS High-Cost Drug Patient Pathway Analysis Tool.
|
||||
|
||||
Provides functions to load patient intervention data from CSV/Parquet files
|
||||
into the SQLite fact_interventions table. Supports:
|
||||
- Batch processing for large files
|
||||
- File hash tracking for incremental updates
|
||||
- Progress reporting during loading
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from core import PathConfig, default_paths
|
||||
from core.logging_config import get_logger
|
||||
from data_processing.database import DatabaseManager
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PatientDataLoadResult:
|
||||
"""Results from a patient data load operation."""
|
||||
file_path: str
|
||||
file_hash: str
|
||||
rows_read: int
|
||||
rows_inserted: int
|
||||
rows_skipped: int
|
||||
success: bool
|
||||
error_message: Optional[str] = None
|
||||
load_time_seconds: float = 0.0
|
||||
was_already_processed: bool = False
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.was_already_processed:
|
||||
return f"{self.file_path}: Already processed (same hash)"
|
||||
elif self.success:
|
||||
return (
|
||||
f"{self.file_path}: Loaded {self.rows_inserted:,} rows "
|
||||
f"in {self.load_time_seconds:.1f}s"
|
||||
)
|
||||
else:
|
||||
return f"{self.file_path}: FAILED - {self.error_message}"
|
||||
|
||||
|
||||
def calculate_file_hash(file_path: Path) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of a file.
|
||||
|
||||
Uses chunked reading to handle large files efficiently.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file.
|
||||
|
||||
Returns:
|
||||
Hex string of SHA256 hash.
|
||||
"""
|
||||
sha256_hash = hashlib.sha256()
|
||||
with open(file_path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(8192), b""):
|
||||
sha256_hash.update(chunk)
|
||||
return sha256_hash.hexdigest()
|
||||
|
||||
|
||||
def check_file_processed(
|
||||
conn: sqlite3.Connection,
|
||||
file_path: str,
|
||||
file_hash: str
|
||||
) -> tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Check if a file has already been processed with the same hash.
|
||||
|
||||
Args:
|
||||
conn: Database connection.
|
||||
file_path: Full path to the file.
|
||||
file_hash: SHA256 hash of the file.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_processed, old_hash).
|
||||
- If is_processed is True and old_hash == file_hash, file is unchanged.
|
||||
- If is_processed is True and old_hash != file_hash, file has changed.
|
||||
- If is_processed is False, file is new.
|
||||
"""
|
||||
cursor = conn.execute(
|
||||
"SELECT file_hash, status FROM processed_files WHERE file_path = ?",
|
||||
(file_path,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result is None:
|
||||
return False, None
|
||||
|
||||
old_hash = result["file_hash"]
|
||||
status = result["status"]
|
||||
|
||||
# Only consider it processed if status is success and hash matches
|
||||
if status == "success" and old_hash == file_hash:
|
||||
return True, old_hash
|
||||
|
||||
return False, old_hash
|
||||
|
||||
|
||||
def record_file_processing_start(
|
||||
conn: sqlite3.Connection,
|
||||
file_path: str,
|
||||
file_hash: str,
|
||||
file_size: int,
|
||||
file_modified: datetime
|
||||
) -> None:
|
||||
"""
|
||||
Record that we're starting to process a file.
|
||||
|
||||
Args:
|
||||
conn: Database connection.
|
||||
file_path: Full path to the file.
|
||||
file_hash: SHA256 hash of the file.
|
||||
file_size: File size in bytes.
|
||||
file_modified: File modification timestamp.
|
||||
"""
|
||||
file_name = Path(file_path).name
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
conn.execute("""
|
||||
INSERT INTO processed_files (
|
||||
file_path, file_name, file_hash, file_size_bytes,
|
||||
file_modified_at, status, first_processed_at, last_processed_at
|
||||
) VALUES (?, ?, ?, ?, ?, 'processing', ?, ?)
|
||||
ON CONFLICT(file_path) DO UPDATE SET
|
||||
file_hash = excluded.file_hash,
|
||||
file_size_bytes = excluded.file_size_bytes,
|
||||
file_modified_at = excluded.file_modified_at,
|
||||
status = 'processing',
|
||||
last_processed_at = excluded.last_processed_at,
|
||||
error_message = NULL
|
||||
""", (file_path, file_name, file_hash, file_size, file_modified.isoformat(), now, now))
|
||||
|
||||
|
||||
def record_file_processing_complete(
|
||||
conn: sqlite3.Connection,
|
||||
file_path: str,
|
||||
row_count: int,
|
||||
duration_seconds: float,
|
||||
success: bool,
|
||||
error_message: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Record that file processing has completed.
|
||||
|
||||
Args:
|
||||
conn: Database connection.
|
||||
file_path: Full path to the file.
|
||||
row_count: Number of rows processed.
|
||||
duration_seconds: Time taken to process.
|
||||
success: Whether processing was successful.
|
||||
error_message: Error message if failed.
|
||||
"""
|
||||
status = "success" if success else "error"
|
||||
|
||||
conn.execute("""
|
||||
UPDATE processed_files
|
||||
SET status = ?,
|
||||
row_count = ?,
|
||||
processing_duration_seconds = ?,
|
||||
error_message = ?,
|
||||
last_processed_at = ?
|
||||
WHERE file_path = ?
|
||||
""", (status, row_count, duration_seconds, error_message, datetime.now().isoformat(), file_path))
|
||||
|
||||
|
||||
def load_dataframe_to_sqlite(
|
||||
df: pd.DataFrame,
|
||||
conn: sqlite3.Connection,
|
||||
source_file: str,
|
||||
batch_size: int = 5000,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None
|
||||
) -> int:
|
||||
"""
|
||||
Load a processed DataFrame into fact_interventions table.
|
||||
|
||||
Args:
|
||||
df: Processed DataFrame with required columns (from FileDataLoader).
|
||||
conn: Database connection.
|
||||
source_file: Source file path for tracking.
|
||||
batch_size: Number of rows to insert per batch.
|
||||
progress_callback: Optional callback(rows_inserted, total_rows) for progress updates.
|
||||
|
||||
Returns:
|
||||
Number of rows inserted.
|
||||
"""
|
||||
# Store the original drug names before processing (for rows where mapping doesn't exist)
|
||||
# The drug_names() transformation sets Drug Name to NULL when no mapping exists.
|
||||
# We need to preserve the original for those cases.
|
||||
|
||||
# Insert SQL columns - always include drug_name_raw
|
||||
insert_columns = [
|
||||
"upid", "provider_code", "person_key",
|
||||
"drug_name_raw", "drug_name_std",
|
||||
"intervention_date", "price_actual",
|
||||
"org_name", "directory",
|
||||
"treatment_function_code",
|
||||
"additional_detail_1", "additional_detail_2", "additional_detail_3",
|
||||
"additional_detail_4", "additional_detail_5",
|
||||
"source_file"
|
||||
]
|
||||
placeholders = ",".join(["?"] * len(insert_columns))
|
||||
insert_sql = f"""
|
||||
INSERT INTO fact_interventions ({",".join(insert_columns)})
|
||||
VALUES ({placeholders})
|
||||
"""
|
||||
|
||||
rows_inserted = 0
|
||||
rows_skipped = 0
|
||||
total_rows = len(df)
|
||||
|
||||
# Process in batches
|
||||
for batch_start in range(0, total_rows, batch_size):
|
||||
batch_end = min(batch_start + batch_size, total_rows)
|
||||
batch_df = df.iloc[batch_start:batch_end]
|
||||
|
||||
# Prepare batch data
|
||||
batch_data = []
|
||||
for _, row in batch_df.iterrows():
|
||||
# Skip rows missing required fields
|
||||
if pd.isna(row.get("UPID")) or pd.isna(row.get("Intervention Date")):
|
||||
rows_skipped += 1
|
||||
continue
|
||||
# Get drug names - raw and standardized
|
||||
drug_name_raw = row.get("Drug Name Raw") if "Drug Name Raw" in df.columns else None
|
||||
drug_name_std = row.get("Drug Name")
|
||||
|
||||
# If drug_name_std is NULL, use the raw drug name (uppercase)
|
||||
# This handles cases where the drug isn't in the drugnames.csv mapping
|
||||
if pd.isna(drug_name_std):
|
||||
if drug_name_raw is not None and not pd.isna(drug_name_raw):
|
||||
drug_name_std = str(drug_name_raw).upper().strip()
|
||||
else:
|
||||
drug_name_std = "UNKNOWN"
|
||||
|
||||
# Also clean up raw drug name for storage
|
||||
if drug_name_raw is not None and not pd.isna(drug_name_raw):
|
||||
drug_name_raw = str(drug_name_raw).strip()
|
||||
|
||||
# Get other values with null handling
|
||||
def get_value(col_name):
|
||||
if col_name not in df.columns:
|
||||
return None
|
||||
val = row[col_name]
|
||||
if pd.isna(val):
|
||||
return None
|
||||
elif hasattr(val, "strftime"):
|
||||
return val.strftime("%Y-%m-%d")
|
||||
return val
|
||||
|
||||
row_data = (
|
||||
get_value("UPID"),
|
||||
get_value("Provider Code"),
|
||||
get_value("PersonKey"),
|
||||
drug_name_raw,
|
||||
drug_name_std,
|
||||
get_value("Intervention Date"),
|
||||
get_value("Price Actual") or 0,
|
||||
get_value("OrganisationName"),
|
||||
get_value("Directory"),
|
||||
get_value("Treatment Function Code"),
|
||||
get_value("Additional Detail 1"),
|
||||
get_value("Additional Detail 2"),
|
||||
get_value("Additional Detail 3"),
|
||||
get_value("Additional Detail 4"),
|
||||
get_value("Additional Detail 5"),
|
||||
source_file
|
||||
)
|
||||
batch_data.append(row_data)
|
||||
|
||||
# Execute batch insert
|
||||
conn.executemany(insert_sql, batch_data)
|
||||
rows_inserted += len(batch_data)
|
||||
|
||||
# Report progress
|
||||
if progress_callback:
|
||||
progress_callback(rows_inserted, total_rows)
|
||||
|
||||
if rows_skipped > 0:
|
||||
logger.info(f"Skipped {rows_skipped:,} rows with missing UPID or Intervention Date")
|
||||
|
||||
return rows_inserted
|
||||
|
||||
|
||||
def delete_file_data(conn: sqlite3.Connection, source_file: str) -> int:
|
||||
"""
|
||||
Delete all data from a specific source file.
|
||||
|
||||
Used when re-processing a changed file.
|
||||
|
||||
Args:
|
||||
conn: Database connection.
|
||||
source_file: Source file path.
|
||||
|
||||
Returns:
|
||||
Number of rows deleted.
|
||||
"""
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM fact_interventions WHERE source_file = ?",
|
||||
(source_file,)
|
||||
)
|
||||
return cursor.rowcount
|
||||
|
||||
|
||||
def load_patient_data(
|
||||
file_path: Path | str,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
paths: Optional[PathConfig] = None,
|
||||
batch_size: int = 5000,
|
||||
force: bool = False,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None
|
||||
) -> PatientDataLoadResult:
|
||||
"""
|
||||
Load patient data from CSV/Parquet file into fact_interventions table.
|
||||
|
||||
This is the main entry point for loading patient data. It:
|
||||
1. Calculates file hash to detect changes
|
||||
2. Checks if file was already processed (skip if unchanged)
|
||||
3. Loads and transforms data using FileDataLoader
|
||||
4. Inserts data into SQLite in batches
|
||||
5. Records processing status in processed_files table
|
||||
|
||||
Args:
|
||||
file_path: Path to CSV or Parquet file.
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
paths: PathConfig for reference data. Uses default if not provided.
|
||||
batch_size: Number of rows to insert per batch (default: 5000).
|
||||
force: If True, re-process even if file hash matches.
|
||||
progress_callback: Optional callback(rows_inserted, total_rows) for progress.
|
||||
|
||||
Returns:
|
||||
PatientDataLoadResult with loading statistics.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
if paths is None:
|
||||
paths = default_paths
|
||||
|
||||
file_path = Path(file_path)
|
||||
file_path_str = str(file_path.absolute())
|
||||
|
||||
logger.info(f"Starting patient data load from {file_path}")
|
||||
start_time = time.time()
|
||||
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
error_msg = f"File not found: {file_path}"
|
||||
logger.error(error_msg)
|
||||
return PatientDataLoadResult(
|
||||
file_path=file_path_str,
|
||||
file_hash="",
|
||||
rows_read=0,
|
||||
rows_inserted=0,
|
||||
rows_skipped=0,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
# Calculate file hash
|
||||
logger.info("Calculating file hash...")
|
||||
file_hash = calculate_file_hash(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
file_modified = datetime.fromtimestamp(file_path.stat().st_mtime)
|
||||
|
||||
logger.info(f"File hash: {file_hash[:16]}... Size: {file_size:,} bytes")
|
||||
|
||||
# Check if already processed
|
||||
if not force:
|
||||
with db_manager.get_connection() as conn:
|
||||
is_processed, old_hash = check_file_processed(conn, file_path_str, file_hash)
|
||||
if is_processed:
|
||||
logger.info(f"File already processed with same hash, skipping")
|
||||
return PatientDataLoadResult(
|
||||
file_path=file_path_str,
|
||||
file_hash=file_hash,
|
||||
rows_read=0,
|
||||
rows_inserted=0,
|
||||
rows_skipped=0,
|
||||
success=True,
|
||||
was_already_processed=True
|
||||
)
|
||||
elif old_hash is not None:
|
||||
logger.info(f"File hash changed, will re-process (old: {old_hash[:16]}...)")
|
||||
|
||||
try:
|
||||
# Use FileDataLoader to load and transform data
|
||||
from data_processing.loader import FileDataLoader
|
||||
|
||||
loader = FileDataLoader(file_path, paths)
|
||||
logger.info("Loading and transforming data...")
|
||||
result = loader.load()
|
||||
df = result.df
|
||||
rows_read = result.row_count
|
||||
|
||||
logger.info(f"Loaded {rows_read:,} rows, starting SQLite insert...")
|
||||
|
||||
# Load into SQLite
|
||||
with db_manager.get_transaction() as conn:
|
||||
# Record that we're starting
|
||||
record_file_processing_start(conn, file_path_str, file_hash, file_size, file_modified)
|
||||
|
||||
# Delete any existing data from this file (for re-processing)
|
||||
deleted = delete_file_data(conn, file_path_str)
|
||||
if deleted > 0:
|
||||
logger.info(f"Deleted {deleted:,} existing rows from previous load")
|
||||
|
||||
# Insert new data
|
||||
rows_inserted = load_dataframe_to_sqlite(
|
||||
df, conn, file_path_str, batch_size, progress_callback
|
||||
)
|
||||
|
||||
# Record success
|
||||
load_time = time.time() - start_time
|
||||
record_file_processing_complete(
|
||||
conn, file_path_str, rows_inserted, load_time, True
|
||||
)
|
||||
|
||||
logger.info(f"Successfully loaded {rows_inserted:,} rows in {load_time:.1f}s")
|
||||
|
||||
return PatientDataLoadResult(
|
||||
file_path=file_path_str,
|
||||
file_hash=file_hash,
|
||||
rows_read=rows_read,
|
||||
rows_inserted=rows_inserted,
|
||||
rows_skipped=rows_read - rows_inserted,
|
||||
success=True,
|
||||
load_time_seconds=load_time
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
load_time = time.time() - start_time
|
||||
error_msg = str(e)
|
||||
logger.error(f"Failed to load patient data: {error_msg}")
|
||||
|
||||
# Record failure
|
||||
try:
|
||||
with db_manager.get_connection() as conn:
|
||||
record_file_processing_complete(
|
||||
conn, file_path_str, 0, load_time, False, error_msg
|
||||
)
|
||||
except Exception:
|
||||
pass # Don't fail on failure to record failure
|
||||
|
||||
return PatientDataLoadResult(
|
||||
file_path=file_path_str,
|
||||
file_hash=file_hash if 'file_hash' in dir() else "",
|
||||
rows_read=0,
|
||||
rows_inserted=0,
|
||||
rows_skipped=0,
|
||||
success=False,
|
||||
error_message=error_msg,
|
||||
load_time_seconds=load_time
|
||||
)
|
||||
|
||||
|
||||
def get_patient_data_stats(db_manager: Optional[DatabaseManager] = None) -> dict:
|
||||
"""
|
||||
Get statistics about patient data in fact_interventions.
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics about the loaded data.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
stats = {}
|
||||
|
||||
with db_manager.get_connection() as conn:
|
||||
# Total rows
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM fact_interventions")
|
||||
stats["total_rows"] = cursor.fetchone()[0]
|
||||
|
||||
# Unique patients
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT upid) FROM fact_interventions")
|
||||
stats["unique_patients"] = cursor.fetchone()[0]
|
||||
|
||||
# Unique drugs
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT drug_name_std) FROM fact_interventions")
|
||||
stats["unique_drugs"] = cursor.fetchone()[0]
|
||||
|
||||
# Unique organizations
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT org_name) FROM fact_interventions")
|
||||
stats["unique_organizations"] = cursor.fetchone()[0]
|
||||
|
||||
# Date range
|
||||
cursor = conn.execute("""
|
||||
SELECT MIN(intervention_date), MAX(intervention_date)
|
||||
FROM fact_interventions
|
||||
""")
|
||||
result = cursor.fetchone()
|
||||
stats["date_range"] = (result[0], result[1]) if result else (None, None)
|
||||
|
||||
# Processed files
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*), SUM(row_count)
|
||||
FROM processed_files WHERE status = 'success'
|
||||
""")
|
||||
result = cursor.fetchone()
|
||||
stats["processed_files"] = result[0] if result else 0
|
||||
stats["processed_rows"] = result[1] if result and result[1] else 0
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def list_processed_files(db_manager: Optional[DatabaseManager] = None) -> list[dict]:
|
||||
"""
|
||||
List all processed files and their status.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with file processing information.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
files = []
|
||||
|
||||
with db_manager.get_connection() as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT file_path, file_name, file_hash, file_size_bytes,
|
||||
row_count, status, error_message,
|
||||
first_processed_at, last_processed_at, processing_duration_seconds
|
||||
FROM processed_files
|
||||
ORDER BY last_processed_at DESC
|
||||
""")
|
||||
|
||||
for row in cursor.fetchall():
|
||||
files.append({
|
||||
"file_path": row["file_path"],
|
||||
"file_name": row["file_name"],
|
||||
"file_hash": row["file_hash"],
|
||||
"file_size_bytes": row["file_size_bytes"],
|
||||
"row_count": row["row_count"],
|
||||
"status": row["status"],
|
||||
"error_message": row["error_message"],
|
||||
"first_processed_at": row["first_processed_at"],
|
||||
"last_processed_at": row["last_processed_at"],
|
||||
"processing_duration_seconds": row["processing_duration_seconds"],
|
||||
})
|
||||
|
||||
return files
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Materialized View Refresh Functions
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class MVRefreshResult:
|
||||
"""Results from refreshing the patient treatment summary materialized view."""
|
||||
patients_processed: int
|
||||
rows_inserted: int
|
||||
refresh_time_seconds: float
|
||||
success: bool
|
||||
error_message: Optional[str] = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.success:
|
||||
return (
|
||||
f"Refreshed MV: {self.patients_processed:,} patients "
|
||||
f"in {self.refresh_time_seconds:.1f}s"
|
||||
)
|
||||
else:
|
||||
return f"MV refresh FAILED: {self.error_message}"
|
||||
|
||||
|
||||
def refresh_patient_treatment_summary(
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None
|
||||
) -> MVRefreshResult:
|
||||
"""
|
||||
Refresh the mv_patient_treatment_summary materialized view.
|
||||
|
||||
This computes per-patient aggregations from fact_interventions:
|
||||
- First/last seen dates
|
||||
- Total cost, average cost per intervention
|
||||
- Intervention count, unique drug count
|
||||
- Drug sequence (chronological, pipe-separated)
|
||||
- Drug counts, costs, and date ranges (as JSON)
|
||||
|
||||
The MV is fully rebuilt (truncate and re-insert) for simplicity.
|
||||
This typically takes 30-60 seconds for ~35,000 patients.
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
progress_callback: Optional callback(patients_done, total_patients).
|
||||
|
||||
Returns:
|
||||
MVRefreshResult with refresh statistics.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
logger.info("Starting materialized view refresh...")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
with db_manager.get_transaction() as conn:
|
||||
# Step 1: Get total patient count for progress reporting
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT upid) FROM fact_interventions")
|
||||
total_patients = cursor.fetchone()[0]
|
||||
logger.info(f"Processing {total_patients:,} unique patients")
|
||||
|
||||
if total_patients == 0:
|
||||
logger.warning("No patient data in fact_interventions, MV will be empty")
|
||||
return MVRefreshResult(
|
||||
patients_processed=0,
|
||||
rows_inserted=0,
|
||||
refresh_time_seconds=time.time() - start_time,
|
||||
success=True
|
||||
)
|
||||
|
||||
# Step 2: Clear existing MV data
|
||||
conn.execute("DELETE FROM mv_patient_treatment_summary")
|
||||
logger.info("Cleared existing MV data")
|
||||
|
||||
# Step 3: Compute aggregations using SQL CTEs
|
||||
# This is more efficient than processing row-by-row in Python
|
||||
refresh_sql = """
|
||||
WITH patient_aggs AS (
|
||||
-- Basic aggregations per patient
|
||||
SELECT
|
||||
upid,
|
||||
MIN(org_name) as org_name,
|
||||
MIN(directory) as directory,
|
||||
MIN(intervention_date) as first_seen_date,
|
||||
MAX(intervention_date) as last_seen_date,
|
||||
JULIANDAY(MAX(intervention_date)) - JULIANDAY(MIN(intervention_date)) as days_treated,
|
||||
SUM(price_actual) as total_cost,
|
||||
AVG(price_actual) as avg_cost_per_intervention,
|
||||
COUNT(*) as intervention_count,
|
||||
COUNT(DISTINCT drug_name_std) as unique_drug_count,
|
||||
COUNT(*) as source_row_count
|
||||
FROM fact_interventions
|
||||
GROUP BY upid
|
||||
),
|
||||
drug_sequences AS (
|
||||
-- Drug sequence per patient (chronological order, pipe-separated)
|
||||
SELECT
|
||||
upid,
|
||||
GROUP_CONCAT(drug_name_std, '|') as drug_sequence
|
||||
FROM (
|
||||
SELECT DISTINCT
|
||||
upid,
|
||||
drug_name_std,
|
||||
MIN(intervention_date) as first_date
|
||||
FROM fact_interventions
|
||||
GROUP BY upid, drug_name_std
|
||||
ORDER BY upid, first_date
|
||||
)
|
||||
GROUP BY upid
|
||||
),
|
||||
drug_counts AS (
|
||||
-- JSON object of drug counts per patient
|
||||
SELECT
|
||||
upid,
|
||||
'{' || GROUP_CONCAT('"' || drug_name_std || '": ' || cnt, ', ') || '}' as drug_counts_json
|
||||
FROM (
|
||||
SELECT
|
||||
upid,
|
||||
drug_name_std,
|
||||
COUNT(*) as cnt
|
||||
FROM fact_interventions
|
||||
GROUP BY upid, drug_name_std
|
||||
)
|
||||
GROUP BY upid
|
||||
),
|
||||
drug_costs AS (
|
||||
-- JSON object of drug costs per patient
|
||||
SELECT
|
||||
upid,
|
||||
'{' || GROUP_CONCAT('"' || drug_name_std || '": ' || ROUND(total_cost, 2), ', ') || '}' as drug_costs_json
|
||||
FROM (
|
||||
SELECT
|
||||
upid,
|
||||
drug_name_std,
|
||||
SUM(price_actual) as total_cost
|
||||
FROM fact_interventions
|
||||
GROUP BY upid, drug_name_std
|
||||
)
|
||||
GROUP BY upid
|
||||
),
|
||||
drug_dates AS (
|
||||
-- JSON object of drug date ranges per patient
|
||||
SELECT
|
||||
upid,
|
||||
'{' || GROUP_CONCAT('"' || drug_name_std || '": {"first": "' || first_date || '", "last": "' || last_date || '"}', ', ') || '}' as drug_date_ranges_json
|
||||
FROM (
|
||||
SELECT
|
||||
upid,
|
||||
drug_name_std,
|
||||
MIN(intervention_date) as first_date,
|
||||
MAX(intervention_date) as last_date
|
||||
FROM fact_interventions
|
||||
GROUP BY upid, drug_name_std
|
||||
)
|
||||
GROUP BY upid
|
||||
)
|
||||
INSERT INTO mv_patient_treatment_summary (
|
||||
upid, org_name, directory,
|
||||
first_seen_date, last_seen_date, days_treated,
|
||||
total_cost, avg_cost_per_intervention,
|
||||
intervention_count, unique_drug_count,
|
||||
drug_sequence, drug_counts_json, drug_costs_json, drug_date_ranges_json,
|
||||
source_row_count, computed_at
|
||||
)
|
||||
SELECT
|
||||
pa.upid,
|
||||
pa.org_name,
|
||||
pa.directory,
|
||||
pa.first_seen_date,
|
||||
pa.last_seen_date,
|
||||
CAST(pa.days_treated AS INTEGER),
|
||||
pa.total_cost,
|
||||
pa.avg_cost_per_intervention,
|
||||
pa.intervention_count,
|
||||
pa.unique_drug_count,
|
||||
ds.drug_sequence,
|
||||
dc.drug_counts_json,
|
||||
dco.drug_costs_json,
|
||||
dd.drug_date_ranges_json,
|
||||
pa.source_row_count,
|
||||
CURRENT_TIMESTAMP
|
||||
FROM patient_aggs pa
|
||||
LEFT JOIN drug_sequences ds ON pa.upid = ds.upid
|
||||
LEFT JOIN drug_counts dc ON pa.upid = dc.upid
|
||||
LEFT JOIN drug_costs dco ON pa.upid = dco.upid
|
||||
LEFT JOIN drug_dates dd ON pa.upid = dd.upid
|
||||
"""
|
||||
|
||||
logger.info("Executing MV refresh query...")
|
||||
conn.execute(refresh_sql)
|
||||
|
||||
# Get actual rows inserted
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM mv_patient_treatment_summary")
|
||||
rows_inserted = cursor.fetchone()[0]
|
||||
|
||||
refresh_time = time.time() - start_time
|
||||
logger.info(f"MV refresh complete: {rows_inserted:,} rows in {refresh_time:.1f}s")
|
||||
|
||||
# Report progress if callback provided
|
||||
if progress_callback:
|
||||
progress_callback(rows_inserted, total_patients)
|
||||
|
||||
return MVRefreshResult(
|
||||
patients_processed=total_patients,
|
||||
rows_inserted=rows_inserted,
|
||||
refresh_time_seconds=refresh_time,
|
||||
success=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
refresh_time = time.time() - start_time
|
||||
error_msg = str(e)
|
||||
logger.error(f"MV refresh failed: {error_msg}")
|
||||
return MVRefreshResult(
|
||||
patients_processed=0,
|
||||
rows_inserted=0,
|
||||
refresh_time_seconds=refresh_time,
|
||||
success=False,
|
||||
error_message=error_msg
|
||||
)
|
||||
|
||||
|
||||
def get_patient_summary_stats(db_manager: Optional[DatabaseManager] = None) -> dict:
|
||||
"""
|
||||
Get statistics about the patient treatment summary MV.
|
||||
|
||||
Returns:
|
||||
Dictionary with MV statistics.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
stats = {}
|
||||
|
||||
with db_manager.get_connection() as conn:
|
||||
# Total rows
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM mv_patient_treatment_summary")
|
||||
stats["total_patients"] = cursor.fetchone()[0]
|
||||
|
||||
if stats["total_patients"] == 0:
|
||||
return stats
|
||||
|
||||
# Aggregated statistics
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
SUM(total_cost) as total_cost_all,
|
||||
AVG(total_cost) as avg_cost_per_patient,
|
||||
SUM(intervention_count) as total_interventions,
|
||||
AVG(intervention_count) as avg_interventions_per_patient,
|
||||
AVG(unique_drug_count) as avg_drugs_per_patient,
|
||||
AVG(days_treated) as avg_days_treated,
|
||||
MIN(first_seen_date) as earliest_date,
|
||||
MAX(last_seen_date) as latest_date,
|
||||
MAX(computed_at) as last_refresh
|
||||
FROM mv_patient_treatment_summary
|
||||
""")
|
||||
result = cursor.fetchone()
|
||||
|
||||
stats["total_cost"] = result[0] if result[0] else 0
|
||||
stats["avg_cost_per_patient"] = result[1] if result[1] else 0
|
||||
stats["total_interventions"] = result[2] if result[2] else 0
|
||||
stats["avg_interventions_per_patient"] = result[3] if result[3] else 0
|
||||
stats["avg_drugs_per_patient"] = result[4] if result[4] else 0
|
||||
stats["avg_days_treated"] = result[5] if result[5] else 0
|
||||
stats["date_range"] = (result[6], result[7])
|
||||
stats["last_refresh"] = result[8]
|
||||
|
||||
# Unique directories in MV
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT directory) FROM mv_patient_treatment_summary")
|
||||
stats["unique_directories"] = cursor.fetchone()[0]
|
||||
|
||||
# Unique organizations in MV
|
||||
cursor = conn.execute("SELECT COUNT(DISTINCT org_name) FROM mv_patient_treatment_summary")
|
||||
stats["unique_organizations"] = cursor.fetchone()[0]
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def verify_mv_consistency(db_manager: Optional[DatabaseManager] = None) -> tuple[bool, str]:
|
||||
"""
|
||||
Verify that the MV is consistent with fact_interventions.
|
||||
|
||||
Checks that:
|
||||
- Patient counts match
|
||||
- Total cost sums match
|
||||
- Intervention counts match
|
||||
|
||||
Returns:
|
||||
Tuple of (is_consistent, message).
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
with db_manager.get_connection() as conn:
|
||||
# Get fact table counts
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(DISTINCT upid) as patients,
|
||||
SUM(price_actual) as total_cost,
|
||||
COUNT(*) as interventions
|
||||
FROM fact_interventions
|
||||
""")
|
||||
fact_row = cursor.fetchone()
|
||||
fact_patients = fact_row[0] or 0
|
||||
fact_cost = fact_row[1] or 0
|
||||
fact_interventions = fact_row[2] or 0
|
||||
|
||||
# Get MV counts
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as patients,
|
||||
SUM(total_cost) as total_cost,
|
||||
SUM(intervention_count) as interventions
|
||||
FROM mv_patient_treatment_summary
|
||||
""")
|
||||
mv_row = cursor.fetchone()
|
||||
mv_patients = mv_row[0] or 0
|
||||
mv_cost = mv_row[1] or 0
|
||||
mv_interventions = mv_row[2] or 0
|
||||
|
||||
# Compare
|
||||
issues = []
|
||||
|
||||
if fact_patients != mv_patients:
|
||||
issues.append(f"Patient count mismatch: fact={fact_patients:,}, mv={mv_patients:,}")
|
||||
|
||||
if mv_interventions != fact_interventions:
|
||||
issues.append(f"Intervention count mismatch: fact={fact_interventions:,}, mv={mv_interventions:,}")
|
||||
|
||||
# Allow small floating point differences in cost
|
||||
cost_diff = abs(fact_cost - mv_cost)
|
||||
if cost_diff > 0.01:
|
||||
issues.append(f"Cost mismatch: fact={fact_cost:,.2f}, mv={mv_cost:,.2f}, diff={cost_diff:.2f}")
|
||||
|
||||
if issues:
|
||||
return False, "; ".join(issues)
|
||||
|
||||
return True, f"MV consistent: {mv_patients:,} patients, {mv_interventions:,} interventions, £{mv_cost:,.2f} total"
|
||||
+27
-438
@@ -115,43 +115,6 @@ CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_cluster ON ref_drug_
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_drug_indication_clusters(indication);
|
||||
"""
|
||||
|
||||
REF_DRUG_SNOMED_MAPPING_SCHEMA = """
|
||||
-- Direct SNOMED code mapping from drug to indication to GP diagnosis codes
|
||||
-- Source: data/drug_snomed_mapping_enriched.csv (163K rows)
|
||||
-- Used for direct GP record matching to assign diagnosis-based directorates
|
||||
-- and to support indication-based pathway hierarchy (Trust → Search_Term → Drug → Pathway)
|
||||
CREATE TABLE IF NOT EXISTS ref_drug_snomed_mapping (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
drug_name TEXT NOT NULL, -- Original drug name from mapping
|
||||
indication TEXT NOT NULL, -- Specific indication (603 unique values)
|
||||
ta_id TEXT, -- NICE TA reference (e.g., TA568)
|
||||
search_term TEXT NOT NULL, -- Simplified grouping (187 unique values)
|
||||
snomed_code TEXT NOT NULL, -- SNOMED CT code for GP record matching
|
||||
snomed_description TEXT, -- SNOMED code description
|
||||
cleaned_drug_name TEXT NOT NULL, -- Standardized drug name for matching
|
||||
primary_directorate TEXT, -- Primary directorate for this indication
|
||||
all_directorates TEXT, -- Pipe-separated list of valid directorates
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(drug_name, indication, snomed_code)
|
||||
);
|
||||
|
||||
-- Index for looking up SNOMED codes by drug name (most common access pattern)
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug ON ref_drug_snomed_mapping(drug_name);
|
||||
|
||||
-- Index for looking up by cleaned drug name (standardized matching)
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_cleaned ON ref_drug_snomed_mapping(cleaned_drug_name);
|
||||
|
||||
-- Index for looking up by SNOMED code (reverse lookup from GP record)
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_snomed ON ref_drug_snomed_mapping(snomed_code);
|
||||
|
||||
-- Index for grouping by search_term (indication-based hierarchy)
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_search_term ON ref_drug_snomed_mapping(search_term);
|
||||
|
||||
-- Composite index for drug + snomed code (common lookup pattern)
|
||||
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug_snomed
|
||||
ON ref_drug_snomed_mapping(cleaned_drug_name, snomed_code);
|
||||
"""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pathway Data Architecture Schemas
|
||||
@@ -278,6 +241,7 @@ CREATE TABLE IF NOT EXISTS pathway_refresh_log (
|
||||
snowflake_query_date_from TEXT, -- Start date of Snowflake query
|
||||
snowflake_query_date_to TEXT, -- End date of Snowflake query
|
||||
processing_duration_seconds REAL, -- How long the refresh took
|
||||
source_row_count INTEGER, -- Number of Snowflake rows fetched
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
@@ -301,208 +265,6 @@ PATHWAY_TABLES_SCHEMA = f"""
|
||||
"""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Fact Table Schemas
|
||||
# =============================================================================
|
||||
|
||||
FACT_INTERVENTIONS_SCHEMA = """
|
||||
-- Patient intervention records (fact table)
|
||||
-- Source: HCD activity data (CSV/Parquet files or Snowflake)
|
||||
-- This is the main fact table storing all patient intervention events
|
||||
CREATE TABLE IF NOT EXISTS fact_interventions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
|
||||
-- Patient identification
|
||||
upid TEXT NOT NULL, -- Unique Patient ID (Provider Code[:3] + PersonKey)
|
||||
provider_code TEXT NOT NULL, -- Original provider code (3-5 chars)
|
||||
person_key TEXT NOT NULL, -- Patient key from source system
|
||||
|
||||
-- Intervention details
|
||||
drug_name_raw TEXT, -- Original drug name from source
|
||||
drug_name_std TEXT NOT NULL, -- Standardized drug name (via ref_drug_names)
|
||||
intervention_date DATE NOT NULL, -- Date of intervention
|
||||
price_actual REAL NOT NULL DEFAULT 0, -- Cost of intervention in GBP
|
||||
|
||||
-- Organization and directory
|
||||
org_name TEXT, -- Organization name (cleaned, no commas)
|
||||
directory TEXT, -- Medical directory/specialty (may be "Undefined")
|
||||
|
||||
-- Source tracking
|
||||
source_file TEXT, -- Original file this record came from
|
||||
loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- Additional clinical fields (optional, used in directory fallback logic)
|
||||
treatment_function_code INTEGER,
|
||||
additional_detail_1 TEXT,
|
||||
additional_detail_2 TEXT,
|
||||
additional_detail_3 TEXT,
|
||||
additional_detail_4 TEXT,
|
||||
additional_detail_5 TEXT
|
||||
);
|
||||
|
||||
-- Primary indexes for common filter patterns used in generate_graph()
|
||||
-- UPID: Used for patient grouping, pathway analysis
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid ON fact_interventions(upid);
|
||||
|
||||
-- Drug name (standardized): Used for drug filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_drug ON fact_interventions(drug_name_std);
|
||||
|
||||
-- Intervention date: Used for date range filtering (start_date, end_date, last_seen)
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_date ON fact_interventions(intervention_date);
|
||||
|
||||
-- Directory: Used for directory/specialty filtering
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_directory ON fact_interventions(directory);
|
||||
|
||||
-- Organization: Used for trust filtering (Provider Code maps to org_name)
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_org ON fact_interventions(org_name);
|
||||
|
||||
-- Composite index for common filter combination (trust + drug + directory)
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_composite
|
||||
ON fact_interventions(org_name, drug_name_std, directory);
|
||||
|
||||
-- Composite index for date-based patient analysis
|
||||
CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid_date
|
||||
ON fact_interventions(upid, intervention_date);
|
||||
"""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Materialized View Schemas (Cached Aggregations)
|
||||
# =============================================================================
|
||||
|
||||
MV_PATIENT_TREATMENT_SUMMARY_SCHEMA = """
|
||||
-- Materialized view of patient treatment summaries
|
||||
-- Pre-computed aggregations per patient for faster pathway analysis
|
||||
-- Refreshed when fact_interventions data changes
|
||||
CREATE TABLE IF NOT EXISTS mv_patient_treatment_summary (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
|
||||
-- Patient identification
|
||||
upid TEXT NOT NULL UNIQUE, -- Unique Patient ID
|
||||
|
||||
-- Organization and directory (for filtering)
|
||||
org_name TEXT, -- Organization name (first org seen)
|
||||
directory TEXT, -- Primary directory (first directory assigned)
|
||||
|
||||
-- Date range
|
||||
first_seen_date DATE NOT NULL, -- First intervention date
|
||||
last_seen_date DATE NOT NULL, -- Last intervention date
|
||||
days_treated INTEGER NOT NULL DEFAULT 0, -- Duration: last_seen - first_seen
|
||||
|
||||
-- Cost aggregations
|
||||
total_cost REAL NOT NULL DEFAULT 0, -- Sum of all intervention costs
|
||||
avg_cost_per_intervention REAL, -- Average cost per intervention
|
||||
|
||||
-- Treatment summary
|
||||
intervention_count INTEGER NOT NULL DEFAULT 0, -- Total number of interventions
|
||||
unique_drug_count INTEGER NOT NULL DEFAULT 0, -- Number of distinct drugs
|
||||
|
||||
-- Drug sequence (pipe-separated standardized drug names in chronological order)
|
||||
-- Example: "ADALIMUMAB|ETANERCEPT|INFLIXIMAB"
|
||||
drug_sequence TEXT,
|
||||
|
||||
-- Drug frequency counts (JSON: {"ADALIMUMAB": 5, "ETANERCEPT": 3})
|
||||
-- Stores count of each drug for this patient
|
||||
drug_counts_json TEXT,
|
||||
|
||||
-- Drug cost totals (JSON: {"ADALIMUMAB": 15000.00, "ETANERCEPT": 8000.00})
|
||||
-- Stores total cost per drug for this patient
|
||||
drug_costs_json TEXT,
|
||||
|
||||
-- Per-drug date ranges (JSON: {"ADALIMUMAB": {"first": "2023-01-01", "last": "2023-06-15"}, ...})
|
||||
-- Stores first/last date for each drug
|
||||
drug_date_ranges_json TEXT,
|
||||
|
||||
-- Metadata
|
||||
computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
source_row_count INTEGER -- Number of fact_interventions rows used
|
||||
);
|
||||
|
||||
-- Index for fast patient lookup
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_upid ON mv_patient_treatment_summary(upid);
|
||||
|
||||
-- Indexes for common filter patterns
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_org ON mv_patient_treatment_summary(org_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_directory ON mv_patient_treatment_summary(directory);
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_first_seen ON mv_patient_treatment_summary(first_seen_date);
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_last_seen ON mv_patient_treatment_summary(last_seen_date);
|
||||
|
||||
-- Composite index for date range filtering (common in generate_graph)
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_date_range
|
||||
ON mv_patient_treatment_summary(first_seen_date, last_seen_date);
|
||||
|
||||
-- Composite index for org + directory + dates (full filter pattern)
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_filter_composite
|
||||
ON mv_patient_treatment_summary(org_name, directory, first_seen_date, last_seen_date);
|
||||
|
||||
-- Index for drug sequence pattern matching
|
||||
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_drug_seq ON mv_patient_treatment_summary(drug_sequence);
|
||||
"""
|
||||
|
||||
MATERIALIZED_VIEWS_SCHEMA = f"""
|
||||
-- Materialized Views Schema
|
||||
-- Pre-computed aggregations for performance
|
||||
|
||||
{MV_PATIENT_TREATMENT_SUMMARY_SCHEMA}
|
||||
"""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# File Tracking Schemas (Incremental Updates)
|
||||
# =============================================================================
|
||||
|
||||
PROCESSED_FILES_SCHEMA = """
|
||||
-- Tracks processed data files for incremental updates
|
||||
-- Enables detecting changed files by comparing hashes
|
||||
-- Stores processing status and statistics
|
||||
CREATE TABLE IF NOT EXISTS processed_files (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
|
||||
-- File identification
|
||||
file_path TEXT NOT NULL, -- Full path to the file
|
||||
file_name TEXT NOT NULL, -- Just the filename (for display)
|
||||
file_hash TEXT NOT NULL, -- SHA256 hash of file contents
|
||||
|
||||
-- File metadata
|
||||
file_size_bytes INTEGER, -- Size of file in bytes
|
||||
file_modified_at TIMESTAMP, -- File's last modification timestamp
|
||||
|
||||
-- Processing results
|
||||
row_count INTEGER DEFAULT 0, -- Number of rows processed from this file
|
||||
status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, success, error
|
||||
error_message TEXT, -- Error details if status='error'
|
||||
|
||||
-- Timestamps
|
||||
first_processed_at TIMESTAMP, -- When first processed
|
||||
last_processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
processing_duration_seconds REAL, -- How long processing took
|
||||
|
||||
-- Uniqueness: only one record per file path
|
||||
-- Hash changes indicate file content changed (needs reprocessing)
|
||||
UNIQUE(file_path)
|
||||
);
|
||||
|
||||
-- Index for fast lookup by file path
|
||||
CREATE INDEX IF NOT EXISTS idx_processed_files_path ON processed_files(file_path);
|
||||
|
||||
-- Index for finding files by status (e.g., find all pending or errored files)
|
||||
CREATE INDEX IF NOT EXISTS idx_processed_files_status ON processed_files(status);
|
||||
|
||||
-- Index for finding files by hash (detect if same file appears at different paths)
|
||||
CREATE INDEX IF NOT EXISTS idx_processed_files_hash ON processed_files(file_hash);
|
||||
|
||||
-- Index for finding recently processed files
|
||||
CREATE INDEX IF NOT EXISTS idx_processed_files_last_processed ON processed_files(last_processed_at);
|
||||
"""
|
||||
|
||||
FILE_TRACKING_SCHEMA = f"""
|
||||
-- File Tracking Schema
|
||||
-- Supports incremental data loading
|
||||
|
||||
{PROCESSED_FILES_SCHEMA}
|
||||
"""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Combined Schemas
|
||||
# =============================================================================
|
||||
@@ -520,29 +282,14 @@ REFERENCE_TABLES_SCHEMA = f"""
|
||||
{REF_DRUG_DIRECTORY_MAP_SCHEMA}
|
||||
|
||||
{REF_DRUG_INDICATION_CLUSTERS_SCHEMA}
|
||||
|
||||
{REF_DRUG_SNOMED_MAPPING_SCHEMA}
|
||||
"""
|
||||
|
||||
FACT_TABLES_SCHEMA = f"""
|
||||
-- Fact Tables Schema
|
||||
-- Contains patient intervention data
|
||||
|
||||
{FACT_INTERVENTIONS_SCHEMA}
|
||||
"""
|
||||
|
||||
ALL_TABLES_SCHEMA = f"""
|
||||
-- Complete Database Schema
|
||||
-- Reference tables + Fact tables + Materialized views + File tracking + Pathway tables
|
||||
-- Reference tables + Pathway tables
|
||||
|
||||
{REFERENCE_TABLES_SCHEMA}
|
||||
|
||||
{FACT_TABLES_SCHEMA}
|
||||
|
||||
{MATERIALIZED_VIEWS_SCHEMA}
|
||||
|
||||
{FILE_TRACKING_SCHEMA}
|
||||
|
||||
{PATHWAY_TABLES_SCHEMA}
|
||||
"""
|
||||
|
||||
@@ -580,26 +327,10 @@ def drop_reference_tables(conn: sqlite3.Connection) -> None:
|
||||
DROP TABLE IF EXISTS ref_directories;
|
||||
DROP TABLE IF EXISTS ref_drug_directory_map;
|
||||
DROP TABLE IF EXISTS ref_drug_indication_clusters;
|
||||
DROP TABLE IF EXISTS ref_drug_snomed_mapping;
|
||||
""")
|
||||
logger.info("Reference tables dropped")
|
||||
|
||||
|
||||
def create_drug_snomed_mapping_table(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Create the ref_drug_snomed_mapping table for direct SNOMED code mapping.
|
||||
|
||||
This table stores mappings from drugs to SNOMED codes for GP record matching,
|
||||
enabling diagnosis-based directorate assignment and indication-based pathways.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
"""
|
||||
logger.info("Creating ref_drug_snomed_mapping table...")
|
||||
conn.executescript(REF_DRUG_SNOMED_MAPPING_SCHEMA)
|
||||
logger.info("ref_drug_snomed_mapping table created successfully")
|
||||
|
||||
|
||||
def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"""
|
||||
Get row counts for all reference tables.
|
||||
@@ -616,7 +347,6 @@ def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"ref_directories",
|
||||
"ref_drug_directory_map",
|
||||
"ref_drug_indication_clusters",
|
||||
"ref_drug_snomed_mapping",
|
||||
]
|
||||
counts = {}
|
||||
|
||||
@@ -647,7 +377,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
|
||||
"ref_directories",
|
||||
"ref_drug_directory_map",
|
||||
"ref_drug_indication_clusters",
|
||||
"ref_drug_snomed_mapping",
|
||||
]
|
||||
missing = []
|
||||
|
||||
@@ -662,164 +391,6 @@ def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
|
||||
return missing
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Fact Table Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def create_fact_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Create all fact tables in the database (including materialized views).
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
"""
|
||||
logger.info("Creating fact tables...")
|
||||
conn.executescript(FACT_TABLES_SCHEMA)
|
||||
conn.executescript(MATERIALIZED_VIEWS_SCHEMA)
|
||||
logger.info("Fact tables created successfully")
|
||||
|
||||
|
||||
def drop_fact_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Drop all fact tables from the database.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Warning:
|
||||
This will delete all patient intervention data. Use with caution.
|
||||
"""
|
||||
logger.warning("Dropping fact tables...")
|
||||
conn.executescript("""
|
||||
DROP TABLE IF EXISTS fact_interventions;
|
||||
DROP TABLE IF EXISTS mv_patient_treatment_summary;
|
||||
""")
|
||||
logger.info("Fact tables dropped")
|
||||
|
||||
|
||||
def get_fact_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"""
|
||||
Get row counts for all fact tables (including materialized views).
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping table name to row count.
|
||||
"""
|
||||
tables = ["fact_interventions", "mv_patient_treatment_summary"]
|
||||
counts = {}
|
||||
|
||||
for table in tables:
|
||||
cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
result = cursor.fetchone()
|
||||
counts[table] = result[0] if result else 0
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def verify_fact_tables_exist(conn: sqlite3.Connection) -> list[str]:
|
||||
"""
|
||||
Verify that all fact tables exist (including materialized views).
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Returns:
|
||||
List of missing table names. Empty list means all tables exist.
|
||||
"""
|
||||
required_tables = ["fact_interventions", "mv_patient_treatment_summary"]
|
||||
missing = []
|
||||
|
||||
for table in required_tables:
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
||||
(table,)
|
||||
)
|
||||
if cursor.fetchone() is None:
|
||||
missing.append(table)
|
||||
|
||||
return missing
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# File Tracking Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def create_file_tracking_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Create file tracking tables in the database.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
"""
|
||||
logger.info("Creating file tracking tables...")
|
||||
conn.executescript(FILE_TRACKING_SCHEMA)
|
||||
logger.info("File tracking tables created successfully")
|
||||
|
||||
|
||||
def drop_file_tracking_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Drop file tracking tables from the database.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Warning:
|
||||
This will delete all file tracking history.
|
||||
"""
|
||||
logger.warning("Dropping file tracking tables...")
|
||||
conn.executescript("""
|
||||
DROP TABLE IF EXISTS processed_files;
|
||||
""")
|
||||
logger.info("File tracking tables dropped")
|
||||
|
||||
|
||||
def get_file_tracking_counts(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"""
|
||||
Get row counts for file tracking tables.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping table name to row count.
|
||||
"""
|
||||
tables = ["processed_files"]
|
||||
counts = {}
|
||||
|
||||
for table in tables:
|
||||
cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
result = cursor.fetchone()
|
||||
counts[table] = result[0] if result else 0
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def verify_file_tracking_tables_exist(conn: sqlite3.Connection) -> list[str]:
|
||||
"""
|
||||
Verify that file tracking tables exist.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
|
||||
Returns:
|
||||
List of missing table names. Empty list means all tables exist.
|
||||
"""
|
||||
required_tables = ["processed_files"]
|
||||
missing = []
|
||||
|
||||
for table in required_tables:
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
||||
(table,)
|
||||
)
|
||||
if cursor.fetchone() is None:
|
||||
missing.append(table)
|
||||
|
||||
return missing
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pathway Table Helper Functions
|
||||
# =============================================================================
|
||||
@@ -1050,13 +621,37 @@ def migrate_pathway_nodes_chart_type(conn: sqlite3.Connection) -> tuple[bool, st
|
||||
return False, f"Migration failed: {e}"
|
||||
|
||||
|
||||
def migrate_refresh_log_source_row_count(conn: sqlite3.Connection) -> tuple[bool, str]:
|
||||
"""Add source_row_count column to pathway_refresh_log if it doesn't exist.
|
||||
|
||||
This column stores the Snowflake row count for display in the UI footer.
|
||||
"""
|
||||
cursor = conn.execute("PRAGMA table_info(pathway_refresh_log)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "source_row_count" in columns:
|
||||
return True, "source_row_count column already exists"
|
||||
|
||||
logger.info("Adding source_row_count column to pathway_refresh_log...")
|
||||
try:
|
||||
conn.execute("""
|
||||
ALTER TABLE pathway_refresh_log
|
||||
ADD COLUMN source_row_count INTEGER
|
||||
""")
|
||||
conn.commit()
|
||||
return True, "Added source_row_count column"
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to add source_row_count column: {e}")
|
||||
return False, f"Migration failed: {e}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Combined Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def create_all_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Create all tables (reference + fact) in the database.
|
||||
Create all tables (reference + pathway) in the database.
|
||||
|
||||
Args:
|
||||
conn: SQLite database connection.
|
||||
@@ -1078,8 +673,6 @@ def drop_all_tables(conn: sqlite3.Connection) -> None:
|
||||
"""
|
||||
logger.warning("Dropping all tables...")
|
||||
drop_pathway_tables(conn)
|
||||
drop_file_tracking_tables(conn)
|
||||
drop_fact_tables(conn)
|
||||
drop_reference_tables(conn)
|
||||
logger.info("All tables dropped")
|
||||
|
||||
@@ -1096,8 +689,6 @@ def get_all_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
|
||||
"""
|
||||
counts = {}
|
||||
counts.update(get_reference_table_counts(conn))
|
||||
counts.update(get_fact_table_counts(conn))
|
||||
counts.update(get_file_tracking_counts(conn))
|
||||
counts.update(get_pathway_table_counts(conn))
|
||||
return counts
|
||||
|
||||
@@ -1114,7 +705,5 @@ def verify_all_tables_exist(conn: sqlite3.Connection) -> list[str]:
|
||||
"""
|
||||
missing = []
|
||||
missing.extend(verify_reference_tables_exist(conn))
|
||||
missing.extend(verify_fact_tables_exist(conn))
|
||||
missing.extend(verify_file_tracking_tables_exist(conn))
|
||||
missing.extend(verify_pathway_tables_exist(conn))
|
||||
return missing
|
||||
|
||||
Reference in New Issue
Block a user