HighCostDrugsDemo/data_processing/schema.py

"""
SQLite schema definitions for NHS High-Cost Drug Patient Pathway Analysis Tool.

Contains SQL strings for creating reference tables, fact tables, and indexes.
Schema design supports:
- Reference data from CSV files (drug names, organizations, directories)
- Drug-directory mappings with single-valid-directory flag
- Patient intervention facts with proper indexing
- Cached aggregations for performance
- File tracking for incremental updates
"""

from typing import Optional
import sqlite3

from core.logging_config import get_logger

logger = get_logger(__name__)


# =============================================================================
# Reference Table Schemas
# =============================================================================

REF_DRUG_NAMES_SCHEMA = """
-- Mapping from raw drug names (as they appear in source data) to standardized names
-- Source: data/drugnames.csv
CREATE TABLE IF NOT EXISTS ref_drug_names (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    raw_name TEXT NOT NULL UNIQUE,
    standard_name TEXT NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Index for fast lookups during data transformation
CREATE INDEX IF NOT EXISTS idx_ref_drug_names_raw ON ref_drug_names(raw_name);
CREATE INDEX IF NOT EXISTS idx_ref_drug_names_standard ON ref_drug_names(standard_name);
"""

REF_ORGANIZATIONS_SCHEMA = """
-- NHS organization codes and names
-- Source: data/org_codes.csv
CREATE TABLE IF NOT EXISTS ref_organizations (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    org_code TEXT NOT NULL UNIQUE,
    org_name TEXT NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Index for fast lookups by organization code
CREATE INDEX IF NOT EXISTS idx_ref_organizations_code ON ref_organizations(org_code);
"""

REF_DIRECTORIES_SCHEMA = """
-- Medical directories/specialties
-- Source: data/directory_list.csv
CREATE TABLE IF NOT EXISTS ref_directories (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    directory_name TEXT NOT NULL UNIQUE,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Index for fast lookups by directory name
CREATE INDEX IF NOT EXISTS idx_ref_directories_name ON ref_directories(directory_name);
"""

REF_DRUG_DIRECTORY_MAP_SCHEMA = """
-- Mapping from drug names to valid directories
-- Source: data/drug_directory_list.csv
-- A drug may map to multiple directories (one row per drug-directory pair)
-- The is_single_valid flag indicates drugs with exactly ONE valid directory,
-- which enables automatic directory assignment in department_identification()
CREATE TABLE IF NOT EXISTS ref_drug_directory_map (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    drug_name TEXT NOT NULL,
    directory_name TEXT NOT NULL,
    is_single_valid BOOLEAN NOT NULL DEFAULT 0,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(drug_name, directory_name)
);

-- Index for looking up directories by drug name (most common access pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_directory_map_drug ON ref_drug_directory_map(drug_name);

-- Index for reverse lookup (find drugs by directory)
CREATE INDEX IF NOT EXISTS idx_ref_drug_directory_map_directory ON ref_drug_directory_map(directory_name);

-- Index for quick filtering of single-valid drugs
CREATE INDEX IF NOT EXISTS idx_ref_drug_directory_map_single ON ref_drug_directory_map(is_single_valid);
"""

REF_DRUG_INDICATION_CLUSTERS_SCHEMA = """
-- Mapping from drugs to SNOMED clusters for indication validation
-- Source: data/drug_indication_clusters.csv
-- Used to validate that patients have appropriate GP diagnoses for their prescribed drugs
-- A drug may map to multiple clusters (one row per drug-indication-cluster combination)
CREATE TABLE IF NOT EXISTS ref_drug_indication_clusters (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    drug_name TEXT NOT NULL,
    indication TEXT NOT NULL,
    cluster_id TEXT NOT NULL,
    cluster_description TEXT,
    nice_ta_reference TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(drug_name, indication, cluster_id)
);

-- Index for looking up clusters by drug name (most common access pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_drug ON ref_drug_indication_clusters(drug_name);

-- Index for looking up drugs by cluster (for finding all drugs treating a condition)
CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_cluster ON ref_drug_indication_clusters(cluster_id);

-- Index for looking up by indication text
CREATE INDEX IF NOT EXISTS idx_ref_drug_indication_clusters_indication ON ref_drug_indication_clusters(indication);
"""

REF_DRUG_SNOMED_MAPPING_SCHEMA = """
-- Direct SNOMED code mapping from drug to indication to GP diagnosis codes
-- Source: data/drug_snomed_mapping_enriched.csv (163K rows)
-- Used for direct GP record matching to assign diagnosis-based directorates
-- and to support indication-based pathway hierarchy (Trust → Search_Term → Drug → Pathway)
CREATE TABLE IF NOT EXISTS ref_drug_snomed_mapping (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    drug_name TEXT NOT NULL,                -- Original drug name from mapping
    indication TEXT NOT NULL,               -- Specific indication (603 unique values)
    ta_id TEXT,                             -- NICE TA reference (e.g., TA568)
    search_term TEXT NOT NULL,              -- Simplified grouping (187 unique values)
    snomed_code TEXT NOT NULL,              -- SNOMED CT code for GP record matching
    snomed_description TEXT,                -- SNOMED code description
    cleaned_drug_name TEXT NOT NULL,        -- Standardized drug name for matching
    primary_directorate TEXT,               -- Primary directorate for this indication
    all_directorates TEXT,                  -- Pipe-separated list of valid directorates
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(drug_name, indication, snomed_code)
);

-- Index for looking up SNOMED codes by drug name (most common access pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug ON ref_drug_snomed_mapping(drug_name);

-- Index for looking up by cleaned drug name (standardized matching)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_cleaned ON ref_drug_snomed_mapping(cleaned_drug_name);

-- Index for looking up by SNOMED code (reverse lookup from GP record)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_snomed ON ref_drug_snomed_mapping(snomed_code);

-- Index for grouping by search_term (indication-based hierarchy)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_search_term ON ref_drug_snomed_mapping(search_term);

-- Composite index for drug + snomed code (common lookup pattern)
CREATE INDEX IF NOT EXISTS idx_ref_drug_snomed_mapping_drug_snomed
    ON ref_drug_snomed_mapping(cleaned_drug_name, snomed_code);
"""


# =============================================================================
# Pathway Data Architecture Schemas
# =============================================================================

PATHWAY_DATE_FILTERS_SCHEMA = """
-- Stores the 6 pre-computed date filter combinations
-- Each combination represents a different initiated/last_seen date range
-- Used to efficiently query pre-computed pathway data
CREATE TABLE IF NOT EXISTS pathway_date_filters (
    id TEXT PRIMARY KEY,                    -- e.g., 'all_6mo', '1yr_12mo'
    initiated_label TEXT NOT NULL,          -- e.g., 'All years', 'Last 1 year', 'Last 2 years'
    last_seen_label TEXT NOT NULL,          -- e.g., 'Last 6 months', 'Last 12 months'
    initiated_years INTEGER,                -- NULL for 'All', 1, or 2
    last_seen_months INTEGER NOT NULL,      -- 6 or 12
    is_default INTEGER DEFAULT 0,           -- 1 for 'all_6mo' (default selection)
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Pre-populate the 6 combinations
INSERT OR REPLACE INTO pathway_date_filters (id, initiated_label, last_seen_label, initiated_years, last_seen_months, is_default) VALUES
    ('all_6mo', 'All years', 'Last 6 months', NULL, 6, 1),
    ('all_12mo', 'All years', 'Last 12 months', NULL, 12, 0),
    ('1yr_6mo', 'Last 1 year', 'Last 6 months', 1, 6, 0),
    ('1yr_12mo', 'Last 1 year', 'Last 12 months', 1, 12, 0),
    ('2yr_6mo', 'Last 2 years', 'Last 6 months', 2, 6, 0),
    ('2yr_12mo', 'Last 2 years', 'Last 12 months', 2, 12, 0);
"""

PATHWAY_NODES_SCHEMA = """
-- Main pathway nodes table (one set per date filter combination)
-- Stores pre-computed pathway hierarchy with all visualization data
-- Designed for fast filtering by date_filter_id + trust/directory/drug
CREATE TABLE IF NOT EXISTS pathway_nodes (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    -- Date filter combination this belongs to
    date_filter_id TEXT NOT NULL,

    -- Hierarchy structure (for icicle chart)
    parents TEXT NOT NULL,                  -- Parent node identifier
    ids TEXT NOT NULL,                      -- Unique node identifier (hierarchical path)
    labels TEXT NOT NULL,                   -- Display label
    level INTEGER NOT NULL,                 -- Hierarchy depth (0=root, 1=trust, 2=directory, 3+=drugs)

    -- Patient counts (accurate for this date filter combination)
    value INTEGER NOT NULL DEFAULT 0,       -- Patient count

    -- Cost metrics
    cost REAL NOT NULL DEFAULT 0.0,         -- Total cost
    costpp REAL,                            -- Cost per patient
    cost_pp_pa TEXT,                        -- Cost per patient per annum (formatted string)

    -- Visualization
    colour REAL NOT NULL DEFAULT 0.0,       -- Color value (proportion of parent)

    -- Date ranges (for this node)
    first_seen TEXT,                        -- First intervention date (ISO format)
    last_seen TEXT,                         -- Last intervention date (ISO format)
    first_seen_parent TEXT,                 -- Earliest date in parent group
    last_seen_parent TEXT,                  -- Latest date in parent group

    -- Treatment statistics
    average_spacing TEXT,                   -- Formatted treatment duration string
    average_administered TEXT,              -- JSON array of average doses per drug
    avg_days REAL,                          -- Average treatment duration in days

    -- Denormalized filter columns (for efficient WHERE clause filtering)
    trust_name TEXT,                        -- Extracted trust name from ids
    directory TEXT,                         -- Extracted directory from ids
    drug_sequence TEXT,                     -- Pipe-separated drug sequence from pathway

    -- Metadata
    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
    data_refresh_id TEXT,                   -- Links to pathway_refresh_log

    -- Unique per date filter + pathway
    UNIQUE(date_filter_id, ids),
    FOREIGN KEY (date_filter_id) REFERENCES pathway_date_filters(id)
);

-- Indexes for efficient filtering
-- Primary filter: select by date_filter_id
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_date_filter ON pathway_nodes(date_filter_id);

-- Level filter: often used with date_filter_id
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_level ON pathway_nodes(date_filter_id, level);

-- Trust filter: for Trust dropdown filtering
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_trust ON pathway_nodes(date_filter_id, trust_name);

-- Directory filter: for Directory dropdown filtering
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_directory ON pathway_nodes(date_filter_id, directory);

-- Drug sequence filter: for drug filtering (uses LIKE '%DRUG%')
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_drug_seq ON pathway_nodes(drug_sequence);

-- Parents filter: for finding children of a node
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_parents ON pathway_nodes(date_filter_id, parents);

-- Composite index for common filter combination
CREATE INDEX IF NOT EXISTS idx_pathway_nodes_filter_composite
    ON pathway_nodes(date_filter_id, trust_name, directory);
"""

PATHWAY_REFRESH_LOG_SCHEMA = """
-- Metadata table for tracking refresh status
-- Tracks when pathway data was last refreshed from Snowflake
CREATE TABLE IF NOT EXISTS pathway_refresh_log (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    refresh_id TEXT NOT NULL,               -- Unique identifier for this refresh run
    started_at TEXT NOT NULL,               -- ISO timestamp when refresh started
    completed_at TEXT,                      -- ISO timestamp when refresh completed (NULL if still running)
    status TEXT DEFAULT 'running',          -- 'running', 'completed', 'failed'
    record_count INTEGER,                   -- Total pathway_nodes records created
    date_filter_counts TEXT,                -- JSON: {"all_6mo": 1234, "all_12mo": 1567, ...}
    error_message TEXT,                     -- Error details if status='failed'
    snowflake_query_date_from TEXT,         -- Start date of Snowflake query
    snowflake_query_date_to TEXT,           -- End date of Snowflake query
    processing_duration_seconds REAL,       -- How long the refresh took
    created_at TEXT DEFAULT CURRENT_TIMESTAMP
);

-- Index for finding latest refresh
CREATE INDEX IF NOT EXISTS idx_pathway_refresh_log_started ON pathway_refresh_log(started_at DESC);

-- Index for finding by status
CREATE INDEX IF NOT EXISTS idx_pathway_refresh_log_status ON pathway_refresh_log(status);
"""

# Combined pathway schema
PATHWAY_TABLES_SCHEMA = f"""
-- Pathway Data Architecture Tables
-- Pre-computed pathway data for fast Reflex filtering

{PATHWAY_DATE_FILTERS_SCHEMA}

{PATHWAY_NODES_SCHEMA}

{PATHWAY_REFRESH_LOG_SCHEMA}
"""


# =============================================================================
# Fact Table Schemas
# =============================================================================

FACT_INTERVENTIONS_SCHEMA = """
-- Patient intervention records (fact table)
-- Source: HCD activity data (CSV/Parquet files or Snowflake)
-- This is the main fact table storing all patient intervention events
CREATE TABLE IF NOT EXISTS fact_interventions (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    -- Patient identification
    upid TEXT NOT NULL,                     -- Unique Patient ID (Provider Code[:3] + PersonKey)
    provider_code TEXT NOT NULL,            -- Original provider code (3-5 chars)
    person_key TEXT NOT NULL,               -- Patient key from source system

    -- Intervention details
    drug_name_raw TEXT,                     -- Original drug name from source
    drug_name_std TEXT NOT NULL,            -- Standardized drug name (via ref_drug_names)
    intervention_date DATE NOT NULL,        -- Date of intervention
    price_actual REAL NOT NULL DEFAULT 0,   -- Cost of intervention in GBP

    -- Organization and directory
    org_name TEXT,                          -- Organization name (cleaned, no commas)
    directory TEXT,                         -- Medical directory/specialty (may be "Undefined")

    -- Source tracking
    source_file TEXT,                       -- Original file this record came from
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    -- Additional clinical fields (optional, used in directory fallback logic)
    treatment_function_code INTEGER,
    additional_detail_1 TEXT,
    additional_detail_2 TEXT,
    additional_detail_3 TEXT,
    additional_detail_4 TEXT,
    additional_detail_5 TEXT
);

-- Primary indexes for common filter patterns used in generate_graph()
-- UPID: Used for patient grouping, pathway analysis
CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid ON fact_interventions(upid);

-- Drug name (standardized): Used for drug filtering
CREATE INDEX IF NOT EXISTS idx_fact_interventions_drug ON fact_interventions(drug_name_std);

-- Intervention date: Used for date range filtering (start_date, end_date, last_seen)
CREATE INDEX IF NOT EXISTS idx_fact_interventions_date ON fact_interventions(intervention_date);

-- Directory: Used for directory/specialty filtering
CREATE INDEX IF NOT EXISTS idx_fact_interventions_directory ON fact_interventions(directory);

-- Organization: Used for trust filtering (Provider Code maps to org_name)
CREATE INDEX IF NOT EXISTS idx_fact_interventions_org ON fact_interventions(org_name);

-- Composite index for common filter combination (trust + drug + directory)
CREATE INDEX IF NOT EXISTS idx_fact_interventions_composite
    ON fact_interventions(org_name, drug_name_std, directory);

-- Composite index for date-based patient analysis
CREATE INDEX IF NOT EXISTS idx_fact_interventions_upid_date
    ON fact_interventions(upid, intervention_date);
"""


# =============================================================================
# Materialized View Schemas (Cached Aggregations)
# =============================================================================

MV_PATIENT_TREATMENT_SUMMARY_SCHEMA = """
-- Materialized view of patient treatment summaries
-- Pre-computed aggregations per patient for faster pathway analysis
-- Refreshed when fact_interventions data changes
CREATE TABLE IF NOT EXISTS mv_patient_treatment_summary (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    -- Patient identification
    upid TEXT NOT NULL UNIQUE,              -- Unique Patient ID

    -- Organization and directory (for filtering)
    org_name TEXT,                          -- Organization name (first org seen)
    directory TEXT,                         -- Primary directory (first directory assigned)

    -- Date range
    first_seen_date DATE NOT NULL,          -- First intervention date
    last_seen_date DATE NOT NULL,           -- Last intervention date
    days_treated INTEGER NOT NULL DEFAULT 0, -- Duration: last_seen - first_seen

    -- Cost aggregations
    total_cost REAL NOT NULL DEFAULT 0,     -- Sum of all intervention costs
    avg_cost_per_intervention REAL,         -- Average cost per intervention

    -- Treatment summary
    intervention_count INTEGER NOT NULL DEFAULT 0,  -- Total number of interventions
    unique_drug_count INTEGER NOT NULL DEFAULT 0,   -- Number of distinct drugs

    -- Drug sequence (pipe-separated standardized drug names in chronological order)
    -- Example: "ADALIMUMAB|ETANERCEPT|INFLIXIMAB"
    drug_sequence TEXT,

    -- Drug frequency counts (JSON: {"ADALIMUMAB": 5, "ETANERCEPT": 3})
    -- Stores count of each drug for this patient
    drug_counts_json TEXT,

    -- Drug cost totals (JSON: {"ADALIMUMAB": 15000.00, "ETANERCEPT": 8000.00})
    -- Stores total cost per drug for this patient
    drug_costs_json TEXT,

    -- Per-drug date ranges (JSON: {"ADALIMUMAB": {"first": "2023-01-01", "last": "2023-06-15"}, ...})
    -- Stores first/last date for each drug
    drug_date_ranges_json TEXT,

    -- Metadata
    computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    source_row_count INTEGER               -- Number of fact_interventions rows used
);

-- Index for fast patient lookup
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_upid ON mv_patient_treatment_summary(upid);

-- Indexes for common filter patterns
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_org ON mv_patient_treatment_summary(org_name);
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_directory ON mv_patient_treatment_summary(directory);
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_first_seen ON mv_patient_treatment_summary(first_seen_date);
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_last_seen ON mv_patient_treatment_summary(last_seen_date);

-- Composite index for date range filtering (common in generate_graph)
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_date_range
    ON mv_patient_treatment_summary(first_seen_date, last_seen_date);

-- Composite index for org + directory + dates (full filter pattern)
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_filter_composite
    ON mv_patient_treatment_summary(org_name, directory, first_seen_date, last_seen_date);

-- Index for drug sequence pattern matching
CREATE INDEX IF NOT EXISTS idx_mv_patient_summary_drug_seq ON mv_patient_treatment_summary(drug_sequence);
"""

MATERIALIZED_VIEWS_SCHEMA = f"""
-- Materialized Views Schema
-- Pre-computed aggregations for performance

{MV_PATIENT_TREATMENT_SUMMARY_SCHEMA}
"""


# =============================================================================
# File Tracking Schemas (Incremental Updates)
# =============================================================================

PROCESSED_FILES_SCHEMA = """
-- Tracks processed data files for incremental updates
-- Enables detecting changed files by comparing hashes
-- Stores processing status and statistics
CREATE TABLE IF NOT EXISTS processed_files (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    -- File identification
    file_path TEXT NOT NULL,                -- Full path to the file
    file_name TEXT NOT NULL,                -- Just the filename (for display)
    file_hash TEXT NOT NULL,                -- SHA256 hash of file contents

    -- File metadata
    file_size_bytes INTEGER,                -- Size of file in bytes
    file_modified_at TIMESTAMP,             -- File's last modification timestamp

    -- Processing results
    row_count INTEGER DEFAULT 0,            -- Number of rows processed from this file
    status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, success, error
    error_message TEXT,                     -- Error details if status='error'

    -- Timestamps
    first_processed_at TIMESTAMP,           -- When first processed
    last_processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    processing_duration_seconds REAL,       -- How long processing took

    -- Uniqueness: only one record per file path
    -- Hash changes indicate file content changed (needs reprocessing)
    UNIQUE(file_path)
);

-- Index for fast lookup by file path
CREATE INDEX IF NOT EXISTS idx_processed_files_path ON processed_files(file_path);

-- Index for finding files by status (e.g., find all pending or errored files)
CREATE INDEX IF NOT EXISTS idx_processed_files_status ON processed_files(status);

-- Index for finding files by hash (detect if same file appears at different paths)
CREATE INDEX IF NOT EXISTS idx_processed_files_hash ON processed_files(file_hash);

-- Index for finding recently processed files
CREATE INDEX IF NOT EXISTS idx_processed_files_last_processed ON processed_files(last_processed_at);
"""

FILE_TRACKING_SCHEMA = f"""
-- File Tracking Schema
-- Supports incremental data loading

{PROCESSED_FILES_SCHEMA}
"""


# =============================================================================
# Combined Schemas
# =============================================================================

REFERENCE_TABLES_SCHEMA = f"""
-- Reference Tables Schema
-- Contains lookup data migrated from CSV files

{REF_DRUG_NAMES_SCHEMA}

{REF_ORGANIZATIONS_SCHEMA}

{REF_DIRECTORIES_SCHEMA}

{REF_DRUG_DIRECTORY_MAP_SCHEMA}

{REF_DRUG_INDICATION_CLUSTERS_SCHEMA}

{REF_DRUG_SNOMED_MAPPING_SCHEMA}
"""

FACT_TABLES_SCHEMA = f"""
-- Fact Tables Schema
-- Contains patient intervention data

{FACT_INTERVENTIONS_SCHEMA}
"""

ALL_TABLES_SCHEMA = f"""
-- Complete Database Schema
-- Reference tables + Fact tables + Materialized views + File tracking + Pathway tables

{REFERENCE_TABLES_SCHEMA}

{FACT_TABLES_SCHEMA}

{MATERIALIZED_VIEWS_SCHEMA}

{FILE_TRACKING_SCHEMA}

{PATHWAY_TABLES_SCHEMA}
"""


# =============================================================================
# Schema Helper Functions
# =============================================================================

def create_reference_tables(conn: sqlite3.Connection) -> None:
    """
    Create all reference tables in the database.

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating reference tables...")
    conn.executescript(REFERENCE_TABLES_SCHEMA)
    logger.info("Reference tables created successfully")


def drop_reference_tables(conn: sqlite3.Connection) -> None:
    """
    Drop all reference tables from the database.

    Args:
        conn: SQLite database connection.

    Warning:
        This will delete all reference data. Use with caution.
    """
    logger.warning("Dropping reference tables...")
    conn.executescript("""
        DROP TABLE IF EXISTS ref_drug_names;
        DROP TABLE IF EXISTS ref_organizations;
        DROP TABLE IF EXISTS ref_directories;
        DROP TABLE IF EXISTS ref_drug_directory_map;
        DROP TABLE IF EXISTS ref_drug_indication_clusters;
        DROP TABLE IF EXISTS ref_drug_snomed_mapping;
    """)
    logger.info("Reference tables dropped")


def create_drug_snomed_mapping_table(conn: sqlite3.Connection) -> None:
    """
    Create the ref_drug_snomed_mapping table for direct SNOMED code mapping.

    This table stores mappings from drugs to SNOMED codes for GP record matching,
    enabling diagnosis-based directorate assignment and indication-based pathways.

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating ref_drug_snomed_mapping table...")
    conn.executescript(REF_DRUG_SNOMED_MAPPING_SCHEMA)
    logger.info("ref_drug_snomed_mapping table created successfully")


def get_reference_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for all reference tables.

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary mapping table name to row count.
    """
    tables = [
        "ref_drug_names",
        "ref_organizations",
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
        "ref_drug_snomed_mapping",
    ]
    counts = {}

    for table in tables:
        try:
            cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
            result = cursor.fetchone()
            counts[table] = result[0] if result else 0
        except sqlite3.OperationalError:
            counts[table] = 0

    return counts


def verify_reference_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    Verify that all reference tables exist.

    Args:
        conn: SQLite database connection.

    Returns:
        List of missing table names. Empty list means all tables exist.
    """
    required_tables = [
        "ref_drug_names",
        "ref_organizations",
        "ref_directories",
        "ref_drug_directory_map",
        "ref_drug_indication_clusters",
        "ref_drug_snomed_mapping",
    ]
    missing = []

    for table in required_tables:
        cursor = conn.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table,)
        )
        if cursor.fetchone() is None:
            missing.append(table)

    return missing


# =============================================================================
# Fact Table Helper Functions
# =============================================================================

def create_fact_tables(conn: sqlite3.Connection) -> None:
    """
    Create all fact tables in the database (including materialized views).

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating fact tables...")
    conn.executescript(FACT_TABLES_SCHEMA)
    conn.executescript(MATERIALIZED_VIEWS_SCHEMA)
    logger.info("Fact tables created successfully")


def drop_fact_tables(conn: sqlite3.Connection) -> None:
    """
    Drop all fact tables from the database.

    Args:
        conn: SQLite database connection.

    Warning:
        This will delete all patient intervention data. Use with caution.
    """
    logger.warning("Dropping fact tables...")
    conn.executescript("""
        DROP TABLE IF EXISTS fact_interventions;
        DROP TABLE IF EXISTS mv_patient_treatment_summary;
    """)
    logger.info("Fact tables dropped")


def get_fact_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for all fact tables (including materialized views).

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary mapping table name to row count.
    """
    tables = ["fact_interventions", "mv_patient_treatment_summary"]
    counts = {}

    for table in tables:
        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
        result = cursor.fetchone()
        counts[table] = result[0] if result else 0

    return counts


def verify_fact_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    Verify that all fact tables exist (including materialized views).

    Args:
        conn: SQLite database connection.

    Returns:
        List of missing table names. Empty list means all tables exist.
    """
    required_tables = ["fact_interventions", "mv_patient_treatment_summary"]
    missing = []

    for table in required_tables:
        cursor = conn.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table,)
        )
        if cursor.fetchone() is None:
            missing.append(table)

    return missing


# =============================================================================
# File Tracking Helper Functions
# =============================================================================

def create_file_tracking_tables(conn: sqlite3.Connection) -> None:
    """
    Create file tracking tables in the database.

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating file tracking tables...")
    conn.executescript(FILE_TRACKING_SCHEMA)
    logger.info("File tracking tables created successfully")


def drop_file_tracking_tables(conn: sqlite3.Connection) -> None:
    """
    Drop file tracking tables from the database.

    Args:
        conn: SQLite database connection.

    Warning:
        This will delete all file tracking history.
    """
    logger.warning("Dropping file tracking tables...")
    conn.executescript("""
        DROP TABLE IF EXISTS processed_files;
    """)
    logger.info("File tracking tables dropped")


def get_file_tracking_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for file tracking tables.

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary mapping table name to row count.
    """
    tables = ["processed_files"]
    counts = {}

    for table in tables:
        cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
        result = cursor.fetchone()
        counts[table] = result[0] if result else 0

    return counts


def verify_file_tracking_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    Verify that file tracking tables exist.

    Args:
        conn: SQLite database connection.

    Returns:
        List of missing table names. Empty list means all tables exist.
    """
    required_tables = ["processed_files"]
    missing = []

    for table in required_tables:
        cursor = conn.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table,)
        )
        if cursor.fetchone() is None:
            missing.append(table)

    return missing


# =============================================================================
# Pathway Table Helper Functions
# =============================================================================

def create_pathway_tables(conn: sqlite3.Connection) -> None:
    """
    Create pathway data architecture tables in the database.

    Creates:
    - pathway_date_filters: 6 pre-defined date filter combinations
    - pathway_nodes: Pre-computed pathway hierarchy data
    - pathway_refresh_log: Refresh tracking metadata

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating pathway tables...")
    conn.executescript(PATHWAY_TABLES_SCHEMA)
    logger.info("Pathway tables created successfully")


def drop_pathway_tables(conn: sqlite3.Connection) -> None:
    """
    Drop pathway data architecture tables from the database.

    Args:
        conn: SQLite database connection.

    Warning:
        This will delete all pre-computed pathway data.
    """
    logger.warning("Dropping pathway tables...")
    conn.executescript("""
        DROP TABLE IF EXISTS pathway_nodes;
        DROP TABLE IF EXISTS pathway_refresh_log;
        DROP TABLE IF EXISTS pathway_date_filters;
    """)
    logger.info("Pathway tables dropped")


def get_pathway_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for pathway tables.

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary mapping table name to row count.
    """
    tables = ["pathway_date_filters", "pathway_nodes", "pathway_refresh_log"]
    counts = {}

    for table in tables:
        try:
            cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
            result = cursor.fetchone()
            counts[table] = result[0] if result else 0
        except sqlite3.OperationalError:
            # Table doesn't exist yet
            counts[table] = 0

    return counts


def verify_pathway_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    Verify that pathway tables exist.

    Args:
        conn: SQLite database connection.

    Returns:
        List of missing table names. Empty list means all tables exist.
    """
    required_tables = ["pathway_date_filters", "pathway_nodes", "pathway_refresh_log"]
    missing = []

    for table in required_tables:
        cursor = conn.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table,)
        )
        if cursor.fetchone() is None:
            missing.append(table)

    return missing


def clear_pathway_nodes(conn: sqlite3.Connection, date_filter_id: str | None = None) -> int:
    """
    Clear pathway nodes, optionally for a specific date filter.

    Args:
        conn: SQLite database connection.
        date_filter_id: If provided, only clear nodes for this date filter.
                        If None, clear all pathway nodes.

    Returns:
        Number of rows deleted.
    """
    if date_filter_id:
        cursor = conn.execute(
            "DELETE FROM pathway_nodes WHERE date_filter_id = ?",
            (date_filter_id,)
        )
    else:
        cursor = conn.execute("DELETE FROM pathway_nodes")

    deleted_count = cursor.rowcount
    conn.commit()
    logger.info(f"Cleared {deleted_count} pathway nodes")
    return deleted_count


def get_pathway_refresh_status(conn: sqlite3.Connection) -> dict | None:
    """
    Get the status of the most recent pathway refresh.

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary with refresh status, or None if no refresh has been done.
    """
    try:
        cursor = conn.execute("""
            SELECT refresh_id, started_at, completed_at, status, record_count,
                   date_filter_counts, error_message, processing_duration_seconds
            FROM pathway_refresh_log
            ORDER BY started_at DESC
            LIMIT 1
        """)
        row = cursor.fetchone()
        if row:
            return {
                "refresh_id": row[0],
                "started_at": row[1],
                "completed_at": row[2],
                "status": row[3],
                "record_count": row[4],
                "date_filter_counts": row[5],
                "error_message": row[6],
                "processing_duration_seconds": row[7],
            }
        return None
    except sqlite3.OperationalError:
        # Table doesn't exist yet
        return None


# =============================================================================
# Combined Helper Functions
# =============================================================================

def create_all_tables(conn: sqlite3.Connection) -> None:
    """
    Create all tables (reference + fact) in the database.

    Args:
        conn: SQLite database connection.
    """
    logger.info("Creating all database tables...")
    conn.executescript(ALL_TABLES_SCHEMA)
    logger.info("All tables created successfully")


def drop_all_tables(conn: sqlite3.Connection) -> None:
    """
    Drop all tables from the database.

    Args:
        conn: SQLite database connection.

    Warning:
        This will delete all data. Use with extreme caution.
    """
    logger.warning("Dropping all tables...")
    drop_pathway_tables(conn)
    drop_file_tracking_tables(conn)
    drop_fact_tables(conn)
    drop_reference_tables(conn)
    logger.info("All tables dropped")


def get_all_table_counts(conn: sqlite3.Connection) -> dict[str, int]:
    """
    Get row counts for all tables.

    Args:
        conn: SQLite database connection.

    Returns:
        Dictionary mapping table name to row count.
    """
    counts = {}
    counts.update(get_reference_table_counts(conn))
    counts.update(get_fact_table_counts(conn))
    counts.update(get_file_tracking_counts(conn))
    counts.update(get_pathway_table_counts(conn))
    return counts


def verify_all_tables_exist(conn: sqlite3.Connection) -> list[str]:
    """
    Verify that all tables exist.

    Args:
        conn: SQLite database connection.

    Returns:
        List of missing table names. Empty list means all tables exist.
    """
    missing = []
    missing.extend(verify_reference_tables_exist(conn))
    missing.extend(verify_fact_tables_exist(conn))
    missing.extend(verify_file_tracking_tables_exist(conn))
    missing.extend(verify_pathway_tables_exist(conn))
    return missing