HighCostDrugsDemo/data_processing/diagnosis_lookup.py

"""
Diagnosis lookup module for NHS Patient Pathway Analysis.

Provides functions to validate patient indications by checking GP diagnosis records
against SNOMED cluster codes. Uses the drug-to-cluster mapping from
drug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.

Key workflow:
1. Get drug's valid indication clusters from local mapping
2. Get all SNOMED codes for those clusters from Snowflake
3. Check if patient has any of those SNOMED codes in GP records
4. Report indication validation status

IMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses
GP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.
"""

from dataclasses import dataclass, field
from datetime import date, datetime
from pathlib import Path
from typing import Optional, Callable, Any, cast, TYPE_CHECKING
import csv

if TYPE_CHECKING:
    import pandas as pd

from core.logging_config import get_logger
from data_processing.database import DatabaseManager, default_db_manager
from data_processing.snowflake_connector import (
    SnowflakeConnector,
    get_connector,
    is_snowflake_available,
    is_snowflake_configured,
    SNOWFLAKE_AVAILABLE,
)
from data_processing.cache import get_cache, is_cache_enabled

logger = get_logger(__name__)


@dataclass
class ClusterSnomedCodes:
    """SNOMED codes for a clinical coding cluster."""
    cluster_id: str
    cluster_description: str
    snomed_codes: list[str] = field(default_factory=list)
    snomed_descriptions: dict[str, str] = field(default_factory=dict)

    @property
    def code_count(self) -> int:
        return len(self.snomed_codes)


@dataclass
class IndicationValidationResult:
    """Result of validating a patient's indication for a drug."""
    patient_pseudonym: str
    drug_name: str
    has_valid_indication: bool
    matched_cluster_id: Optional[str] = None
    matched_snomed_code: Optional[str] = None
    matched_snomed_description: Optional[str] = None
    checked_clusters: list[str] = field(default_factory=list)
    total_codes_checked: int = 0
    source: str = "GP_SNOMED"  # GP_SNOMED | NONE
    error_message: Optional[str] = None


@dataclass
class DrugIndicationMatchRate:
    """Match rate statistics for a drug's indication validation."""
    drug_name: str
    total_patients: int
    patients_with_indication: int
    patients_without_indication: int
    match_rate: float  # 0.0 to 1.0
    clusters_checked: list[str] = field(default_factory=list)
    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs


def get_drug_clusters(
    drug_name: str,
    db_manager: Optional[DatabaseManager] = None
) -> list[dict]:
    """
    Get all SNOMED cluster mappings for a drug from local SQLite.

    Args:
        drug_name: Drug name to look up (case-insensitive)
        db_manager: Optional DatabaseManager (defaults to default_db_manager)

    Returns:
        List of dicts with keys: drug_name, indication, cluster_id,
        cluster_description, nice_ta_reference
    """
    if db_manager is None:
        db_manager = default_db_manager

    query = """
        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference
        FROM ref_drug_indication_clusters
        WHERE UPPER(drug_name) = UPPER(?)
        ORDER BY indication, cluster_id
    """

    try:
        with db_manager.get_connection() as conn:
            cursor = conn.execute(query, (drug_name,))
            rows = cursor.fetchall()

            results = []
            for row in rows:
                results.append({
                    "drug_name": row["drug_name"],
                    "indication": row["indication"],
                    "cluster_id": row["cluster_id"],
                    "cluster_description": row["cluster_description"],
                    "nice_ta_reference": row["nice_ta_reference"],
                })

            logger.debug(f"Found {len(results)} cluster mappings for drug '{drug_name}'")
            return results

    except Exception as e:
        logger.error(f"Error getting clusters for drug '{drug_name}': {e}")
        return []


def get_drug_cluster_ids(
    drug_name: str,
    db_manager: Optional[DatabaseManager] = None
) -> list[str]:
    """
    Get unique cluster IDs for a drug.

    Args:
        drug_name: Drug name to look up
        db_manager: Optional DatabaseManager

    Returns:
        List of unique cluster IDs
    """
    clusters = get_drug_clusters(drug_name, db_manager)
    return list(set(c["cluster_id"] for c in clusters))


def get_cluster_snomed_codes(
    cluster_id: str,
    connector: Optional[SnowflakeConnector] = None,
    use_cache: bool = True,
) -> ClusterSnomedCodes:
    """
    Get all SNOMED codes for a cluster from Snowflake.

    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes
    that belong to the specified cluster.

    Args:
        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')
        connector: Optional SnowflakeConnector (defaults to singleton)
        use_cache: Whether to use cached results (default True)

    Returns:
        ClusterSnomedCodes with list of SNOMED codes and descriptions
    """
    if not SNOWFLAKE_AVAILABLE:
        logger.warning("Snowflake connector not available")
        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description="")

    if not is_snowflake_configured():
        logger.warning("Snowflake not configured - cannot get cluster codes")
        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description="")

    # Check cache first
    cache_key = f"cluster_snomed_{cluster_id}"
    if use_cache and is_cache_enabled():
        cache = get_cache()
        cached = cache.get(cache_key)
        if cached is not None and len(cached) > 0:
            logger.debug(f"Using cached SNOMED codes for cluster '{cluster_id}'")
            cached_dict = cached[0]  # First element is our data dict
            return ClusterSnomedCodes(
                cluster_id=cluster_id,
                cluster_description=str(cached_dict.get("description", "")),
                snomed_codes=list(cached_dict.get("codes", [])),
                snomed_descriptions=dict(cached_dict.get("descriptions", {})),
            )

    if connector is None:
        connector = get_connector()

    query = '''
        SELECT DISTINCT
            "Cluster_ID",
            "Cluster_Description",
            "SNOMEDCode",
            "SNOMEDDescription"
        FROM DATA_HUB.PHM."ClinicalCodingClusterSnomedCodes"
        WHERE "Cluster_ID" = %s
        ORDER BY "SNOMEDCode"
    '''

    try:
        results = connector.execute_dict(query, (cluster_id,))

        if not results:
            logger.warning(f"No SNOMED codes found for cluster '{cluster_id}'")
            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description="")

        codes = []
        descriptions = {}
        description = results[0].get("Cluster_Description", "") if results else ""

        for row in results:
            code = row.get("SNOMEDCode")
            if code:
                codes.append(code)
                descriptions[code] = row.get("SNOMEDDescription", "")

        logger.info(f"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'")

        # Cache the results (using query-based cache with fake params)
        if use_cache and is_cache_enabled():
            cache = get_cache()
            cache_data = [{
                "description": description,
                "codes": codes,
                "descriptions": descriptions,
            }]
            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]

        return ClusterSnomedCodes(
            cluster_id=cluster_id,
            cluster_description=description,
            snomed_codes=codes,
            snomed_descriptions=descriptions,
        )

    except Exception as e:
        logger.error(f"Error getting SNOMED codes for cluster '{cluster_id}': {e}")
        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description="")


def patient_has_indication(
    patient_pseudonym: str,
    cluster_ids: list[str],
    connector: Optional[SnowflakeConnector] = None,
    before_date: Optional[date] = None,
) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:
    """
    Check if a patient has any SNOMED codes from the specified clusters in GP records.

    Args:
        patient_pseudonym: Patient's pseudonymised NHS number
        cluster_ids: List of cluster IDs to check against
        connector: Optional SnowflakeConnector
        before_date: Optional date - only check diagnoses before this date

    Returns:
        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)
    """
    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
        return False, None, None, None

    if not cluster_ids:
        return False, None, None, None

    if connector is None:
        connector = get_connector()

    # Build placeholders for cluster IDs
    placeholders = ", ".join(["%s"] * len(cluster_ids))

    # Query to check if patient has any matching SNOMED code
    query = f'''
        SELECT
            pc."SNOMEDCode",
            cc."Cluster_ID",
            cc."SNOMEDDescription"
        FROM DATA_HUB.PHM."PrimaryCareClinicalCoding" pc
        INNER JOIN DATA_HUB.PHM."ClinicalCodingClusterSnomedCodes" cc
            ON pc."SNOMEDCode" = cc."SNOMEDCode"
        WHERE pc."PatientPseudonym" = %s
            AND cc."Cluster_ID" IN ({placeholders})
    '''

    params = [patient_pseudonym] + cluster_ids

    if before_date:
        query += ' AND pc."EventDateTime" < %s'
        params.append(before_date.isoformat())

    query += ' LIMIT 1'

    try:
        results = connector.execute_dict(query, tuple(params))

        if results:
            row = results[0]
            return (
                True,
                row.get("Cluster_ID"),
                row.get("SNOMEDCode"),
                row.get("SNOMEDDescription"),
            )

        return False, None, None, None

    except Exception as e:
        logger.error(f"Error checking indication for patient '{patient_pseudonym}': {e}")
        return False, None, None, None


def validate_indication(
    patient_pseudonym: str,
    drug_name: str,
    connector: Optional[SnowflakeConnector] = None,
    db_manager: Optional[DatabaseManager] = None,
    before_date: Optional[date] = None,
) -> IndicationValidationResult:
    """
    Validate that a patient has an appropriate indication for a drug.

    Full validation workflow:
    1. Get drug's valid indication clusters from local mapping
    2. Check if patient has any matching SNOMED codes in GP records
    3. Return detailed validation result

    Args:
        patient_pseudonym: Patient's pseudonymised NHS number
        drug_name: Drug name to validate indication for
        connector: Optional SnowflakeConnector
        db_manager: Optional DatabaseManager
        before_date: Optional date - only check diagnoses before this date

    Returns:
        IndicationValidationResult with validation details
    """
    result = IndicationValidationResult(
        patient_pseudonym=patient_pseudonym,
        drug_name=drug_name,
        has_valid_indication=False,
    )

    # Step 1: Get drug's cluster mappings
    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)

    if not cluster_ids:
        result.error_message = f"No cluster mappings found for drug '{drug_name}'"
        result.source = "NONE"
        return result

    result.checked_clusters = cluster_ids

    # Step 2: Check Snowflake availability
    if not SNOWFLAKE_AVAILABLE:
        result.error_message = "Snowflake connector not installed"
        result.source = "NONE"
        return result

    if not is_snowflake_configured():
        result.error_message = "Snowflake not configured"
        result.source = "NONE"
        return result

    # Step 3: Check patient GP records
    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(
        patient_pseudonym=patient_pseudonym,
        cluster_ids=cluster_ids,
        connector=connector,
        before_date=before_date,
    )

    result.has_valid_indication = has_indication
    result.matched_cluster_id = matched_cluster
    result.matched_snomed_code = matched_code
    result.matched_snomed_description = matched_desc
    result.source = "GP_SNOMED" if has_indication else "NONE"

    return result


def get_indication_match_rate(
    drug_name: str,
    patient_pseudonyms: list[str],
    connector: Optional[SnowflakeConnector] = None,
    db_manager: Optional[DatabaseManager] = None,
    sample_unmatched_count: int = 10,
) -> DrugIndicationMatchRate:
    """
    Calculate indication match rate for a drug across a list of patients.

    Args:
        drug_name: Drug name to check
        patient_pseudonyms: List of patient pseudonymised NHS numbers
        connector: Optional SnowflakeConnector
        db_manager: Optional DatabaseManager
        sample_unmatched_count: Number of unmatched patient IDs to include in sample

    Returns:
        DrugIndicationMatchRate with match statistics
    """
    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():
        connector = get_connector()

    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)

    total = len(patient_pseudonyms)
    matched = 0
    unmatched = 0
    sample_unmatched: list[str] = []

    if not cluster_ids:
        logger.warning(f"No cluster mappings for drug '{drug_name}' - all patients will be unmatched")
        return DrugIndicationMatchRate(
            drug_name=drug_name,
            total_patients=total,
            patients_with_indication=0,
            patients_without_indication=total,
            match_rate=0.0,
            clusters_checked=[],
            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],
        )

    for i, pseudonym in enumerate(patient_pseudonyms):
        if i > 0 and i % 100 == 0:
            logger.info(f"Validating indications: {i}/{total} ({100*i/total:.1f}%)")

        has_indication, _, _, _ = patient_has_indication(
            patient_pseudonym=pseudonym,
            cluster_ids=cluster_ids,
            connector=connector,
        )

        if has_indication:
            matched += 1
        else:
            unmatched += 1
            if len(sample_unmatched) < sample_unmatched_count:
                sample_unmatched.append(pseudonym)

    match_rate = matched / total if total > 0 else 0.0

    logger.info(f"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})")

    return DrugIndicationMatchRate(
        drug_name=drug_name,
        total_patients=total,
        patients_with_indication=matched,
        patients_without_indication=unmatched,
        match_rate=match_rate,
        clusters_checked=cluster_ids,
        sample_unmatched=sample_unmatched,
    )


def batch_validate_indications(
    patient_drug_pairs: list[tuple[str, str]],
    connector: Optional[SnowflakeConnector] = None,
    db_manager: Optional[DatabaseManager] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> list[IndicationValidationResult]:
    """
    Validate indications for multiple patient-drug pairs efficiently.

    Args:
        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples
        connector: Optional SnowflakeConnector
        db_manager: Optional DatabaseManager
        progress_callback: Optional callback(current, total) for progress updates

    Returns:
        List of IndicationValidationResult for each pair
    """
    results = []
    total = len(patient_drug_pairs)

    # Cache cluster lookups by drug
    drug_clusters_cache = {}

    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):
        if progress_callback:
            progress_callback(i + 1, total)

        # Get clusters from cache or lookup
        drug_upper = drug_name.upper()
        if drug_upper not in drug_clusters_cache:
            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)

        cluster_ids = drug_clusters_cache[drug_upper]

        if not cluster_ids:
            results.append(IndicationValidationResult(
                patient_pseudonym=pseudonym,
                drug_name=drug_name,
                has_valid_indication=False,
                source="NONE",
                error_message=f"No cluster mappings for drug '{drug_name}'",
            ))
            continue

        # Check patient indication
        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(
            patient_pseudonym=pseudonym,
            cluster_ids=cluster_ids,
            connector=connector,
        )

        results.append(IndicationValidationResult(
            patient_pseudonym=pseudonym,
            drug_name=drug_name,
            has_valid_indication=has_indication,
            matched_cluster_id=matched_cluster,
            matched_snomed_code=matched_code,
            matched_snomed_description=matched_desc,
            checked_clusters=cluster_ids,
            source="GP_SNOMED" if has_indication else "NONE",
        ))

    matched_count = sum(1 for r in results if r.has_valid_indication)
    logger.info(f"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications")

    return results


def get_available_clusters(
    connector: Optional[SnowflakeConnector] = None,
) -> list[dict]:
    """
    Get list of all available SNOMED clusters from Snowflake.

    Returns:
        List of dicts with cluster_id, cluster_description, code_count
    """
    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():
        logger.warning("Snowflake not available - cannot list clusters")
        return []

    if connector is None:
        connector = get_connector()

    query = '''
        SELECT
            "Cluster_ID",
            "Cluster_Description",
            COUNT(DISTINCT "SNOMEDCode") as code_count
        FROM DATA_HUB.PHM."ClinicalCodingClusterSnomedCodes"
        GROUP BY "Cluster_ID", "Cluster_Description"
        ORDER BY "Cluster_ID"
    '''

    try:
        results = connector.execute_dict(query)

        clusters = []
        for row in results:
            clusters.append({
                "cluster_id": row.get("Cluster_ID"),
                "cluster_description": row.get("Cluster_Description"),
                "code_count": row.get("code_count", 0),
            })

        logger.info(f"Found {len(clusters)} available SNOMED clusters")
        return clusters

    except Exception as e:
        logger.error(f"Error getting available clusters: {e}")
        return []


# === Drug-to-indication mapping from DimSearchTerm.csv ===


# Merge related Search_Terms into canonical names.
# Asthma variants are clinically the same condition at different severity levels.
# Urticaria is a separate condition — do NOT merge with asthma.
SEARCH_TERM_MERGE_MAP: dict[str, str] = {
    "allergic asthma": "asthma",
    "severe persistent allergic asthma": "asthma",
}


def load_drug_indication_mapping(
    csv_path: Optional[str] = None,
) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
    """
    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.

    Builds two lookup dicts:
    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it
    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)

    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate
    CleanedDrugName is pipe-separated (e.g., "ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB").

    Note: A Search_Term can appear multiple times with different PrimaryDirectorates
    (e.g., "diabetes" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).
    Drug fragments from all rows for the same Search_Term are combined.

    Asthma-related Search_Terms ("allergic asthma", "severe persistent allergic asthma")
    are merged into "asthma" to match the CLUSTER_MAPPING_SQL normalization.
    "urticaria" stays separate.

    Args:
        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.

    Returns:
        Tuple of (fragment_to_search_terms, search_term_to_fragments)
    """
    if csv_path is None:
        csv_path = str(Path(__file__).parent.parent / "data" / "DimSearchTerm.csv")

    fragment_to_search_terms: dict[str, list[str]] = {}
    search_term_to_fragments: dict[str, list[str]] = {}

    try:
        with open(csv_path, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                search_term = row.get("Search_Term", "").strip()
                drug_names_raw = row.get("CleanedDrugName", "").strip()

                # Normalize asthma variants to canonical "asthma"
                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)

                if not search_term or not drug_names_raw:
                    continue

                fragments = [frag.strip().upper() for frag in drug_names_raw.split("|") if frag.strip()]

                # Build search_term -> fragments (accumulate for duplicate Search_Terms)
                if search_term not in search_term_to_fragments:
                    search_term_to_fragments[search_term] = []
                for frag in fragments:
                    if frag not in search_term_to_fragments[search_term]:
                        search_term_to_fragments[search_term].append(frag)

                # Build fragment -> search_terms
                for frag in fragments:
                    if frag not in fragment_to_search_terms:
                        fragment_to_search_terms[frag] = []
                    if search_term not in fragment_to_search_terms[frag]:
                        fragment_to_search_terms[frag].append(search_term)

        logger.info(
            f"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, "
            f"{len(fragment_to_search_terms)} drug fragments"
        )

    except FileNotFoundError:
        logger.error(f"DimSearchTerm.csv not found at {csv_path}")
    except Exception as e:
        logger.error(f"Error loading DimSearchTerm.csv: {e}")

    return fragment_to_search_terms, search_term_to_fragments


def get_search_terms_for_drug(
    drug_name: str,
    search_term_to_fragments: dict[str, list[str]],
) -> list[str]:
    """
    Get all Search_Terms that list a given drug using substring matching.

    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given
    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)
    and partial fragments (PEGYLATED, INHALED).

    Args:
        drug_name: HCD drug name (e.g., "ADALIMUMAB 40MG", "PEGYLATED LIPOSOMAL DOXORUBICIN")
        search_term_to_fragments: Mapping of search_term -> list of drug fragments

    Returns:
        List of Search_Terms whose drug fragments match the drug name
    """
    drug_name_upper = drug_name.upper()
    matched_terms: list[str] = []

    for search_term, fragments in search_term_to_fragments.items():
        for frag in fragments:
            if frag in drug_name_upper:
                matched_terms.append(search_term)
                break  # One matching fragment is enough for this Search_Term

    return matched_terms


def assign_drug_indications(
    df: "pd.DataFrame",
    gp_matches_df: "pd.DataFrame",
    search_term_to_fragments: dict[str, list[str]],
) -> "tuple[pd.DataFrame, pd.DataFrame]":
    """
    Assign drug-aware indications by cross-referencing GP diagnoses with drug mappings.

    For each UPID + Drug Name pair in the HCD data:
    1. Look up patient's GP-matched Search_Terms (via PseudoNHSNoLinked → gp_matches_df)
    2. Look up which Search_Terms list this drug (via search_term_to_fragments)
    3. Intersect = valid indications for this drug-patient pair
    4. If 1 match: use it
    5. If multiple: pick highest code_frequency (most GP coding = most likely indication)
    6. If tied frequency: alphabetical Search_Term for determinism
    7. If 0 matches: fallback to "{Directory} (no GP dx)"

    Modifies UPID to include indication: "{UPID}|{search_term}" or "{UPID}|{Directory} (no GP dx)".
    This makes drugs under different indications appear as separate pathways.

    Args:
        df: HCD DataFrame with columns: UPID, Drug Name, PseudoNHSNoLinked, Directory
        gp_matches_df: GP matches from get_patient_indication_groups() with columns:
            PatientPseudonym, Search_Term, code_frequency
            (multiple rows per patient, one per matched Search_Term)
        search_term_to_fragments: From load_drug_indication_mapping()[1].
            Maps search_term -> list of drug fragments (UPPERCASE).

    Returns:
        Tuple of (modified_df, indication_df):
        - modified_df: Copy of df with UPID replaced by "{UPID}|{indication}"
        - indication_df: DataFrame indexed by modified UPID with 'Directory' column
            containing the Search_Term (or fallback label) for pathway hierarchy
    """
    import pandas as pd

    modified_df = df.copy()

    # Build GP match lookup: PseudoNHSNoLinked -> {Search_Term: code_frequency}
    gp_lookup: dict[str, dict[str, int]] = {}
    if not gp_matches_df.empty:
        for _, row in gp_matches_df.iterrows():
            pseudo = row['PatientPseudonym']
            term = row['Search_Term']
            freq = int(row.get('code_frequency', 0))
            if pseudo not in gp_lookup:
                gp_lookup[pseudo] = {}
            gp_lookup[pseudo][term] = freq

    logger.info(f"GP lookup built: {len(gp_lookup)} patients with GP matches")

    # Cache drug -> Search_Terms lookups to avoid recomputing per row
    drug_search_terms_cache: dict[str, list[str]] = {}

    def get_drug_terms(drug_name: str) -> list[str]:
        if drug_name not in drug_search_terms_cache:
            drug_search_terms_cache[drug_name] = get_search_terms_for_drug(
                drug_name, search_term_to_fragments
            )
        return drug_search_terms_cache[drug_name]

    # Process each row: determine indication for each UPID + Drug Name
    # We work at the (UPID, Drug Name) pair level, then apply to all rows
    # Key: (UPID, Drug Name) -> (matched_search_term_or_fallback, is_diagnosis)
    pair_indication_cache: dict[tuple[str, str], tuple[str, bool]] = {}

    # Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combos
    required_cols = ['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']
    unique_pairs = modified_df[required_cols].drop_duplicates(
        subset=['UPID', 'Drug Name']
    )

    match_count = 0
    fallback_count = 0
    tiebreak_count = 0

    for _, pair_row in unique_pairs.iterrows():
        upid = pair_row['UPID']
        drug_name = pair_row['Drug Name']
        pseudo = pair_row['PseudoNHSNoLinked']
        directory = pair_row['Directory']

        # Get Search_Terms this drug maps to
        drug_terms = get_drug_terms(drug_name)

        # Get patient's GP-matched Search_Terms
        patient_gp_terms = gp_lookup.get(pseudo, {}) if pd.notna(pseudo) else {}

        # Intersect: Search_Terms that list this drug AND patient has GP dx for
        valid_indications = {
            term: patient_gp_terms[term]
            for term in drug_terms
            if term in patient_gp_terms
        }

        if len(valid_indications) == 1:
            matched_term = next(iter(valid_indications))
            pair_indication_cache[(upid, drug_name)] = (matched_term, True)
            match_count += 1

        elif len(valid_indications) > 1:
            # Tiebreaker: highest code_frequency, then alphabetical
            sorted_terms = sorted(
                valid_indications.items(),
                key=lambda x: (-x[1], x[0]),
            )
            matched_term = sorted_terms[0][0]
            pair_indication_cache[(upid, drug_name)] = (matched_term, True)
            match_count += 1
            tiebreak_count += 1

        else:
            # No intersection: fallback to directory
            if pd.notna(directory):
                fallback_label = f"{directory} (no GP dx)"
            else:
                fallback_label = "UNKNOWN (no GP dx)"
            pair_indication_cache[(upid, drug_name)] = (fallback_label, False)
            fallback_count += 1

    total_pairs = len(unique_pairs)
    logger.info(f"Drug-indication matching complete:")
    logger.info(f"  Total UPID-Drug pairs: {total_pairs}")
    logger.info(f"  Matched (GP dx + drug mapping): {match_count} ({100*match_count/total_pairs:.1f}%)" if total_pairs > 0 else f"  Matched: 0")
    logger.info(f"  Tiebreaker used: {tiebreak_count}")
    logger.info(f"  Fallback (no match): {fallback_count} ({100*fallback_count/total_pairs:.1f}%)" if total_pairs > 0 else f"  Fallback: 0")

    # Apply modified UPIDs to all rows
    # Build vectorized lookup: original UPID + Drug Name -> modified UPID
    def build_modified_upid(row: "pd.Series") -> str:
        upid = row['UPID']
        drug_name = row['Drug Name']
        key = (upid, drug_name)
        if key in pair_indication_cache:
            indication, _ = pair_indication_cache[key]
            return f"{upid}|{indication}"
        # Shouldn't happen, but fallback
        return f"{upid}|UNKNOWN (no GP dx)"

    modified_df['UPID'] = modified_df.apply(build_modified_upid, axis=1)

    # Build indication_df: modified UPID -> Search_Term/fallback label
    # This maps each unique modified UPID to its indication for the pathway hierarchy
    indication_records: list[dict[str, str]] = []
    seen_upids: set[str] = set()

    for (original_upid, drug_name), (indication, is_diagnosis) in pair_indication_cache.items():
        modified_upid = f"{original_upid}|{indication}"
        if modified_upid not in seen_upids:
            indication_records.append({
                'UPID': modified_upid,
                'Directory': indication,
            })
            seen_upids.add(modified_upid)

    indication_df = pd.DataFrame(indication_records)
    if not indication_df.empty:
        indication_df = indication_df.set_index('UPID')

    logger.info(f"  Unique modified UPIDs: {len(indication_df)}")

    # Log top indications
    if not indication_df.empty:
        top_terms = indication_df['Directory'].value_counts().head(5)
        logger.info(f"  Top 5 indications: {dict(top_terms)}")

    return modified_df, indication_df


# === NEW APPROACH: Query Snowflake directly using cluster CTE ===

# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)
# This maps Search_Term -> Cluster_ID for ~148 clinical conditions
CLUSTER_MAPPING_SQL = """
WITH SearchTermClusters AS (
    SELECT Search_Term, Cluster_ID FROM (VALUES
        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),
        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),
        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),
        ('asthma', 'AST_COD'),
        ('allergic rhinitis', 'MILDINTAST_COD'),
        ('alzheimer''s disease', 'DEMALZ_COD'),
        ('amyloidosis', 'AMYLOID_COD'),
        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),
        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),
        ('apixaban', 'DOACCON_COD'),
        ('aplastic anaemia', 'eFI2_AnaemiaEver'),
        ('arthritis', 'eFI2_InflammatoryArthritis'),
        ('asthma', 'eFI2_Asthma'),
        ('atopic dermatitis', 'ATOPDERM_COD'),
        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),
        ('attention deficit hyperactivity disorder', 'ADHD_COD'),
        ('bipolar disorder', 'MH_COD'),
        ('bladder', 'eFI2_UrinaryIncontinence'),
        ('breast cancer', 'BRCANSCR_COD'),
        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),
        ('cardiovascular disease', 'CVDRISKASS_COD'),
        ('cervical cancer', 'CSDEC_COD'),
        ('cholangiocarcinoma', 'eFI2_Cancer'),
        ('chronic kidney disease', 'CKD_COD'),
        ('chronic liver disease', 'eFI2_LiverProblems'),
        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),
        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),
        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),
        ('colon cancer', 'eFI2_Cancer'),
        ('colorectal cancer', 'GICANREF_COD'),
        ('constipation', 'CHRONCONSTIP_COD'),
        ('covid-19', 'POSSPOSTCOVID_COD'),
        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),
        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),
        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),
        ('deep vein thrombosis', 'VTE_COD'),
        ('depression', 'eFI2_Depression'),
        ('diabetes', 'eFI2_DiabetesEver'),
        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),
        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),
        ('dravet syndrome', 'EPIL_COD'),
        ('drug misuse', 'ILLSUBINT_COD'),
        ('dyspepsia', 'eFI2_AbdominalPain'),
        ('epilepsy', 'eFI2_Seizures'),
        ('fallopian tube', 'STERIL_COD'),
        ('follicular lymphoma', 'C19HAEMCAN_COD'),
        ('gastric cancer', 'eFI2_Cancer'),
        ('giant cell arteritis', 'GCA_COD'),
        ('glioma', 'NHAEMCANMORPH_COD'),
        ('gout', 'eFI2_InflammatoryArthritis'),
        ('graft versus host disease', 'GVHD_COD'),
        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),
        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),
        ('hand eczema', 'ECZEMA_COD'),
        ('heart failure', 'eFI2_HeartFailure'),
        ('hepatitis b', 'HEPBCVAC_COD'),
        ('hepatocellular carcinoma', 'eFI2_Cancer'),
        ('hiv', 'PREFLANG_COD'),
        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),
        ('hormone receptor', 'eFI2_ThyroidProblems'),
        ('hypercholesterolaemia', 'CLASSFH_COD'),
        ('immune thrombocytopenia', 'ITP_COD'),
        ('influenza', 'FLUINVITE_COD'),
        ('insomnia', 'eFI2_SleepProblems'),
        ('irritable bowel syndrome', 'IBS_COD'),
        ('ischaemic stroke', 'OSTR_COD'),
        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),
        ('kidney transplant', 'RENALTRANSP_COD'),
        ('leukaemia', 'eFI2_Cancer'),
        ('lung cancer', 'FTCANREF_COD'),
        ('lymphoma', 'C19HAEMCAN_COD'),
        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),
        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),
        ('major depressive episodes', 'eFI2_Depression'),
        ('malignant melanoma', 'eFI2_Cancer'),
        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),
        ('manic episode', 'MH_COD'),
        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),
        ('melanoma', 'eFI2_Cancer'),
        ('merkel cell carcinoma', 'C19CAN_COD'),
        ('migraine', 'eFI2_Headache'),
        ('motor neurone disease', 'MND_COD'),
        ('multiple myeloma', 'C19HAEMCAN_COD'),
        ('multiple sclerosis', 'MS_COD'),
        ('myelodysplastic', 'eFI2_AnaemiaEver'),
        ('myelofibrosis', 'MDS_COD'),
        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),
        ('myotonia', 'CNDATRISK2_COD'),
        ('narcolepsy', 'LD_COD'),
        ('neuroendocrine tumour', 'LUNGCAN_COD'),
        ('non-small cell lung cancer', 'LUNGCAN_COD'),
        ('non-small-cell lung cancer', 'FTCANREF_COD'),
        ('obesity', 'BMI30_COD'),
        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),
        ('osteoporosis', 'eFI2_Osteoporosis'),
        ('osteosarcoma', 'NHAEMCANMORPH_COD'),
        ('ovarian cancer', 'C19CAN_COD'),
        ('peripheral arterial disease', 'PADEXC_COD'),
        ('plaque psoriasis', 'PSORIASIS_COD'),
        ('polycystic kidney disease', 'EPPCONGMALF_COD'),
        ('polycythaemia vera', 'C19HAEMCAN_COD'),
        ('pregnancy', 'C19PREG_COD'),
        ('primary biliary cholangitis', 'eFI2_LiverProblems'),
        ('primary hypercholesterolaemia', 'FNFHYP_COD'),
        ('prostate cancer', 'EPPSOLIDCAN_COD'),
        ('psoriasis', 'PSORIASIS_COD'),
        ('psoriatic arthritis', 'RARTHAD_COD'),
        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),
        ('pulmonary fibrosis', 'ILD_COD'),
        ('relapsing multiple sclerosis', 'MS_COD'),
        ('renal cell carcinoma', 'C19CAN_COD'),
        ('renal transplantation', 'RENALTRANSP_COD'),
        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),
        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),
        ('rivaroxaban', 'DOACCON_COD'),
        ('schizophrenia', 'MH_COD'),
        ('seizures', 'LSZFREQ_COD'),
        ('sepsis', 'C19ACTIVITY_COD'),
        ('asthma', 'SEVAST_COD'),
        ('sickle cell disease', 'SICKLE_COD'),
        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),
        ('smoking cessation', 'SMOKINGINT_COD'),
        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),
        ('spinal muscular atrophy', 'MND_COD'),
        ('squamous cell', 'C19CAN_COD'),
        ('squamous cell carcinoma', 'C19CAN_COD'),
        ('stem cell transplant', 'ALLOTRANSP_COD'),
        ('stroke', 'eFI2_Stroke'),
        ('systemic lupus erythematosus', 'SLUPUS_COD'),
        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),
        ('thrombocytopenic purpura', 'TTP_COD'),
        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),
        ('thyroid cancer', 'C19CAN_COD'),
        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),
        ('transitional cell carcinoma', 'C19CAN_COD'),
        ('type 1 diabetes', 'DMTYPE1_COD'),
        ('type 2 diabetes', 'DMTYPE2_COD'),
        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),
        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),
        ('urticaria', 'XSAL_COD'),
        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),
        ('vascular disease', 'CVDINVITE_COD'),
        ('vasculitis', 'CRYOGLOBVASC_COD')
    ) AS t(Search_Term, Cluster_ID)
),

ClusterCodes AS (
    SELECT
        stc.Search_Term,
        c."SNOMEDCode",
        c."SNOMEDDescription"
    FROM SearchTermClusters stc
    JOIN DATA_HUB.PHM."ClinicalCodingClusterSnomedCodes" c
        ON stc.Cluster_ID = c."Cluster_ID"
    WHERE c."SNOMEDCode" IS NOT NULL
),

ExplicitCodes AS (
    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES
        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),
        ('ankylosing spondylitis', '162930007', 'Manual mapping'),
        ('ankylosing spondylitis', '239805001', 'Manual mapping'),
        ('ankylosing spondylitis', '239810002', 'Manual mapping'),
        ('ankylosing spondylitis', '239811003', 'Manual mapping'),
        ('ankylosing spondylitis', '394990003', 'Manual mapping'),
        ('ankylosing spondylitis', '429712009', 'Manual mapping'),
        ('ankylosing spondylitis', '441562009', 'Manual mapping'),
        ('ankylosing spondylitis', '441680005', 'Manual mapping'),
        ('ankylosing spondylitis', '441930001', 'Manual mapping'),
        ('axial spondyloarthritis', '723116002', 'Manual mapping'),
        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),
        ('choroidal neovascularisation', '733124000', 'Manual mapping')
    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)
),

AllIndicationCodes AS (
    SELECT Search_Term, "SNOMEDCode" AS SNOMEDCode, "SNOMEDDescription" AS SNOMEDDescription
    FROM ClusterCodes
    UNION ALL
    SELECT Search_Term, SNOMEDCode, SNOMEDDescription
    FROM ExplicitCodes
)
"""


def get_patient_indication_groups(
    patient_pseudonyms: list[str],
    connector: Optional[SnowflakeConnector] = None,
    batch_size: int = 5000,
    earliest_hcd_date: Optional[str] = None,
) -> "pd.DataFrame":
    """
    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.

    Returns ALL matching Search_Terms per patient with code_frequency (count of
    matching SNOMED codes). This enables drug-aware indication matching where
    each drug is cross-referenced against the patient's GP diagnoses.

    The query:
    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings
    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes
    3. Groups by patient + Search_Term and counts matching codes (code_frequency)
    4. Optionally restricts to GP codes from earliest_hcd_date onwards

    Args:
        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)
        connector: Optional SnowflakeConnector (defaults to singleton)
        batch_size: Number of patients per Snowflake query batch (default 500)
        earliest_hcd_date: Optional ISO date string (YYYY-MM-DD). If provided, only
            counts GP codes from this date onwards. Should be MIN(Intervention Date)
            from the HCD DataFrame to restrict to the HCD data window.

    Returns:
        DataFrame with columns:
        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)
        - Search_Term: The matched indication (e.g., "rheumatoid arthritis")
        - code_frequency: Count of matching SNOMED codes for this Search_Term

        Multiple rows per patient (one per matched Search_Term).
        Patients not found in results have no matching GP diagnosis.
    """
    import pandas as pd

    logger.info(f"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...")
    if earliest_hcd_date:
        logger.info(f"  Restricting GP codes to >= {earliest_hcd_date}")

    # Handle edge case: empty patient list
    if not patient_pseudonyms:
        logger.warning("Empty patient list provided")
        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'code_frequency'])

    # Check Snowflake availability
    if not SNOWFLAKE_AVAILABLE:
        logger.error("Snowflake connector not available - cannot lookup GP records")
        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'code_frequency'])

    if not is_snowflake_configured():
        logger.error("Snowflake not configured - cannot lookup GP records")
        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'code_frequency'])

    if connector is None:
        connector = get_connector()

    # Results list to collect all matches
    all_results: list[dict] = []

    # Process patients in batches
    total_patients = len(patient_pseudonyms)
    for batch_start in range(0, total_patients, batch_size):
        batch_end = min(batch_start + batch_size, total_patients)
        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]
        batch_num = batch_start // batch_size + 1
        total_batches = (total_patients + batch_size - 1) // batch_size

        logger.info(f"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}")

        # Build patient IN clause placeholders
        patient_placeholders = ", ".join(["%s"] * len(batch_pseudonyms))

        # Build WHERE clause with optional date filter
        date_filter = ""
        if earliest_hcd_date:
            date_filter = f"\n    AND pc.\"EventDateTime\" >= %s"

        # Build the full query with cluster CTE
        # Returns ALL matching Search_Terms per patient with code_frequency
        # code_frequency = COUNT of matching SNOMED codes per Search_Term per patient
        query = f"""
{CLUSTER_MAPPING_SQL}
SELECT
    pc."PatientPseudonym" AS "PatientPseudonym",
    aic.Search_Term AS "Search_Term",
    COUNT(*) AS "code_frequency"
FROM DATA_HUB.PHM."PrimaryCareClinicalCoding" pc
INNER JOIN AllIndicationCodes aic
    ON pc."SNOMEDCode" = aic.SNOMEDCode
WHERE pc."PatientPseudonym" IN ({patient_placeholders}){date_filter}
GROUP BY pc."PatientPseudonym", aic.Search_Term
"""

        # Build params: patient pseudonyms + optional date
        params = list(batch_pseudonyms)
        if earliest_hcd_date:
            params.append(earliest_hcd_date)

        try:
            results = connector.execute_dict(query, tuple(params))

            for row in results:
                all_results.append({
                    'PatientPseudonym': row.get('PatientPseudonym'),
                    'Search_Term': row.get('Search_Term'),
                    'code_frequency': row.get('code_frequency', 0),
                })

            logger.debug(f"Batch {batch_num}: found {len(results)} patient-indication matches")

        except Exception as e:
            logger.error(f"Error querying GP records for batch {batch_num}: {e}")
            # Continue with other batches - partial results are better than none

    # Build result DataFrame
    result_df = pd.DataFrame(all_results)

    # Log summary statistics
    if len(result_df) > 0:
        unique_patients = result_df['PatientPseudonym'].nunique()
        total_rows = len(result_df)
        match_rate = 100 * unique_patients / total_patients
        unique_terms = result_df['Search_Term'].nunique()
        avg_indications = total_rows / unique_patients if unique_patients > 0 else 0
        logger.info(f"Indication lookup complete:")
        logger.info(f"  Total patients queried: {total_patients}")
        logger.info(f"  Patients with GP match: {unique_patients} ({match_rate:.1f}%)")
        logger.info(f"  Total patient-indication rows: {total_rows} (avg {avg_indications:.1f} per patient)")
        logger.info(f"  Unique Search_Terms found: {unique_terms}")

        # Log top Search_Terms
        top_terms = result_df['Search_Term'].value_counts().head(5)
        logger.info(f"  Top 5 indications: {dict(top_terms)}")
    else:
        logger.info(f"Indication lookup complete: 0 matches from {total_patients} patients")

    return result_df


# Export public API
__all__ = [
    # Dataclasses
    "ClusterSnomedCodes",
    "IndicationValidationResult",
    "DrugIndicationMatchRate",
    # Cluster-based lookup functions
    "get_drug_clusters",
    "get_drug_cluster_ids",
    "get_cluster_snomed_codes",
    "patient_has_indication",
    "validate_indication",
    "get_indication_match_rate",
    "batch_validate_indications",
    "get_available_clusters",
    # Drug-indication mapping from DimSearchTerm.csv
    "SEARCH_TERM_MERGE_MAP",
    "load_drug_indication_mapping",
    "get_search_terms_for_drug",
    # Drug-aware indication assignment
    "assign_drug_indications",
    # Snowflake-direct indication lookup
    "get_patient_indication_groups",
    "CLUSTER_MAPPING_SQL",
]