feat: add indication pathway processing functions (Task 2.3)

- Add generate_icicle_chart_indication() to pathway_analyzer.py - Variant that uses indication_df instead of directory_df - Groups by Trust → Search_Term → Drug → Pathway - Accepts indication_df mapping UPID → Indication_Group - Add process_indication_pathway_for_date_filter() to pathway_pipeline.py - Processes indication-based pathway for a single date filter - Uses generate_icicle_chart_indication() for hierarchy building - Add extract_indication_fields() to pathway_pipeline.py - Extracts trust_name, search_term, drug_sequence from ids column - Similar to extract_denormalized_fields() but for indication charts - Update convert_to_records() with chart_type parameter - Includes chart_type column in output records - Supports "directory" and "indication" values - Add ChartType type alias (Literal["directory", "indication"]) - Update __all__ exports with new functions
2026-02-05 14:32:28 +00:00
parent aabe4bf45d
commit 7cbc648c6d
3 changed files with 352 additions and 6 deletions
@@ -749,3 +749,191 @@ def generate_icicle_chart(
    ice_df = prepare_chart_data(ice_df, minimum_num_patients)

    return ice_df, final_title
+
+
+def generate_icicle_chart_indication(
+    df: pd.DataFrame,
+    indication_df: pd.DataFrame,
+    start_date: str,
+    end_date: str,
+    last_seen_date: str,
+    trust_filter: list[str],
+    drug_filter: list[str],
+    directory_filter: list[str],
+    minimum_num_patients: int,
+    title: str = "",
+    paths: Optional[PathConfig] = None,
+) -> tuple[pd.DataFrame, str]:
+    """
+    Generate icicle chart data with indication-based grouping.
+
+    This is a variant of generate_icicle_chart() that groups by Search_Term
+    (from GP diagnosis match) instead of Directory. For patients without
+    a GP diagnosis match, the fallback directorate is used with a "(no GP dx)"
+    suffix to distinguish them.
+
+    Hierarchy: Trust → Indication_Group → Drug → Pathway
+
+    Args:
+        df: DataFrame with processed patient intervention data
+        indication_df: DataFrame mapping UPID → Indication_Group
+                      Must have 'UPID' as index and 'Indication_Group' column
+                      Values are either Search_Term or "Directory (no GP dx)"
+        start_date: Start date for patient initiation filter
+        end_date: End date for patient initiation filter
+        last_seen_date: Filter for patients last seen after this date
+        trust_filter: List of trust names to include
+        drug_filter: List of drug names to include
+        directory_filter: List of directories to include
+        minimum_num_patients: Minimum number of patients to include a pathway
+        title: Chart title (auto-generated if empty)
+        paths: PathConfig for file paths (uses default if None)
+
+    Returns:
+        Tuple of (ice_df for chart, final_title) or (None, "") if no data
+    """
+    if paths is None:
+        paths = default_paths
+
+    # Prepare data - use standard prepare_data function
+    result = prepare_data(df, trust_filter, drug_filter, directory_filter, paths)
+    if result[0] is None:
+        return None, ""
+    filtered_df, org_codes, directory_df = result
+
+    # For indication charts, we replace directory_df with indication_df
+    # First, ensure indication_df has the correct format (UPID as index)
+    if indication_df is not None and not indication_df.empty:
+        if 'UPID' in indication_df.columns:
+            indication_df = indication_df.set_index('UPID')
+        # Rename column for compatibility with build_hierarchy()
+        if 'Indication_Group' in indication_df.columns:
+            indication_df = indication_df.rename(columns={'Indication_Group': 'Directory'})
+        elif 'indication_group' in indication_df.columns:
+            indication_df = indication_df.rename(columns={'indication_group': 'Directory'})
+    else:
+        # Fall back to directory if no indication data provided
+        logger.warning("No indication data provided, falling back to directory grouping")
+        indication_df = directory_df
+
+    cost_df = filtered_df[["UPID", "Price Actual"]]
+    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
+    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
+
+    result = calculate_statistics(filtered_df, start_date, end_date, last_seen_date, title)
+    if result[0] is None:
+        return None, ""
+    patient_info, date_df, final_title = result
+
+    df_drug_freq = (
+        filtered_df.groupby("UPID")
+        .agg({"Drug Name": lambda x: list(x)})
+        .reset_index()
+        .set_index("UPID")
+    )
+    df_drug_cost = (
+        filtered_df.groupby("UPID")
+        .agg({"Price Actual": lambda x: list(x)})
+        .reset_index()
+        .set_index("UPID")
+    )
+    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
+    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(_count_list_values)
+    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: _sum_list_values(x), axis=1)
+
+    df1_unique = _drop_duplicate_treatments(filtered_df, True)
+    df_drugs = (
+        df1_unique.groupby("UPID")
+        .agg({"Drug Name": lambda x: list(x)})
+        .reset_index()
+        .set_index("UPID")
+    )
+    df_dates = (
+        df1_unique.groupby("UPID")
+        .agg({"Intervention Date": lambda x: list(x)})
+        .reset_index()
+        .set_index("UPID")
+    )
+
+    df_dates_unwrapped = pd.DataFrame(
+        df_dates["Intervention Date"].values.tolist(), index=df_dates.index
+    ).add_prefix("date_")
+    df_drugs_unwrapped = pd.DataFrame(
+        df_drugs["Drug Name"].values.tolist(), index=df_drugs.index
+    ).add_prefix("drug_")
+
+    start_dates = (
+        filtered_df[["UPIDTreatment", "Intervention Date"]]
+        .sort_values(by=["Intervention Date"], ascending=True)
+        .drop_duplicates(subset="UPIDTreatment")
+        .set_index("UPIDTreatment")
+    )
+    end_dates = (
+        filtered_df[["UPIDTreatment", "Intervention Date"]]
+        .sort_values(by=["Intervention Date"], ascending=False)
+        .drop_duplicates(subset="UPIDTreatment")
+        .set_index("UPIDTreatment")
+    )
+
+    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(
+        lambda x: _start_date_drug(start_dates, x), axis=1
+    )
+    df_start_dates_unwrapped = pd.DataFrame(
+        df_drugs_unwrapped["start_dates"].values.tolist(), index=df_drugs_unwrapped.index
+    ).add_prefix("start_date_")
+    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
+
+    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(
+        lambda x: _start_date_drug(end_dates, x), axis=1
+    )
+    df_end_dates_unwrapped_2 = pd.DataFrame(
+        df_drugs_unwrapped["end_dates"].values.tolist(), index=df_drugs_unwrapped.index
+    ).add_prefix("end_date_")
+    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
+
+    df_drugs_unwrapped = pd.merge(
+        df_drugs_unwrapped, df_start_dates_unwrapped, left_index=True, right_index=True
+    )
+    df_drugs_unwrapped = pd.merge(
+        df_drugs_unwrapped, df_end_dates_unwrapped_2, left_index=True, right_index=True
+    )
+
+    df_freq_for_merge = pd.DataFrame(
+        df_drug_freq["Drug Name"].values.tolist(), index=df_drugs_unwrapped.index
+    ).add_prefix("freq_")
+    df_drugs_unwrapped = pd.merge(
+        df_drugs_unwrapped, df_freq_for_merge, left_index=True, right_index=True
+    )
+    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(
+        lambda x: _drug_frequency_average(x), axis=1
+    )
+
+    df_spacing_unwrapped = pd.DataFrame(
+        df_drugs_unwrapped["frequency"].values.tolist(), index=df_drugs_unwrapped.index
+    ).add_prefix("spacing_")
+    df_drugs_unwrapped = pd.merge(
+        df_drugs_unwrapped, df_spacing_unwrapped, left_index=True, right_index=True
+    )
+
+    df_cost_unwrapped = pd.DataFrame(
+        df_drug_freq["Drug cost total"].values.tolist(), index=df_drugs_unwrapped.index
+    ).add_prefix("total_cost_drug_")
+    df_drugs_unwrapped = pd.merge(
+        df_drugs_unwrapped, df_cost_unwrapped, left_index=True, right_index=True
+    )
+    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
+
+    # Build hierarchy with indication_df instead of directory_df
+    ice_df = build_hierarchy(
+        patient_info,
+        date_df,
+        filtered_df,
+        org_codes,
+        indication_df,  # Use indication mapping instead of directory
+        total_costs,
+        df_drugs_unwrapped,
+    )
+
+    ice_df = prepare_chart_data(ice_df, minimum_num_patients)
+
+    return ice_df, final_title