refactor: reorganize repository to src/ layout

Move 6 packages (core, config, data_processing, analysis, visualization, cli) into src/ to reduce root clutter. Merge tools/data.py into data_processing/transforms.py. Move docs to docs/. Path resolution via .pth file (setup_dev.py), pytest pythonpath config, and sys.path bootstrap in rxconfig.py and CLI entry points. Clean up pyproject.toml deps (remove stale pins, add snowflake-connector-python). Fix tomllib import for Python 3.10 compatibility. All 113 tests pass.
2026-02-06 12:03:48 +00:00
parent 1581b1d3dd
commit 76838887e6
40 changed files with 589 additions and 214 deletions
@@ -1,50 +0,0 @@
-"""
-Analysis package for patient pathway processing.
-
-This package contains refactored functions from the original generate_graph() pipeline:
- pathway_analyzer: Main analysis pipeline with prepare_data, calculate_statistics, build_hierarchy
- statistics: Statistical calculation functions (costs, frequencies, durations)
-"""
-
-from analysis.pathway_analyzer import (
-    prepare_data,
-    calculate_statistics,
-    build_hierarchy,
-    prepare_chart_data,
-    generate_icicle_chart,
-)
-
-from analysis.statistics import (
-    count_consecutive_values,
-    calculate_drug_costs,
-    calculate_dosing_frequency,
-    calculate_drug_frequency_row,
-    calculate_cost_per_patient_per_annum,
-    calculate_treatment_duration,
-    calculate_pathway_proportion,
-    aggregate_patient_costs,
-    aggregate_drug_frequencies,
-    format_treatment_statistics,
-    remove_nan_values,
-)
-
-__all__ = [
-    # Pathway analysis pipeline
-    "prepare_data",
-    "calculate_statistics",
-    "build_hierarchy",
-    "prepare_chart_data",
-    "generate_icicle_chart",
-    # Statistical calculations
-    "count_consecutive_values",
-    "calculate_drug_costs",
-    "calculate_dosing_frequency",
-    "calculate_drug_frequency_row",
-    "calculate_cost_per_patient_per_annum",
-    "calculate_treatment_duration",
-    "calculate_pathway_proportion",
-    "aggregate_patient_costs",
-    "aggregate_drug_frequencies",
-    "format_treatment_statistics",
-    "remove_nan_values",
-]
@@ -1,943 +0,0 @@
-"""
-Patient pathway analysis pipeline.
-
-This module contains functions extracted from the original generate_graph() function
-to improve maintainability and testability. The functions follow this pipeline:
-
-1. prepare_data() - Apply filters, create composite keys, load reference data
-2. calculate_statistics() - Calculate patient costs, drug frequencies, treatment durations
-3. build_hierarchy() - Build the Trust → Directory → Drug → Pathway hierarchy
-4. prepare_chart_data() - Finalize data for Plotly icicle chart
-
-The generate_icicle_chart() function orchestrates the full pipeline.
-"""
-
-from typing import Optional
-
-import numpy as np
-import pandas as pd
-
-from core import PathConfig, default_paths
-from core.logging_config import get_logger
-from analysis.statistics import (
-    count_consecutive_values,
-    calculate_drug_costs,
-    calculate_dosing_frequency,
-    calculate_cost_per_patient_per_annum,
-    remove_nan_values,
-)
-
-logger = get_logger(__name__)
-
-
-def prepare_data(
-    df: pd.DataFrame,
-    trust_filter: list[str],
-    drug_filter: list[str],
-    directory_filter: list[str],
-    paths: Optional[PathConfig] = None,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """
-    Prepare data for analysis by applying filters and loading reference data.
-
-    Args:
-        df: DataFrame with processed patient intervention data
-        trust_filter: List of trust names to include
-        drug_filter: List of drug names to include
-        directory_filter: List of directories to include
-        paths: PathConfig for file paths (uses default if None)
-
-    Returns:
-        Tuple of (filtered_df, org_codes_df, directory_df) or (None, None, None) if no data
-    """
-    if paths is None:
-        paths = default_paths
-
-    # Work on a copy to avoid mutating the caller's DataFrame
-    # (Provider Code mapping is destructive — second call would map names to NaN)
-    df = df.copy()
-
-    df["UPIDTreatment"] = df["UPID"] + df["Drug Name"]
-
-    org_codes = pd.read_csv(paths.org_codes_csv, index_col=1)
-    df["Provider Code"] = df["Provider Code"].map(org_codes["Name"])
-
-    df = df[
-        (df["Provider Code"].isin(trust_filter))
-        & (df["Drug Name"].isin(drug_filter))
-        & (df["Directory"].isin(directory_filter))
-    ]
-
-    if len(df) == 0:
-        logger.warning("No data found for selected filters.")
-        return None, None, None
-
-    directory_df = df[["UPID", "Directory"]].drop_duplicates("UPID").set_index("UPID")
-
-    logger.info("Filtering unrelated interventions")
-    return df, org_codes, directory_df
-
-
-def _count_list_values(x):
-    """Count consecutive occurrences of each value in a sorted list."""
-    return count_consecutive_values(x)
-
-
-def _sum_list_values(x):
-    """Calculate sum of price_actual for each drug's portion of the list."""
-    return calculate_drug_costs(x["Drug Name"], x["Price Actual"])
-
-
-def _start_date_drug(start_dates_df: pd.DataFrame, x: pd.Series) -> list:
-    """Get start dates for each drug in a patient's treatment."""
-    drug_count = x.notnull().sum()
-    date_string = []
-    for d in range(drug_count):
-        UPID_date_var = str(x.name) + str(x[d])
-        date = start_dates_df.loc[UPID_date_var, "Intervention Date"]
-        date_string.append(date)
-    return date_string
-
-
-def _end_date_drug(end_dates_df: pd.DataFrame, x: pd.Series) -> list:
-    """Get end dates for each drug in a patient's treatment."""
-    drug_count = x.notnull().sum()
-    date_string = []
-    for d in range(drug_count - 1):
-        UPID_date_var = str(x.name) + str(x[d])
-        date = end_dates_df.loc[UPID_date_var, "Intervention Date"]
-        date_string.append(date)
-    return date_string
-
-
-def _drug_frequency_average(x: pd.Series) -> list[float]:
-    """Calculate average dosing frequency for each drug."""
-    drug_count = x.index.str.contains("drug_").sum()
-    freq = []
-    for d in range(drug_count):
-        freq_val = x.get(f"freq_{d}", 0)
-        if pd.isna(freq_val):
-            freq_val = 0
-        else:
-            freq_val = int(freq_val)
-
-        if freq_val > 1:
-            start_date = x.get(f"start_date_{d}")
-            end_date = x.get(f"end_date_{d}")
-            if pd.notna(start_date) and pd.notna(end_date):
-                freq_calc = calculate_dosing_frequency(freq_val, start_date, end_date)
-            else:
-                freq_calc = 0.0
-        else:
-            freq_calc = 0.0
-        freq.append(freq_calc)
-    return freq
-
-
-def _drop_duplicate_treatments(df: pd.DataFrame, ascending: bool) -> pd.DataFrame:
-    """Drop duplicate treatments keeping first/last based on date sort order."""
-    df_sorted = df.sort_values(by=["Intervention Date"], ascending=ascending)
-    df_treatment_steps = df_sorted.drop_duplicates(subset="UPIDTreatment", keep="first")
-    if not ascending:
-        df_treatment_steps = df_treatment_steps.sort_values(by=["Intervention Date"], ascending=True)
-    return df_treatment_steps
-
-
-def calculate_statistics(
-    df: pd.DataFrame,
-    start_date: str,
-    end_date: str,
-    last_seen_date: str,
-    title: str,
-) -> tuple[pd.DataFrame, pd.DataFrame, str]:
-    """
-    Calculate patient statistics: costs, drug frequencies, treatment durations.
-
-    Args:
-        df: Filtered DataFrame from prepare_data()
-        start_date: Start date for patient initiation filter
-        end_date: End date for patient initiation filter
-        last_seen_date: Filter for patients last seen after this date
-        title: Chart title (auto-generated if empty)
-
-    Returns:
-        Tuple of (patient_info_df, date_df, final_title) or (None, None, "") if no valid data
-    """
-    cost_df = df[["UPID", "Price Actual"]]
-    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
-    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
-
-    df_end_dates = _drop_duplicate_treatments(df, False)
-    df1_unique = _drop_duplicate_treatments(df, True)
-    logger.info("Identifying unique patients and interventions used")
-
-    df_drug_freq = (
-        df.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_cost = (
-        df.groupby("UPID")
-        .agg({"Price Actual": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
-    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(_count_list_values)
-    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: _sum_list_values(x), axis=1)
-
-    df_drugs = (
-        df1_unique.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_dates = (
-        df1_unique.groupby("UPID")
-        .agg({"Intervention Date": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_end_dates_grouped = (
-        df_end_dates.groupby("UPID")
-        .agg({"Intervention Date": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-
-    logger.info(
-        "Calculating each unique patient's intervention average frequency, cost and duration of each intervention"
-    )
-
-    df_dates_unwrapped = pd.DataFrame(
-        df_dates["Intervention Date"].values.tolist(), index=df_dates.index
-    ).add_prefix("date_")
-    df_end_dates_unwrapped = pd.DataFrame(
-        df_end_dates_grouped["Intervention Date"].values.tolist(),
-        index=df_end_dates_grouped.index,
-    ).add_prefix("date_end_")
-    df_drugs_unwrapped = pd.DataFrame(
-        df_drugs["Drug Name"].values.tolist(), index=df_drugs.index
-    ).add_prefix("drug_")
-
-    df_freq_unwrapped = pd.DataFrame(
-        df_drug_freq["Drug Name"].values.tolist(), index=df_drug_freq.index
-    ).add_prefix("freq_")
-
-    start_dates = (
-        df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=True)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-    end_dates = (
-        df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=False)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-
-    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(start_dates, x), axis=1
-    )
-    df_start_dates_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["start_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("start_date_")
-    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(end_dates, x), axis=1
-    )
-    df_end_dates_unwrapped_2 = pd.DataFrame(
-        df_drugs_unwrapped["end_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("end_date_")
-    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_start_dates_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_end_dates_unwrapped_2, left_index=True, right_index=True
-    )
-
-    df_freq_for_merge = pd.DataFrame(
-        df_drug_freq["Drug Name"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("freq_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_freq_for_merge, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(
-        lambda x: _drug_frequency_average(x), axis=1
-    )
-
-    df_spacing_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["frequency"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("spacing_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_spacing_unwrapped, left_index=True, right_index=True
-    )
-
-    df_cost_unwrapped = pd.DataFrame(
-        df_drug_freq["Drug cost total"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("total_cost_drug_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_cost_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
-
-    df_drugs_unwrapped.insert(0, "First seen", df_dates_unwrapped.min(axis=1))
-    df_drugs_unwrapped.insert(1, "Last seen", df_end_dates_unwrapped.max(axis=1))
-
-    patient_info = df.drop_duplicates(subset="UPID", keep="first").set_index("UPID")
-    patient_info = pd.merge(patient_info, df_drugs_unwrapped, left_index=True, right_index=True)
-    patient_info = pd.merge(patient_info, df_freq_unwrapped, left_index=True, right_index=True)
-    patient_info = pd.merge(patient_info, total_costs, left_index=True, right_index=True)
-
-    patient_info = patient_info[
-        (patient_info["First seen"] >= str(start_date))
-        & (patient_info["First seen"] < str(end_date))
-    ]
-
-    if title == "":
-        title = f"Patients initiated from {start_date} to {end_date}"
-
-    patient_info = patient_info[patient_info["Last seen"] > str(last_seen_date)]
-
-    patient_info["drug_0"] = patient_info["drug_0"].replace("N/A", np.nan)
-    patient_info.dropna(subset=["drug_0"], inplace=True)
-
-    if len(patient_info) == 0:
-        logger.warning("No patients remaining after date filters.")
-        return None, None, ""
-
-    patient_info["Days treated"] = patient_info["Last seen"] - patient_info["First seen"]
-    date_df = patient_info[["First seen", "Last seen", "Days treated"]]
-
-    return patient_info, date_df, title
-
-
-def _row_function(row: pd.Series) -> str:
-    """Build composite parent-label-id string for hierarchy."""
-    ids = ""
-    parents = "N&WICS"
-    count = row.count()
-    for c in range(count):
-        v = row[c]
-        if type(v) != str:
-            v = row[c + 1]
-        if c == count - 1:
-            ids = parents + " - " + v
-            continue
-        parents += " - " + v
-    label = row[count - 1]
-    value = parents + "," + label + "," + ids
-    return value
-
-
-def _remove_nan_string(y) -> list:
-    """Remove 'nan' strings from list."""
-    return remove_nan_values(y)
-
-
-def _list_to_string(x: pd.Series) -> str:
-    """Format drug statistics into readable string."""
-    list_parts = x.ids.split(" - ")
-    drug_list = list_parts[len(list_parts) - len(x.average_cost) :]
-    ret_string = ""
-    for y in range(len(x.average_cost)):
-        if (
-            (round(x.average_spacing[y], 0) > 1)
-            and (round(x.average_administered[y], 0) > 2.5)
-            and (int(x.value) > 0)
-        ):
-            string = (
-                f"<br><b>{drug_list[y]}</b><br>On average given "
-                f"{round(x.average_administered[y], 1)} times with a "
-                f"{round(int(x.average_spacing[y]) / 7, 1)} weekly interval ("
-                f"{round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1), 0)} weeks total treatment length)"
-            )
-        else:
-            string = (
-                f"<br><b>{drug_list[y]}</b><br>On average given "
-                f"{round(x.average_administered[y], 1)} times with a "
-                f"{round(int(x.average_spacing[y]) / 7, 1)} weekly interval ("
-                f"{round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1), 0)} weeks total treatment length)"
-            )
-        ret_string += string
-    return ret_string
-
-
-def _min_max_treatment_dates(ice_df: pd.DataFrame, row: pd.Series) -> str:
-    """Get min/max dates for a pathway."""
-    ids = row["ids"]
-    min_max = ice_df[ice_df["ids"].str.contains(ids, regex=False)]
-    if len(min_max) == 0:
-        return "N/A,N/A"
-
-    # Handle NaT (Not a Time) values
-    first_seen_min = min_max["First seen"].min()
-    last_seen_max = min_max["Last seen"].max()
-
-    if pd.isna(first_seen_min):
-        min_date = "N/A"
-    else:
-        min_date = str(first_seen_min.strftime("%Y-%m-%d"))
-
-    if pd.isna(last_seen_max):
-        max_date = "N/A"
-    else:
-        max_date = str(last_seen_max.strftime("%Y-%m-%d"))
-
-    return f"{min_date},{max_date}"
-
-
-def _cost_pp_pa(x: pd.Series) -> str:
-    """Calculate cost per patient per annum."""
-    result = calculate_cost_per_patient_per_annum(x["costpp"], x["avg_days"])
-    if result is not None:
-        return str(round(result, 2))
-    else:
-        return "N/A"
-
-
-def build_hierarchy(
-    patient_info: pd.DataFrame,
-    date_df: pd.DataFrame,
-    df: pd.DataFrame,
-    org_codes: pd.DataFrame,
-    directory_df: pd.DataFrame,
-    total_costs: pd.DataFrame,
-    df_drugs_unwrapped: pd.DataFrame,
-) -> pd.DataFrame:
-    """
-    Build the hierarchical structure for the icicle chart.
-
-    Args:
-        patient_info: DataFrame with calculated patient statistics
-        date_df: DataFrame with first/last seen dates
-        df: Original filtered DataFrame
-        org_codes: Organization codes lookup
-        directory_df: Directory assignments by UPID
-        total_costs: Total costs by UPID
-        df_drugs_unwrapped: Drug data with dates and frequencies unwrapped
-
-    Returns:
-        DataFrame with parents, ids, labels, value, colour for icicle chart
-    """
-    number_of_drugs = np.count_nonzero(patient_info.columns.str.startswith("drug_"))
-    final_drug_index = patient_info.columns.to_list().index("drug_" + str(number_of_drugs - 1))
-
-    upid_drugs_df = patient_info.iloc[
-        :, (final_drug_index - number_of_drugs + 1) : final_drug_index + 1
-    ]
-    upid_drugs_df = upid_drugs_df.copy()
-
-    upid_drugs_df.insert(0, "Trust", upid_drugs_df.index.str[:3])
-    upid_drugs_df.insert(1, "Directory", upid_drugs_df.index)
-
-    upid_drugs_df["Trust"] = upid_drugs_df["Trust"].map(org_codes["Name"])
-    upid_drugs_df["Directory"] = upid_drugs_df["Directory"].map(directory_df["Directory"])
-
-    upid_drugs_df["value"] = upid_drugs_df.apply(lambda x: _row_function(x), axis=1)
-    upid_drugs_df = pd.merge(upid_drugs_df, date_df, left_index=True, right_index=True)
-
-    upid_drugs_df["ids"] = upid_drugs_df["value"].str.split(",").str[2]
-
-    avg_treatment_dfs = pd.DataFrame(
-        upid_drugs_df.groupby("ids", as_index=False)["Days treated"].mean()
-    ).set_index("ids")
-    value_dfs = pd.DataFrame(
-        upid_drugs_df.groupby("value", as_index=False).size()
-    ).reset_index()
-    first_seen_treatment_dfs = pd.DataFrame(
-        upid_drugs_df.groupby("ids", as_index=False)["First seen"].min()
-    ).set_index("ids")
-    last_seen_treatment_dfs = pd.DataFrame(
-        upid_drugs_df.groupby("ids", as_index=False)["Last seen"].max()
-    ).set_index("ids")
-
-    upid_drugs_df["Cost"] = upid_drugs_df.index.map(total_costs["Total cost"])
-    cost_dfs = pd.DataFrame(
-        upid_drugs_df.groupby("value", as_index=False)["Cost"].sum()
-    ).set_index("value", drop=True)
-
-    upid_drugs_df = pd.merge(upid_drugs_df, df_drugs_unwrapped, left_index=True, right_index=True)
-
-    spacing_average = pd.DataFrame(
-        upid_drugs_df.groupby("value", as_index=False)[
-            [col for col in upid_drugs_df.columns if "spacing_" in col]
-        ].mean()
-    ).set_index("value", drop=True)
-    spacing_average = spacing_average.round()
-    spacing_average["combined"] = spacing_average.values.tolist()
-    spacing_average["ids"] = spacing_average.index
-    spacing_average["ids"] = spacing_average["ids"].str.split(",").str[2]
-    spacing_average.set_index("ids", inplace=True)
-
-    cost_average = pd.DataFrame(
-        upid_drugs_df.groupby("value", as_index=False)[
-            [col for col in upid_drugs_df.columns if "total_cost_drug_" in col]
-        ].mean()
-    ).set_index("value", drop=True)
-    cost_average = cost_average.round(2)
-    cost_average["combined"] = cost_average.values.tolist()
-    cost_average["ids"] = cost_average.index
-    cost_average["ids"] = cost_average["ids"].str.split(",").str[2]
-    cost_average.set_index("ids", inplace=True)
-
-    freq_average = pd.DataFrame(
-        upid_drugs_df.groupby("ids", as_index=False)[
-            [col for col in upid_drugs_df.columns if "freq_" in col]
-        ].mean()
-    ).set_index("ids", drop=True)
-    freq_average["combined"] = freq_average.values.tolist()
-
-    num = cost_dfs._get_numeric_data()
-    num[num < 0] = 0
-
-    value_dfs["Cost"] = value_dfs["value"].map(cost_dfs["Cost"])
-
-    ice_df = pd.DataFrame()
-    ice_df[["parents", "labels", "ids"]] = value_dfs["value"].str.split(",", expand=True)
-
-    ice_df["average_administered"] = ice_df["ids"].map(freq_average["combined"])
-    ice_df["cost"] = value_dfs["Cost"]
-    ice_df["value"] = value_dfs["size"]
-
-    ice_df["average_cost"] = ice_df["ids"].map(cost_average["combined"])
-    ice_df["average_cost"] = ice_df["average_cost"].apply(_remove_nan_string)
-
-    ice_df["average_spacing"] = ice_df["ids"].map(spacing_average["combined"])
-    ice_df["average_spacing"] = ice_df["average_spacing"].apply(_remove_nan_string)
-    ice_df["average_spacing"] = ice_df.apply(lambda x: _list_to_string(x), axis=1)
-    ice_df["average_spacing"] = ice_df["average_spacing"].str.replace("nan", "N/A")
-
-    logger.info("Building graph dataframe structure.")
-
-    new_row = pd.DataFrame(
-        {"parents": "", "ids": "N&WICS", "labels": "N&WICS", "value": 0, "cost": 0}, index=[0]
-    )
-    ice_df = pd.concat(objs=[ice_df, new_row], ignore_index=True, axis=0)
-
-    l_df = pd.DataFrame()
-    ice_df2 = pd.DataFrame()
-    l3 = [x for x in ice_df.parents.unique() if x not in ice_df.ids]
-    while len(l3) > 1:
-        for l in l3:
-            z = l.rfind("-")
-            if z > 0:
-                l_dict = {
-                    "parents": l[: z - 1],
-                    "ids": l,
-                    "value": 0,
-                    "labels": l[z + 2 :],
-                    "cost": 0,
-                }
-                l_df = pd.concat([l_df, pd.DataFrame(l_dict, index=[0])], ignore_index=True)
-        ice_df2 = pd.concat([ice_df, l_df], ignore_index=True)
-        l3 = [x for x in ice_df2.parents.unique() if x not in ice_df2.ids.unique()]
-    if len(ice_df2) > 0:
-        ice_df = ice_df2.drop_duplicates("ids")
-
-    ice_df["level"] = ice_df["ids"].str.count("-")
-    ice_df = ice_df[~ice_df["labels"].isin(["COST", "CHARGE", "N/A"])]
-    ice_df.sort_values(by=["level"], ascending=False, inplace=True, ignore_index=True)
-
-    for index, row in ice_df.iterrows():
-        lookup_index = ice_df.index[ice_df["ids"] == row["parents"]]
-        ice_df.loc[lookup_index, "value"] = (
-            ice_df.loc[lookup_index, "value"] + ice_df.loc[index, "value"]
-        )
-        ice_df.loc[lookup_index, "cost"] = (
-            ice_df.loc[lookup_index, "cost"] + ice_df.loc[index, "cost"]
-        )
-
-    colour_df = pd.DataFrame(ice_df.groupby(["parents"])["value"].sum())
-    ice_df["colour"] = ice_df["parents"].map(colour_df["value"])
-    ice_df["colour"] = ice_df["value"] / ice_df["colour"]
-
-    ice_df["costpp"] = ice_df["cost"] / ice_df["value"]
-    ice_df["avg_days"] = ice_df["ids"].map(avg_treatment_dfs["Days treated"])
-    ice_df["First seen"] = ice_df["ids"].map(first_seen_treatment_dfs["First seen"])
-    ice_df["Last seen"] = ice_df["ids"].map(last_seen_treatment_dfs["Last seen"])
-
-    ice_df["dates"] = ice_df.apply(lambda x: _min_max_treatment_dates(ice_df, x), axis=1)
-    ice_df[["First seen (Parent)", "Last seen (Parent)"]] = ice_df["dates"].str.split(
-        ",", expand=True
-    )
-
-    ice_df["First seen"] = pd.to_datetime(ice_df["First seen"])
-    ice_df["Last seen"] = pd.to_datetime(ice_df["Last seen"])
-    ice_df["cost_pp_pa"] = ice_df.apply(lambda x: _cost_pp_pa(x), axis=1)
-
-    return ice_df
-
-
-def prepare_chart_data(
-    ice_df: pd.DataFrame,
-    minimum_num_patients: int,
-) -> pd.DataFrame:
-    """
-    Prepare final chart data by applying patient threshold filter.
-
-    Args:
-        ice_df: DataFrame from build_hierarchy()
-        minimum_num_patients: Minimum number of patients to include a pathway
-
-    Returns:
-        Filtered DataFrame ready for chart generation
-    """
-    ice_df = ice_df[ice_df["value"] >= minimum_num_patients]
-    logger.info("Generating graph.")
-    return ice_df
-
-
-def generate_icicle_chart(
-    df: pd.DataFrame,
-    start_date: str,
-    end_date: str,
-    last_seen_date: str,
-    trust_filter: list[str],
-    drug_filter: list[str],
-    directory_filter: list[str],
-    minimum_num_patients: int,
-    title: str = "",
-    paths: Optional[PathConfig] = None,
-) -> tuple[pd.DataFrame, str]:
-    """
-    Generate icicle chart data using the refactored pipeline.
-
-    This is the main entry point that orchestrates the full analysis pipeline.
-
-    Args:
-        df: DataFrame with processed patient intervention data
-        start_date: Start date for patient initiation filter
-        end_date: End date for patient initiation filter
-        last_seen_date: Filter for patients last seen after this date
-        trust_filter: List of trust names to include
-        drug_filter: List of drug names to include
-        directory_filter: List of directories to include
-        minimum_num_patients: Minimum number of patients to include a pathway
-        title: Chart title (auto-generated if empty)
-        paths: PathConfig for file paths (uses default if None)
-
-    Returns:
-        Tuple of (ice_df for chart, final_title) or (None, "") if no data
-    """
-    if paths is None:
-        paths = default_paths
-
-    result = prepare_data(df, trust_filter, drug_filter, directory_filter, paths)
-    if result[0] is None:
-        return None, ""
-    filtered_df, org_codes, directory_df = result
-
-    cost_df = filtered_df[["UPID", "Price Actual"]]
-    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
-    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
-
-    result = calculate_statistics(filtered_df, start_date, end_date, last_seen_date, title)
-    if result[0] is None:
-        return None, ""
-    patient_info, date_df, final_title = result
-
-    df_drug_freq = (
-        filtered_df.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_cost = (
-        filtered_df.groupby("UPID")
-        .agg({"Price Actual": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
-    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(_count_list_values)
-    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: _sum_list_values(x), axis=1)
-
-    df1_unique = _drop_duplicate_treatments(filtered_df, True)
-    df_drugs = (
-        df1_unique.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_dates = (
-        df1_unique.groupby("UPID")
-        .agg({"Intervention Date": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-
-    df_dates_unwrapped = pd.DataFrame(
-        df_dates["Intervention Date"].values.tolist(), index=df_dates.index
-    ).add_prefix("date_")
-    df_drugs_unwrapped = pd.DataFrame(
-        df_drugs["Drug Name"].values.tolist(), index=df_drugs.index
-    ).add_prefix("drug_")
-
-    start_dates = (
-        filtered_df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=True)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-    end_dates = (
-        filtered_df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=False)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-
-    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(start_dates, x), axis=1
-    )
-    df_start_dates_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["start_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("start_date_")
-    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(end_dates, x), axis=1
-    )
-    df_end_dates_unwrapped_2 = pd.DataFrame(
-        df_drugs_unwrapped["end_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("end_date_")
-    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_start_dates_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_end_dates_unwrapped_2, left_index=True, right_index=True
-    )
-
-    df_freq_for_merge = pd.DataFrame(
-        df_drug_freq["Drug Name"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("freq_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_freq_for_merge, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(
-        lambda x: _drug_frequency_average(x), axis=1
-    )
-
-    df_spacing_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["frequency"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("spacing_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_spacing_unwrapped, left_index=True, right_index=True
-    )
-
-    df_cost_unwrapped = pd.DataFrame(
-        df_drug_freq["Drug cost total"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("total_cost_drug_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_cost_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
-
-    ice_df = build_hierarchy(
-        patient_info,
-        date_df,
-        filtered_df,
-        org_codes,
-        directory_df,
-        total_costs,
-        df_drugs_unwrapped,
-    )
-
-    ice_df = prepare_chart_data(ice_df, minimum_num_patients)
-
-    return ice_df, final_title
-
-
-def generate_icicle_chart_indication(
-    df: pd.DataFrame,
-    indication_df: pd.DataFrame,
-    start_date: str,
-    end_date: str,
-    last_seen_date: str,
-    trust_filter: list[str],
-    drug_filter: list[str],
-    directory_filter: list[str],
-    minimum_num_patients: int,
-    title: str = "",
-    paths: Optional[PathConfig] = None,
-) -> tuple[pd.DataFrame, str]:
-    """
-    Generate icicle chart data with indication-based grouping.
-
-    This is a variant of generate_icicle_chart() that groups by Search_Term
-    (from GP diagnosis match) instead of Directory. For patients without
-    a GP diagnosis match, the fallback directorate is used with a "(no GP dx)"
-    suffix to distinguish them.
-
-    Hierarchy: Trust → Indication_Group → Drug → Pathway
-
-    Args:
-        df: DataFrame with processed patient intervention data
-        indication_df: DataFrame mapping UPID → Indication_Group
-                      Must have 'UPID' as index and 'Indication_Group' column
-                      Values are either Search_Term or "Directory (no GP dx)"
-        start_date: Start date for patient initiation filter
-        end_date: End date for patient initiation filter
-        last_seen_date: Filter for patients last seen after this date
-        trust_filter: List of trust names to include
-        drug_filter: List of drug names to include
-        directory_filter: List of directories to include
-        minimum_num_patients: Minimum number of patients to include a pathway
-        title: Chart title (auto-generated if empty)
-        paths: PathConfig for file paths (uses default if None)
-
-    Returns:
-        Tuple of (ice_df for chart, final_title) or (None, "") if no data
-    """
-    if paths is None:
-        paths = default_paths
-
-    # Prepare data - use standard prepare_data function
-    result = prepare_data(df, trust_filter, drug_filter, directory_filter, paths)
-    if result[0] is None:
-        return None, ""
-    filtered_df, org_codes, directory_df = result
-
-    # For indication charts, we replace directory_df with indication_df
-    # First, ensure indication_df has the correct format (UPID as index)
-    if indication_df is not None and not indication_df.empty:
-        if 'UPID' in indication_df.columns:
-            indication_df = indication_df.set_index('UPID')
-        # Rename column for compatibility with build_hierarchy()
-        if 'Indication_Group' in indication_df.columns:
-            indication_df = indication_df.rename(columns={'Indication_Group': 'Directory'})
-        elif 'indication_group' in indication_df.columns:
-            indication_df = indication_df.rename(columns={'indication_group': 'Directory'})
-    else:
-        # Fall back to directory if no indication data provided
-        logger.warning("No indication data provided, falling back to directory grouping")
-        indication_df = directory_df
-
-    cost_df = filtered_df[["UPID", "Price Actual"]]
-    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
-    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
-
-    result = calculate_statistics(filtered_df, start_date, end_date, last_seen_date, title)
-    if result[0] is None:
-        return None, ""
-    patient_info, date_df, final_title = result
-
-    df_drug_freq = (
-        filtered_df.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_cost = (
-        filtered_df.groupby("UPID")
-        .agg({"Price Actual": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
-    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(_count_list_values)
-    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: _sum_list_values(x), axis=1)
-
-    df1_unique = _drop_duplicate_treatments(filtered_df, True)
-    df_drugs = (
-        df1_unique.groupby("UPID")
-        .agg({"Drug Name": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-    df_dates = (
-        df1_unique.groupby("UPID")
-        .agg({"Intervention Date": lambda x: list(x)})
-        .reset_index()
-        .set_index("UPID")
-    )
-
-    df_dates_unwrapped = pd.DataFrame(
-        df_dates["Intervention Date"].values.tolist(), index=df_dates.index
-    ).add_prefix("date_")
-    df_drugs_unwrapped = pd.DataFrame(
-        df_drugs["Drug Name"].values.tolist(), index=df_drugs.index
-    ).add_prefix("drug_")
-
-    start_dates = (
-        filtered_df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=True)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-    end_dates = (
-        filtered_df[["UPIDTreatment", "Intervention Date"]]
-        .sort_values(by=["Intervention Date"], ascending=False)
-        .drop_duplicates(subset="UPIDTreatment")
-        .set_index("UPIDTreatment")
-    )
-
-    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(start_dates, x), axis=1
-    )
-    df_start_dates_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["start_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("start_date_")
-    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(
-        lambda x: _start_date_drug(end_dates, x), axis=1
-    )
-    df_end_dates_unwrapped_2 = pd.DataFrame(
-        df_drugs_unwrapped["end_dates"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("end_date_")
-    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
-
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_start_dates_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_end_dates_unwrapped_2, left_index=True, right_index=True
-    )
-
-    df_freq_for_merge = pd.DataFrame(
-        df_drug_freq["Drug Name"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("freq_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_freq_for_merge, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(
-        lambda x: _drug_frequency_average(x), axis=1
-    )
-
-    df_spacing_unwrapped = pd.DataFrame(
-        df_drugs_unwrapped["frequency"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("spacing_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_spacing_unwrapped, left_index=True, right_index=True
-    )
-
-    df_cost_unwrapped = pd.DataFrame(
-        df_drug_freq["Drug cost total"].values.tolist(), index=df_drugs_unwrapped.index
-    ).add_prefix("total_cost_drug_")
-    df_drugs_unwrapped = pd.merge(
-        df_drugs_unwrapped, df_cost_unwrapped, left_index=True, right_index=True
-    )
-    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
-
-    # Build hierarchy with indication_df instead of directory_df
-    ice_df = build_hierarchy(
-        patient_info,
-        date_df,
-        filtered_df,
-        org_codes,
-        indication_df,  # Use indication mapping instead of directory
-        total_costs,
-        df_drugs_unwrapped,
-    )
-
-    ice_df = prepare_chart_data(ice_df, minimum_num_patients)
-
-    return ice_df, final_title
@@ -1,331 +0,0 @@
-"""
-Statistical calculation functions for patient pathway analysis.
-
-This module contains functions for calculating:
- Drug frequency counts and averages
- Cost aggregations (total, per patient, per annum)
- Treatment duration calculations
- Dosing interval calculations
-
-These functions are extracted from the analysis pipeline to enable:
- Independent testing
- Reuse across different analysis contexts
- Clearer separation of concerns
-"""
-
-from itertools import groupby
-from typing import Optional
-
-import numpy as np
-import pandas as pd
-
-
-def count_consecutive_values(values: list) -> list[int]:
-    """
-    Count consecutive occurrences of each value in a sorted list.
-
-    Used to count how many times each drug was administered.
-
-    Args:
-        values: List of values (typically drug names)
-
-    Returns:
-        List of counts for each unique value in sorted order
-
-    Example:
-        >>> count_consecutive_values(['A', 'A', 'B', 'A'])
-        [3, 1]  # 'A' appears 3 times, 'B' appears 1 time (sorted)
-    """
-    return [len(list(group)) for key, group in groupby(sorted(values))]
-
-
-def calculate_drug_costs(drug_counts: list[int], prices: list[float]) -> list[float]:
-    """
-    Calculate total cost for each drug based on counts and prices.
-
-    Splits the price list based on drug administration counts and sums
-    each drug's portion.
-
-    Args:
-        drug_counts: List of administration counts per drug (from count_consecutive_values)
-        prices: List of individual administration prices (Price Actual values)
-
-    Returns:
-        List of total costs per drug
-
-    Example:
-        >>> calculate_drug_costs([3, 2], [100, 100, 100, 200, 200])
-        [300.0, 400.0]  # Drug 1: 3x$100 = $300, Drug 2: 2x$200 = $400
-    """
-    sum_list = []
-    cumulative = 0
-    for count in drug_counts:
-        drug_cost = sum(prices[cumulative:cumulative + count])
-        sum_list.append(float(drug_cost))
-        cumulative += count
-    return sum_list
-
-
-def calculate_dosing_frequency(
-    freq: int,
-    start_date: pd.Timestamp,
-    end_date: pd.Timestamp,
-) -> float:
-    """
-    Calculate average dosing interval in days.
-
-    Computes the average number of days between administrations.
-
-    Args:
-        freq: Number of administrations
-        start_date: First administration date
-        end_date: Last administration date
-
-    Returns:
-        Average days between administrations, or 0 if only one dose
-
-    Example:
-        >>> start = pd.Timestamp('2024-01-01')
-        >>> end = pd.Timestamp('2024-01-22')
-        >>> calculate_dosing_frequency(4, start, end)
-        7.0  # 21 days / (4-1) = 7 days between doses
-    """
-    if freq <= 1:
-        return 0.0
-
-    duration_days = (end_date - start_date) / np.timedelta64(1, "D")
-    if duration_days <= 0:
-        return 0.0
-
-    return duration_days / (freq - 1)
-
-
-def calculate_drug_frequency_row(row: pd.Series) -> list[float]:
-    """
-    Calculate average dosing frequency for each drug in a patient's treatment.
-
-    Used with DataFrame.apply() on rows containing drug_*, freq_*, start_date_*, end_date_* columns.
-
-    Args:
-        row: Series with drug names, frequencies, start dates, and end dates
-
-    Returns:
-        List of average dosing intervals (days) for each drug
-    """
-    drug_count = row.index.str.contains("drug_").sum()
-    frequencies = []
-
-    for d in range(drug_count):
-        freq_col = f"freq_{d}"
-        start_col = f"start_date_{d}"
-        end_col = f"end_date_{d}"
-
-        freq = row.get(freq_col, 0)
-        if freq is None or pd.isna(freq):
-            freq = 0
-        else:
-            freq = int(freq)
-
-        if freq > 1:
-            start_date = row.get(start_col)
-            end_date = row.get(end_col)
-
-            if pd.notna(start_date) and pd.notna(end_date):
-                interval = calculate_dosing_frequency(freq, start_date, end_date)
-            else:
-                interval = 0.0
-        else:
-            interval = 0.0
-
-        frequencies.append(interval)
-
-    return frequencies
-
-
-def calculate_cost_per_patient_per_annum(
-    total_cost: float,
-    days_treated: Optional[pd.Timedelta],
-) -> Optional[float]:
-    """
-    Calculate annualized cost per patient.
-
-    Normalizes costs to a per-year basis to enable comparison across
-    patients with different treatment durations.
-
-    Args:
-        total_cost: Total cost for the patient (can be Decimal or float)
-        days_treated: Treatment duration as timedelta
-
-    Returns:
-        Annualized cost, or None if days_treated is 0 or None
-
-    Example:
-        >>> calculate_cost_per_patient_per_annum(5000, pd.Timedelta(days=182.5))
-        10000.0  # Half year treatment, so annual cost is 2x
-    """
-    if days_treated is None or pd.isna(days_treated):
-        return None
-
-    days = days_treated / np.timedelta64(1, "D") if hasattr(days_treated, '__truediv__') else float(days_treated)
-
-    if days <= 0:
-        return None
-
-    # Convert total_cost to float to handle Decimal from Snowflake
-    return float(total_cost) / (days / 365)
-
-
-def calculate_treatment_duration(
-    first_seen: pd.Timestamp,
-    last_seen: pd.Timestamp,
-) -> pd.Timedelta:
-    """
-    Calculate treatment duration from first to last seen dates.
-
-    Args:
-        first_seen: Date of first treatment
-        last_seen: Date of last treatment
-
-    Returns:
-        Duration as timedelta
-    """
-    return last_seen - first_seen
-
-
-def calculate_pathway_proportion(value: int, parent_value: int) -> float:
-    """
-    Calculate proportion of parent value for color scaling.
-
-    Used to determine color intensity in the icicle chart based on
-    what proportion of the parent category this pathway represents.
-
-    Args:
-        value: Patient count for this pathway
-        parent_value: Total patient count for the parent category
-
-    Returns:
-        Proportion (0.0 to 1.0)
-    """
-    if parent_value <= 0:
-        return 0.0
-    return value / parent_value
-
-
-def aggregate_patient_costs(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Calculate total cost per patient (UPID).
-
-    Args:
-        df: DataFrame with UPID and Price Actual columns
-
-    Returns:
-        DataFrame indexed by UPID with Total cost column
-    """
-    cost_df = df[["UPID", "Price Actual"]]
-    total_costs = cost_df.groupby("UPID").sum()
-    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
-    return total_costs
-
-
-def aggregate_drug_frequencies(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Calculate drug administration frequency per patient.
-
-    Groups by UPID and returns counts of each drug's administrations.
-
-    Args:
-        df: DataFrame with UPID and Drug Name columns
-
-    Returns:
-        DataFrame indexed by UPID with Drug Name as list of counts
-    """
-    return (
-        df.groupby("UPID")
-        .agg({"Drug Name": lambda x: count_consecutive_values(list(x))})
-        .reset_index()
-        .set_index("UPID")
-    )
-
-
-def calculate_average_spacing_for_pathway(
-    upid_drugs_df: pd.DataFrame,
-    pathway_value: str,
-) -> list[float]:
-    """
-    Calculate average dosing spacing for a treatment pathway.
-
-    Groups patients by pathway and calculates mean spacing for each drug position.
-
-    Args:
-        upid_drugs_df: DataFrame with patient pathway data and spacing columns
-        pathway_value: Pathway identifier string
-
-    Returns:
-        List of average spacing values (days) for each drug in pathway
-    """
-    spacing_cols = [col for col in upid_drugs_df.columns if col.startswith("spacing_")]
-
-    pathway_data = upid_drugs_df[upid_drugs_df["value"] == pathway_value]
-
-    if len(pathway_data) == 0:
-        return []
-
-    averages = pathway_data[spacing_cols].mean()
-    return [round(v, 0) if pd.notna(v) else 0.0 for v in averages.tolist()]
-
-
-def format_treatment_statistics(
-    drug_names: list[str],
-    average_administered: list[float],
-    average_spacing: list[float],
-    average_cost: list[float],
-) -> str:
-    """
-    Format drug treatment statistics into a readable string for chart display.
-
-    Creates an HTML-formatted string with drug name, average administrations,
-    dosing interval, and total treatment length.
-
-    Args:
-        drug_names: List of drug names in treatment sequence
-        average_administered: Average number of administrations per drug
-        average_spacing: Average days between doses per drug
-        average_cost: Average cost per drug
-
-    Returns:
-        HTML-formatted string for chart hover text
-    """
-    ret_string = ""
-
-    for i, drug_name in enumerate(drug_names):
-        admin_count = average_administered[i] if i < len(average_administered) else 0
-        spacing_days = average_spacing[i] if i < len(average_spacing) else 0
-
-        # Convert to weeks
-        spacing_weeks = spacing_days / 7 if spacing_days > 0 else 0
-        total_weeks = spacing_weeks * admin_count if admin_count > 0 else 0
-
-        string = (
-            f"<br><b>{drug_name}</b><br>On average given "
-            f"{round(admin_count, 1)} times with a "
-            f"{round(spacing_weeks, 1)} weekly interval ("
-            f"{round(total_weeks, 0)} weeks total treatment length)"
-        )
-        ret_string += string
-
-    return ret_string
-
-
-def remove_nan_values(values: list) -> list:
-    """
-    Remove NaN string values from a list.
-
-    Used to clean up aggregated statistics that may contain 'nan' strings.
-
-    Args:
-        values: List potentially containing 'nan' strings
-
-    Returns:
-        Filtered list without 'nan' strings
-    """
-    return [x for x in values if str(x).lower() != "nan"]