chore: archive unused files and move legacy code to can_delete

archive/ — unused reference files (no active code references): - LookupSearchTermCleanedDrugName.csv, condition_directorate_mapping.csv - na_directory_rows.csv (diagnostic output), ta-recommendations.xlsx - snomed_indication_mapping_query.sql (source for embedded SQL) - IMPROVEMENT_RECOMMENDATIONS.md, power query.pq archive/can_delete/ — legacy code and logs safe to remove: - dashboard_gui.py (replaced by Reflex app) - pathways_app_old.py.bak (old backup) - Ralph loop iteration logs (iterations 2-8)
2026-02-06 01:01:02 +00:00
parent a31907aa1f
commit bb93c1673e
23 changed files with 121509 additions and 0 deletions
@@ -0,0 +1,647 @@
+import webbrowser
+from itertools import groupby
+import os
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+
+from core import AnalysisFilters, PathConfig, default_paths
+from core.logging_config import get_logger
+from tools import data
+
+# Import refactored analysis functions
+from analysis.pathway_analyzer import (
+    generate_icicle_chart as _generate_icicle_chart,
+    prepare_data as _prepare_data,
+    calculate_statistics as _calculate_statistics,
+    build_hierarchy as _build_hierarchy,
+    prepare_chart_data as _prepare_chart_data,
+)
+
+# Import visualization functions
+from visualization.plotly_generator import (
+    create_icicle_figure as _create_icicle_figure,
+    save_figure_html as _save_figure_html,
+    figure_legacy as _figure_legacy,
+)
+
+logger = get_logger(__name__)
+
+pd.options.mode.chained_assignment = None  # default='warn'
+def human_format(num):
+    num = float('{:.3g}'.format(num))
+    magnitude = 0
+    while abs(num) >= 1000:
+        magnitude += 1
+        num /= 1000.0
+    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
+
+def main(dir, paths: Optional[PathConfig] = None):
+    """
+    Load and process patient intervention data from a file.
+
+    Uses the FileDataLoader abstraction to handle CSV/Parquet file loading
+    with all necessary transformations (patient_id, drug_names, department_identification).
+
+    Args:
+        dir: Path to CSV or Parquet file
+        paths: PathConfig for reference data locations (uses default_paths if None)
+
+    Returns:
+        DataFrame with processed patient intervention data
+    """
+    from data_processing.loader import FileDataLoader
+
+    if paths is None:
+        paths = default_paths
+
+    loader = FileDataLoader(file_path=dir, paths=paths)
+    result = loader.load()
+
+    logger.info("Initial data processing complete.")
+    return result.df
+
+
+def drop_duplicate_treatments(df, ascending):
+    df.sort_values(by=['Intervention Date'], ascending=ascending, inplace=True)
+    df_treatment_steps = df.drop_duplicates(subset="UPIDTreatment", keep="first")
+    if not ascending:
+        df_treatment_steps.sort_values(by=['Intervention Date'], ascending=True, inplace=True)
+    return df_treatment_steps
+
+
+def row_function(row):
+    ids = ""
+    parents = "N&WICS"
+    count = row.count()
+    for c in range(count):
+        v = row[c]
+        if type(v) != str:
+            v = row[c + 1]
+        if c == count - 1:
+            ids = parents + " - " + v
+            continue
+        parents += " - " + v
+    label = row[count - 1]
+    value = parents + "," + label + "," + ids
+    return value
+
+
+def count_list_values(x):
+    return [len(list(group)) for key, group in groupby(sorted(x))]
+
+
+def sum_list_values(x):
+    sum_list = []
+    for count in range(len(x["Drug Name"])):
+        if count == 0:
+            sum_list.append(sum(x["Price Actual"][ : x["Drug Name"][count]]))
+        else:
+            sum_list.append(sum(x["Price Actual"][x["Drug Name"][count-1] : (x["Drug Name"][count-1] + x["Drug Name"][count])]))
+    return sum_list
+
+
+def remove_nan_string(y):
+    return [x for x in y if str(x) != 'nan']
+
+
+def min_max_treatment_dates(ice_df, row):
+    ids = row[2]
+    min_max = ice_df[ice_df["ids"].str.contains(ids)]
+    min_date = str(min_max["First seen"].min().strftime('%Y-%m-%d'))
+    max_date = str(min_max["Last seen"].max().strftime('%Y-%m-%d'))
+    return min_date + ',' + max_date
+
+
+def start_date_drug(df, x):
+    drug_count = x.notnull().sum()
+    date_string = []
+    for d in range(drug_count):
+        UPID_date_var = str(x.name) + str(x[d])
+        date = df.loc[UPID_date_var, "Intervention Date"]
+        date_string.append(date)
+    return date_string
+
+
+def end_date_drug(df, x):
+    drug_count = x.notnull().sum()
+    date_string = []
+    # Need to -1 from drug count as start date gets counted from notnull above
+    for d in range(drug_count - 1):
+        UPID_date_var = str(x.name) + str(x[d])
+        date = df.loc[UPID_date_var, "Intervention Date"]
+        date_string.append(date)
+    return date_string
+
+
+def list_to_string(x):
+    list = x.ids.split(' - ')
+    drug_list = list[len(list) - len(x.average_cost):]
+    ret_string = ""
+    for y in range(len(x.average_cost)):
+        if (round(x.average_spacing[y], 0) > 1) and (round(x.average_administered[y], 0) > 2.5) and (int(x.value) > 0):
+            string = "<br><b>" + str(drug_list[y]) + "</b><br>On average given " + str(
+                round(x.average_administered[y], 1)) + \
+                     " times with a " + str(round(int(x.average_spacing[y]) / 7, 1)) + " weekly interval (" \
+                     + str(round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1),
+                                 0)) + " weeks total treatment length)" 
+                     #"<br>Average annual cost per annum:" + \
+                     #str(human_format(
+                     #    (x.cost / x.value) / (((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1))/ 52)))
+        else:
+            string = "<br><b>" + str(drug_list[y]) + "</b><br>On average given " + str(
+                round(x.average_administered[y], 1)) + \
+                     " times with a " + str(round(int(x.average_spacing[y]) / 7, 1)) + " weekly interval (" \
+                     + str(round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1),
+                                 0)) + " weeks total treatment length)" 
+                     #"<br>Average annual cost per annum unavailable"
+
+        ret_string += string
+
+    return ret_string
+
+
+def drug_frequency_average(x):
+    drug_count = x.index.str.contains("drug_").sum()
+    freq = []
+    for d in range(drug_count):
+        if x["freq_" + str(d)] > 1:
+            duration = ((x["end_date_" + str(d)] - x["start_date_" + str(d)]) / np.timedelta64(1, 'D'))
+            if duration > 0:
+                freq_calc = duration / (x["freq_" + str(d)] - 1)
+            else:
+                freq_calc = 0
+        else:
+            freq_calc = 0
+        freq.append(freq_calc)
+    return freq
+
+
+def cost_pp_pa(x):
+    if x["avg_days"]/ np.timedelta64(1, 'D') > 0:
+        return str(round(x["costpp"] / ((x["avg_days"] / np.timedelta64(1, 'D')) / 365), 2))
+    else:
+        return "N/A"
+
+
+def generate_graph(
+    df1,
+    start_date=None,
+    end_date=None,
+    last_seen=None,
+    save_dir=None,
+    trustFilter=None,
+    drugFilter=None,
+    directorateFilter=None,
+    title=None,
+    minimum_num_patients=None,
+    *,
+    filters: Optional[AnalysisFilters] = None,
+    paths: Optional[PathConfig] = None,
+):
+    """
+    Generate patient pathway icicle chart.
+
+    This function can be called in two ways:
+    1. New style: Pass filters=AnalysisFilters(...) with all parameters encapsulated
+    2. Legacy style: Pass individual parameters (start_date, end_date, etc.)
+
+    If both are provided, the filters object takes precedence.
+
+    Args:
+        df1: DataFrame with processed patient data
+        filters: AnalysisFilters object with all filter parameters (preferred)
+        paths: PathConfig object for file paths (optional, uses default_paths if not provided)
+
+        Legacy parameters (used if filters is None):
+        start_date, end_date, last_seen, save_dir, trustFilter, drugFilter,
+        directorateFilter, title, minimum_num_patients
+    """
+    # Use PathConfig for file paths
+    if paths is None:
+        paths = default_paths
+
+    # Extract parameters from AnalysisFilters if provided
+    if filters is not None:
+        start_date = filters.start_date
+        end_date = filters.end_date
+        last_seen = filters.last_seen_date
+        save_dir = filters.output_dir
+        trustFilter = filters.trusts
+        drugFilter = filters.drugs
+        directorateFilter = filters.directories
+        title = filters.custom_title
+        minimum_num_patients = filters.minimum_patients
+
+    df1["UPIDTreatment"] = df1["UPID"] + df1["Drug Name"]
+
+    # Get average number of doses count
+    org_codes = pd.read_csv(paths.org_codes_csv, index_col=1)
+    df1["Provider Code"] = df1["Provider Code"].map(org_codes["Name"])
+    #df1.to_csv("./df1.csv", index=False)
+
+    df1 = df1[(df1["Provider Code"].isin(trustFilter)) & (df1["Drug Name"].isin(drugFilter)) & (df1["Directory"].isin(directorateFilter))]
+
+    if len(df1) == 0:
+        logger.warning("No data found for selected filters.")
+        return
+
+    # Find total cost for each patient - Total cost is ~£110Mil, about 30% is unattributable to a patient (no UPID)
+    cost_df = df1[["UPID", "Price Actual"]]
+    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
+    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
+
+    # Series to map directory
+    directory_df = df1[["UPID", "Directory"]]
+    directory_df.drop_duplicates("UPID", inplace=True)
+    directory_df.set_index("UPID", inplace=True)
+    logger.info("Filtering unrelated interventions")
+
+    df_end_dates = drop_duplicate_treatments(df1, False)
+    df1_unique = drop_duplicate_treatments(df1, True)
+    logger.info("Identifying unique patients and interventions used")
+    # Create list of total number of that drug for each patient
+    df_drug_freq = df1.groupby("UPID").agg({"Drug Name": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_drug_cost = df1.groupby("UPID").agg({"Price Actual": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
+    #df_drug_freq["Price Actual"] = df_drug_freq["Price Actual"].map(df_drug_cost)
+    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(count_list_values)
+    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: sum_list_values(x), axis=1)
+
+
+    # Aggregate interventions & dates of interventions into transposed list by UPID
+    df_drugs = df1_unique.groupby("UPID").agg({"Drug Name": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_dates = df1_unique.groupby("UPID").agg({"Intervention Date": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_end_dates = df_end_dates.groupby("UPID").agg({"Intervention Date": lambda x: list(x)}).reset_index().set_index("UPID")
+
+    logger.info("Calculating each unique patient's intervention average frequency, cost and duration of each intervention")
+    # The following sh*t show is to unwrap the lists into columns for different drugs, start/end dates, and average
+    # frequency/average total injections of each one
+    df_dates_unwrapped = pd.DataFrame(df_dates["Intervention Date"].values.tolist(), index=df_dates.index).add_prefix(
+        'date_')
+    df_end_dates_unwrapped = pd.DataFrame(df_end_dates["Intervention Date"].values.tolist(), index=df_end_dates.index).add_prefix(
+        'date_end_')
+    df_drugs_unwrapped = pd.DataFrame(df_drugs["Drug Name"].values.tolist(), index=df_drugs.index).add_prefix('drug_')
+
+    df_freq_unwrapped = pd.DataFrame(df_drug_freq["Drug Name"].values.tolist(), index=df_drug_freq.index).add_prefix(
+        'freq_')
+    start_dates = df1[["UPIDTreatment", "Intervention Date"]].sort_values(by=["Intervention Date"], ascending=True,
+                                                                               inplace=False,
+                                                                               ignore_index=True).drop_duplicates(
+        subset="UPIDTreatment").set_index("UPIDTreatment")
+    end_dates = df1[["UPIDTreatment", "Intervention Date"]].sort_values(by=["Intervention Date"], ascending=False,
+                                                                             inplace=False,
+                                                                             ignore_index=True).drop_duplicates(
+        subset="UPIDTreatment").set_index("UPIDTreatment")
+
+
+
+    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(lambda x: start_date_drug(start_dates, x), axis=1)
+
+    df_ddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["start_dates"].values.tolist(),
+                                       index=df_drugs_unwrapped.index).add_prefix(
+        'start_date_')
+    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
+    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(lambda x: start_date_drug(end_dates, x), axis=1)
+    df_dddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["end_dates"].values.tolist(),
+                                       index=df_drugs_unwrapped.index).add_prefix(
+        'end_date_')
+
+    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_ddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddrugs_unwrapped, left_index=True, right_index=True)
+    df_dddddrugs_unwrapped = pd.DataFrame(df_drug_freq["Drug Name"].values.tolist(),
+                                          index=df_drugs_unwrapped.index).add_prefix(
+        'freq_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(lambda x: drug_frequency_average(x), axis=1)
+
+    df_ddddddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["frequency"].values.tolist(),
+                                           index=df_drugs_unwrapped.index).add_prefix(
+        'spacing_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_ddddddrugs_unwrapped, left_index=True, right_index=True)
+    df_dddddddrugs_unwrapped = pd.DataFrame(df_drug_freq["Drug cost total"].values.tolist(),
+                                           index=df_drugs_unwrapped.index).add_prefix('total_cost_drug_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddddddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
+
+    # Insert first & last date seen into df (need to add last date seen)
+    df_drugs_unwrapped.insert(0, "First seen", df_dates_unwrapped.min(axis=1))
+    df_drugs_unwrapped.insert(1, "Last seen", df_end_dates_unwrapped.max(axis=1))
+
+    # Merge info from activity data with grouped info, and total cost info
+    patient_info = df1.drop_duplicates(subset="UPID", keep="first").set_index("UPID")
+    patient_info = pd.merge(patient_info, df_drugs_unwrapped, left_index=True, right_index=True)
+    patient_info = pd.merge(patient_info, df_freq_unwrapped, left_index=True, right_index=True)
+    patient_info = pd.merge(patient_info, total_costs, left_index=True, right_index=True)
+
+    #patient_info.to_csv("patient_info.csv", index=False)
+
+    # Filter initiation based on years provided
+    patient_info = patient_info[(patient_info['First seen'] >= str(start_date)) & (
+                patient_info['First seen'] < str(end_date))]
+    if title == "":
+        title = "Patients initiated from " + str(start_date) + " to " + str(end_date)
+
+    # Filter last seen based on date provided
+    patient_info = patient_info[patient_info['Last seen'] > str(last_seen)]
+
+    # Remove patients with 0 drug, by filling blanks with NaN & dropping rows
+    patient_info.drug_0.replace('N/A', np.nan, inplace=True)
+    patient_info.dropna(subset=['drug_0'], inplace=True)
+
+    # Calculate duation of treatment
+    patient_info['Days treated'] = patient_info["Last seen"] - patient_info["First seen"]
+    date_df = patient_info[["First seen", "Last seen", 'Days treated']]
+
+    # Create df for ice chart with hierarchy of plot
+    number_of_drugs = np.count_nonzero(patient_info.columns.str.startswith('drug_'))
+    final_drug_index = patient_info.columns.to_list().index("drug_" + str(number_of_drugs - 1))
+
+    upid_drugs_df = patient_info.iloc[:, (final_drug_index - number_of_drugs + 1):final_drug_index + 1]
+
+    upid_drugs_df.insert(0, "Trust", upid_drugs_df.index.str[:3])
+    upid_drugs_df.insert(1, "Directory", upid_drugs_df.index)
+
+    upid_drugs_df["Trust"] = upid_drugs_df["Trust"].map(org_codes["Name"])
+    upid_drugs_df["Directory"] = upid_drugs_df["Directory"].map(directory_df["Directory"])
+
+    l_df = pd.DataFrame()
+    ice_df2 = pd.DataFrame()
+    ice_df = pd.DataFrame()
+
+    upid_drugs_df["value"] = upid_drugs_df.apply(lambda x: row_function(x), axis=1)
+    # Merge in date info
+    upid_drugs_df = pd.merge(upid_drugs_df, date_df, left_index=True, right_index=True)
+
+    upid_drugs_df["ids"] = upid_drugs_df["value"].str.split(',').str[2]
+    avg_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["Days treated"].mean()).set_index("ids")
+    value_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False).size()).reset_index()
+    first_seen_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["First seen"].min()).set_index(
+        "ids")
+    last_seen_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["Last seen"].max()).set_index(
+        "ids")
+
+    # Calculate total cost for parents
+    upid_drugs_df["Cost"] = upid_drugs_df.index.map(total_costs["Total cost"])
+    cost_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)['Cost'].sum()).set_index("value", drop=True)
+
+    # Calculate average dosing for each drug
+    upid_drugs_df = pd.merge(upid_drugs_df, df_drugs_unwrapped, left_index=True, right_index=True)
+    # frequency_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)['Cost'].sum()).set_index("value", drop=True)
+
+    # Calculate average spacing between drugs
+    spacing_average = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)[
+                                       [col for col in upid_drugs_df.columns if 'spacing_' in col]].mean()).set_index(
+        "value", drop=True)
+    spacing_average = spacing_average.round()
+    spacing_average['combined'] = spacing_average.values.tolist()
+    spacing_average["ids"] = spacing_average.index
+    spacing_average["ids"] = spacing_average["ids"].str.split(',').str[2]
+    spacing_average.set_index("ids", inplace=True)
+
+    # Calculate average cost for each drug
+    cost_average = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)[
+                                       [col for col in upid_drugs_df.columns if 'total_cost_drug_' in col]].mean()).set_index(
+        "value", drop=True)
+    cost_average = cost_average.round(2)
+    cost_average['combined'] = cost_average.values.tolist()
+    cost_average["ids"] = cost_average.index
+    cost_average["ids"] = cost_average["ids"].str.split(',').str[2]
+    cost_average.set_index("ids", inplace=True)
+
+
+    # Calculate average number of doses
+    freq_average = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)[
+                                    [col for col in upid_drugs_df.columns if 'freq_' in col]].mean()).set_index("ids",
+                                                                                                                drop=True)
+    # freq_average = freq_average.round()
+    freq_average['combined'] = freq_average.values.tolist()
+
+    # Remove negative totals from "Cost" column
+    num = cost_dfs._get_numeric_data()
+    num[num < 0] = 0
+
+    value_dfs["Cost"] = value_dfs["value"].map(cost_dfs["Cost"])
+
+    ice_df[['parents', 'labels', 'ids']] = value_dfs["value"].str.split(',', expand=True)
+    # ice_df["index"] = ice_df.ids
+    # ice_df.set_index("index", inplace=True)
+
+    ice_df["average_administered"] = ice_df["ids"].map(freq_average["combined"])
+    ice_df["cost"] = value_dfs["Cost"]
+    ice_df["value"] = value_dfs["size"]
+
+    ice_df["average_cost"] = ice_df["ids"].map(cost_average["combined"])
+    ice_df["average_cost"] = ice_df["average_cost"].apply(remove_nan_string)
+
+    ice_df["average_spacing"] = ice_df["ids"].map(spacing_average["combined"])
+    ice_df["average_spacing"] = ice_df["average_spacing"].apply(remove_nan_string)
+    ice_df["average_spacing"] = ice_df.apply(lambda x: list_to_string(x), axis=1)
+    ice_df["average_spacing"] = ice_df["average_spacing"].str.replace("nan", "N/A")
+
+
+    logger.info("Building graph dataframe structure.")
+    # Add very top level of Trust
+    new_row = pd.DataFrame({'parents': '', 'ids': "N&WICS", 'labels': 'N&WICS', 'value': 0, "cost": 0}, index=[0])
+    ice_df = pd.concat(objs=[ice_df, new_row], ignore_index=True, axis=0)
+
+    # need to add parents as blocks...
+    l3 = [x for x in ice_df.parents.unique() if x not in ice_df.ids]
+    while len(l3) > 1:
+        for l in l3:
+            z = l.rfind("-")
+            if z > 0:
+                l_dict = {"parents": l[:z - 1], "ids": l, "value": 0, "labels": l[z + 2:], "cost": 0}
+                l_df = pd.concat([l_df, pd.DataFrame(l_dict, index=[0])], ignore_index=True)
+        ice_df2 = pd.concat([ice_df, l_df], ignore_index=True)
+        l3 = [x for x in ice_df2.parents.unique() if x not in ice_df2.ids.unique()]
+    ice_df = ice_df2.drop_duplicates("ids")
+
+    ice_df["level"] = ice_df["ids"].str.count('-')
+    ice_df = ice_df[~ice_df['labels'].isin(["COST", "CHARGE", "N/A"])]
+    ice_df.sort_values(by=["level"], ascending=False, inplace=True, ignore_index=True)
+
+    for index, row in ice_df.iterrows():
+        lookup_index = ice_df.index[ice_df['ids'] == row['parents']]
+        ice_df.loc[lookup_index, 'value'] = ice_df.loc[lookup_index, "value"] + ice_df.loc[index, "value"]
+        ice_df.loc[lookup_index, 'cost'] = ice_df.loc[lookup_index, "cost"] + ice_df.loc[index, 'cost']
+
+    # Sum of parent values to create denominator for percentage - FOR PATIENT NUMBER COLOUR GRADING
+    colour_df = pd.DataFrame(ice_df.groupby(["parents"])["value"].sum())
+    ice_df['colour'] = ice_df["parents"].map(colour_df["value"])
+    ice_df['colour'] = ice_df['value']/ice_df['colour']
+
+    # Sum of parent values to create denominator for percentage - FOR COST COLOUR GRADING
+    #colour_df = pd.DataFrame(ice_df.groupby(["parents"])["cost"].sum())
+    #ice_df['colour'] = ice_df["parents"].map(colour_df["cost"])
+    #ice_df['colour'] = ice_df['cost'] / ice_df['colour']
+
+
+    ice_df['costpp'] = ice_df['cost'] / ice_df['value']
+    # Treatment length info
+    ice_df['avg_days'] = ice_df["ids"].map(avg_treatment_dfs["Days treated"])
+    ice_df['First seen'] = ice_df["ids"].map(first_seen_treatment_dfs["First seen"])
+    ice_df['Last seen'] = ice_df["ids"].map(last_seen_treatment_dfs["Last seen"])
+
+    ice_df["dates"] = ice_df.apply(lambda x: min_max_treatment_dates(ice_df, x), axis=1)
+    ice_df[['First seen (Parent)', 'Last seen (Parent)']] = ice_df["dates"].str.split(',', expand=True)
+
+    # Sort labels to be alphabetical
+    # ice_df.sort_values(by=["labels"], ascending=True, inplace=True, ignore_index=True)
+    ice_df['First seen'] = pd.to_datetime(ice_df['First seen'])
+    ice_df['Last seen'] = pd.to_datetime(ice_df['Last seen'])
+    ice_df["cost_pp_pa"] = ice_df.apply(lambda x: cost_pp_pa(x), axis=1)
+
+    # Filter out rows where value is less than minimum number of patients
+    ice_df = ice_df[ice_df['value'] >= minimum_num_patients]
+
+    logger.info("Generating graph.")
+
+    figure(ice_df, title, save_dir)
+    return
+
+
+def figure(ice_df4, dir_string, save_dir):
+    """
+    Create and display icicle figure (legacy interface).
+
+    This function delegates to visualization.plotly_generator.figure_legacy()
+    for backward compatibility.
+
+    Args:
+        ice_df4: DataFrame with chart data
+        dir_string: Title string (used for filename and chart title)
+        save_dir: Directory to save the HTML file
+    """
+    _figure_legacy(ice_df4, dir_string, save_dir)
+    return
+
+
+# fig = go.Figure(go.Icicle(
+#         labels=ice_df4.labels,
+#         ids=ice_df4.ids,
+#         # count="branches",
+#         parents=ice_df4.parents,
+#         customdata=np.stack((ice_df4.value, ice_df4.colour, ice_df4.cost, ice_df4.costpp, first_seen, last_seen,
+#                              first_seen_parent, last_seen_parent, average_spacing, ice_df4.cost_pp_pa), axis=1),
+#         values=ice_df4.value,
+#         branchvalues="total",
+#         marker=dict(
+#             colors=ice_df4.colour,
+#             colorscale='Viridis'),
+#         maxdepth=3,
+#         texttemplate='<b>%{label}</b> '
+#                       '<br><b>Total patients:</b> %{customdata[0]} - %{customdata[1]:.3p} of patients in level'
+#                       '<br><b>Total cost:</b> £%{customdata[2]:.3~s}'
+#                       '<br><b>Average cost per patient:</b> £%{customdata[3]:.3~s}'
+#                       '<br><b>Average cost per patient per annum:</b> £%{customdata[9]:.3~s}',
+#         hovertemplate='<b>%{label}</b>'
+#                       '<br><b>Total patients:</b> %{customdata[0]} - %{customdata[1]:.3p} of patients in level'
+#                       '<br><b>Total cost:</b> £%{customdata[2]:.3~s}'
+#                       '<br><b>Average cost per patient:</b> £%{customdata[3]:.3~s}'
+#                       '<br><b>Average cost per patient per annum:</b> £%{customdata[9]:.3~s}'
+#                       '<br><b>First seen:</b> %{customdata[4]}'
+#                       '<br><b>Last seen (including further treatments):</b> %{customdata[7]}'
+#                       '<br><b>Average treatment duration:</b>'
+#                       '%{customdata[8]}'
+#                       '<extra></extra>',
+#     ))
+#
+#import os 
+#def main():
+#    input = "ice_df.csv"
+#    save_dir = os.path.dirname(os.path.abspath(__file__))
+#    dir = "debugging"
+#    ice_df4 = pd.read_csv(input)
+#    
+#    ice_df4['First seen'] = pd.to_datetime(ice_df4['First seen'])
+#    ice_df4['avg_days'] = pd.to_timedelta(ice_df4['avg_days'])
+#    ice_df4['Last seen'] = pd.to_datetime(ice_df4['Last seen'])
+#    figure(ice_df4, dir, save_dir)
+#
+#if __name__ == "__main__":
+#    main()
+
+
+def generate_graph_v2(
+    df: pd.DataFrame,
+    start_date: str,
+    end_date: str,
+    last_seen_date: str,
+    save_dir: str,
+    trust_filter: list[str],
+    drug_filter: list[str],
+    directory_filter: list[str],
+    minimum_num_patients: int = 0,
+    title: str = "",
+    paths: Optional[PathConfig] = None,
+) -> Optional[go.Figure]:
+    """
+    Generate patient pathway icicle chart using refactored pipeline.
+
+    This is the modern API that uses the refactored analysis functions.
+    It provides cleaner parameter names and returns the figure instead of
+    automatically opening it in a browser.
+
+    Args:
+        df: DataFrame with processed patient intervention data
+        start_date: Start date for patient initiation filter (YYYY-MM-DD)
+        end_date: End date for patient initiation filter (YYYY-MM-DD)
+        last_seen_date: Filter for patients last seen after this date
+        save_dir: Directory to save the HTML file
+        trust_filter: List of trust names to include
+        drug_filter: List of drug names to include
+        directory_filter: List of directories to include
+        minimum_num_patients: Minimum number of patients to include a pathway
+        title: Chart title (auto-generated from dates if empty)
+        paths: PathConfig for file paths (uses default if None)
+
+    Returns:
+        Plotly Figure object, or None if no data
+    """
+    if paths is None:
+        paths = default_paths
+
+    ice_df, final_title = _generate_icicle_chart(
+        df=df,
+        start_date=start_date,
+        end_date=end_date,
+        last_seen_date=last_seen_date,
+        trust_filter=trust_filter,
+        drug_filter=drug_filter,
+        directory_filter=directory_filter,
+        minimum_num_patients=minimum_num_patients,
+        title=title,
+        paths=paths,
+    )
+
+    if ice_df is None or len(ice_df) == 0:
+        return None
+
+    fig = create_icicle_figure(ice_df, final_title)
+
+    if save_dir:
+        fig.write_html(f"{save_dir}/{final_title}.html")
+        logger.info(f"Success! File saved to {save_dir}/{final_title}.html")
+
+    return fig
+
+
+def create_icicle_figure(ice_df: pd.DataFrame, title: str) -> go.Figure:
+    """
+    Create Plotly icicle figure from prepared DataFrame.
+
+    This function delegates to visualization.plotly_generator.create_icicle_figure()
+    for the actual figure generation.
+
+    Args:
+        ice_df: DataFrame with parents, ids, labels, value, colour etc.
+        title: Chart title
+
+    Returns:
+        Plotly Figure object
+    """
+    return _create_icicle_figure(ice_df, title)