Initial commit before Ralph loop

2026-02-04 13:04:29 +00:00
commit fdd33a67af
89 changed files with 20660 additions and 0 deletions
@@ -0,0 +1,647 @@
+import webbrowser
+from itertools import groupby
+import os
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+
+from core import AnalysisFilters, PathConfig, default_paths
+from core.logging_config import get_logger
+from tools import data
+
+# Import refactored analysis functions
+from analysis.pathway_analyzer import (
+    generate_icicle_chart as _generate_icicle_chart,
+    prepare_data as _prepare_data,
+    calculate_statistics as _calculate_statistics,
+    build_hierarchy as _build_hierarchy,
+    prepare_chart_data as _prepare_chart_data,
+)
+
+# Import visualization functions
+from visualization.plotly_generator import (
+    create_icicle_figure as _create_icicle_figure,
+    save_figure_html as _save_figure_html,
+    figure_legacy as _figure_legacy,
+)
+
+logger = get_logger(__name__)
+
+pd.options.mode.chained_assignment = None  # default='warn'
+def human_format(num):
+    num = float('{:.3g}'.format(num))
+    magnitude = 0
+    while abs(num) >= 1000:
+        magnitude += 1
+        num /= 1000.0
+    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
+
+def main(dir, paths: Optional[PathConfig] = None):
+    """
+    Load and process patient intervention data from a file.
+
+    Uses the FileDataLoader abstraction to handle CSV/Parquet file loading
+    with all necessary transformations (patient_id, drug_names, department_identification).
+
+    Args:
+        dir: Path to CSV or Parquet file
+        paths: PathConfig for reference data locations (uses default_paths if None)
+
+    Returns:
+        DataFrame with processed patient intervention data
+    """
+    from data_processing.loader import FileDataLoader
+
+    if paths is None:
+        paths = default_paths
+
+    loader = FileDataLoader(file_path=dir, paths=paths)
+    result = loader.load()
+
+    logger.info("Initial data processing complete.")
+    return result.df
+
+
+def drop_duplicate_treatments(df, ascending):
+    df.sort_values(by=['Intervention Date'], ascending=ascending, inplace=True)
+    df_treatment_steps = df.drop_duplicates(subset="UPIDTreatment", keep="first")
+    if not ascending:
+        df_treatment_steps.sort_values(by=['Intervention Date'], ascending=True, inplace=True)
+    return df_treatment_steps
+
+
+def row_function(row):
+    ids = ""
+    parents = "N&WICS"
+    count = row.count()
+    for c in range(count):
+        v = row[c]
+        if type(v) != str:
+            v = row[c + 1]
+        if c == count - 1:
+            ids = parents + " - " + v
+            continue
+        parents += " - " + v
+    label = row[count - 1]
+    value = parents + "," + label + "," + ids
+    return value
+
+
+def count_list_values(x):
+    return [len(list(group)) for key, group in groupby(sorted(x))]
+
+
+def sum_list_values(x):
+    sum_list = []
+    for count in range(len(x["Drug Name"])):
+        if count == 0:
+            sum_list.append(sum(x["Price Actual"][ : x["Drug Name"][count]]))
+        else:
+            sum_list.append(sum(x["Price Actual"][x["Drug Name"][count-1] : (x["Drug Name"][count-1] + x["Drug Name"][count])]))
+    return sum_list
+
+
+def remove_nan_string(y):
+    return [x for x in y if str(x) != 'nan']
+
+
+def min_max_treatment_dates(ice_df, row):
+    ids = row[2]
+    min_max = ice_df[ice_df["ids"].str.contains(ids)]
+    min_date = str(min_max["First seen"].min().strftime('%Y-%m-%d'))
+    max_date = str(min_max["Last seen"].max().strftime('%Y-%m-%d'))
+    return min_date + ',' + max_date
+
+
+def start_date_drug(df, x):
+    drug_count = x.notnull().sum()
+    date_string = []
+    for d in range(drug_count):
+        UPID_date_var = str(x.name) + str(x[d])
+        date = df.loc[UPID_date_var, "Intervention Date"]
+        date_string.append(date)
+    return date_string
+
+
+def end_date_drug(df, x):
+    drug_count = x.notnull().sum()
+    date_string = []
+    # Need to -1 from drug count as start date gets counted from notnull above
+    for d in range(drug_count - 1):
+        UPID_date_var = str(x.name) + str(x[d])
+        date = df.loc[UPID_date_var, "Intervention Date"]
+        date_string.append(date)
+    return date_string
+
+
+def list_to_string(x):
+    list = x.ids.split(' - ')
+    drug_list = list[len(list) - len(x.average_cost):]
+    ret_string = ""
+    for y in range(len(x.average_cost)):
+        if (round(x.average_spacing[y], 0) > 1) and (round(x.average_administered[y], 0) > 2.5) and (int(x.value) > 0):
+            string = "<br><b>" + str(drug_list[y]) + "</b><br>On average given " + str(
+                round(x.average_administered[y], 1)) + \
+                     " times with a " + str(round(int(x.average_spacing[y]) / 7, 1)) + " weekly interval (" \
+                     + str(round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1),
+                                 0)) + " weeks total treatment length)" 
+                     #"<br>Average annual cost per annum:" + \
+                     #str(human_format(
+                     #    (x.cost / x.value) / (((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1))/ 52)))
+        else:
+            string = "<br><b>" + str(drug_list[y]) + "</b><br>On average given " + str(
+                round(x.average_administered[y], 1)) + \
+                     " times with a " + str(round(int(x.average_spacing[y]) / 7, 1)) + " weekly interval (" \
+                     + str(round((int(x.average_spacing[y]) / 7) * round(x.average_administered[y], 1),
+                                 0)) + " weeks total treatment length)" 
+                     #"<br>Average annual cost per annum unavailable"
+
+        ret_string += string
+
+    return ret_string
+
+
+def drug_frequency_average(x):
+    drug_count = x.index.str.contains("drug_").sum()
+    freq = []
+    for d in range(drug_count):
+        if x["freq_" + str(d)] > 1:
+            duration = ((x["end_date_" + str(d)] - x["start_date_" + str(d)]) / np.timedelta64(1, 'D'))
+            if duration > 0:
+                freq_calc = duration / (x["freq_" + str(d)] - 1)
+            else:
+                freq_calc = 0
+        else:
+            freq_calc = 0
+        freq.append(freq_calc)
+    return freq
+
+
+def cost_pp_pa(x):
+    if x["avg_days"]/ np.timedelta64(1, 'D') > 0:
+        return str(round(x["costpp"] / ((x["avg_days"] / np.timedelta64(1, 'D')) / 365), 2))
+    else:
+        return "N/A"
+
+
+def generate_graph(
+    df1,
+    start_date=None,
+    end_date=None,
+    last_seen=None,
+    save_dir=None,
+    trustFilter=None,
+    drugFilter=None,
+    directorateFilter=None,
+    title=None,
+    minimum_num_patients=None,
+    *,
+    filters: Optional[AnalysisFilters] = None,
+    paths: Optional[PathConfig] = None,
+):
+    """
+    Generate patient pathway icicle chart.
+
+    This function can be called in two ways:
+    1. New style: Pass filters=AnalysisFilters(...) with all parameters encapsulated
+    2. Legacy style: Pass individual parameters (start_date, end_date, etc.)
+
+    If both are provided, the filters object takes precedence.
+
+    Args:
+        df1: DataFrame with processed patient data
+        filters: AnalysisFilters object with all filter parameters (preferred)
+        paths: PathConfig object for file paths (optional, uses default_paths if not provided)
+
+        Legacy parameters (used if filters is None):
+        start_date, end_date, last_seen, save_dir, trustFilter, drugFilter,
+        directorateFilter, title, minimum_num_patients
+    """
+    # Use PathConfig for file paths
+    if paths is None:
+        paths = default_paths
+
+    # Extract parameters from AnalysisFilters if provided
+    if filters is not None:
+        start_date = filters.start_date
+        end_date = filters.end_date
+        last_seen = filters.last_seen_date
+        save_dir = filters.output_dir
+        trustFilter = filters.trusts
+        drugFilter = filters.drugs
+        directorateFilter = filters.directories
+        title = filters.custom_title
+        minimum_num_patients = filters.minimum_patients
+
+    df1["UPIDTreatment"] = df1["UPID"] + df1["Drug Name"]
+
+    # Get average number of doses count
+    org_codes = pd.read_csv(paths.org_codes_csv, index_col=1)
+    df1["Provider Code"] = df1["Provider Code"].map(org_codes["Name"])
+    #df1.to_csv("./df1.csv", index=False)
+
+    df1 = df1[(df1["Provider Code"].isin(trustFilter)) & (df1["Drug Name"].isin(drugFilter)) & (df1["Directory"].isin(directorateFilter))]
+
+    if len(df1) == 0:
+        logger.warning("No data found for selected filters.")
+        return
+
+    # Find total cost for each patient - Total cost is ~£110Mil, about 30% is unattributable to a patient (no UPID)
+    cost_df = df1[["UPID", "Price Actual"]]
+    total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
+    total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
+
+    # Series to map directory
+    directory_df = df1[["UPID", "Directory"]]
+    directory_df.drop_duplicates("UPID", inplace=True)
+    directory_df.set_index("UPID", inplace=True)
+    logger.info("Filtering unrelated interventions")
+
+    df_end_dates = drop_duplicate_treatments(df1, False)
+    df1_unique = drop_duplicate_treatments(df1, True)
+    logger.info("Identifying unique patients and interventions used")
+    # Create list of total number of that drug for each patient
+    df_drug_freq = df1.groupby("UPID").agg({"Drug Name": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_drug_cost = df1.groupby("UPID").agg({"Price Actual": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
+    #df_drug_freq["Price Actual"] = df_drug_freq["Price Actual"].map(df_drug_cost)
+    df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(count_list_values)
+    df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: sum_list_values(x), axis=1)
+
+
+    # Aggregate interventions & dates of interventions into transposed list by UPID
+    df_drugs = df1_unique.groupby("UPID").agg({"Drug Name": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_dates = df1_unique.groupby("UPID").agg({"Intervention Date": lambda x: list(x)}).reset_index().set_index("UPID")
+    df_end_dates = df_end_dates.groupby("UPID").agg({"Intervention Date": lambda x: list(x)}).reset_index().set_index("UPID")
+
+    logger.info("Calculating each unique patient's intervention average frequency, cost and duration of each intervention")
+    # The following sh*t show is to unwrap the lists into columns for different drugs, start/end dates, and average
+    # frequency/average total injections of each one
+    df_dates_unwrapped = pd.DataFrame(df_dates["Intervention Date"].values.tolist(), index=df_dates.index).add_prefix(
+        'date_')
+    df_end_dates_unwrapped = pd.DataFrame(df_end_dates["Intervention Date"].values.tolist(), index=df_end_dates.index).add_prefix(
+        'date_end_')
+    df_drugs_unwrapped = pd.DataFrame(df_drugs["Drug Name"].values.tolist(), index=df_drugs.index).add_prefix('drug_')
+
+    df_freq_unwrapped = pd.DataFrame(df_drug_freq["Drug Name"].values.tolist(), index=df_drug_freq.index).add_prefix(
+        'freq_')
+    start_dates = df1[["UPIDTreatment", "Intervention Date"]].sort_values(by=["Intervention Date"], ascending=True,
+                                                                               inplace=False,
+                                                                               ignore_index=True).drop_duplicates(
+        subset="UPIDTreatment").set_index("UPIDTreatment")
+    end_dates = df1[["UPIDTreatment", "Intervention Date"]].sort_values(by=["Intervention Date"], ascending=False,
+                                                                             inplace=False,
+                                                                             ignore_index=True).drop_duplicates(
+        subset="UPIDTreatment").set_index("UPIDTreatment")
+
+
+
+    df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(lambda x: start_date_drug(start_dates, x), axis=1)
+
+    df_ddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["start_dates"].values.tolist(),
+                                       index=df_drugs_unwrapped.index).add_prefix(
+        'start_date_')
+    df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
+    df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(lambda x: start_date_drug(end_dates, x), axis=1)
+    df_dddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["end_dates"].values.tolist(),
+                                       index=df_drugs_unwrapped.index).add_prefix(
+        'end_date_')
+
+    df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_ddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddrugs_unwrapped, left_index=True, right_index=True)
+    df_dddddrugs_unwrapped = pd.DataFrame(df_drug_freq["Drug Name"].values.tolist(),
+                                          index=df_drugs_unwrapped.index).add_prefix(
+        'freq_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(lambda x: drug_frequency_average(x), axis=1)
+
+    df_ddddddrugs_unwrapped = pd.DataFrame(df_drugs_unwrapped["frequency"].values.tolist(),
+                                           index=df_drugs_unwrapped.index).add_prefix(
+        'spacing_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_ddddddrugs_unwrapped, left_index=True, right_index=True)
+    df_dddddddrugs_unwrapped = pd.DataFrame(df_drug_freq["Drug cost total"].values.tolist(),
+                                           index=df_drugs_unwrapped.index).add_prefix('total_cost_drug_')
+    df_drugs_unwrapped = pd.merge(df_drugs_unwrapped, df_dddddddrugs_unwrapped, left_index=True, right_index=True)
+    df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
+
+    # Insert first & last date seen into df (need to add last date seen)
+    df_drugs_unwrapped.insert(0, "First seen", df_dates_unwrapped.min(axis=1))
+    df_drugs_unwrapped.insert(1, "Last seen", df_end_dates_unwrapped.max(axis=1))
+
+    # Merge info from activity data with grouped info, and total cost info
+    patient_info = df1.drop_duplicates(subset="UPID", keep="first").set_index("UPID")
+    patient_info = pd.merge(patient_info, df_drugs_unwrapped, left_index=True, right_index=True)
+    patient_info = pd.merge(patient_info, df_freq_unwrapped, left_index=True, right_index=True)
+    patient_info = pd.merge(patient_info, total_costs, left_index=True, right_index=True)
+
+    #patient_info.to_csv("patient_info.csv", index=False)
+
+    # Filter initiation based on years provided
+    patient_info = patient_info[(patient_info['First seen'] >= str(start_date)) & (
+                patient_info['First seen'] < str(end_date))]
+    if title == "":
+        title = "Patients initiated from " + str(start_date) + " to " + str(end_date)
+
+    # Filter last seen based on date provided
+    patient_info = patient_info[patient_info['Last seen'] > str(last_seen)]
+
+    # Remove patients with 0 drug, by filling blanks with NaN & dropping rows
+    patient_info.drug_0.replace('N/A', np.nan, inplace=True)
+    patient_info.dropna(subset=['drug_0'], inplace=True)
+
+    # Calculate duation of treatment
+    patient_info['Days treated'] = patient_info["Last seen"] - patient_info["First seen"]
+    date_df = patient_info[["First seen", "Last seen", 'Days treated']]
+
+    # Create df for ice chart with hierarchy of plot
+    number_of_drugs = np.count_nonzero(patient_info.columns.str.startswith('drug_'))
+    final_drug_index = patient_info.columns.to_list().index("drug_" + str(number_of_drugs - 1))
+
+    upid_drugs_df = patient_info.iloc[:, (final_drug_index - number_of_drugs + 1):final_drug_index + 1]
+
+    upid_drugs_df.insert(0, "Trust", upid_drugs_df.index.str[:3])
+    upid_drugs_df.insert(1, "Directory", upid_drugs_df.index)
+
+    upid_drugs_df["Trust"] = upid_drugs_df["Trust"].map(org_codes["Name"])
+    upid_drugs_df["Directory"] = upid_drugs_df["Directory"].map(directory_df["Directory"])
+
+    l_df = pd.DataFrame()
+    ice_df2 = pd.DataFrame()
+    ice_df = pd.DataFrame()
+
+    upid_drugs_df["value"] = upid_drugs_df.apply(lambda x: row_function(x), axis=1)
+    # Merge in date info
+    upid_drugs_df = pd.merge(upid_drugs_df, date_df, left_index=True, right_index=True)
+
+    upid_drugs_df["ids"] = upid_drugs_df["value"].str.split(',').str[2]
+    avg_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["Days treated"].mean()).set_index("ids")
+    value_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False).size()).reset_index()
+    first_seen_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["First seen"].min()).set_index(
+        "ids")
+    last_seen_treatment_dfs = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)["Last seen"].max()).set_index(
+        "ids")
+
+    # Calculate total cost for parents
+    upid_drugs_df["Cost"] = upid_drugs_df.index.map(total_costs["Total cost"])
+    cost_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)['Cost'].sum()).set_index("value", drop=True)
+
+    # Calculate average dosing for each drug
+    upid_drugs_df = pd.merge(upid_drugs_df, df_drugs_unwrapped, left_index=True, right_index=True)
+    # frequency_dfs = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)['Cost'].sum()).set_index("value", drop=True)
+
+    # Calculate average spacing between drugs
+    spacing_average = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)[
+                                       [col for col in upid_drugs_df.columns if 'spacing_' in col]].mean()).set_index(
+        "value", drop=True)
+    spacing_average = spacing_average.round()
+    spacing_average['combined'] = spacing_average.values.tolist()
+    spacing_average["ids"] = spacing_average.index
+    spacing_average["ids"] = spacing_average["ids"].str.split(',').str[2]
+    spacing_average.set_index("ids", inplace=True)
+
+    # Calculate average cost for each drug
+    cost_average = pd.DataFrame(upid_drugs_df.groupby("value", as_index=False)[
+                                       [col for col in upid_drugs_df.columns if 'total_cost_drug_' in col]].mean()).set_index(
+        "value", drop=True)
+    cost_average = cost_average.round(2)
+    cost_average['combined'] = cost_average.values.tolist()
+    cost_average["ids"] = cost_average.index
+    cost_average["ids"] = cost_average["ids"].str.split(',').str[2]
+    cost_average.set_index("ids", inplace=True)
+
+
+    # Calculate average number of doses
+    freq_average = pd.DataFrame(upid_drugs_df.groupby("ids", as_index=False)[
+                                    [col for col in upid_drugs_df.columns if 'freq_' in col]].mean()).set_index("ids",
+                                                                                                                drop=True)
+    # freq_average = freq_average.round()
+    freq_average['combined'] = freq_average.values.tolist()
+
+    # Remove negative totals from "Cost" column
+    num = cost_dfs._get_numeric_data()
+    num[num < 0] = 0
+
+    value_dfs["Cost"] = value_dfs["value"].map(cost_dfs["Cost"])
+
+    ice_df[['parents', 'labels', 'ids']] = value_dfs["value"].str.split(',', expand=True)
+    # ice_df["index"] = ice_df.ids
+    # ice_df.set_index("index", inplace=True)
+
+    ice_df["average_administered"] = ice_df["ids"].map(freq_average["combined"])
+    ice_df["cost"] = value_dfs["Cost"]
+    ice_df["value"] = value_dfs["size"]
+
+    ice_df["average_cost"] = ice_df["ids"].map(cost_average["combined"])
+    ice_df["average_cost"] = ice_df["average_cost"].apply(remove_nan_string)
+
+    ice_df["average_spacing"] = ice_df["ids"].map(spacing_average["combined"])
+    ice_df["average_spacing"] = ice_df["average_spacing"].apply(remove_nan_string)
+    ice_df["average_spacing"] = ice_df.apply(lambda x: list_to_string(x), axis=1)
+    ice_df["average_spacing"] = ice_df["average_spacing"].str.replace("nan", "N/A")
+
+
+    logger.info("Building graph dataframe structure.")
+    # Add very top level of Trust
+    new_row = pd.DataFrame({'parents': '', 'ids': "N&WICS", 'labels': 'N&WICS', 'value': 0, "cost": 0}, index=[0])
+    ice_df = pd.concat(objs=[ice_df, new_row], ignore_index=True, axis=0)
+
+    # need to add parents as blocks...
+    l3 = [x for x in ice_df.parents.unique() if x not in ice_df.ids]
+    while len(l3) > 1:
+        for l in l3:
+            z = l.rfind("-")
+            if z > 0:
+                l_dict = {"parents": l[:z - 1], "ids": l, "value": 0, "labels": l[z + 2:], "cost": 0}
+                l_df = pd.concat([l_df, pd.DataFrame(l_dict, index=[0])], ignore_index=True)
+        ice_df2 = pd.concat([ice_df, l_df], ignore_index=True)
+        l3 = [x for x in ice_df2.parents.unique() if x not in ice_df2.ids.unique()]
+    ice_df = ice_df2.drop_duplicates("ids")
+
+    ice_df["level"] = ice_df["ids"].str.count('-')
+    ice_df = ice_df[~ice_df['labels'].isin(["COST", "CHARGE", "N/A"])]
+    ice_df.sort_values(by=["level"], ascending=False, inplace=True, ignore_index=True)
+
+    for index, row in ice_df.iterrows():
+        lookup_index = ice_df.index[ice_df['ids'] == row['parents']]
+        ice_df.loc[lookup_index, 'value'] = ice_df.loc[lookup_index, "value"] + ice_df.loc[index, "value"]
+        ice_df.loc[lookup_index, 'cost'] = ice_df.loc[lookup_index, "cost"] + ice_df.loc[index, 'cost']
+
+    # Sum of parent values to create denominator for percentage - FOR PATIENT NUMBER COLOUR GRADING
+    colour_df = pd.DataFrame(ice_df.groupby(["parents"])["value"].sum())
+    ice_df['colour'] = ice_df["parents"].map(colour_df["value"])
+    ice_df['colour'] = ice_df['value']/ice_df['colour']
+
+    # Sum of parent values to create denominator for percentage - FOR COST COLOUR GRADING
+    #colour_df = pd.DataFrame(ice_df.groupby(["parents"])["cost"].sum())
+    #ice_df['colour'] = ice_df["parents"].map(colour_df["cost"])
+    #ice_df['colour'] = ice_df['cost'] / ice_df['colour']
+
+
+    ice_df['costpp'] = ice_df['cost'] / ice_df['value']
+    # Treatment length info
+    ice_df['avg_days'] = ice_df["ids"].map(avg_treatment_dfs["Days treated"])
+    ice_df['First seen'] = ice_df["ids"].map(first_seen_treatment_dfs["First seen"])
+    ice_df['Last seen'] = ice_df["ids"].map(last_seen_treatment_dfs["Last seen"])
+
+    ice_df["dates"] = ice_df.apply(lambda x: min_max_treatment_dates(ice_df, x), axis=1)
+    ice_df[['First seen (Parent)', 'Last seen (Parent)']] = ice_df["dates"].str.split(',', expand=True)
+
+    # Sort labels to be alphabetical
+    # ice_df.sort_values(by=["labels"], ascending=True, inplace=True, ignore_index=True)
+    ice_df['First seen'] = pd.to_datetime(ice_df['First seen'])
+    ice_df['Last seen'] = pd.to_datetime(ice_df['Last seen'])
+    ice_df["cost_pp_pa"] = ice_df.apply(lambda x: cost_pp_pa(x), axis=1)
+
+    # Filter out rows where value is less than minimum number of patients
+    ice_df = ice_df[ice_df['value'] >= minimum_num_patients]
+
+    logger.info("Generating graph.")
+
+    figure(ice_df, title, save_dir)
+    return
+
+
+def figure(ice_df4, dir_string, save_dir):
+    """
+    Create and display icicle figure (legacy interface).
+
+    This function delegates to visualization.plotly_generator.figure_legacy()
+    for backward compatibility.
+
+    Args:
+        ice_df4: DataFrame with chart data
+        dir_string: Title string (used for filename and chart title)
+        save_dir: Directory to save the HTML file
+    """
+    _figure_legacy(ice_df4, dir_string, save_dir)
+    return
+
+
+# fig = go.Figure(go.Icicle(
+#         labels=ice_df4.labels,
+#         ids=ice_df4.ids,
+#         # count="branches",
+#         parents=ice_df4.parents,
+#         customdata=np.stack((ice_df4.value, ice_df4.colour, ice_df4.cost, ice_df4.costpp, first_seen, last_seen,
+#                              first_seen_parent, last_seen_parent, average_spacing, ice_df4.cost_pp_pa), axis=1),
+#         values=ice_df4.value,
+#         branchvalues="total",
+#         marker=dict(
+#             colors=ice_df4.colour,
+#             colorscale='Viridis'),
+#         maxdepth=3,
+#         texttemplate='<b>%{label}</b> '
+#                       '<br><b>Total patients:</b> %{customdata[0]} - %{customdata[1]:.3p} of patients in level'
+#                       '<br><b>Total cost:</b> £%{customdata[2]:.3~s}'
+#                       '<br><b>Average cost per patient:</b> £%{customdata[3]:.3~s}'
+#                       '<br><b>Average cost per patient per annum:</b> £%{customdata[9]:.3~s}',
+#         hovertemplate='<b>%{label}</b>'
+#                       '<br><b>Total patients:</b> %{customdata[0]} - %{customdata[1]:.3p} of patients in level'
+#                       '<br><b>Total cost:</b> £%{customdata[2]:.3~s}'
+#                       '<br><b>Average cost per patient:</b> £%{customdata[3]:.3~s}'
+#                       '<br><b>Average cost per patient per annum:</b> £%{customdata[9]:.3~s}'
+#                       '<br><b>First seen:</b> %{customdata[4]}'
+#                       '<br><b>Last seen (including further treatments):</b> %{customdata[7]}'
+#                       '<br><b>Average treatment duration:</b>'
+#                       '%{customdata[8]}'
+#                       '<extra></extra>',
+#     ))
+#
+#import os 
+#def main():
+#    input = "ice_df.csv"
+#    save_dir = os.path.dirname(os.path.abspath(__file__))
+#    dir = "debugging"
+#    ice_df4 = pd.read_csv(input)
+#    
+#    ice_df4['First seen'] = pd.to_datetime(ice_df4['First seen'])
+#    ice_df4['avg_days'] = pd.to_timedelta(ice_df4['avg_days'])
+#    ice_df4['Last seen'] = pd.to_datetime(ice_df4['Last seen'])
+#    figure(ice_df4, dir, save_dir)
+#
+#if __name__ == "__main__":
+#    main()
+
+
+def generate_graph_v2(
+    df: pd.DataFrame,
+    start_date: str,
+    end_date: str,
+    last_seen_date: str,
+    save_dir: str,
+    trust_filter: list[str],
+    drug_filter: list[str],
+    directory_filter: list[str],
+    minimum_num_patients: int = 0,
+    title: str = "",
+    paths: Optional[PathConfig] = None,
+) -> Optional[go.Figure]:
+    """
+    Generate patient pathway icicle chart using refactored pipeline.
+
+    This is the modern API that uses the refactored analysis functions.
+    It provides cleaner parameter names and returns the figure instead of
+    automatically opening it in a browser.
+
+    Args:
+        df: DataFrame with processed patient intervention data
+        start_date: Start date for patient initiation filter (YYYY-MM-DD)
+        end_date: End date for patient initiation filter (YYYY-MM-DD)
+        last_seen_date: Filter for patients last seen after this date
+        save_dir: Directory to save the HTML file
+        trust_filter: List of trust names to include
+        drug_filter: List of drug names to include
+        directory_filter: List of directories to include
+        minimum_num_patients: Minimum number of patients to include a pathway
+        title: Chart title (auto-generated from dates if empty)
+        paths: PathConfig for file paths (uses default if None)
+
+    Returns:
+        Plotly Figure object, or None if no data
+    """
+    if paths is None:
+        paths = default_paths
+
+    ice_df, final_title = _generate_icicle_chart(
+        df=df,
+        start_date=start_date,
+        end_date=end_date,
+        last_seen_date=last_seen_date,
+        trust_filter=trust_filter,
+        drug_filter=drug_filter,
+        directory_filter=directory_filter,
+        minimum_num_patients=minimum_num_patients,
+        title=title,
+        paths=paths,
+    )
+
+    if ice_df is None or len(ice_df) == 0:
+        return None
+
+    fig = create_icicle_figure(ice_df, final_title)
+
+    if save_dir:
+        fig.write_html(f"{save_dir}/{final_title}.html")
+        logger.info(f"Success! File saved to {save_dir}/{final_title}.html")
+
+    return fig
+
+
+def create_icicle_figure(ice_df: pd.DataFrame, title: str) -> go.Figure:
+    """
+    Create Plotly icicle figure from prepared DataFrame.
+
+    This function delegates to visualization.plotly_generator.create_icicle_figure()
+    for the actual figure generation.
+
+    Args:
+        ice_df: DataFrame with parents, ids, labels, value, colour etc.
+        title: Chart title
+
+    Returns:
+        Plotly Figure object
+    """
+    return _create_icicle_figure(ice_df, title)
@@ -0,0 +1,331 @@
+import numpy as np
+import pandas as pd
+import csv
+import urllib.request
+import io # Added for StringIO
+import re # Added for regex escape and word boundaries
+from typing import Optional
+
+from core import PathConfig, default_paths
+from core.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+def drug_names(df, paths: Optional[PathConfig] = None):
+    # Generate dictionary to convert drug names from activity data to generic standardisation
+    if paths is None:
+        paths = default_paths
+
+    d = {}
+    with open(paths.drugnames_csv, 'r', newline='') as f:
+        reader = csv.reader(f, delimiter=',')
+        for drug_name, generic in reader:
+            d[drug_name.upper()] = generic.upper()
+
+    # Map drug names with dictionary generated earlier
+    df["Drug Name"] = df["Drug Name"].str.upper().map(d)
+
+    # Remove (Left eye) or (Right eye) from Drug Name, including whitespace
+    df["Drug Name"] = df["Drug Name"].str.replace(r'\(LEFT EYE\)', '', regex=True) # Escaped parentheses
+    df["Drug Name"] = df["Drug Name"].str.replace(r'\(RIGHT EYE\)', '', regex=True) # Escaped parentheses
+    df["Drug Name"] = df["Drug Name"].str.strip()
+    return df
+
+
+def patient_id(df):
+    # Generate unique patient ID
+    df["UPID"] = df["Provider Code"].str[:3] + df["PersonKey"].astype(str)
+    return df
+
+
+def compress_csv(filepath):
+    df = pd.read_csv(filepath)
+    compressed_path = filepath.replace(".csv", "_bz2.csv")
+    df.to_csv(compressed_path, compression="bz2", index=False)
+    return compressed_path
+
+
+def department_identification(df, paths: Optional[PathConfig] = None):
+    # --- Setup ---
+    if paths is None:
+        paths = default_paths
+
+    # 1. Load directory_list.csv and prepare uppercase versions/pattern
+    try:
+        directory_df = pd.read_csv(paths.directory_list_csv)
+        directory_list = directory_df["directory"].dropna().astype(str).tolist()
+        if not directory_list:
+             raise ValueError("directory_list.csv is empty or contains only NA values.")
+        directory_list_upper = [d.upper() for d in directory_list]
+        # Use word boundaries (\b) to avoid partial matches within words, escape special regex chars
+        dir_pattern_upper = r'\b({})'.format('|'.join(map(re.escape, directory_list_upper)))
+    except FileNotFoundError:
+         logger.error(f"File not found: {paths.directory_list_csv}. Cannot extract directories.")
+         return df
+    except ValueError as e:
+         logger.error(f"Error loading directory list: {e}")
+         return df
+
+    # Simpler pattern for Primary_Source (no word boundaries)
+    dir_pattern_primary_simple = r'({})'.format('|'.join(map(re.escape, directory_list_upper)))
+
+    # 2. Load treatment_function_codes.csv and prepare uppercase mapping
+    treatment_codes = pd.read_csv(paths.treatment_function_codes_csv)
+    mapping_treatment_codes = dict(treatment_codes[['Code', 'Service']].values)
+    mapping_treatment_codes_upper = {k: str(v).upper() for k, v in mapping_treatment_codes.items()}
+
+    # 3. Load drug_directory_list.csv and parse into drug_to_valid_dirs
+    drug_to_valid_dirs: dict[str, set[str]] = {}
+    # Try pandas direct read - much simpler approach
+    drug_dir_df = pd.read_csv(paths.drug_directory_list_csv, skipinitialspace=True)
+    
+    # Identify the drug name column (first column) and directory column (second column)
+    drug_col = drug_dir_df.columns[0]
+    dir_col = drug_dir_df.columns[1]
+    
+    # Process dataframe directly
+    drug_to_valid_dirs = {}
+    for _, row in drug_dir_df.iterrows():
+        drug_name = str(row[drug_col]).strip().upper()
+        try:
+            # Directories are pipe-separated in the second column
+            dirs_str = str(row[dir_col]) if not pd.isna(row[dir_col]) else ""
+            dirs = {d.strip().upper() for d in dirs_str.split('|') if d.strip()}
+            if drug_name and dirs and drug_name.lower() != 'nan':
+                drug_to_valid_dirs[drug_name] = dirs
+        except Exception:
+            # Silently continue on row errors
+            continue
+    # 4. Create drug_to_single_dir map
+    drug_to_single_dir = {
+        drug: list(dirs)[0]
+        for drug, dirs in drug_to_valid_dirs.items()
+        if len(dirs) == 1
+    }
+
+    # --- Data Preprocessing ---
+    # Keep original extraction columns list
+    additional_detail_columns = ["Additional Detail 1", "Additional Description 1", "Additional Detail 2", "Additional Description 2",
+     "Additional Detail 3", "Additional Description 3", "Additional Detail 4", "Additional Description 4",
+     "Additional Detail 5", "Additional Description 5", "NCDR Treatment Function Name", "Treatment Function Desc"]
+
+    # 6. Convert detail columns to uppercase BEFORE extraction
+    for ad in additional_detail_columns:
+         # Check if column exists and is object/string type before applying .str
+         if ad in df.columns and pd.api.types.is_object_dtype(df[ad]):
+              df[ad] = df[ad].str.upper()
+
+    # Original extraction loop (using original case list for extraction)
+    # Extract directory from specified columns
+    directory_df = pd.read_csv(paths.directory_list_csv)
+    directory_list = directory_df["directory"].tolist() # Reload original case list
+
+    for ad in additional_detail_columns:
+        try:
+            # Ensure column is string type before cleaning
+            if pd.api.types.is_string_dtype(df[ad]):
+                 # Extract directly from the uppercased string column
+                 extracted = df[ad].str.extract(dir_pattern_upper, expand=False)
+                 df.loc[extracted.index, ad] = extracted
+            else:
+                 df[ad] = np.nan # Set non-string columns to NaN
+        except AttributeError: # Skip columns that might not exist or are not string type
+             df[ad] = np.nan # Ensure column exists but set to NaN if error
+        except Exception as e: # Catch other potential errors during extract
+             logger.error(f"Error processing column {ad}: {e}")
+             df[ad] = np.nan
+
+    # 7. Process Treatment Function Code
+    df["Treatment Function Code"].replace(np.nan, 0, inplace=True)
+    # Ensure it's int type before mapping, handle potential errors
+    try:
+        df["Treatment Function Code"] = df["Treatment Function Code"].astype(int)
+    except ValueError:
+        # Handle cases where conversion to int fails (e.g., non-numeric values)
+        # Try coercing errors to NaN, then fillna with 0
+        df["Treatment Function Code"] = pd.to_numeric(df["Treatment Function Code"], errors='coerce').fillna(0).astype(int)
+
+    df["Treatment Function Code"] = df["Treatment Function Code"].map(mapping_treatment_codes_upper)
+    df.rename(columns={'Treatment Function Code': 'Fallback_Source'}, inplace=True)
+
+    # Apply replacements before combining
+    df.replace('MEDICAL OPHTHALMOLOGY', 'OPHTHALMOLOGY', inplace=True)
+
+    # --- Single Directory Assignment ---
+    # 8. Apply single directory override
+    # Ensure Drug Name is suitable for mapping (already done in drug_names func)
+    df['Directory'] = df['Drug Name'].map(drug_to_single_dir)
+
+    # Initialize Directory_Source column - track which fallback level was used
+    df['Directory_Source'] = pd.NA
+    # Mark rows where single valid directory was assigned
+    df.loc[df['Directory'].notna(), 'Directory_Source'] = 'SINGLE_VALID_DIR'
+
+    # --- Prepare Fallback Logic ---
+    # 9. Create Primary source from Additional Detail 1
+    if 'Additional Detail 1' in df.columns:
+        df['Primary_Source'] = df['Additional Detail 1'].astype(pd.StringDtype())
+        df['Primary_Source'] = df['Primary_Source'].str.upper() # Apply upper to strings
+    else:
+        df['Primary_Source'] = pd.NA # Use pd.NA for StringDtype
+
+    # Extract actual directory name using the pattern
+    try:
+        # Use simpler pattern for primary source
+        df['Extracted_Primary_Dir'] = df['Primary_Source'].str.extract(dir_pattern_primary_simple, expand=False, flags=re.IGNORECASE)
+        df['Extracted_Fallback_Dir'] = df['Fallback_Source'].str.extract(dir_pattern_upper, expand=False, flags=re.IGNORECASE)
+    except Exception as e:
+        logger.error(f"Error during directory extraction: {e}")
+        # Assign NA columns if extraction fails
+        df['Extracted_Primary_Dir'] = pd.NA
+        df['Extracted_Fallback_Dir'] = pd.NA
+
+    # Strip potential whitespace from extracted directories
+    if 'Extracted_Primary_Dir' in df.columns:
+         df['Extracted_Primary_Dir'] = df['Extracted_Primary_Dir'].str.strip()
+    if 'Extracted_Fallback_Dir' in df.columns:
+         df['Extracted_Fallback_Dir'] = df['Extracted_Fallback_Dir'].str.strip()
+
+    # 10. Combine sources, prioritizing Primary_Source
+    # Combine EXTRACTED directories
+    df['Primary_Directory'] = df['Extracted_Primary_Dir'].fillna(df['Extracted_Fallback_Dir'])
+
+    # Track extraction source for Directory_Source column
+    # Rows where we have Extracted_Primary_Dir will use EXTRACTED_PRIMARY
+    # Rows where we only have Extracted_Fallback_Dir will use EXTRACTED_FALLBACK
+    df['_extracted_source'] = pd.NA
+    df.loc[df['Extracted_Primary_Dir'].notna(), '_extracted_source'] = 'EXTRACTED_PRIMARY'
+    df.loc[(df['Extracted_Primary_Dir'].isna()) & (df['Extracted_Fallback_Dir'].notna()), '_extracted_source'] = 'EXTRACTED_FALLBACK'
+
+    # 11. Clean up intermediate columns
+    df.drop(columns=['Primary_Source', 'Fallback_Source', 'Extracted_Primary_Dir', 'Extracted_Fallback_Dir'], inplace=True, errors='ignore')
+
+    # --- Identify Rows Needing Calculation ---
+    # 12. Filter rows where Directory is not yet assigned
+    df_to_process = df[df['Directory'].isnull()].copy()
+
+    # --- Calculate Most Frequent Valid Directory ---
+    # 13. Drop rows without a potential primary directory
+    df_to_process.dropna(subset=['Primary_Directory'], inplace=True)
+
+    # 14. Group and count potential directories
+    if not df_to_process.empty:
+        df_counts = df_to_process.groupby(['UPID', 'Drug Name', 'Primary_Directory'], observed=True)['Primary_Directory'].count().reset_index(name='count')
+
+        # 15. Sort by count descending
+        df_counts.sort_values(['UPID', 'Drug Name', 'count'], ascending=[True, True, False], inplace=True)
+
+        # 16. Define helper function
+        def find_first_valid_dir(group, drug_map):
+            drug_name = group['Drug Name'].iloc[0]
+            valid_dirs = drug_map.get(drug_name, set())
+            
+            if not valid_dirs:
+                return np.nan
+            
+            for dir_candidate in group['Primary_Directory']:
+                # Skip NA values
+                if pd.isna(dir_candidate):
+                    continue
+                    
+                # Check if valid directory for this drug
+                if isinstance(dir_candidate, str) and dir_candidate in valid_dirs:
+                    return dir_candidate
+            
+            return np.nan # No valid directory found in the group
+
+        # 17. Group by UPID and Drug Name
+        valid_groups = df_counts.groupby(['UPID', 'Drug Name'], observed=True, group_keys=False)
+
+        # 18. Apply helper function to find the best valid directory
+        calculated_dirs = valid_groups.apply(lambda grp: find_first_valid_dir(grp, drug_to_valid_dirs))
+
+        # 19. Reset index to get UPID, Drug Name columns
+        final_mapping = calculated_dirs.reset_index()
+
+        # 20. Rename the resulting column
+        final_mapping.columns = ['UPID', 'Drug Name', 'Calculated_Directory']
+
+        # --- Merge Results and Finalize ---
+        # 21. Merge calculated directories back to the main DataFrame
+        df = pd.merge(df, final_mapping, on=['UPID', 'Drug Name'], how='left')
+
+        # 22. Fill NaN Directories with the calculated ones and track source
+        # Find rows that will be filled from Calculated_Directory
+        rows_to_fill = df['Directory'].isna() & df['Calculated_Directory'].notna()
+        # For these rows, set Directory_Source based on _extracted_source (where the calculated dir came from)
+        # The "calculated" directory is still derived from extraction, just via frequency analysis
+        df.loc[rows_to_fill, 'Directory_Source'] = df.loc[rows_to_fill, '_extracted_source'].fillna('CALCULATED_MOST_FREQ')
+        # Replace with the actual value of _extracted_source or fall back to CALCULATED_MOST_FREQ
+        # Actually, let's simplify: if we're using the calculated most frequent directory, that's CALCULATED_MOST_FREQ
+        df.loc[rows_to_fill, 'Directory_Source'] = 'CALCULATED_MOST_FREQ'
+
+        df['Directory'].fillna(df['Calculated_Directory'], inplace=True)
+
+        # 23. Drop temporary columns
+        df.drop(columns=['Calculated_Directory', 'Primary_Directory', '_extracted_source'], inplace=True, errors='ignore')
+
+    else:
+         # If df_to_process was empty, still need to drop temporary columns
+         df.drop(columns=['Primary_Directory', '_extracted_source'], inplace=True, errors='ignore')
+
+    # 24. Drop rows with missing UPID (original logic)
+    df['UPID'].replace('', np.nan, inplace=True) # Ensure empty strings are NaN
+    df_orig = df.copy() # Save before dropna for future reference if needed
+    df.dropna(subset=['UPID'], inplace=True)
+
+    # 25. Export rows with NA Directory to CSV for analysis (keep this for diagnostics)
+    na_directory_rows = df[df['Directory'].isna()].copy()
+    
+    # Export to CSV if there are any NA Directory rows
+    if len(na_directory_rows) > 0:
+        na_directory_rows.to_csv(paths.na_directory_rows_csv, index=False)
+    
+    # 26. FALLBACK MECHANISM 1: Infer directory based on same UPID
+    # Create a mapping of most frequent directory per UPID (only for UPIDs with a directory)
+    if len(df[df['Directory'].isna()]) > 0:
+        # First get valid directories per UPID
+        valid_upid_dirs = df[df['Directory'].notna()].groupby('UPID')['Directory'].agg(
+            lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None
+        ).to_dict()
+
+        # Apply UPID-based inference and track source
+        for idx in df[df['Directory'].isna()].index:
+            upid = df.loc[idx, 'UPID']
+            if upid in valid_upid_dirs and valid_upid_dirs[upid] is not None:
+                df.loc[idx, 'Directory'] = valid_upid_dirs[upid]
+                df.loc[idx, 'Directory_Source'] = 'UPID_INFERENCE'
+
+    # 27. FALLBACK MECHANISM 2: Label remaining NA as "Undefined"
+    # Track rows that will be marked as Undefined
+    rows_undefined = df['Directory'].isna()
+    df.loc[rows_undefined, 'Directory_Source'] = 'UNDEFINED'
+    # Fill remaining NA directories with "Undefined"
+    df['Directory'].fillna("Undefined", inplace=True)
+
+    # 28. Return the processed DataFrame
+    return df
+
+
+
+def ta_list_get(paths: Optional[PathConfig] = None):
+    if paths is None:
+        paths = default_paths
+
+    link = "https://www.nice.org.uk/Media/Default/About/what-we-do/NICE-guidance/NICE-technology-appraisals/TA%20recommendations.xlsx"
+    urllib.request.urlretrieve(link, paths.ta_recommendations_xlsx)
+    ta_db = pd.read_excel(paths.ta_recommendations_xlsx, index_col=0)
+
+    # Filter out TA's which are not Recommended or not Pharmaceutical
+    ta_db = ta_db[ta_db["Categorisation (for specific recommendation)"].isin(["Recommended", "Optimised"])]
+    ta_db = ta_db[ta_db["Technology type"] == "Pharmaceutical"]
+
+    # Amend TA001 strings to only the integer
+    ta_db["TA ID"] = ta_db["TA ID"].str.replace(r'\D+', '', regex=True).astype(int)
+    ta_db["TA ID"] = "NICE TA" + ta_db["TA ID"].astype(str)
+    ta_series = ta_db[["TA ID", "Indication"]].drop_duplicates()
+    return ta_series
+
+
+
+