feat: add indication pathway processing functions (Task 2.3)

- Add generate_icicle_chart_indication() to pathway_analyzer.py
  - Variant that uses indication_df instead of directory_df
  - Groups by Trust → Search_Term → Drug → Pathway
  - Accepts indication_df mapping UPID → Indication_Group

- Add process_indication_pathway_for_date_filter() to pathway_pipeline.py
  - Processes indication-based pathway for a single date filter
  - Uses generate_icicle_chart_indication() for hierarchy building

- Add extract_indication_fields() to pathway_pipeline.py
  - Extracts trust_name, search_term, drug_sequence from ids column
  - Similar to extract_denormalized_fields() but for indication charts

- Update convert_to_records() with chart_type parameter
  - Includes chart_type column in output records
  - Supports "directory" and "indication" values

- Add ChartType type alias (Literal["directory", "indication"])

- Update __all__ exports with new functions
This commit is contained in:
Andrew Charlwood
2026-02-05 14:32:28 +00:00
parent aabe4bf45d
commit 7cbc648c6d
3 changed files with 352 additions and 6 deletions
+188
View File
@@ -749,3 +749,191 @@ def generate_icicle_chart(
ice_df = prepare_chart_data(ice_df, minimum_num_patients)
return ice_df, final_title
def generate_icicle_chart_indication(
df: pd.DataFrame,
indication_df: pd.DataFrame,
start_date: str,
end_date: str,
last_seen_date: str,
trust_filter: list[str],
drug_filter: list[str],
directory_filter: list[str],
minimum_num_patients: int,
title: str = "",
paths: Optional[PathConfig] = None,
) -> tuple[pd.DataFrame, str]:
"""
Generate icicle chart data with indication-based grouping.
This is a variant of generate_icicle_chart() that groups by Search_Term
(from GP diagnosis match) instead of Directory. For patients without
a GP diagnosis match, the fallback directorate is used with a "(no GP dx)"
suffix to distinguish them.
Hierarchy: Trust → Indication_Group → Drug → Pathway
Args:
df: DataFrame with processed patient intervention data
indication_df: DataFrame mapping UPID → Indication_Group
Must have 'UPID' as index and 'Indication_Group' column
Values are either Search_Term or "Directory (no GP dx)"
start_date: Start date for patient initiation filter
end_date: End date for patient initiation filter
last_seen_date: Filter for patients last seen after this date
trust_filter: List of trust names to include
drug_filter: List of drug names to include
directory_filter: List of directories to include
minimum_num_patients: Minimum number of patients to include a pathway
title: Chart title (auto-generated if empty)
paths: PathConfig for file paths (uses default if None)
Returns:
Tuple of (ice_df for chart, final_title) or (None, "") if no data
"""
if paths is None:
paths = default_paths
# Prepare data - use standard prepare_data function
result = prepare_data(df, trust_filter, drug_filter, directory_filter, paths)
if result[0] is None:
return None, ""
filtered_df, org_codes, directory_df = result
# For indication charts, we replace directory_df with indication_df
# First, ensure indication_df has the correct format (UPID as index)
if indication_df is not None and not indication_df.empty:
if 'UPID' in indication_df.columns:
indication_df = indication_df.set_index('UPID')
# Rename column for compatibility with build_hierarchy()
if 'Indication_Group' in indication_df.columns:
indication_df = indication_df.rename(columns={'Indication_Group': 'Directory'})
elif 'indication_group' in indication_df.columns:
indication_df = indication_df.rename(columns={'indication_group': 'Directory'})
else:
# Fall back to directory if no indication data provided
logger.warning("No indication data provided, falling back to directory grouping")
indication_df = directory_df
cost_df = filtered_df[["UPID", "Price Actual"]]
total_costs = pd.DataFrame(cost_df.groupby("UPID").sum())
total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
result = calculate_statistics(filtered_df, start_date, end_date, last_seen_date, title)
if result[0] is None:
return None, ""
patient_info, date_df, final_title = result
df_drug_freq = (
filtered_df.groupby("UPID")
.agg({"Drug Name": lambda x: list(x)})
.reset_index()
.set_index("UPID")
)
df_drug_cost = (
filtered_df.groupby("UPID")
.agg({"Price Actual": lambda x: list(x)})
.reset_index()
.set_index("UPID")
)
df_drug_freq["Price Actual"] = df_drug_freq.index.map(df_drug_cost["Price Actual"])
df_drug_freq["Drug Name"] = df_drug_freq["Drug Name"].apply(_count_list_values)
df_drug_freq["Drug cost total"] = df_drug_freq.apply(lambda x: _sum_list_values(x), axis=1)
df1_unique = _drop_duplicate_treatments(filtered_df, True)
df_drugs = (
df1_unique.groupby("UPID")
.agg({"Drug Name": lambda x: list(x)})
.reset_index()
.set_index("UPID")
)
df_dates = (
df1_unique.groupby("UPID")
.agg({"Intervention Date": lambda x: list(x)})
.reset_index()
.set_index("UPID")
)
df_dates_unwrapped = pd.DataFrame(
df_dates["Intervention Date"].values.tolist(), index=df_dates.index
).add_prefix("date_")
df_drugs_unwrapped = pd.DataFrame(
df_drugs["Drug Name"].values.tolist(), index=df_drugs.index
).add_prefix("drug_")
start_dates = (
filtered_df[["UPIDTreatment", "Intervention Date"]]
.sort_values(by=["Intervention Date"], ascending=True)
.drop_duplicates(subset="UPIDTreatment")
.set_index("UPIDTreatment")
)
end_dates = (
filtered_df[["UPIDTreatment", "Intervention Date"]]
.sort_values(by=["Intervention Date"], ascending=False)
.drop_duplicates(subset="UPIDTreatment")
.set_index("UPIDTreatment")
)
df_drugs_unwrapped["start_dates"] = df_drugs_unwrapped.apply(
lambda x: _start_date_drug(start_dates, x), axis=1
)
df_start_dates_unwrapped = pd.DataFrame(
df_drugs_unwrapped["start_dates"].values.tolist(), index=df_drugs_unwrapped.index
).add_prefix("start_date_")
df_drugs_unwrapped.drop(["start_dates"], inplace=True, axis=1)
df_drugs_unwrapped["end_dates"] = df_drugs_unwrapped.apply(
lambda x: _start_date_drug(end_dates, x), axis=1
)
df_end_dates_unwrapped_2 = pd.DataFrame(
df_drugs_unwrapped["end_dates"].values.tolist(), index=df_drugs_unwrapped.index
).add_prefix("end_date_")
df_drugs_unwrapped.drop(["end_dates"], inplace=True, axis=1)
df_drugs_unwrapped = pd.merge(
df_drugs_unwrapped, df_start_dates_unwrapped, left_index=True, right_index=True
)
df_drugs_unwrapped = pd.merge(
df_drugs_unwrapped, df_end_dates_unwrapped_2, left_index=True, right_index=True
)
df_freq_for_merge = pd.DataFrame(
df_drug_freq["Drug Name"].values.tolist(), index=df_drugs_unwrapped.index
).add_prefix("freq_")
df_drugs_unwrapped = pd.merge(
df_drugs_unwrapped, df_freq_for_merge, left_index=True, right_index=True
)
df_drugs_unwrapped["frequency"] = df_drugs_unwrapped.apply(
lambda x: _drug_frequency_average(x), axis=1
)
df_spacing_unwrapped = pd.DataFrame(
df_drugs_unwrapped["frequency"].values.tolist(), index=df_drugs_unwrapped.index
).add_prefix("spacing_")
df_drugs_unwrapped = pd.merge(
df_drugs_unwrapped, df_spacing_unwrapped, left_index=True, right_index=True
)
df_cost_unwrapped = pd.DataFrame(
df_drug_freq["Drug cost total"].values.tolist(), index=df_drugs_unwrapped.index
).add_prefix("total_cost_drug_")
df_drugs_unwrapped = pd.merge(
df_drugs_unwrapped, df_cost_unwrapped, left_index=True, right_index=True
)
df_drugs_unwrapped.drop(["frequency"], inplace=True, axis=1)
# Build hierarchy with indication_df instead of directory_df
ice_df = build_hierarchy(
patient_info,
date_df,
filtered_df,
org_codes,
indication_df, # Use indication mapping instead of directory
total_costs,
df_drugs_unwrapped,
)
ice_df = prepare_chart_data(ice_df, minimum_num_patients)
return ice_df, final_title