adc1dbfc58
Tested full refresh pipeline end-to-end with real Snowflake data: - Fixed trust filter to read Name column from defaultTrusts.csv - Fixed Decimal type handling in calculate_cost_per_patient_per_annum - Fixed array handling in convert_to_records for average_administered - Added required reference CSV files to data/ directory - Configured Snowflake connection (account, warehouse, user) Results: - Snowflake fetch: 656,695 records in ~7s - Transformations: 519,848 records after UPID/drug/directory - Pathway nodes: 293 for all_6mo (8 trusts, 14 directories) - Total processing time: ~6.2 minutes
332 lines
9.4 KiB
Python
332 lines
9.4 KiB
Python
"""
|
|
Statistical calculation functions for patient pathway analysis.
|
|
|
|
This module contains functions for calculating:
|
|
- Drug frequency counts and averages
|
|
- Cost aggregations (total, per patient, per annum)
|
|
- Treatment duration calculations
|
|
- Dosing interval calculations
|
|
|
|
These functions are extracted from the analysis pipeline to enable:
|
|
- Independent testing
|
|
- Reuse across different analysis contexts
|
|
- Clearer separation of concerns
|
|
"""
|
|
|
|
from itertools import groupby
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
|
|
def count_consecutive_values(values: list) -> list[int]:
|
|
"""
|
|
Count consecutive occurrences of each value in a sorted list.
|
|
|
|
Used to count how many times each drug was administered.
|
|
|
|
Args:
|
|
values: List of values (typically drug names)
|
|
|
|
Returns:
|
|
List of counts for each unique value in sorted order
|
|
|
|
Example:
|
|
>>> count_consecutive_values(['A', 'A', 'B', 'A'])
|
|
[3, 1] # 'A' appears 3 times, 'B' appears 1 time (sorted)
|
|
"""
|
|
return [len(list(group)) for key, group in groupby(sorted(values))]
|
|
|
|
|
|
def calculate_drug_costs(drug_counts: list[int], prices: list[float]) -> list[float]:
|
|
"""
|
|
Calculate total cost for each drug based on counts and prices.
|
|
|
|
Splits the price list based on drug administration counts and sums
|
|
each drug's portion.
|
|
|
|
Args:
|
|
drug_counts: List of administration counts per drug (from count_consecutive_values)
|
|
prices: List of individual administration prices (Price Actual values)
|
|
|
|
Returns:
|
|
List of total costs per drug
|
|
|
|
Example:
|
|
>>> calculate_drug_costs([3, 2], [100, 100, 100, 200, 200])
|
|
[300.0, 400.0] # Drug 1: 3x$100 = $300, Drug 2: 2x$200 = $400
|
|
"""
|
|
sum_list = []
|
|
cumulative = 0
|
|
for count in drug_counts:
|
|
drug_cost = sum(prices[cumulative:cumulative + count])
|
|
sum_list.append(float(drug_cost))
|
|
cumulative += count
|
|
return sum_list
|
|
|
|
|
|
def calculate_dosing_frequency(
|
|
freq: int,
|
|
start_date: pd.Timestamp,
|
|
end_date: pd.Timestamp,
|
|
) -> float:
|
|
"""
|
|
Calculate average dosing interval in days.
|
|
|
|
Computes the average number of days between administrations.
|
|
|
|
Args:
|
|
freq: Number of administrations
|
|
start_date: First administration date
|
|
end_date: Last administration date
|
|
|
|
Returns:
|
|
Average days between administrations, or 0 if only one dose
|
|
|
|
Example:
|
|
>>> start = pd.Timestamp('2024-01-01')
|
|
>>> end = pd.Timestamp('2024-01-22')
|
|
>>> calculate_dosing_frequency(4, start, end)
|
|
7.0 # 21 days / (4-1) = 7 days between doses
|
|
"""
|
|
if freq <= 1:
|
|
return 0.0
|
|
|
|
duration_days = (end_date - start_date) / np.timedelta64(1, "D")
|
|
if duration_days <= 0:
|
|
return 0.0
|
|
|
|
return duration_days / (freq - 1)
|
|
|
|
|
|
def calculate_drug_frequency_row(row: pd.Series) -> list[float]:
|
|
"""
|
|
Calculate average dosing frequency for each drug in a patient's treatment.
|
|
|
|
Used with DataFrame.apply() on rows containing drug_*, freq_*, start_date_*, end_date_* columns.
|
|
|
|
Args:
|
|
row: Series with drug names, frequencies, start dates, and end dates
|
|
|
|
Returns:
|
|
List of average dosing intervals (days) for each drug
|
|
"""
|
|
drug_count = row.index.str.contains("drug_").sum()
|
|
frequencies = []
|
|
|
|
for d in range(drug_count):
|
|
freq_col = f"freq_{d}"
|
|
start_col = f"start_date_{d}"
|
|
end_col = f"end_date_{d}"
|
|
|
|
freq = row.get(freq_col, 0)
|
|
if freq is None or pd.isna(freq):
|
|
freq = 0
|
|
else:
|
|
freq = int(freq)
|
|
|
|
if freq > 1:
|
|
start_date = row.get(start_col)
|
|
end_date = row.get(end_col)
|
|
|
|
if pd.notna(start_date) and pd.notna(end_date):
|
|
interval = calculate_dosing_frequency(freq, start_date, end_date)
|
|
else:
|
|
interval = 0.0
|
|
else:
|
|
interval = 0.0
|
|
|
|
frequencies.append(interval)
|
|
|
|
return frequencies
|
|
|
|
|
|
def calculate_cost_per_patient_per_annum(
|
|
total_cost: float,
|
|
days_treated: Optional[pd.Timedelta],
|
|
) -> Optional[float]:
|
|
"""
|
|
Calculate annualized cost per patient.
|
|
|
|
Normalizes costs to a per-year basis to enable comparison across
|
|
patients with different treatment durations.
|
|
|
|
Args:
|
|
total_cost: Total cost for the patient (can be Decimal or float)
|
|
days_treated: Treatment duration as timedelta
|
|
|
|
Returns:
|
|
Annualized cost, or None if days_treated is 0 or None
|
|
|
|
Example:
|
|
>>> calculate_cost_per_patient_per_annum(5000, pd.Timedelta(days=182.5))
|
|
10000.0 # Half year treatment, so annual cost is 2x
|
|
"""
|
|
if days_treated is None or pd.isna(days_treated):
|
|
return None
|
|
|
|
days = days_treated / np.timedelta64(1, "D") if hasattr(days_treated, '__truediv__') else float(days_treated)
|
|
|
|
if days <= 0:
|
|
return None
|
|
|
|
# Convert total_cost to float to handle Decimal from Snowflake
|
|
return float(total_cost) / (days / 365)
|
|
|
|
|
|
def calculate_treatment_duration(
|
|
first_seen: pd.Timestamp,
|
|
last_seen: pd.Timestamp,
|
|
) -> pd.Timedelta:
|
|
"""
|
|
Calculate treatment duration from first to last seen dates.
|
|
|
|
Args:
|
|
first_seen: Date of first treatment
|
|
last_seen: Date of last treatment
|
|
|
|
Returns:
|
|
Duration as timedelta
|
|
"""
|
|
return last_seen - first_seen
|
|
|
|
|
|
def calculate_pathway_proportion(value: int, parent_value: int) -> float:
|
|
"""
|
|
Calculate proportion of parent value for color scaling.
|
|
|
|
Used to determine color intensity in the icicle chart based on
|
|
what proportion of the parent category this pathway represents.
|
|
|
|
Args:
|
|
value: Patient count for this pathway
|
|
parent_value: Total patient count for the parent category
|
|
|
|
Returns:
|
|
Proportion (0.0 to 1.0)
|
|
"""
|
|
if parent_value <= 0:
|
|
return 0.0
|
|
return value / parent_value
|
|
|
|
|
|
def aggregate_patient_costs(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Calculate total cost per patient (UPID).
|
|
|
|
Args:
|
|
df: DataFrame with UPID and Price Actual columns
|
|
|
|
Returns:
|
|
DataFrame indexed by UPID with Total cost column
|
|
"""
|
|
cost_df = df[["UPID", "Price Actual"]]
|
|
total_costs = cost_df.groupby("UPID").sum()
|
|
total_costs.rename(columns={"Price Actual": "Total cost"}, inplace=True)
|
|
return total_costs
|
|
|
|
|
|
def aggregate_drug_frequencies(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Calculate drug administration frequency per patient.
|
|
|
|
Groups by UPID and returns counts of each drug's administrations.
|
|
|
|
Args:
|
|
df: DataFrame with UPID and Drug Name columns
|
|
|
|
Returns:
|
|
DataFrame indexed by UPID with Drug Name as list of counts
|
|
"""
|
|
return (
|
|
df.groupby("UPID")
|
|
.agg({"Drug Name": lambda x: count_consecutive_values(list(x))})
|
|
.reset_index()
|
|
.set_index("UPID")
|
|
)
|
|
|
|
|
|
def calculate_average_spacing_for_pathway(
|
|
upid_drugs_df: pd.DataFrame,
|
|
pathway_value: str,
|
|
) -> list[float]:
|
|
"""
|
|
Calculate average dosing spacing for a treatment pathway.
|
|
|
|
Groups patients by pathway and calculates mean spacing for each drug position.
|
|
|
|
Args:
|
|
upid_drugs_df: DataFrame with patient pathway data and spacing columns
|
|
pathway_value: Pathway identifier string
|
|
|
|
Returns:
|
|
List of average spacing values (days) for each drug in pathway
|
|
"""
|
|
spacing_cols = [col for col in upid_drugs_df.columns if col.startswith("spacing_")]
|
|
|
|
pathway_data = upid_drugs_df[upid_drugs_df["value"] == pathway_value]
|
|
|
|
if len(pathway_data) == 0:
|
|
return []
|
|
|
|
averages = pathway_data[spacing_cols].mean()
|
|
return [round(v, 0) if pd.notna(v) else 0.0 for v in averages.tolist()]
|
|
|
|
|
|
def format_treatment_statistics(
|
|
drug_names: list[str],
|
|
average_administered: list[float],
|
|
average_spacing: list[float],
|
|
average_cost: list[float],
|
|
) -> str:
|
|
"""
|
|
Format drug treatment statistics into a readable string for chart display.
|
|
|
|
Creates an HTML-formatted string with drug name, average administrations,
|
|
dosing interval, and total treatment length.
|
|
|
|
Args:
|
|
drug_names: List of drug names in treatment sequence
|
|
average_administered: Average number of administrations per drug
|
|
average_spacing: Average days between doses per drug
|
|
average_cost: Average cost per drug
|
|
|
|
Returns:
|
|
HTML-formatted string for chart hover text
|
|
"""
|
|
ret_string = ""
|
|
|
|
for i, drug_name in enumerate(drug_names):
|
|
admin_count = average_administered[i] if i < len(average_administered) else 0
|
|
spacing_days = average_spacing[i] if i < len(average_spacing) else 0
|
|
|
|
# Convert to weeks
|
|
spacing_weeks = spacing_days / 7 if spacing_days > 0 else 0
|
|
total_weeks = spacing_weeks * admin_count if admin_count > 0 else 0
|
|
|
|
string = (
|
|
f"<br><b>{drug_name}</b><br>On average given "
|
|
f"{round(admin_count, 1)} times with a "
|
|
f"{round(spacing_weeks, 1)} weekly interval ("
|
|
f"{round(total_weeks, 0)} weeks total treatment length)"
|
|
)
|
|
ret_string += string
|
|
|
|
return ret_string
|
|
|
|
|
|
def remove_nan_values(values: list) -> list:
|
|
"""
|
|
Remove NaN string values from a list.
|
|
|
|
Used to clean up aggregated statistics that may contain 'nan' strings.
|
|
|
|
Args:
|
|
values: List potentially containing 'nan' strings
|
|
|
|
Returns:
|
|
Filtered list without 'nan' strings
|
|
"""
|
|
return [x for x in values if str(x).lower() != "nan"]
|