"""Parsing utilities for pathway node data. Shared functions for extracting structured data from pathway_nodes columns. Used by analytics chart callbacks in dash_app/callbacks/. """ import re def parse_average_spacing(spacing_html): """Extract dosing information from average_spacing HTML string. Args: spacing_html: HTML like '
DRUG
On average given 35.6 times with a 9.0 weekly interval (320.0 weeks total treatment length)' May contain multiple drug entries separated by
. Returns: List of dicts with keys: drug_name, dose_count, weekly_interval, total_weeks. Returns empty list for None/empty input or unparseable strings. """ if not spacing_html: return [] results = [] pattern = ( r"([^<]+)
" r"On average given ([\d.]+) times " r"with a ([\d.]+) weekly interval " r"\(([\d.]+) weeks total treatment length\)" ) for match in re.finditer(pattern, spacing_html): results.append({ "drug_name": match.group(1).strip(), "dose_count": float(match.group(2)), "weekly_interval": float(match.group(3)), "total_weeks": float(match.group(4)), }) return results def parse_pathway_drugs(ids, level): """Extract ordered drug list from the ids column at level 4+. Args: ids: String like 'ROOT - TRUST - DIR - DRUG_A - DRUG_B - DRUG_C'. Segments are separated by ' - '. Drug names start at index 3 (0=root, 1=trust, 2=directory, 3+=drugs). level: Node level. Only meaningful for level >= 3. Returns: List of drug names in treatment order. Empty list for level < 3 or invalid input. """ if not ids or level < 3: return [] segments = ids.split(" - ") # Segments: [root, trust, directory, drug_0, drug_1, ...] if len(segments) <= 3: return [] return segments[3:] def _get_patients(node): """Get patient count from a node dict (supports both 'value' and 'patients' keys).""" return node.get("value") or node.get("patients") or 0 def calculate_retention_rate(nodes): """Calculate pathway retention rates from node data. For each N-drug pathway, calculate what % of patients do NOT escalate to an N+1 drug pathway. This identifies effective treatment sequences. Args: nodes: List of dicts with 'ids', 'level', and 'value' or 'patients' keys. Should contain level 4+ nodes (pathway level). Returns: Dict mapping pathway ids to retention info: {ids: {"retained_patients": int, "total_patients": int, "retention_rate": float, "drug_sequence": list}} """ if not nodes: return {} # Index nodes by ids for parent lookup node_map = {n["ids"]: n for n in nodes if n.get("ids")} results = {} for node in nodes: level = node.get("level", 0) if level < 4: continue node_ids = node.get("ids", "") total_patients = _get_patients(node) if not total_patients: continue # Find child pathways (nodes whose ids start with this node's ids + " - ") child_prefix = node_ids + " - " child_patients = sum( _get_patients(n) for n in nodes if n.get("ids", "").startswith(child_prefix) and n.get("level", 0) == level + 1 ) retained = total_patients - child_patients retention_rate = (retained / total_patients * 100) if total_patients > 0 else 0.0 results[node_ids] = { "retained_patients": retained, "total_patients": total_patients, "retention_rate": round(retention_rate, 1), "drug_sequence": parse_pathway_drugs(node_ids, level), } return results