"""Generate a complete synthetic pathways.db for the containerised demo. Uses the existing schema and migration infrastructure to build a fully functional database with fabricated patient pathway data. Reference data (drug names, directories, SNOMED clusters) comes from the real CSVs — these are standard NHS terminology, not patient data. Trust names are fictional to make the synthetic nature obvious. Usage: python scripts/generate_demo_db.py """ from __future__ import annotations import json import random import sqlite3 import sys import uuid from datetime import datetime, timedelta from pathlib import Path # Ensure project root is on sys.path _project_root = str(Path(__file__).resolve().parent.parent) if _project_root not in sys.path: sys.path.insert(0, _project_root) # Pre-register data_processing as a bare namespace package to avoid its # __init__.py which pulls in snowflake_connector (not needed/available here). import types import importlib if "data_processing" not in sys.modules: _pkg = types.ModuleType("data_processing") _pkg.__path__ = [str(Path(_project_root) / "data_processing")] _pkg.__package__ = "data_processing" sys.modules["data_processing"] = _pkg from core.config import PathConfig # noqa: E402 from data_processing.database import DatabaseConfig, DatabaseManager # noqa: E402 from data_processing.schema import create_all_tables # noqa: E402 from data_processing.reference_data import ( # noqa: E402 migrate_drug_names, migrate_organizations, migrate_directories, migrate_drug_directory_map, migrate_drug_indication_clusters, ) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- random.seed(42) FICTIONAL_TRUSTS = [ "GREENFIELD UNIVERSITY HOSPITAL NHS FT", "RIVERSIDE DISTRICT GENERAL NHS TRUST", "THORNBURY ROYAL INFIRMARY NHS FT", "ASHWORTH COMMUNITY HOSPITAL NHS TRUST", "KINGSBURY TEACHING HOSPITALS NHS FT", ] ROOT_LABEL = "DEMO ICS" DATE_FILTER_IDS = [ "all_6mo", "all_12mo", "1yr_6mo", "1yr_12mo", "2yr_6mo", "2yr_12mo", ] CHART_TYPES = ["directory", "indication"] # Drug → directories mapping (subset of real drugs found in ref_drug_directory_map) DRUG_DIRECTORIES: dict[str, list[str]] = { "ADALIMUMAB": ["RHEUMATOLOGY", "GASTROENTEROLOGY", "DERMATOLOGY", "OPHTHALMOLOGY"], "INFLIXIMAB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"], "ETANERCEPT": ["RHEUMATOLOGY", "DERMATOLOGY"], "RITUXIMAB": ["RHEUMATOLOGY", "HAEMATOLOGY"], "TOCILIZUMAB": ["RHEUMATOLOGY"], "SECUKINUMAB": ["RHEUMATOLOGY", "DERMATOLOGY"], "VEDOLIZUMAB": ["GASTROENTEROLOGY"], "USTEKINUMAB": ["GASTROENTEROLOGY", "DERMATOLOGY"], "TOFACITINIB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"], "BARICITINIB": ["RHEUMATOLOGY", "DERMATOLOGY"], "OCRELIZUMAB": ["NEUROLOGY"], "NATALIZUMAB": ["NEUROLOGY"], "AFLIBERCEPT": ["OPHTHALMOLOGY"], "RANIBIZUMAB": ["OPHTHALMOLOGY"], "IBRUTINIB": ["HAEMATOLOGY"], "LENALIDOMIDE": ["HAEMATOLOGY"], "PEMBROLIZUMAB": ["ONCOLOGY"], "NIVOLUMAB": ["ONCOLOGY"], "TRASTUZUMAB": ["ONCOLOGY"], "BEVACIZUMAB": ["ONCOLOGY"], } # Drug → indications (for indication chart type) DRUG_INDICATIONS: dict[str, list[str]] = { "ADALIMUMAB": ["Rheumatoid arthritis", "Crohn's disease", "Psoriasis", "Uveitis"], "INFLIXIMAB": ["Ulcerative colitis", "Crohn's disease", "Rheumatoid arthritis"], "ETANERCEPT": ["Rheumatoid arthritis", "Psoriatic arthritis", "Ankylosing spondylitis"], "RITUXIMAB": ["Rheumatoid arthritis", "Non-Hodgkin lymphoma", "CLL"], "TOCILIZUMAB": ["Rheumatoid arthritis", "Giant cell arteritis"], "SECUKINUMAB": ["Psoriasis", "Psoriatic arthritis", "Ankylosing spondylitis"], "VEDOLIZUMAB": ["Ulcerative colitis", "Crohn's disease"], "USTEKINUMAB": ["Psoriasis", "Crohn's disease"], "TOFACITINIB": ["Ulcerative colitis", "Rheumatoid arthritis"], "BARICITINIB": ["Rheumatoid arthritis", "Atopic dermatitis"], "OCRELIZUMAB": ["Multiple sclerosis"], "NATALIZUMAB": ["Multiple sclerosis"], "AFLIBERCEPT": ["Wet AMD", "Diabetic macular oedema"], "RANIBIZUMAB": ["Wet AMD"], "IBRUTINIB": ["CLL", "Mantle cell lymphoma"], "LENALIDOMIDE": ["Multiple myeloma"], "PEMBROLIZUMAB": ["Non-small cell lung cancer", "Melanoma"], "NIVOLUMAB": ["Non-small cell lung cancer", "Renal cell carcinoma"], "TRASTUZUMAB": ["HER2+ breast cancer"], "BEVACIZUMAB": ["Colorectal cancer", "Non-small cell lung cancer"], } # Directories that appear in the demo DIRECTORIES_USED = sorted({d for dirs in DRUG_DIRECTORIES.values() for d in dirs}) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def rand_date(start_year: int, end_year: int) -> str: start = datetime(start_year, 1, 1) end = datetime(end_year, 12, 28) delta = (end - start).days dt = start + timedelta(days=random.randint(0, max(delta, 1))) return dt.strftime("%Y-%m-%d") def make_average_spacing_html(drugs: list[str]) -> str: """Build the HTML-formatted average_spacing string the Dash app expects.""" parts = [] for drug in drugs: times = random.randint(4, 30) interval = round(random.uniform(2.0, 12.0), 1) total_weeks = round(times * interval, 1) parts.append( f"
{drug}" f"
On average given {times} times with a {interval} weekly interval " f"({total_weeks} weeks total treatment length)" ) return "".join(parts) def make_average_administered_json(drugs: list[str]) -> str: """Build the JSON array of avg doses the Dash app expects.""" entries = [] for drug in drugs: entries.append({ "drug": drug, "avg_dose_mg": round(random.uniform(50, 500), 1), "avg_administrations": random.randint(4, 30), }) return json.dumps(entries) # --------------------------------------------------------------------------- # Node generation # --------------------------------------------------------------------------- def generate_nodes_for_combination( date_filter_id: str, chart_type: str, refresh_id: str, ) -> list[dict]: """Generate a complete hierarchy of pathway_nodes for one filter/chart combo. Returns a list of dicts ready for INSERT. """ nodes: list[dict] = [] # Scale factors per date filter so narrower filters have fewer patients scale = { "all_6mo": 0.6, "all_12mo": 1.0, "1yr_6mo": 0.3, "1yr_12mo": 0.5, "2yr_6mo": 0.4, "2yr_12mo": 0.7, }[date_filter_id] root_patients = 0 root_cost = 0.0 for trust in FICTIONAL_TRUSTS: trust_patients = 0 trust_cost = 0.0 # Determine level-2 groups based on chart_type if chart_type == "directory": level2_groups = DIRECTORIES_USED else: # For indication chart, use unique indications level2_groups = sorted({ ind for inds in DRUG_INDICATIONS.values() for ind in inds }) for group_name in level2_groups: group_patients = 0 group_cost = 0.0 # Determine which drugs appear under this group if chart_type == "directory": drugs_in_group = [ d for d, dirs in DRUG_DIRECTORIES.items() if group_name in dirs ] else: drugs_in_group = [ d for d, inds in DRUG_INDICATIONS.items() if group_name in inds ] if not drugs_in_group: continue # Only include a random subset per trust to create variation if len(drugs_in_group) > 3: n_drugs = random.randint(2, min(len(drugs_in_group), 5)) drugs_in_group = random.sample(drugs_in_group, n_drugs) for drug in drugs_in_group: drug_id = f"{ROOT_LABEL} - {trust} - {group_name} - {drug}" base_patients = int(random.randint(5, 80) * scale) if base_patients < 1: base_patients = 1 cost_pp = round(random.uniform(3000, 25000), 2) drug_cost = round(base_patients * cost_pp, 2) avg_days = round(random.uniform(180, 2500), 1) # Occasionally generate sub-pathway nodes (level 4+) sub_pathway_patients = 0 sub_pathway_cost = 0.0 if random.random() < 0.4: # Pick 1-2 follow-on drugs other_drugs = [d for d in DRUG_DIRECTORIES if d != drug] n_sub = random.randint(1, 2) follow_on_drugs = random.sample(other_drugs, min(n_sub, len(other_drugs))) for follow_drug in follow_on_drugs: sub_id = f"{drug_id} - {follow_drug}" sub_pts = int(random.randint(2, max(base_patients // 3, 3)) * scale) if sub_pts < 1: sub_pts = 1 sub_cpp = round(random.uniform(3000, 20000), 2) sub_cost = round(sub_pts * sub_cpp, 2) sub_avg_days = round(random.uniform(300, 3000), 1) drug_seq = f"{drug}|{follow_drug}" nodes.append({ "date_filter_id": date_filter_id, "chart_type": chart_type, "parents": drug_id, "ids": sub_id, "labels": follow_drug, "level": 4, "value": sub_pts, "cost": sub_cost, "costpp": sub_cpp, "cost_pp_pa": f"£{sub_cpp * random.uniform(0.8, 1.2):,.0f}", "colour": round(sub_pts / max(base_patients, 1), 4), "first_seen": rand_date(2019, 2022), "last_seen": rand_date(2024, 2025), "first_seen_parent": rand_date(2018, 2021), "last_seen_parent": rand_date(2024, 2025), "average_spacing": make_average_spacing_html([drug, follow_drug]), "average_administered": make_average_administered_json([drug, follow_drug]), "avg_days": sub_avg_days, "trust_name": trust, "directory": group_name if chart_type == "directory" else None, "drug_sequence": drug_seq, "data_refresh_id": refresh_id, }) sub_pathway_patients += sub_pts sub_pathway_cost += sub_cost # Drug node (level 3) — value must include sub-pathways total_drug_patients = base_patients + sub_pathway_patients total_drug_cost = drug_cost + sub_pathway_cost nodes.append({ "date_filter_id": date_filter_id, "chart_type": chart_type, "parents": f"{ROOT_LABEL} - {trust} - {group_name}", "ids": drug_id, "labels": drug, "level": 3, "value": total_drug_patients, "cost": total_drug_cost, "costpp": round(total_drug_cost / max(total_drug_patients, 1), 2), "cost_pp_pa": f"£{total_drug_cost / max(total_drug_patients, 1) * random.uniform(0.8, 1.2):,.0f}", "colour": 0.0, # placeholder, set after group total known "first_seen": rand_date(2018, 2021), "last_seen": rand_date(2024, 2025), "first_seen_parent": rand_date(2017, 2020), "last_seen_parent": rand_date(2024, 2025), "average_spacing": make_average_spacing_html([drug]), "average_administered": make_average_administered_json([drug]), "avg_days": avg_days, "trust_name": trust, "directory": group_name if chart_type == "directory" else None, "drug_sequence": drug, "data_refresh_id": refresh_id, }) group_patients += total_drug_patients group_cost += total_drug_cost if group_patients == 0: continue # Level 2 group node (directory or indication) group_id = f"{ROOT_LABEL} - {trust} - {group_name}" nodes.append({ "date_filter_id": date_filter_id, "chart_type": chart_type, "parents": f"{ROOT_LABEL} - {trust}", "ids": group_id, "labels": group_name, "level": 2, "value": group_patients, "cost": round(group_cost, 2), "costpp": round(group_cost / max(group_patients, 1), 2), "cost_pp_pa": f"£{group_cost / max(group_patients, 1) * random.uniform(0.8, 1.2):,.0f}", "colour": 0.0, # set after trust total known "first_seen": rand_date(2017, 2020), "last_seen": rand_date(2024, 2025), "first_seen_parent": rand_date(2016, 2019), "last_seen_parent": rand_date(2024, 2025), "average_spacing": None, "average_administered": None, "avg_days": None, "trust_name": trust, "directory": group_name if chart_type == "directory" else None, "drug_sequence": None, "data_refresh_id": refresh_id, }) trust_patients += group_patients trust_cost += group_cost if trust_patients == 0: continue # Level 1 trust node trust_id = f"{ROOT_LABEL} - {trust}" nodes.append({ "date_filter_id": date_filter_id, "chart_type": chart_type, "parents": ROOT_LABEL, "ids": trust_id, "labels": trust, "level": 1, "value": trust_patients, "cost": round(trust_cost, 2), "costpp": round(trust_cost / max(trust_patients, 1), 2), "cost_pp_pa": f"£{trust_cost / max(trust_patients, 1):,.0f}", "colour": 0.0, # set after root total known "first_seen": rand_date(2016, 2019), "last_seen": rand_date(2024, 2025), "first_seen_parent": None, "last_seen_parent": None, "average_spacing": None, "average_administered": None, "avg_days": None, "trust_name": trust, "directory": None, "drug_sequence": None, "data_refresh_id": refresh_id, }) root_patients += trust_patients root_cost += trust_cost # Level 0 root node nodes.append({ "date_filter_id": date_filter_id, "chart_type": chart_type, "parents": "", "ids": ROOT_LABEL, "labels": ROOT_LABEL, "level": 0, "value": root_patients, "cost": round(root_cost, 2), "costpp": round(root_cost / max(root_patients, 1), 2), "cost_pp_pa": f"£{root_cost / max(root_patients, 1):,.0f}", "colour": 0.5, "first_seen": None, "last_seen": None, "first_seen_parent": None, "last_seen_parent": None, "average_spacing": None, "average_administered": None, "avg_days": None, "trust_name": None, "directory": None, "drug_sequence": None, "data_refresh_id": refresh_id, }) # Fix colour values (proportion of parent) parent_values: dict[str, int] = {n["ids"]: n["value"] for n in nodes} for node in nodes: if node["level"] > 0 and node["parents"] in parent_values: parent_val = parent_values[node["parents"]] node["colour"] = round(node["value"] / max(parent_val, 1), 4) return nodes # --------------------------------------------------------------------------- # Database construction # --------------------------------------------------------------------------- def insert_nodes(conn: sqlite3.Connection, nodes: list[dict]) -> None: """Bulk insert pathway_nodes.""" columns = [ "date_filter_id", "chart_type", "parents", "ids", "labels", "level", "value", "cost", "costpp", "cost_pp_pa", "colour", "first_seen", "last_seen", "first_seen_parent", "last_seen_parent", "average_spacing", "average_administered", "avg_days", "trust_name", "directory", "drug_sequence", "data_refresh_id", ] placeholders = ", ".join(["?"] * len(columns)) col_names = ", ".join(columns) conn.executemany( f"INSERT INTO pathway_nodes ({col_names}) VALUES ({placeholders})", [tuple(node[c] for c in columns) for node in nodes], ) def build_database(db_path: Path) -> None: """Build the complete synthetic database.""" # Remove existing DB if db_path.exists(): db_path.unlink() db_path.parent.mkdir(parents=True, exist_ok=True) config = DatabaseConfig(db_path=db_path) db_manager = DatabaseManager(config) paths = PathConfig(base_dir=Path(_project_root)) # 1. Create all tables (reference + pathway + date filters) print("Creating schema...") with db_manager.get_connection() as conn: create_all_tables(conn) # 2. Migrate reference data from CSVs print("Migrating reference data...") migrations = [ ("Drug names", lambda: migrate_drug_names(db_manager, paths)), ("Organizations", lambda: migrate_organizations(db_manager, paths)), ("Directories", lambda: migrate_directories(db_manager, paths)), ("Drug-directory map", lambda: migrate_drug_directory_map(db_manager, paths)), ("Drug indication clusters", lambda: migrate_drug_indication_clusters( db_manager, paths.data_dir / "drug_indication_clusters.csv" )), ] for name, migrate_fn in migrations: result = migrate_fn() if not result.success: print(f" FAILED: {name} — {result.error_message}") sys.exit(1) print(f" {name}: {result.rows_inserted} rows inserted") # 3. Generate synthetic pathway_nodes for all 12 combinations refresh_id = str(uuid.uuid4()) started_at = datetime.now().isoformat() total_nodes = 0 date_filter_counts: dict[str, int] = {} print("Generating synthetic pathway nodes...") with db_manager.get_transaction() as conn: for date_filter_id in DATE_FILTER_IDS: filter_count = 0 for chart_type in CHART_TYPES: # Reset random seed per combo for reproducibility but variation random.seed(hash((date_filter_id, chart_type)) % (2**31)) nodes = generate_nodes_for_combination( date_filter_id, chart_type, refresh_id ) insert_nodes(conn, nodes) filter_count += len(nodes) print(f" {date_filter_id}/{chart_type}: {len(nodes)} nodes") date_filter_counts[date_filter_id] = filter_count total_nodes += filter_count # 4. Insert refresh log entry print("Writing refresh log...") completed_at = datetime.now().isoformat() with db_manager.get_transaction() as conn: conn.execute( """ INSERT INTO pathway_refresh_log (refresh_id, started_at, completed_at, status, record_count, date_filter_counts, source_row_count, processing_duration_seconds) VALUES (?, ?, ?, 'completed', ?, ?, ?, ?) """, ( refresh_id, started_at, completed_at, total_nodes, json.dumps(date_filter_counts), total_nodes, 0.0, ), ) print(f"\nDone! {total_nodes} total nodes written to {db_path}") print(f"Date filter breakdown: {json.dumps(date_filter_counts, indent=2)}") # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- if __name__ == "__main__": db_path = Path(_project_root) / "data" / "pathways.db" build_database(db_path)