diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..823547c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,16 @@ +.git +__pycache__ +*.pyc +*.pyo +*.db +.pytest_cache +.coverage +htmlcov +tests +.venv +*.egg-info +dist +build +.mypy_cache +.ruff_cache +*.spec diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1c1f1cf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +WORKDIR /app +COPY . . + +# Runtime deps only — excludes snowflake, pywebview, pyinstaller, pyarrow, fastparquet +RUN pip install --no-cache-dir \ + dash>=2.14.0 \ + dash-mantine-components>=0.14.0 \ + plotly>=5.15.0 \ + pandas>=2.0.3 \ + numpy>=1.25.0 \ + gunicorn>=21.0.0 + +# Generate synthetic database at build time +RUN python scripts/generate_demo_db.py + +EXPOSE 8050 + +CMD ["gunicorn", "--bind", "0.0.0.0:8050", "--workers", "2", "--timeout", "120", "dash_app.app:server"] diff --git a/dash_app/components/header.py b/dash_app/components/header.py index 7290964..6e7c158 100644 --- a/dash_app/components/header.py +++ b/dash_app/components/header.py @@ -18,6 +18,19 @@ def make_header(): ], ), + # Demo banner + html.Div( + "SYNTHETIC DATA FOR DEMONSTRATION", + style={ + "color": "#e53e3e", + "fontWeight": "bold", + "fontSize": "0.85rem", + "letterSpacing": "0.05em", + "textAlign": "center", + "whiteSpace": "nowrap", + }, + ), + # Center: 3 fraction KPIs (filtered / total) html.Div( className="top-header__kpis", diff --git a/data/pathways.db b/data/pathways.db index a8114f3..837707a 100644 Binary files a/data/pathways.db and b/data/pathways.db differ diff --git a/data_processing/snowflake_connector.py b/data_processing/snowflake_connector.py index f24ca04..11c4628 100644 --- a/data_processing/snowflake_connector.py +++ b/data_processing/snowflake_connector.py @@ -24,6 +24,8 @@ Usage: connector.close() """ +from __future__ import annotations + from contextlib import contextmanager from dataclasses import dataclass from datetime import date, datetime diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..bded8b7 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,6 @@ +services: + hcd-demo: + build: . + ports: + - "8050:8050" + restart: unless-stopped diff --git a/scripts/generate_demo_db.py b/scripts/generate_demo_db.py new file mode 100644 index 0000000..bf202d6 --- /dev/null +++ b/scripts/generate_demo_db.py @@ -0,0 +1,531 @@ +"""Generate a complete synthetic pathways.db for the containerised demo. + +Uses the existing schema and migration infrastructure to build a fully +functional database with fabricated patient pathway data. Reference data +(drug names, directories, SNOMED clusters) comes from the real CSVs — +these are standard NHS terminology, not patient data. Trust names are +fictional to make the synthetic nature obvious. + +Usage: + python scripts/generate_demo_db.py +""" + +from __future__ import annotations + +import json +import random +import sqlite3 +import sys +import uuid +from datetime import datetime, timedelta +from pathlib import Path + +# Ensure project root is on sys.path +_project_root = str(Path(__file__).resolve().parent.parent) +if _project_root not in sys.path: + sys.path.insert(0, _project_root) + +# Pre-register data_processing as a bare namespace package to avoid its +# __init__.py which pulls in snowflake_connector (not needed/available here). +import types +import importlib + +if "data_processing" not in sys.modules: + _pkg = types.ModuleType("data_processing") + _pkg.__path__ = [str(Path(_project_root) / "data_processing")] + _pkg.__package__ = "data_processing" + sys.modules["data_processing"] = _pkg + +from core.config import PathConfig # noqa: E402 +from data_processing.database import DatabaseConfig, DatabaseManager # noqa: E402 +from data_processing.schema import create_all_tables # noqa: E402 +from data_processing.reference_data import ( # noqa: E402 + migrate_drug_names, + migrate_organizations, + migrate_directories, + migrate_drug_directory_map, + migrate_drug_indication_clusters, +) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +random.seed(42) + +FICTIONAL_TRUSTS = [ + "GREENFIELD UNIVERSITY HOSPITAL NHS FT", + "RIVERSIDE DISTRICT GENERAL NHS TRUST", + "THORNBURY ROYAL INFIRMARY NHS FT", + "ASHWORTH COMMUNITY HOSPITAL NHS TRUST", + "KINGSBURY TEACHING HOSPITALS NHS FT", +] + +ROOT_LABEL = "DEMO ICS" + +DATE_FILTER_IDS = [ + "all_6mo", "all_12mo", + "1yr_6mo", "1yr_12mo", + "2yr_6mo", "2yr_12mo", +] + +CHART_TYPES = ["directory", "indication"] + +# Drug → directories mapping (subset of real drugs found in ref_drug_directory_map) +DRUG_DIRECTORIES: dict[str, list[str]] = { + "ADALIMUMAB": ["RHEUMATOLOGY", "GASTROENTEROLOGY", "DERMATOLOGY", "OPHTHALMOLOGY"], + "INFLIXIMAB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"], + "ETANERCEPT": ["RHEUMATOLOGY", "DERMATOLOGY"], + "RITUXIMAB": ["RHEUMATOLOGY", "HAEMATOLOGY"], + "TOCILIZUMAB": ["RHEUMATOLOGY"], + "SECUKINUMAB": ["RHEUMATOLOGY", "DERMATOLOGY"], + "VEDOLIZUMAB": ["GASTROENTEROLOGY"], + "USTEKINUMAB": ["GASTROENTEROLOGY", "DERMATOLOGY"], + "TOFACITINIB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"], + "BARICITINIB": ["RHEUMATOLOGY", "DERMATOLOGY"], + "OCRELIZUMAB": ["NEUROLOGY"], + "NATALIZUMAB": ["NEUROLOGY"], + "AFLIBERCEPT": ["OPHTHALMOLOGY"], + "RANIBIZUMAB": ["OPHTHALMOLOGY"], + "IBRUTINIB": ["HAEMATOLOGY"], + "LENALIDOMIDE": ["HAEMATOLOGY"], + "PEMBROLIZUMAB": ["ONCOLOGY"], + "NIVOLUMAB": ["ONCOLOGY"], + "TRASTUZUMAB": ["ONCOLOGY"], + "BEVACIZUMAB": ["ONCOLOGY"], +} + +# Drug → indications (for indication chart type) +DRUG_INDICATIONS: dict[str, list[str]] = { + "ADALIMUMAB": ["Rheumatoid arthritis", "Crohn's disease", "Psoriasis", "Uveitis"], + "INFLIXIMAB": ["Ulcerative colitis", "Crohn's disease", "Rheumatoid arthritis"], + "ETANERCEPT": ["Rheumatoid arthritis", "Psoriatic arthritis", "Ankylosing spondylitis"], + "RITUXIMAB": ["Rheumatoid arthritis", "Non-Hodgkin lymphoma", "CLL"], + "TOCILIZUMAB": ["Rheumatoid arthritis", "Giant cell arteritis"], + "SECUKINUMAB": ["Psoriasis", "Psoriatic arthritis", "Ankylosing spondylitis"], + "VEDOLIZUMAB": ["Ulcerative colitis", "Crohn's disease"], + "USTEKINUMAB": ["Psoriasis", "Crohn's disease"], + "TOFACITINIB": ["Ulcerative colitis", "Rheumatoid arthritis"], + "BARICITINIB": ["Rheumatoid arthritis", "Atopic dermatitis"], + "OCRELIZUMAB": ["Multiple sclerosis"], + "NATALIZUMAB": ["Multiple sclerosis"], + "AFLIBERCEPT": ["Wet AMD", "Diabetic macular oedema"], + "RANIBIZUMAB": ["Wet AMD"], + "IBRUTINIB": ["CLL", "Mantle cell lymphoma"], + "LENALIDOMIDE": ["Multiple myeloma"], + "PEMBROLIZUMAB": ["Non-small cell lung cancer", "Melanoma"], + "NIVOLUMAB": ["Non-small cell lung cancer", "Renal cell carcinoma"], + "TRASTUZUMAB": ["HER2+ breast cancer"], + "BEVACIZUMAB": ["Colorectal cancer", "Non-small cell lung cancer"], +} + +# Directories that appear in the demo +DIRECTORIES_USED = sorted({d for dirs in DRUG_DIRECTORIES.values() for d in dirs}) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def rand_date(start_year: int, end_year: int) -> str: + start = datetime(start_year, 1, 1) + end = datetime(end_year, 12, 28) + delta = (end - start).days + dt = start + timedelta(days=random.randint(0, max(delta, 1))) + return dt.strftime("%Y-%m-%d") + + +def make_average_spacing_html(drugs: list[str]) -> str: + """Build the HTML-formatted average_spacing string the Dash app expects.""" + parts = [] + for drug in drugs: + times = random.randint(4, 30) + interval = round(random.uniform(2.0, 12.0), 1) + total_weeks = round(times * interval, 1) + parts.append( + f"
{drug}" + f"
On average given {times} times with a {interval} weekly interval " + f"({total_weeks} weeks total treatment length)" + ) + return "".join(parts) + + +def make_average_administered_json(drugs: list[str]) -> str: + """Build the JSON array of avg doses the Dash app expects.""" + entries = [] + for drug in drugs: + entries.append({ + "drug": drug, + "avg_dose_mg": round(random.uniform(50, 500), 1), + "avg_administrations": random.randint(4, 30), + }) + return json.dumps(entries) + + +# --------------------------------------------------------------------------- +# Node generation +# --------------------------------------------------------------------------- + +def generate_nodes_for_combination( + date_filter_id: str, + chart_type: str, + refresh_id: str, +) -> list[dict]: + """Generate a complete hierarchy of pathway_nodes for one filter/chart combo. + + Returns a list of dicts ready for INSERT. + """ + nodes: list[dict] = [] + + # Scale factors per date filter so narrower filters have fewer patients + scale = { + "all_6mo": 0.6, "all_12mo": 1.0, + "1yr_6mo": 0.3, "1yr_12mo": 0.5, + "2yr_6mo": 0.4, "2yr_12mo": 0.7, + }[date_filter_id] + + root_patients = 0 + root_cost = 0.0 + + for trust in FICTIONAL_TRUSTS: + trust_patients = 0 + trust_cost = 0.0 + + # Determine level-2 groups based on chart_type + if chart_type == "directory": + level2_groups = DIRECTORIES_USED + else: + # For indication chart, use unique indications + level2_groups = sorted({ + ind for inds in DRUG_INDICATIONS.values() for ind in inds + }) + + for group_name in level2_groups: + group_patients = 0 + group_cost = 0.0 + + # Determine which drugs appear under this group + if chart_type == "directory": + drugs_in_group = [ + d for d, dirs in DRUG_DIRECTORIES.items() + if group_name in dirs + ] + else: + drugs_in_group = [ + d for d, inds in DRUG_INDICATIONS.items() + if group_name in inds + ] + + if not drugs_in_group: + continue + + # Only include a random subset per trust to create variation + if len(drugs_in_group) > 3: + n_drugs = random.randint(2, min(len(drugs_in_group), 5)) + drugs_in_group = random.sample(drugs_in_group, n_drugs) + + for drug in drugs_in_group: + drug_id = f"{ROOT_LABEL} - {trust} - {group_name} - {drug}" + base_patients = int(random.randint(5, 80) * scale) + if base_patients < 1: + base_patients = 1 + cost_pp = round(random.uniform(3000, 25000), 2) + drug_cost = round(base_patients * cost_pp, 2) + avg_days = round(random.uniform(180, 2500), 1) + + # Occasionally generate sub-pathway nodes (level 4+) + sub_pathway_patients = 0 + sub_pathway_cost = 0.0 + if random.random() < 0.4: + # Pick 1-2 follow-on drugs + other_drugs = [d for d in DRUG_DIRECTORIES if d != drug] + n_sub = random.randint(1, 2) + follow_on_drugs = random.sample(other_drugs, min(n_sub, len(other_drugs))) + + for follow_drug in follow_on_drugs: + sub_id = f"{drug_id} - {follow_drug}" + sub_pts = int(random.randint(2, max(base_patients // 3, 3)) * scale) + if sub_pts < 1: + sub_pts = 1 + sub_cpp = round(random.uniform(3000, 20000), 2) + sub_cost = round(sub_pts * sub_cpp, 2) + sub_avg_days = round(random.uniform(300, 3000), 1) + drug_seq = f"{drug}|{follow_drug}" + + nodes.append({ + "date_filter_id": date_filter_id, + "chart_type": chart_type, + "parents": drug_id, + "ids": sub_id, + "labels": follow_drug, + "level": 4, + "value": sub_pts, + "cost": sub_cost, + "costpp": sub_cpp, + "cost_pp_pa": f"£{sub_cpp * random.uniform(0.8, 1.2):,.0f}", + "colour": round(sub_pts / max(base_patients, 1), 4), + "first_seen": rand_date(2019, 2022), + "last_seen": rand_date(2024, 2025), + "first_seen_parent": rand_date(2018, 2021), + "last_seen_parent": rand_date(2024, 2025), + "average_spacing": make_average_spacing_html([drug, follow_drug]), + "average_administered": make_average_administered_json([drug, follow_drug]), + "avg_days": sub_avg_days, + "trust_name": trust, + "directory": group_name if chart_type == "directory" else None, + "drug_sequence": drug_seq, + "data_refresh_id": refresh_id, + }) + sub_pathway_patients += sub_pts + sub_pathway_cost += sub_cost + + # Drug node (level 3) — value must include sub-pathways + total_drug_patients = base_patients + sub_pathway_patients + total_drug_cost = drug_cost + sub_pathway_cost + + nodes.append({ + "date_filter_id": date_filter_id, + "chart_type": chart_type, + "parents": f"{ROOT_LABEL} - {trust} - {group_name}", + "ids": drug_id, + "labels": drug, + "level": 3, + "value": total_drug_patients, + "cost": total_drug_cost, + "costpp": round(total_drug_cost / max(total_drug_patients, 1), 2), + "cost_pp_pa": f"£{total_drug_cost / max(total_drug_patients, 1) * random.uniform(0.8, 1.2):,.0f}", + "colour": 0.0, # placeholder, set after group total known + "first_seen": rand_date(2018, 2021), + "last_seen": rand_date(2024, 2025), + "first_seen_parent": rand_date(2017, 2020), + "last_seen_parent": rand_date(2024, 2025), + "average_spacing": make_average_spacing_html([drug]), + "average_administered": make_average_administered_json([drug]), + "avg_days": avg_days, + "trust_name": trust, + "directory": group_name if chart_type == "directory" else None, + "drug_sequence": drug, + "data_refresh_id": refresh_id, + }) + + group_patients += total_drug_patients + group_cost += total_drug_cost + + if group_patients == 0: + continue + + # Level 2 group node (directory or indication) + group_id = f"{ROOT_LABEL} - {trust} - {group_name}" + nodes.append({ + "date_filter_id": date_filter_id, + "chart_type": chart_type, + "parents": f"{ROOT_LABEL} - {trust}", + "ids": group_id, + "labels": group_name, + "level": 2, + "value": group_patients, + "cost": round(group_cost, 2), + "costpp": round(group_cost / max(group_patients, 1), 2), + "cost_pp_pa": f"£{group_cost / max(group_patients, 1) * random.uniform(0.8, 1.2):,.0f}", + "colour": 0.0, # set after trust total known + "first_seen": rand_date(2017, 2020), + "last_seen": rand_date(2024, 2025), + "first_seen_parent": rand_date(2016, 2019), + "last_seen_parent": rand_date(2024, 2025), + "average_spacing": None, + "average_administered": None, + "avg_days": None, + "trust_name": trust, + "directory": group_name if chart_type == "directory" else None, + "drug_sequence": None, + "data_refresh_id": refresh_id, + }) + + trust_patients += group_patients + trust_cost += group_cost + + if trust_patients == 0: + continue + + # Level 1 trust node + trust_id = f"{ROOT_LABEL} - {trust}" + nodes.append({ + "date_filter_id": date_filter_id, + "chart_type": chart_type, + "parents": ROOT_LABEL, + "ids": trust_id, + "labels": trust, + "level": 1, + "value": trust_patients, + "cost": round(trust_cost, 2), + "costpp": round(trust_cost / max(trust_patients, 1), 2), + "cost_pp_pa": f"£{trust_cost / max(trust_patients, 1):,.0f}", + "colour": 0.0, # set after root total known + "first_seen": rand_date(2016, 2019), + "last_seen": rand_date(2024, 2025), + "first_seen_parent": None, + "last_seen_parent": None, + "average_spacing": None, + "average_administered": None, + "avg_days": None, + "trust_name": trust, + "directory": None, + "drug_sequence": None, + "data_refresh_id": refresh_id, + }) + + root_patients += trust_patients + root_cost += trust_cost + + # Level 0 root node + nodes.append({ + "date_filter_id": date_filter_id, + "chart_type": chart_type, + "parents": "", + "ids": ROOT_LABEL, + "labels": ROOT_LABEL, + "level": 0, + "value": root_patients, + "cost": round(root_cost, 2), + "costpp": round(root_cost / max(root_patients, 1), 2), + "cost_pp_pa": f"£{root_cost / max(root_patients, 1):,.0f}", + "colour": 0.5, + "first_seen": None, + "last_seen": None, + "first_seen_parent": None, + "last_seen_parent": None, + "average_spacing": None, + "average_administered": None, + "avg_days": None, + "trust_name": None, + "directory": None, + "drug_sequence": None, + "data_refresh_id": refresh_id, + }) + + # Fix colour values (proportion of parent) + parent_values: dict[str, int] = {n["ids"]: n["value"] for n in nodes} + for node in nodes: + if node["level"] > 0 and node["parents"] in parent_values: + parent_val = parent_values[node["parents"]] + node["colour"] = round(node["value"] / max(parent_val, 1), 4) + + return nodes + + +# --------------------------------------------------------------------------- +# Database construction +# --------------------------------------------------------------------------- + +def insert_nodes(conn: sqlite3.Connection, nodes: list[dict]) -> None: + """Bulk insert pathway_nodes.""" + columns = [ + "date_filter_id", "chart_type", "parents", "ids", "labels", "level", + "value", "cost", "costpp", "cost_pp_pa", "colour", + "first_seen", "last_seen", "first_seen_parent", "last_seen_parent", + "average_spacing", "average_administered", "avg_days", + "trust_name", "directory", "drug_sequence", + "data_refresh_id", + ] + placeholders = ", ".join(["?"] * len(columns)) + col_names = ", ".join(columns) + + conn.executemany( + f"INSERT INTO pathway_nodes ({col_names}) VALUES ({placeholders})", + [tuple(node[c] for c in columns) for node in nodes], + ) + + +def build_database(db_path: Path) -> None: + """Build the complete synthetic database.""" + # Remove existing DB + if db_path.exists(): + db_path.unlink() + + db_path.parent.mkdir(parents=True, exist_ok=True) + + config = DatabaseConfig(db_path=db_path) + db_manager = DatabaseManager(config) + paths = PathConfig(base_dir=Path(_project_root)) + + # 1. Create all tables (reference + pathway + date filters) + print("Creating schema...") + with db_manager.get_connection() as conn: + create_all_tables(conn) + + # 2. Migrate reference data from CSVs + print("Migrating reference data...") + migrations = [ + ("Drug names", lambda: migrate_drug_names(db_manager, paths)), + ("Organizations", lambda: migrate_organizations(db_manager, paths)), + ("Directories", lambda: migrate_directories(db_manager, paths)), + ("Drug-directory map", lambda: migrate_drug_directory_map(db_manager, paths)), + ("Drug indication clusters", lambda: migrate_drug_indication_clusters( + db_manager, paths.data_dir / "drug_indication_clusters.csv" + )), + ] + + for name, migrate_fn in migrations: + result = migrate_fn() + if not result.success: + print(f" FAILED: {name} — {result.error_message}") + sys.exit(1) + print(f" {name}: {result.rows_inserted} rows inserted") + + # 3. Generate synthetic pathway_nodes for all 12 combinations + refresh_id = str(uuid.uuid4()) + started_at = datetime.now().isoformat() + total_nodes = 0 + date_filter_counts: dict[str, int] = {} + + print("Generating synthetic pathway nodes...") + with db_manager.get_transaction() as conn: + for date_filter_id in DATE_FILTER_IDS: + filter_count = 0 + for chart_type in CHART_TYPES: + # Reset random seed per combo for reproducibility but variation + random.seed(hash((date_filter_id, chart_type)) % (2**31)) + + nodes = generate_nodes_for_combination( + date_filter_id, chart_type, refresh_id + ) + insert_nodes(conn, nodes) + filter_count += len(nodes) + print(f" {date_filter_id}/{chart_type}: {len(nodes)} nodes") + + date_filter_counts[date_filter_id] = filter_count + total_nodes += filter_count + + # 4. Insert refresh log entry + print("Writing refresh log...") + completed_at = datetime.now().isoformat() + with db_manager.get_transaction() as conn: + conn.execute( + """ + INSERT INTO pathway_refresh_log + (refresh_id, started_at, completed_at, status, record_count, + date_filter_counts, source_row_count, processing_duration_seconds) + VALUES (?, ?, ?, 'completed', ?, ?, ?, ?) + """, + ( + refresh_id, + started_at, + completed_at, + total_nodes, + json.dumps(date_filter_counts), + total_nodes, + 0.0, + ), + ) + + print(f"\nDone! {total_nodes} total nodes written to {db_path}") + print(f"Date filter breakdown: {json.dumps(date_filter_counts, indent=2)}") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + db_path = Path(_project_root) / "data" / "pathways.db" + build_database(db_path)