diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..823547c
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,16 @@
+.git
+__pycache__
+*.pyc
+*.pyo
+*.db
+.pytest_cache
+.coverage
+htmlcov
+tests
+.venv
+*.egg-info
+dist
+build
+.mypy_cache
+.ruff_cache
+*.spec
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..1c1f1cf
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+COPY . .
+
+# Runtime deps only — excludes snowflake, pywebview, pyinstaller, pyarrow, fastparquet
+RUN pip install --no-cache-dir \
+ dash>=2.14.0 \
+ dash-mantine-components>=0.14.0 \
+ plotly>=5.15.0 \
+ pandas>=2.0.3 \
+ numpy>=1.25.0 \
+ gunicorn>=21.0.0
+
+# Generate synthetic database at build time
+RUN python scripts/generate_demo_db.py
+
+EXPOSE 8050
+
+CMD ["gunicorn", "--bind", "0.0.0.0:8050", "--workers", "2", "--timeout", "120", "dash_app.app:server"]
diff --git a/dash_app/components/header.py b/dash_app/components/header.py
index 7290964..6e7c158 100644
--- a/dash_app/components/header.py
+++ b/dash_app/components/header.py
@@ -18,6 +18,19 @@ def make_header():
],
),
+ # Demo banner
+ html.Div(
+ "SYNTHETIC DATA FOR DEMONSTRATION",
+ style={
+ "color": "#e53e3e",
+ "fontWeight": "bold",
+ "fontSize": "0.85rem",
+ "letterSpacing": "0.05em",
+ "textAlign": "center",
+ "whiteSpace": "nowrap",
+ },
+ ),
+
# Center: 3 fraction KPIs (filtered / total)
html.Div(
className="top-header__kpis",
diff --git a/data/pathways.db b/data/pathways.db
index a8114f3..837707a 100644
Binary files a/data/pathways.db and b/data/pathways.db differ
diff --git a/data_processing/snowflake_connector.py b/data_processing/snowflake_connector.py
index f24ca04..11c4628 100644
--- a/data_processing/snowflake_connector.py
+++ b/data_processing/snowflake_connector.py
@@ -24,6 +24,8 @@ Usage:
connector.close()
"""
+from __future__ import annotations
+
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import date, datetime
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..bded8b7
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,6 @@
+services:
+ hcd-demo:
+ build: .
+ ports:
+ - "8050:8050"
+ restart: unless-stopped
diff --git a/scripts/generate_demo_db.py b/scripts/generate_demo_db.py
new file mode 100644
index 0000000..bf202d6
--- /dev/null
+++ b/scripts/generate_demo_db.py
@@ -0,0 +1,531 @@
+"""Generate a complete synthetic pathways.db for the containerised demo.
+
+Uses the existing schema and migration infrastructure to build a fully
+functional database with fabricated patient pathway data. Reference data
+(drug names, directories, SNOMED clusters) comes from the real CSVs —
+these are standard NHS terminology, not patient data. Trust names are
+fictional to make the synthetic nature obvious.
+
+Usage:
+ python scripts/generate_demo_db.py
+"""
+
+from __future__ import annotations
+
+import json
+import random
+import sqlite3
+import sys
+import uuid
+from datetime import datetime, timedelta
+from pathlib import Path
+
+# Ensure project root is on sys.path
+_project_root = str(Path(__file__).resolve().parent.parent)
+if _project_root not in sys.path:
+ sys.path.insert(0, _project_root)
+
+# Pre-register data_processing as a bare namespace package to avoid its
+# __init__.py which pulls in snowflake_connector (not needed/available here).
+import types
+import importlib
+
+if "data_processing" not in sys.modules:
+ _pkg = types.ModuleType("data_processing")
+ _pkg.__path__ = [str(Path(_project_root) / "data_processing")]
+ _pkg.__package__ = "data_processing"
+ sys.modules["data_processing"] = _pkg
+
+from core.config import PathConfig # noqa: E402
+from data_processing.database import DatabaseConfig, DatabaseManager # noqa: E402
+from data_processing.schema import create_all_tables # noqa: E402
+from data_processing.reference_data import ( # noqa: E402
+ migrate_drug_names,
+ migrate_organizations,
+ migrate_directories,
+ migrate_drug_directory_map,
+ migrate_drug_indication_clusters,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+random.seed(42)
+
+FICTIONAL_TRUSTS = [
+ "GREENFIELD UNIVERSITY HOSPITAL NHS FT",
+ "RIVERSIDE DISTRICT GENERAL NHS TRUST",
+ "THORNBURY ROYAL INFIRMARY NHS FT",
+ "ASHWORTH COMMUNITY HOSPITAL NHS TRUST",
+ "KINGSBURY TEACHING HOSPITALS NHS FT",
+]
+
+ROOT_LABEL = "DEMO ICS"
+
+DATE_FILTER_IDS = [
+ "all_6mo", "all_12mo",
+ "1yr_6mo", "1yr_12mo",
+ "2yr_6mo", "2yr_12mo",
+]
+
+CHART_TYPES = ["directory", "indication"]
+
+# Drug → directories mapping (subset of real drugs found in ref_drug_directory_map)
+DRUG_DIRECTORIES: dict[str, list[str]] = {
+ "ADALIMUMAB": ["RHEUMATOLOGY", "GASTROENTEROLOGY", "DERMATOLOGY", "OPHTHALMOLOGY"],
+ "INFLIXIMAB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"],
+ "ETANERCEPT": ["RHEUMATOLOGY", "DERMATOLOGY"],
+ "RITUXIMAB": ["RHEUMATOLOGY", "HAEMATOLOGY"],
+ "TOCILIZUMAB": ["RHEUMATOLOGY"],
+ "SECUKINUMAB": ["RHEUMATOLOGY", "DERMATOLOGY"],
+ "VEDOLIZUMAB": ["GASTROENTEROLOGY"],
+ "USTEKINUMAB": ["GASTROENTEROLOGY", "DERMATOLOGY"],
+ "TOFACITINIB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"],
+ "BARICITINIB": ["RHEUMATOLOGY", "DERMATOLOGY"],
+ "OCRELIZUMAB": ["NEUROLOGY"],
+ "NATALIZUMAB": ["NEUROLOGY"],
+ "AFLIBERCEPT": ["OPHTHALMOLOGY"],
+ "RANIBIZUMAB": ["OPHTHALMOLOGY"],
+ "IBRUTINIB": ["HAEMATOLOGY"],
+ "LENALIDOMIDE": ["HAEMATOLOGY"],
+ "PEMBROLIZUMAB": ["ONCOLOGY"],
+ "NIVOLUMAB": ["ONCOLOGY"],
+ "TRASTUZUMAB": ["ONCOLOGY"],
+ "BEVACIZUMAB": ["ONCOLOGY"],
+}
+
+# Drug → indications (for indication chart type)
+DRUG_INDICATIONS: dict[str, list[str]] = {
+ "ADALIMUMAB": ["Rheumatoid arthritis", "Crohn's disease", "Psoriasis", "Uveitis"],
+ "INFLIXIMAB": ["Ulcerative colitis", "Crohn's disease", "Rheumatoid arthritis"],
+ "ETANERCEPT": ["Rheumatoid arthritis", "Psoriatic arthritis", "Ankylosing spondylitis"],
+ "RITUXIMAB": ["Rheumatoid arthritis", "Non-Hodgkin lymphoma", "CLL"],
+ "TOCILIZUMAB": ["Rheumatoid arthritis", "Giant cell arteritis"],
+ "SECUKINUMAB": ["Psoriasis", "Psoriatic arthritis", "Ankylosing spondylitis"],
+ "VEDOLIZUMAB": ["Ulcerative colitis", "Crohn's disease"],
+ "USTEKINUMAB": ["Psoriasis", "Crohn's disease"],
+ "TOFACITINIB": ["Ulcerative colitis", "Rheumatoid arthritis"],
+ "BARICITINIB": ["Rheumatoid arthritis", "Atopic dermatitis"],
+ "OCRELIZUMAB": ["Multiple sclerosis"],
+ "NATALIZUMAB": ["Multiple sclerosis"],
+ "AFLIBERCEPT": ["Wet AMD", "Diabetic macular oedema"],
+ "RANIBIZUMAB": ["Wet AMD"],
+ "IBRUTINIB": ["CLL", "Mantle cell lymphoma"],
+ "LENALIDOMIDE": ["Multiple myeloma"],
+ "PEMBROLIZUMAB": ["Non-small cell lung cancer", "Melanoma"],
+ "NIVOLUMAB": ["Non-small cell lung cancer", "Renal cell carcinoma"],
+ "TRASTUZUMAB": ["HER2+ breast cancer"],
+ "BEVACIZUMAB": ["Colorectal cancer", "Non-small cell lung cancer"],
+}
+
+# Directories that appear in the demo
+DIRECTORIES_USED = sorted({d for dirs in DRUG_DIRECTORIES.values() for d in dirs})
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def rand_date(start_year: int, end_year: int) -> str:
+ start = datetime(start_year, 1, 1)
+ end = datetime(end_year, 12, 28)
+ delta = (end - start).days
+ dt = start + timedelta(days=random.randint(0, max(delta, 1)))
+ return dt.strftime("%Y-%m-%d")
+
+
+def make_average_spacing_html(drugs: list[str]) -> str:
+ """Build the HTML-formatted average_spacing string the Dash app expects."""
+ parts = []
+ for drug in drugs:
+ times = random.randint(4, 30)
+ interval = round(random.uniform(2.0, 12.0), 1)
+ total_weeks = round(times * interval, 1)
+ parts.append(
+ f"
{drug}"
+ f"
On average given {times} times with a {interval} weekly interval "
+ f"({total_weeks} weeks total treatment length)"
+ )
+ return "".join(parts)
+
+
+def make_average_administered_json(drugs: list[str]) -> str:
+ """Build the JSON array of avg doses the Dash app expects."""
+ entries = []
+ for drug in drugs:
+ entries.append({
+ "drug": drug,
+ "avg_dose_mg": round(random.uniform(50, 500), 1),
+ "avg_administrations": random.randint(4, 30),
+ })
+ return json.dumps(entries)
+
+
+# ---------------------------------------------------------------------------
+# Node generation
+# ---------------------------------------------------------------------------
+
+def generate_nodes_for_combination(
+ date_filter_id: str,
+ chart_type: str,
+ refresh_id: str,
+) -> list[dict]:
+ """Generate a complete hierarchy of pathway_nodes for one filter/chart combo.
+
+ Returns a list of dicts ready for INSERT.
+ """
+ nodes: list[dict] = []
+
+ # Scale factors per date filter so narrower filters have fewer patients
+ scale = {
+ "all_6mo": 0.6, "all_12mo": 1.0,
+ "1yr_6mo": 0.3, "1yr_12mo": 0.5,
+ "2yr_6mo": 0.4, "2yr_12mo": 0.7,
+ }[date_filter_id]
+
+ root_patients = 0
+ root_cost = 0.0
+
+ for trust in FICTIONAL_TRUSTS:
+ trust_patients = 0
+ trust_cost = 0.0
+
+ # Determine level-2 groups based on chart_type
+ if chart_type == "directory":
+ level2_groups = DIRECTORIES_USED
+ else:
+ # For indication chart, use unique indications
+ level2_groups = sorted({
+ ind for inds in DRUG_INDICATIONS.values() for ind in inds
+ })
+
+ for group_name in level2_groups:
+ group_patients = 0
+ group_cost = 0.0
+
+ # Determine which drugs appear under this group
+ if chart_type == "directory":
+ drugs_in_group = [
+ d for d, dirs in DRUG_DIRECTORIES.items()
+ if group_name in dirs
+ ]
+ else:
+ drugs_in_group = [
+ d for d, inds in DRUG_INDICATIONS.items()
+ if group_name in inds
+ ]
+
+ if not drugs_in_group:
+ continue
+
+ # Only include a random subset per trust to create variation
+ if len(drugs_in_group) > 3:
+ n_drugs = random.randint(2, min(len(drugs_in_group), 5))
+ drugs_in_group = random.sample(drugs_in_group, n_drugs)
+
+ for drug in drugs_in_group:
+ drug_id = f"{ROOT_LABEL} - {trust} - {group_name} - {drug}"
+ base_patients = int(random.randint(5, 80) * scale)
+ if base_patients < 1:
+ base_patients = 1
+ cost_pp = round(random.uniform(3000, 25000), 2)
+ drug_cost = round(base_patients * cost_pp, 2)
+ avg_days = round(random.uniform(180, 2500), 1)
+
+ # Occasionally generate sub-pathway nodes (level 4+)
+ sub_pathway_patients = 0
+ sub_pathway_cost = 0.0
+ if random.random() < 0.4:
+ # Pick 1-2 follow-on drugs
+ other_drugs = [d for d in DRUG_DIRECTORIES if d != drug]
+ n_sub = random.randint(1, 2)
+ follow_on_drugs = random.sample(other_drugs, min(n_sub, len(other_drugs)))
+
+ for follow_drug in follow_on_drugs:
+ sub_id = f"{drug_id} - {follow_drug}"
+ sub_pts = int(random.randint(2, max(base_patients // 3, 3)) * scale)
+ if sub_pts < 1:
+ sub_pts = 1
+ sub_cpp = round(random.uniform(3000, 20000), 2)
+ sub_cost = round(sub_pts * sub_cpp, 2)
+ sub_avg_days = round(random.uniform(300, 3000), 1)
+ drug_seq = f"{drug}|{follow_drug}"
+
+ nodes.append({
+ "date_filter_id": date_filter_id,
+ "chart_type": chart_type,
+ "parents": drug_id,
+ "ids": sub_id,
+ "labels": follow_drug,
+ "level": 4,
+ "value": sub_pts,
+ "cost": sub_cost,
+ "costpp": sub_cpp,
+ "cost_pp_pa": f"£{sub_cpp * random.uniform(0.8, 1.2):,.0f}",
+ "colour": round(sub_pts / max(base_patients, 1), 4),
+ "first_seen": rand_date(2019, 2022),
+ "last_seen": rand_date(2024, 2025),
+ "first_seen_parent": rand_date(2018, 2021),
+ "last_seen_parent": rand_date(2024, 2025),
+ "average_spacing": make_average_spacing_html([drug, follow_drug]),
+ "average_administered": make_average_administered_json([drug, follow_drug]),
+ "avg_days": sub_avg_days,
+ "trust_name": trust,
+ "directory": group_name if chart_type == "directory" else None,
+ "drug_sequence": drug_seq,
+ "data_refresh_id": refresh_id,
+ })
+ sub_pathway_patients += sub_pts
+ sub_pathway_cost += sub_cost
+
+ # Drug node (level 3) — value must include sub-pathways
+ total_drug_patients = base_patients + sub_pathway_patients
+ total_drug_cost = drug_cost + sub_pathway_cost
+
+ nodes.append({
+ "date_filter_id": date_filter_id,
+ "chart_type": chart_type,
+ "parents": f"{ROOT_LABEL} - {trust} - {group_name}",
+ "ids": drug_id,
+ "labels": drug,
+ "level": 3,
+ "value": total_drug_patients,
+ "cost": total_drug_cost,
+ "costpp": round(total_drug_cost / max(total_drug_patients, 1), 2),
+ "cost_pp_pa": f"£{total_drug_cost / max(total_drug_patients, 1) * random.uniform(0.8, 1.2):,.0f}",
+ "colour": 0.0, # placeholder, set after group total known
+ "first_seen": rand_date(2018, 2021),
+ "last_seen": rand_date(2024, 2025),
+ "first_seen_parent": rand_date(2017, 2020),
+ "last_seen_parent": rand_date(2024, 2025),
+ "average_spacing": make_average_spacing_html([drug]),
+ "average_administered": make_average_administered_json([drug]),
+ "avg_days": avg_days,
+ "trust_name": trust,
+ "directory": group_name if chart_type == "directory" else None,
+ "drug_sequence": drug,
+ "data_refresh_id": refresh_id,
+ })
+
+ group_patients += total_drug_patients
+ group_cost += total_drug_cost
+
+ if group_patients == 0:
+ continue
+
+ # Level 2 group node (directory or indication)
+ group_id = f"{ROOT_LABEL} - {trust} - {group_name}"
+ nodes.append({
+ "date_filter_id": date_filter_id,
+ "chart_type": chart_type,
+ "parents": f"{ROOT_LABEL} - {trust}",
+ "ids": group_id,
+ "labels": group_name,
+ "level": 2,
+ "value": group_patients,
+ "cost": round(group_cost, 2),
+ "costpp": round(group_cost / max(group_patients, 1), 2),
+ "cost_pp_pa": f"£{group_cost / max(group_patients, 1) * random.uniform(0.8, 1.2):,.0f}",
+ "colour": 0.0, # set after trust total known
+ "first_seen": rand_date(2017, 2020),
+ "last_seen": rand_date(2024, 2025),
+ "first_seen_parent": rand_date(2016, 2019),
+ "last_seen_parent": rand_date(2024, 2025),
+ "average_spacing": None,
+ "average_administered": None,
+ "avg_days": None,
+ "trust_name": trust,
+ "directory": group_name if chart_type == "directory" else None,
+ "drug_sequence": None,
+ "data_refresh_id": refresh_id,
+ })
+
+ trust_patients += group_patients
+ trust_cost += group_cost
+
+ if trust_patients == 0:
+ continue
+
+ # Level 1 trust node
+ trust_id = f"{ROOT_LABEL} - {trust}"
+ nodes.append({
+ "date_filter_id": date_filter_id,
+ "chart_type": chart_type,
+ "parents": ROOT_LABEL,
+ "ids": trust_id,
+ "labels": trust,
+ "level": 1,
+ "value": trust_patients,
+ "cost": round(trust_cost, 2),
+ "costpp": round(trust_cost / max(trust_patients, 1), 2),
+ "cost_pp_pa": f"£{trust_cost / max(trust_patients, 1):,.0f}",
+ "colour": 0.0, # set after root total known
+ "first_seen": rand_date(2016, 2019),
+ "last_seen": rand_date(2024, 2025),
+ "first_seen_parent": None,
+ "last_seen_parent": None,
+ "average_spacing": None,
+ "average_administered": None,
+ "avg_days": None,
+ "trust_name": trust,
+ "directory": None,
+ "drug_sequence": None,
+ "data_refresh_id": refresh_id,
+ })
+
+ root_patients += trust_patients
+ root_cost += trust_cost
+
+ # Level 0 root node
+ nodes.append({
+ "date_filter_id": date_filter_id,
+ "chart_type": chart_type,
+ "parents": "",
+ "ids": ROOT_LABEL,
+ "labels": ROOT_LABEL,
+ "level": 0,
+ "value": root_patients,
+ "cost": round(root_cost, 2),
+ "costpp": round(root_cost / max(root_patients, 1), 2),
+ "cost_pp_pa": f"£{root_cost / max(root_patients, 1):,.0f}",
+ "colour": 0.5,
+ "first_seen": None,
+ "last_seen": None,
+ "first_seen_parent": None,
+ "last_seen_parent": None,
+ "average_spacing": None,
+ "average_administered": None,
+ "avg_days": None,
+ "trust_name": None,
+ "directory": None,
+ "drug_sequence": None,
+ "data_refresh_id": refresh_id,
+ })
+
+ # Fix colour values (proportion of parent)
+ parent_values: dict[str, int] = {n["ids"]: n["value"] for n in nodes}
+ for node in nodes:
+ if node["level"] > 0 and node["parents"] in parent_values:
+ parent_val = parent_values[node["parents"]]
+ node["colour"] = round(node["value"] / max(parent_val, 1), 4)
+
+ return nodes
+
+
+# ---------------------------------------------------------------------------
+# Database construction
+# ---------------------------------------------------------------------------
+
+def insert_nodes(conn: sqlite3.Connection, nodes: list[dict]) -> None:
+ """Bulk insert pathway_nodes."""
+ columns = [
+ "date_filter_id", "chart_type", "parents", "ids", "labels", "level",
+ "value", "cost", "costpp", "cost_pp_pa", "colour",
+ "first_seen", "last_seen", "first_seen_parent", "last_seen_parent",
+ "average_spacing", "average_administered", "avg_days",
+ "trust_name", "directory", "drug_sequence",
+ "data_refresh_id",
+ ]
+ placeholders = ", ".join(["?"] * len(columns))
+ col_names = ", ".join(columns)
+
+ conn.executemany(
+ f"INSERT INTO pathway_nodes ({col_names}) VALUES ({placeholders})",
+ [tuple(node[c] for c in columns) for node in nodes],
+ )
+
+
+def build_database(db_path: Path) -> None:
+ """Build the complete synthetic database."""
+ # Remove existing DB
+ if db_path.exists():
+ db_path.unlink()
+
+ db_path.parent.mkdir(parents=True, exist_ok=True)
+
+ config = DatabaseConfig(db_path=db_path)
+ db_manager = DatabaseManager(config)
+ paths = PathConfig(base_dir=Path(_project_root))
+
+ # 1. Create all tables (reference + pathway + date filters)
+ print("Creating schema...")
+ with db_manager.get_connection() as conn:
+ create_all_tables(conn)
+
+ # 2. Migrate reference data from CSVs
+ print("Migrating reference data...")
+ migrations = [
+ ("Drug names", lambda: migrate_drug_names(db_manager, paths)),
+ ("Organizations", lambda: migrate_organizations(db_manager, paths)),
+ ("Directories", lambda: migrate_directories(db_manager, paths)),
+ ("Drug-directory map", lambda: migrate_drug_directory_map(db_manager, paths)),
+ ("Drug indication clusters", lambda: migrate_drug_indication_clusters(
+ db_manager, paths.data_dir / "drug_indication_clusters.csv"
+ )),
+ ]
+
+ for name, migrate_fn in migrations:
+ result = migrate_fn()
+ if not result.success:
+ print(f" FAILED: {name} — {result.error_message}")
+ sys.exit(1)
+ print(f" {name}: {result.rows_inserted} rows inserted")
+
+ # 3. Generate synthetic pathway_nodes for all 12 combinations
+ refresh_id = str(uuid.uuid4())
+ started_at = datetime.now().isoformat()
+ total_nodes = 0
+ date_filter_counts: dict[str, int] = {}
+
+ print("Generating synthetic pathway nodes...")
+ with db_manager.get_transaction() as conn:
+ for date_filter_id in DATE_FILTER_IDS:
+ filter_count = 0
+ for chart_type in CHART_TYPES:
+ # Reset random seed per combo for reproducibility but variation
+ random.seed(hash((date_filter_id, chart_type)) % (2**31))
+
+ nodes = generate_nodes_for_combination(
+ date_filter_id, chart_type, refresh_id
+ )
+ insert_nodes(conn, nodes)
+ filter_count += len(nodes)
+ print(f" {date_filter_id}/{chart_type}: {len(nodes)} nodes")
+
+ date_filter_counts[date_filter_id] = filter_count
+ total_nodes += filter_count
+
+ # 4. Insert refresh log entry
+ print("Writing refresh log...")
+ completed_at = datetime.now().isoformat()
+ with db_manager.get_transaction() as conn:
+ conn.execute(
+ """
+ INSERT INTO pathway_refresh_log
+ (refresh_id, started_at, completed_at, status, record_count,
+ date_filter_counts, source_row_count, processing_duration_seconds)
+ VALUES (?, ?, ?, 'completed', ?, ?, ?, ?)
+ """,
+ (
+ refresh_id,
+ started_at,
+ completed_at,
+ total_nodes,
+ json.dumps(date_filter_counts),
+ total_nodes,
+ 0.0,
+ ),
+ )
+
+ print(f"\nDone! {total_nodes} total nodes written to {db_path}")
+ print(f"Date filter breakdown: {json.dumps(date_filter_counts, indent=2)}")
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+ db_path = Path(_project_root) / "data" / "pathways.db"
+ build_database(db_path)