Compare commits
1 Commits
2f75efa964
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 09be4c2472 |
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.db
|
||||||
|
.pytest_cache
|
||||||
|
.coverage
|
||||||
|
htmlcov
|
||||||
|
tests
|
||||||
|
.venv
|
||||||
|
*.egg-info
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
.mypy_cache
|
||||||
|
.ruff_cache
|
||||||
|
*.spec
|
||||||
+20
@@ -0,0 +1,20 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Runtime deps only — excludes snowflake, pywebview, pyinstaller, pyarrow, fastparquet
|
||||||
|
RUN pip install --no-cache-dir \
|
||||||
|
dash>=2.14.0 \
|
||||||
|
dash-mantine-components>=0.14.0 \
|
||||||
|
plotly>=5.15.0 \
|
||||||
|
pandas>=2.0.3 \
|
||||||
|
numpy>=1.25.0 \
|
||||||
|
gunicorn>=21.0.0
|
||||||
|
|
||||||
|
# Generate synthetic database at build time
|
||||||
|
RUN python scripts/generate_demo_db.py
|
||||||
|
|
||||||
|
EXPOSE 8050
|
||||||
|
|
||||||
|
CMD ["gunicorn", "--bind", "0.0.0.0:8050", "--workers", "2", "--timeout", "120", "dash_app.app:server"]
|
||||||
@@ -18,6 +18,19 @@ def make_header():
|
|||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Demo banner
|
||||||
|
html.Div(
|
||||||
|
"SYNTHETIC DATA FOR DEMONSTRATION",
|
||||||
|
style={
|
||||||
|
"color": "#e53e3e",
|
||||||
|
"fontWeight": "bold",
|
||||||
|
"fontSize": "0.85rem",
|
||||||
|
"letterSpacing": "0.05em",
|
||||||
|
"textAlign": "center",
|
||||||
|
"whiteSpace": "nowrap",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
|
||||||
# Center: 3 fraction KPIs (filtered / total)
|
# Center: 3 fraction KPIs (filtered / total)
|
||||||
html.Div(
|
html.Div(
|
||||||
className="top-header__kpis",
|
className="top-header__kpis",
|
||||||
|
|||||||
Binary file not shown.
@@ -24,6 +24,8 @@ Usage:
|
|||||||
connector.close()
|
connector.close()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
services:
|
||||||
|
hcd-demo:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8050:8050"
|
||||||
|
restart: unless-stopped
|
||||||
@@ -0,0 +1,531 @@
|
|||||||
|
"""Generate a complete synthetic pathways.db for the containerised demo.
|
||||||
|
|
||||||
|
Uses the existing schema and migration infrastructure to build a fully
|
||||||
|
functional database with fabricated patient pathway data. Reference data
|
||||||
|
(drug names, directories, SNOMED clusters) comes from the real CSVs —
|
||||||
|
these are standard NHS terminology, not patient data. Trust names are
|
||||||
|
fictional to make the synthetic nature obvious.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/generate_demo_db.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Ensure project root is on sys.path
|
||||||
|
_project_root = str(Path(__file__).resolve().parent.parent)
|
||||||
|
if _project_root not in sys.path:
|
||||||
|
sys.path.insert(0, _project_root)
|
||||||
|
|
||||||
|
# Pre-register data_processing as a bare namespace package to avoid its
|
||||||
|
# __init__.py which pulls in snowflake_connector (not needed/available here).
|
||||||
|
import types
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
if "data_processing" not in sys.modules:
|
||||||
|
_pkg = types.ModuleType("data_processing")
|
||||||
|
_pkg.__path__ = [str(Path(_project_root) / "data_processing")]
|
||||||
|
_pkg.__package__ = "data_processing"
|
||||||
|
sys.modules["data_processing"] = _pkg
|
||||||
|
|
||||||
|
from core.config import PathConfig # noqa: E402
|
||||||
|
from data_processing.database import DatabaseConfig, DatabaseManager # noqa: E402
|
||||||
|
from data_processing.schema import create_all_tables # noqa: E402
|
||||||
|
from data_processing.reference_data import ( # noqa: E402
|
||||||
|
migrate_drug_names,
|
||||||
|
migrate_organizations,
|
||||||
|
migrate_directories,
|
||||||
|
migrate_drug_directory_map,
|
||||||
|
migrate_drug_indication_clusters,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
random.seed(42)
|
||||||
|
|
||||||
|
FICTIONAL_TRUSTS = [
|
||||||
|
"GREENFIELD UNIVERSITY HOSPITAL NHS FT",
|
||||||
|
"RIVERSIDE DISTRICT GENERAL NHS TRUST",
|
||||||
|
"THORNBURY ROYAL INFIRMARY NHS FT",
|
||||||
|
"ASHWORTH COMMUNITY HOSPITAL NHS TRUST",
|
||||||
|
"KINGSBURY TEACHING HOSPITALS NHS FT",
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_LABEL = "DEMO ICS"
|
||||||
|
|
||||||
|
DATE_FILTER_IDS = [
|
||||||
|
"all_6mo", "all_12mo",
|
||||||
|
"1yr_6mo", "1yr_12mo",
|
||||||
|
"2yr_6mo", "2yr_12mo",
|
||||||
|
]
|
||||||
|
|
||||||
|
CHART_TYPES = ["directory", "indication"]
|
||||||
|
|
||||||
|
# Drug → directories mapping (subset of real drugs found in ref_drug_directory_map)
|
||||||
|
DRUG_DIRECTORIES: dict[str, list[str]] = {
|
||||||
|
"ADALIMUMAB": ["RHEUMATOLOGY", "GASTROENTEROLOGY", "DERMATOLOGY", "OPHTHALMOLOGY"],
|
||||||
|
"INFLIXIMAB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"],
|
||||||
|
"ETANERCEPT": ["RHEUMATOLOGY", "DERMATOLOGY"],
|
||||||
|
"RITUXIMAB": ["RHEUMATOLOGY", "HAEMATOLOGY"],
|
||||||
|
"TOCILIZUMAB": ["RHEUMATOLOGY"],
|
||||||
|
"SECUKINUMAB": ["RHEUMATOLOGY", "DERMATOLOGY"],
|
||||||
|
"VEDOLIZUMAB": ["GASTROENTEROLOGY"],
|
||||||
|
"USTEKINUMAB": ["GASTROENTEROLOGY", "DERMATOLOGY"],
|
||||||
|
"TOFACITINIB": ["GASTROENTEROLOGY", "RHEUMATOLOGY"],
|
||||||
|
"BARICITINIB": ["RHEUMATOLOGY", "DERMATOLOGY"],
|
||||||
|
"OCRELIZUMAB": ["NEUROLOGY"],
|
||||||
|
"NATALIZUMAB": ["NEUROLOGY"],
|
||||||
|
"AFLIBERCEPT": ["OPHTHALMOLOGY"],
|
||||||
|
"RANIBIZUMAB": ["OPHTHALMOLOGY"],
|
||||||
|
"IBRUTINIB": ["HAEMATOLOGY"],
|
||||||
|
"LENALIDOMIDE": ["HAEMATOLOGY"],
|
||||||
|
"PEMBROLIZUMAB": ["ONCOLOGY"],
|
||||||
|
"NIVOLUMAB": ["ONCOLOGY"],
|
||||||
|
"TRASTUZUMAB": ["ONCOLOGY"],
|
||||||
|
"BEVACIZUMAB": ["ONCOLOGY"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Drug → indications (for indication chart type)
|
||||||
|
DRUG_INDICATIONS: dict[str, list[str]] = {
|
||||||
|
"ADALIMUMAB": ["Rheumatoid arthritis", "Crohn's disease", "Psoriasis", "Uveitis"],
|
||||||
|
"INFLIXIMAB": ["Ulcerative colitis", "Crohn's disease", "Rheumatoid arthritis"],
|
||||||
|
"ETANERCEPT": ["Rheumatoid arthritis", "Psoriatic arthritis", "Ankylosing spondylitis"],
|
||||||
|
"RITUXIMAB": ["Rheumatoid arthritis", "Non-Hodgkin lymphoma", "CLL"],
|
||||||
|
"TOCILIZUMAB": ["Rheumatoid arthritis", "Giant cell arteritis"],
|
||||||
|
"SECUKINUMAB": ["Psoriasis", "Psoriatic arthritis", "Ankylosing spondylitis"],
|
||||||
|
"VEDOLIZUMAB": ["Ulcerative colitis", "Crohn's disease"],
|
||||||
|
"USTEKINUMAB": ["Psoriasis", "Crohn's disease"],
|
||||||
|
"TOFACITINIB": ["Ulcerative colitis", "Rheumatoid arthritis"],
|
||||||
|
"BARICITINIB": ["Rheumatoid arthritis", "Atopic dermatitis"],
|
||||||
|
"OCRELIZUMAB": ["Multiple sclerosis"],
|
||||||
|
"NATALIZUMAB": ["Multiple sclerosis"],
|
||||||
|
"AFLIBERCEPT": ["Wet AMD", "Diabetic macular oedema"],
|
||||||
|
"RANIBIZUMAB": ["Wet AMD"],
|
||||||
|
"IBRUTINIB": ["CLL", "Mantle cell lymphoma"],
|
||||||
|
"LENALIDOMIDE": ["Multiple myeloma"],
|
||||||
|
"PEMBROLIZUMAB": ["Non-small cell lung cancer", "Melanoma"],
|
||||||
|
"NIVOLUMAB": ["Non-small cell lung cancer", "Renal cell carcinoma"],
|
||||||
|
"TRASTUZUMAB": ["HER2+ breast cancer"],
|
||||||
|
"BEVACIZUMAB": ["Colorectal cancer", "Non-small cell lung cancer"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Directories that appear in the demo
|
||||||
|
DIRECTORIES_USED = sorted({d for dirs in DRUG_DIRECTORIES.values() for d in dirs})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def rand_date(start_year: int, end_year: int) -> str:
|
||||||
|
start = datetime(start_year, 1, 1)
|
||||||
|
end = datetime(end_year, 12, 28)
|
||||||
|
delta = (end - start).days
|
||||||
|
dt = start + timedelta(days=random.randint(0, max(delta, 1)))
|
||||||
|
return dt.strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
||||||
|
def make_average_spacing_html(drugs: list[str]) -> str:
|
||||||
|
"""Build the HTML-formatted average_spacing string the Dash app expects."""
|
||||||
|
parts = []
|
||||||
|
for drug in drugs:
|
||||||
|
times = random.randint(4, 30)
|
||||||
|
interval = round(random.uniform(2.0, 12.0), 1)
|
||||||
|
total_weeks = round(times * interval, 1)
|
||||||
|
parts.append(
|
||||||
|
f"<br><b>{drug}</b>"
|
||||||
|
f"<br>On average given {times} times with a {interval} weekly interval "
|
||||||
|
f"({total_weeks} weeks total treatment length)"
|
||||||
|
)
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def make_average_administered_json(drugs: list[str]) -> str:
|
||||||
|
"""Build the JSON array of avg doses the Dash app expects."""
|
||||||
|
entries = []
|
||||||
|
for drug in drugs:
|
||||||
|
entries.append({
|
||||||
|
"drug": drug,
|
||||||
|
"avg_dose_mg": round(random.uniform(50, 500), 1),
|
||||||
|
"avg_administrations": random.randint(4, 30),
|
||||||
|
})
|
||||||
|
return json.dumps(entries)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Node generation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def generate_nodes_for_combination(
|
||||||
|
date_filter_id: str,
|
||||||
|
chart_type: str,
|
||||||
|
refresh_id: str,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Generate a complete hierarchy of pathway_nodes for one filter/chart combo.
|
||||||
|
|
||||||
|
Returns a list of dicts ready for INSERT.
|
||||||
|
"""
|
||||||
|
nodes: list[dict] = []
|
||||||
|
|
||||||
|
# Scale factors per date filter so narrower filters have fewer patients
|
||||||
|
scale = {
|
||||||
|
"all_6mo": 0.6, "all_12mo": 1.0,
|
||||||
|
"1yr_6mo": 0.3, "1yr_12mo": 0.5,
|
||||||
|
"2yr_6mo": 0.4, "2yr_12mo": 0.7,
|
||||||
|
}[date_filter_id]
|
||||||
|
|
||||||
|
root_patients = 0
|
||||||
|
root_cost = 0.0
|
||||||
|
|
||||||
|
for trust in FICTIONAL_TRUSTS:
|
||||||
|
trust_patients = 0
|
||||||
|
trust_cost = 0.0
|
||||||
|
|
||||||
|
# Determine level-2 groups based on chart_type
|
||||||
|
if chart_type == "directory":
|
||||||
|
level2_groups = DIRECTORIES_USED
|
||||||
|
else:
|
||||||
|
# For indication chart, use unique indications
|
||||||
|
level2_groups = sorted({
|
||||||
|
ind for inds in DRUG_INDICATIONS.values() for ind in inds
|
||||||
|
})
|
||||||
|
|
||||||
|
for group_name in level2_groups:
|
||||||
|
group_patients = 0
|
||||||
|
group_cost = 0.0
|
||||||
|
|
||||||
|
# Determine which drugs appear under this group
|
||||||
|
if chart_type == "directory":
|
||||||
|
drugs_in_group = [
|
||||||
|
d for d, dirs in DRUG_DIRECTORIES.items()
|
||||||
|
if group_name in dirs
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
drugs_in_group = [
|
||||||
|
d for d, inds in DRUG_INDICATIONS.items()
|
||||||
|
if group_name in inds
|
||||||
|
]
|
||||||
|
|
||||||
|
if not drugs_in_group:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Only include a random subset per trust to create variation
|
||||||
|
if len(drugs_in_group) > 3:
|
||||||
|
n_drugs = random.randint(2, min(len(drugs_in_group), 5))
|
||||||
|
drugs_in_group = random.sample(drugs_in_group, n_drugs)
|
||||||
|
|
||||||
|
for drug in drugs_in_group:
|
||||||
|
drug_id = f"{ROOT_LABEL} - {trust} - {group_name} - {drug}"
|
||||||
|
base_patients = int(random.randint(5, 80) * scale)
|
||||||
|
if base_patients < 1:
|
||||||
|
base_patients = 1
|
||||||
|
cost_pp = round(random.uniform(3000, 25000), 2)
|
||||||
|
drug_cost = round(base_patients * cost_pp, 2)
|
||||||
|
avg_days = round(random.uniform(180, 2500), 1)
|
||||||
|
|
||||||
|
# Occasionally generate sub-pathway nodes (level 4+)
|
||||||
|
sub_pathway_patients = 0
|
||||||
|
sub_pathway_cost = 0.0
|
||||||
|
if random.random() < 0.4:
|
||||||
|
# Pick 1-2 follow-on drugs
|
||||||
|
other_drugs = [d for d in DRUG_DIRECTORIES if d != drug]
|
||||||
|
n_sub = random.randint(1, 2)
|
||||||
|
follow_on_drugs = random.sample(other_drugs, min(n_sub, len(other_drugs)))
|
||||||
|
|
||||||
|
for follow_drug in follow_on_drugs:
|
||||||
|
sub_id = f"{drug_id} - {follow_drug}"
|
||||||
|
sub_pts = int(random.randint(2, max(base_patients // 3, 3)) * scale)
|
||||||
|
if sub_pts < 1:
|
||||||
|
sub_pts = 1
|
||||||
|
sub_cpp = round(random.uniform(3000, 20000), 2)
|
||||||
|
sub_cost = round(sub_pts * sub_cpp, 2)
|
||||||
|
sub_avg_days = round(random.uniform(300, 3000), 1)
|
||||||
|
drug_seq = f"{drug}|{follow_drug}"
|
||||||
|
|
||||||
|
nodes.append({
|
||||||
|
"date_filter_id": date_filter_id,
|
||||||
|
"chart_type": chart_type,
|
||||||
|
"parents": drug_id,
|
||||||
|
"ids": sub_id,
|
||||||
|
"labels": follow_drug,
|
||||||
|
"level": 4,
|
||||||
|
"value": sub_pts,
|
||||||
|
"cost": sub_cost,
|
||||||
|
"costpp": sub_cpp,
|
||||||
|
"cost_pp_pa": f"£{sub_cpp * random.uniform(0.8, 1.2):,.0f}",
|
||||||
|
"colour": round(sub_pts / max(base_patients, 1), 4),
|
||||||
|
"first_seen": rand_date(2019, 2022),
|
||||||
|
"last_seen": rand_date(2024, 2025),
|
||||||
|
"first_seen_parent": rand_date(2018, 2021),
|
||||||
|
"last_seen_parent": rand_date(2024, 2025),
|
||||||
|
"average_spacing": make_average_spacing_html([drug, follow_drug]),
|
||||||
|
"average_administered": make_average_administered_json([drug, follow_drug]),
|
||||||
|
"avg_days": sub_avg_days,
|
||||||
|
"trust_name": trust,
|
||||||
|
"directory": group_name if chart_type == "directory" else None,
|
||||||
|
"drug_sequence": drug_seq,
|
||||||
|
"data_refresh_id": refresh_id,
|
||||||
|
})
|
||||||
|
sub_pathway_patients += sub_pts
|
||||||
|
sub_pathway_cost += sub_cost
|
||||||
|
|
||||||
|
# Drug node (level 3) — value must include sub-pathways
|
||||||
|
total_drug_patients = base_patients + sub_pathway_patients
|
||||||
|
total_drug_cost = drug_cost + sub_pathway_cost
|
||||||
|
|
||||||
|
nodes.append({
|
||||||
|
"date_filter_id": date_filter_id,
|
||||||
|
"chart_type": chart_type,
|
||||||
|
"parents": f"{ROOT_LABEL} - {trust} - {group_name}",
|
||||||
|
"ids": drug_id,
|
||||||
|
"labels": drug,
|
||||||
|
"level": 3,
|
||||||
|
"value": total_drug_patients,
|
||||||
|
"cost": total_drug_cost,
|
||||||
|
"costpp": round(total_drug_cost / max(total_drug_patients, 1), 2),
|
||||||
|
"cost_pp_pa": f"£{total_drug_cost / max(total_drug_patients, 1) * random.uniform(0.8, 1.2):,.0f}",
|
||||||
|
"colour": 0.0, # placeholder, set after group total known
|
||||||
|
"first_seen": rand_date(2018, 2021),
|
||||||
|
"last_seen": rand_date(2024, 2025),
|
||||||
|
"first_seen_parent": rand_date(2017, 2020),
|
||||||
|
"last_seen_parent": rand_date(2024, 2025),
|
||||||
|
"average_spacing": make_average_spacing_html([drug]),
|
||||||
|
"average_administered": make_average_administered_json([drug]),
|
||||||
|
"avg_days": avg_days,
|
||||||
|
"trust_name": trust,
|
||||||
|
"directory": group_name if chart_type == "directory" else None,
|
||||||
|
"drug_sequence": drug,
|
||||||
|
"data_refresh_id": refresh_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
group_patients += total_drug_patients
|
||||||
|
group_cost += total_drug_cost
|
||||||
|
|
||||||
|
if group_patients == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Level 2 group node (directory or indication)
|
||||||
|
group_id = f"{ROOT_LABEL} - {trust} - {group_name}"
|
||||||
|
nodes.append({
|
||||||
|
"date_filter_id": date_filter_id,
|
||||||
|
"chart_type": chart_type,
|
||||||
|
"parents": f"{ROOT_LABEL} - {trust}",
|
||||||
|
"ids": group_id,
|
||||||
|
"labels": group_name,
|
||||||
|
"level": 2,
|
||||||
|
"value": group_patients,
|
||||||
|
"cost": round(group_cost, 2),
|
||||||
|
"costpp": round(group_cost / max(group_patients, 1), 2),
|
||||||
|
"cost_pp_pa": f"£{group_cost / max(group_patients, 1) * random.uniform(0.8, 1.2):,.0f}",
|
||||||
|
"colour": 0.0, # set after trust total known
|
||||||
|
"first_seen": rand_date(2017, 2020),
|
||||||
|
"last_seen": rand_date(2024, 2025),
|
||||||
|
"first_seen_parent": rand_date(2016, 2019),
|
||||||
|
"last_seen_parent": rand_date(2024, 2025),
|
||||||
|
"average_spacing": None,
|
||||||
|
"average_administered": None,
|
||||||
|
"avg_days": None,
|
||||||
|
"trust_name": trust,
|
||||||
|
"directory": group_name if chart_type == "directory" else None,
|
||||||
|
"drug_sequence": None,
|
||||||
|
"data_refresh_id": refresh_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
trust_patients += group_patients
|
||||||
|
trust_cost += group_cost
|
||||||
|
|
||||||
|
if trust_patients == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Level 1 trust node
|
||||||
|
trust_id = f"{ROOT_LABEL} - {trust}"
|
||||||
|
nodes.append({
|
||||||
|
"date_filter_id": date_filter_id,
|
||||||
|
"chart_type": chart_type,
|
||||||
|
"parents": ROOT_LABEL,
|
||||||
|
"ids": trust_id,
|
||||||
|
"labels": trust,
|
||||||
|
"level": 1,
|
||||||
|
"value": trust_patients,
|
||||||
|
"cost": round(trust_cost, 2),
|
||||||
|
"costpp": round(trust_cost / max(trust_patients, 1), 2),
|
||||||
|
"cost_pp_pa": f"£{trust_cost / max(trust_patients, 1):,.0f}",
|
||||||
|
"colour": 0.0, # set after root total known
|
||||||
|
"first_seen": rand_date(2016, 2019),
|
||||||
|
"last_seen": rand_date(2024, 2025),
|
||||||
|
"first_seen_parent": None,
|
||||||
|
"last_seen_parent": None,
|
||||||
|
"average_spacing": None,
|
||||||
|
"average_administered": None,
|
||||||
|
"avg_days": None,
|
||||||
|
"trust_name": trust,
|
||||||
|
"directory": None,
|
||||||
|
"drug_sequence": None,
|
||||||
|
"data_refresh_id": refresh_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
root_patients += trust_patients
|
||||||
|
root_cost += trust_cost
|
||||||
|
|
||||||
|
# Level 0 root node
|
||||||
|
nodes.append({
|
||||||
|
"date_filter_id": date_filter_id,
|
||||||
|
"chart_type": chart_type,
|
||||||
|
"parents": "",
|
||||||
|
"ids": ROOT_LABEL,
|
||||||
|
"labels": ROOT_LABEL,
|
||||||
|
"level": 0,
|
||||||
|
"value": root_patients,
|
||||||
|
"cost": round(root_cost, 2),
|
||||||
|
"costpp": round(root_cost / max(root_patients, 1), 2),
|
||||||
|
"cost_pp_pa": f"£{root_cost / max(root_patients, 1):,.0f}",
|
||||||
|
"colour": 0.5,
|
||||||
|
"first_seen": None,
|
||||||
|
"last_seen": None,
|
||||||
|
"first_seen_parent": None,
|
||||||
|
"last_seen_parent": None,
|
||||||
|
"average_spacing": None,
|
||||||
|
"average_administered": None,
|
||||||
|
"avg_days": None,
|
||||||
|
"trust_name": None,
|
||||||
|
"directory": None,
|
||||||
|
"drug_sequence": None,
|
||||||
|
"data_refresh_id": refresh_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Fix colour values (proportion of parent)
|
||||||
|
parent_values: dict[str, int] = {n["ids"]: n["value"] for n in nodes}
|
||||||
|
for node in nodes:
|
||||||
|
if node["level"] > 0 and node["parents"] in parent_values:
|
||||||
|
parent_val = parent_values[node["parents"]]
|
||||||
|
node["colour"] = round(node["value"] / max(parent_val, 1), 4)
|
||||||
|
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Database construction
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def insert_nodes(conn: sqlite3.Connection, nodes: list[dict]) -> None:
|
||||||
|
"""Bulk insert pathway_nodes."""
|
||||||
|
columns = [
|
||||||
|
"date_filter_id", "chart_type", "parents", "ids", "labels", "level",
|
||||||
|
"value", "cost", "costpp", "cost_pp_pa", "colour",
|
||||||
|
"first_seen", "last_seen", "first_seen_parent", "last_seen_parent",
|
||||||
|
"average_spacing", "average_administered", "avg_days",
|
||||||
|
"trust_name", "directory", "drug_sequence",
|
||||||
|
"data_refresh_id",
|
||||||
|
]
|
||||||
|
placeholders = ", ".join(["?"] * len(columns))
|
||||||
|
col_names = ", ".join(columns)
|
||||||
|
|
||||||
|
conn.executemany(
|
||||||
|
f"INSERT INTO pathway_nodes ({col_names}) VALUES ({placeholders})",
|
||||||
|
[tuple(node[c] for c in columns) for node in nodes],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_database(db_path: Path) -> None:
|
||||||
|
"""Build the complete synthetic database."""
|
||||||
|
# Remove existing DB
|
||||||
|
if db_path.exists():
|
||||||
|
db_path.unlink()
|
||||||
|
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
config = DatabaseConfig(db_path=db_path)
|
||||||
|
db_manager = DatabaseManager(config)
|
||||||
|
paths = PathConfig(base_dir=Path(_project_root))
|
||||||
|
|
||||||
|
# 1. Create all tables (reference + pathway + date filters)
|
||||||
|
print("Creating schema...")
|
||||||
|
with db_manager.get_connection() as conn:
|
||||||
|
create_all_tables(conn)
|
||||||
|
|
||||||
|
# 2. Migrate reference data from CSVs
|
||||||
|
print("Migrating reference data...")
|
||||||
|
migrations = [
|
||||||
|
("Drug names", lambda: migrate_drug_names(db_manager, paths)),
|
||||||
|
("Organizations", lambda: migrate_organizations(db_manager, paths)),
|
||||||
|
("Directories", lambda: migrate_directories(db_manager, paths)),
|
||||||
|
("Drug-directory map", lambda: migrate_drug_directory_map(db_manager, paths)),
|
||||||
|
("Drug indication clusters", lambda: migrate_drug_indication_clusters(
|
||||||
|
db_manager, paths.data_dir / "drug_indication_clusters.csv"
|
||||||
|
)),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, migrate_fn in migrations:
|
||||||
|
result = migrate_fn()
|
||||||
|
if not result.success:
|
||||||
|
print(f" FAILED: {name} — {result.error_message}")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f" {name}: {result.rows_inserted} rows inserted")
|
||||||
|
|
||||||
|
# 3. Generate synthetic pathway_nodes for all 12 combinations
|
||||||
|
refresh_id = str(uuid.uuid4())
|
||||||
|
started_at = datetime.now().isoformat()
|
||||||
|
total_nodes = 0
|
||||||
|
date_filter_counts: dict[str, int] = {}
|
||||||
|
|
||||||
|
print("Generating synthetic pathway nodes...")
|
||||||
|
with db_manager.get_transaction() as conn:
|
||||||
|
for date_filter_id in DATE_FILTER_IDS:
|
||||||
|
filter_count = 0
|
||||||
|
for chart_type in CHART_TYPES:
|
||||||
|
# Reset random seed per combo for reproducibility but variation
|
||||||
|
random.seed(hash((date_filter_id, chart_type)) % (2**31))
|
||||||
|
|
||||||
|
nodes = generate_nodes_for_combination(
|
||||||
|
date_filter_id, chart_type, refresh_id
|
||||||
|
)
|
||||||
|
insert_nodes(conn, nodes)
|
||||||
|
filter_count += len(nodes)
|
||||||
|
print(f" {date_filter_id}/{chart_type}: {len(nodes)} nodes")
|
||||||
|
|
||||||
|
date_filter_counts[date_filter_id] = filter_count
|
||||||
|
total_nodes += filter_count
|
||||||
|
|
||||||
|
# 4. Insert refresh log entry
|
||||||
|
print("Writing refresh log...")
|
||||||
|
completed_at = datetime.now().isoformat()
|
||||||
|
with db_manager.get_transaction() as conn:
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO pathway_refresh_log
|
||||||
|
(refresh_id, started_at, completed_at, status, record_count,
|
||||||
|
date_filter_counts, source_row_count, processing_duration_seconds)
|
||||||
|
VALUES (?, ?, ?, 'completed', ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
refresh_id,
|
||||||
|
started_at,
|
||||||
|
completed_at,
|
||||||
|
total_nodes,
|
||||||
|
json.dumps(date_filter_counts),
|
||||||
|
total_nodes,
|
||||||
|
0.0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nDone! {total_nodes} total nodes written to {db_path}")
|
||||||
|
print(f"Date filter breakdown: {json.dumps(date_filter_counts, indent=2)}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
db_path = Path(_project_root) / "data" / "pathways.db"
|
||||||
|
build_database(db_path)
|
||||||
Reference in New Issue
Block a user