From 76838887e656ef4256608c821774098c58f3d4f0 Mon Sep 17 00:00:00 2001 From: Andrew Charlwood Date: Fri, 6 Feb 2026 12:03:48 +0000 Subject: [PATCH] refactor: reorganize repository to src/ layout Move 6 packages (core, config, data_processing, analysis, visualization, cli) into src/ to reduce root clutter. Merge tools/data.py into data_processing/transforms.py. Move docs to docs/. Path resolution via .pth file (setup_dev.py), pytest pythonpath config, and sys.path bootstrap in rxconfig.py and CLI entry points. Clean up pyproject.toml deps (remove stale pins, add snowflake-connector-python). Fix tomllib import for Python 3.10 compatibility. All 113 tests pass. --- CLAUDE.md | 123 +++-- DESIGN_SYSTEM.md => docs/DESIGN_SYSTEM.md | 0 .../SNOWFLAKE_REFERENCE.md | 0 pyproject.toml | 28 +- rxconfig.py | 5 + setup_dev.py | 13 + src/analysis/CLAUDE.md | 24 + {analysis => src/analysis}/__init__.py | 0 .../analysis}/pathway_analyzer.py | 0 {analysis => src/analysis}/statistics.py | 0 src/cli/CLAUDE.md | 27 ++ {cli => src/cli}/__init__.py | 0 {cli => src/cli}/refresh_pathways.py | 5 + src/config/CLAUDE.md | 23 + {config => src/config}/__init__.py | 5 +- {config => src/config}/snowflake.toml | 0 src/core/CLAUDE.md | 25 + {core => src/core}/__init__.py | 0 {core => src/core}/config.py | 0 {core => src/core}/logging_config.py | 0 {core => src/core}/models.py | 0 src/data_processing/CLAUDE.md | 42 ++ .../data_processing}/__init__.py | 0 .../data_processing}/cache.py | 0 .../data_processing}/data_source.py | 2 +- .../data_processing}/database.py | 0 .../data_processing}/diagnosis_lookup.py | 0 .../data_processing}/loader.py | 2 +- .../data_processing}/migrate.py | 5 + .../data_processing}/pathway_pipeline.py | 2 +- .../data_processing}/reference_data.py | 0 .../data_processing}/schema.py | 0 .../data_processing}/snowflake_connector.py | 0 .../data_processing/transforms.py | 0 src/visualization/CLAUDE.md | 27 ++ .../visualization}/__init__.py | 0 .../visualization}/plotly_generator.py | 0 tests/test_data_transformations.py | 2 +- tests/test_real_data_undefined_rate.py | 7 +- uv.lock | 436 +++++++++++++----- 40 files changed, 589 insertions(+), 214 deletions(-) rename DESIGN_SYSTEM.md => docs/DESIGN_SYSTEM.md (100%) rename SNOWFLAKE_REFERENCE.md => docs/SNOWFLAKE_REFERENCE.md (100%) create mode 100644 setup_dev.py create mode 100644 src/analysis/CLAUDE.md rename {analysis => src/analysis}/__init__.py (100%) rename {analysis => src/analysis}/pathway_analyzer.py (100%) rename {analysis => src/analysis}/statistics.py (100%) create mode 100644 src/cli/CLAUDE.md rename {cli => src/cli}/__init__.py (100%) rename {cli => src/cli}/refresh_pathways.py (99%) create mode 100644 src/config/CLAUDE.md rename {config => src/config}/__init__.py (99%) rename {config => src/config}/snowflake.toml (100%) create mode 100644 src/core/CLAUDE.md rename {core => src/core}/__init__.py (100%) rename {core => src/core}/config.py (100%) rename {core => src/core}/logging_config.py (100%) rename {core => src/core}/models.py (100%) create mode 100644 src/data_processing/CLAUDE.md rename {data_processing => src/data_processing}/__init__.py (100%) rename {data_processing => src/data_processing}/cache.py (100%) rename {data_processing => src/data_processing}/data_source.py (99%) rename {data_processing => src/data_processing}/database.py (100%) rename {data_processing => src/data_processing}/diagnosis_lookup.py (100%) rename {data_processing => src/data_processing}/loader.py (99%) rename {data_processing => src/data_processing}/migrate.py (98%) rename {data_processing => src/data_processing}/pathway_pipeline.py (99%) rename {data_processing => src/data_processing}/reference_data.py (100%) rename {data_processing => src/data_processing}/schema.py (100%) rename {data_processing => src/data_processing}/snowflake_connector.py (100%) rename tools/data.py => src/data_processing/transforms.py (100%) create mode 100644 src/visualization/CLAUDE.md rename {visualization => src/visualization}/__init__.py (100%) rename {visualization => src/visualization}/plotly_generator.py (100%) diff --git a/CLAUDE.md b/CLAUDE.md index d376ec6..95ca207 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -18,10 +18,11 @@ NHS High-Cost Drug Patient Pathway Analysis Tool - a web-based application that ```bash # Install dependencies -pip install -r requirements.txt -# OR with uv uv sync +# One-time dev setup: adds src/ to Python path via .pth file +uv run python setup_dev.py + # Initialize/migrate the database (creates pathway tables) python -m data_processing.migrate @@ -75,53 +76,53 @@ The refresh command: ``` . -├── core/ # Core configuration and models -│ ├── config.py # PathConfig dataclass for file paths -│ ├── models.py # AnalysisFilters dataclass -│ └── logging_config.py # Structured logging setup +├── src/ # All application library code +│ ├── core/ # Foundation: paths, models, logging +│ │ ├── config.py # PathConfig dataclass for file paths +│ │ ├── models.py # AnalysisFilters dataclass +│ │ └── logging_config.py # Structured logging setup +│ │ +│ ├── config/ # Service configuration +│ │ ├── __init__.py # SnowflakeConfig + loader +│ │ └── snowflake.toml # Connection settings (co-located with loader) +│ │ +│ ├── data_processing/ # Data layer +│ │ ├── database.py # SQLite connection management +│ │ ├── schema.py # Database schema (reference + pathway tables) +│ │ ├── pathway_pipeline.py # Pipeline: Snowflake → SQLite +│ │ ├── transforms.py # Data transformations (UPID, drug names, directory) +│ │ ├── loader.py # FileDataLoader for CSV/Parquet files +│ │ ├── reference_data.py # Reference data migration +│ │ ├── snowflake_connector.py # Snowflake integration +│ │ ├── cache.py # Query result caching +│ │ ├── data_source.py # Data source fallback chain +│ │ └── diagnosis_lookup.py # GP diagnosis lookup (SNOMED clusters) +│ │ +│ ├── analysis/ # Analysis pipeline +│ │ ├── pathway_analyzer.py # prepare_data, calculate_statistics, build_hierarchy +│ │ └── statistics.py # Statistical calculation functions +│ │ +│ ├── visualization/ # Chart generation +│ │ └── plotly_generator.py # create_icicle_figure, save_figure_html +│ │ +│ └── cli/ # CLI tools +│ └── refresh_pathways.py # Data refresh command │ -├── cli/ # Command-line interface tools -│ ├── __init__.py -│ └── refresh_pathways.py # CLI to refresh pre-computed pathway data +├── pathways_app/ # Reflex web app (stays at root — framework requirement) +│ ├── pathways_app.py # AppState + page components +│ └── components/ # Layout and navigation components │ -├── data_processing/ # Data layer -│ ├── database.py # SQLite connection management -│ ├── schema.py # Database schema (reference + pathway tables) -│ ├── pathway_pipeline.py # Pathway processing pipeline (Snowflake → SQLite) -│ ├── loader.py # FileDataLoader for CSV/Parquet files -│ ├── reference_data.py # Reference data migration -│ ├── snowflake_connector.py # Snowflake integration -│ ├── cache.py # Query result caching -│ ├── data_source.py # Data source fallback chain (Snowflake/file) -│ └── diagnosis_lookup.py # GP diagnosis lookup and drug-indication mapping -│ -├── analysis/ # Analysis pipeline -│ ├── pathway_analyzer.py # prepare_data, calculate_statistics, build_hierarchy -│ └── statistics.py # Statistical calculation functions -│ -├── visualization/ # Chart generation -│ └── plotly_generator.py # create_icicle_figure, save_figure_html -│ -├── pathways_app/ # Reflex web application -│ ├── pathways_app.py # State class and page components -│ └── components/ # Layout and navigation components -│ -├── tools/ # Legacy modules -│ ├── dashboard_gui.py # Original analysis engine (being refactored) -│ └── data.py # Data transformations (UPID, drug names, directory) -│ -├── config/ # Configuration files -│ └── snowflake.toml # Snowflake connection settings -│ -├── data/ # Reference data and database -│ ├── pathways.db # SQLite database (includes pathway_nodes) -│ └── *.csv # Reference data files -│ -└── tests/ # Test suite - ├── conftest.py # Pytest fixtures - └── test_*.py # Test modules +├── tests/ # Test suite (113 tests) +├── data/ # Reference data + SQLite DB +├── docs/ # Documentation +├── assets/ # Static assets (logo, favicon) +├── archive/ # Historical/deprecated +└── logs/ # Runtime logs ``` +**Path resolution**: `src/` is added to `sys.path` via a `.pth` file (created by `setup_dev.py`). +All imports use package names directly: `from core import ...`, `from data_processing import ...`, etc. + ### Pathway Data Architecture The application uses a pre-computed pathway architecture for performance: @@ -252,16 +253,12 @@ The `AppState` class manages all application state: - Switching reloads pathway data from SQLite filtered by `chart_type` - Note: Directory filter only applies to directory charts (indication charts store Search_Terms in the directory column) -### Legacy Modules (`tools/`) +### Data Transformations (`data_processing/transforms.py`) -Still used during transition: - -- **tools/data.py** - Data transformation functions: - - `patient_id()` - Creates UPID = Provider Code (first 3 chars) + PersonKey - - `drug_names()` - Standardizes via drugnames.csv lookup - - `department_identification()` - 5-level fallback chain for directory assignment - -- **tools/dashboard_gui.py** - Original analysis engine (being replaced by `analysis/` module) +Core data transformation functions used by the pipeline: +- `patient_id()` - Creates UPID = Provider Code (first 3 chars) + PersonKey +- `drug_names()` - Standardizes via drugnames.csv lookup +- `department_identification()` - 5-level fallback chain for directory assignment ### Data Flow @@ -274,7 +271,7 @@ Still used during transition: │ ▼ (fetch_and_transform_data) ┌──────────────────────────────────────────┐ - │ Data Transformations (tools/data.py) │ + │ Data Transformations (data_processing/transforms.py) │ │ → patient_id() creates UPID │ │ → drug_names() standardizes names │ │ → department_identification() → Dir │ @@ -461,7 +458,7 @@ Test coverage includes: ## Configuration -### Snowflake Connection (`config/snowflake.toml`) +### Snowflake Connection (`src/config/snowflake.toml`) ```toml [snowflake] @@ -475,7 +472,7 @@ authenticator = "externalbrowser" # Required for NHS SSO ### Logging Logs are written to `logs/` directory with structured format. -Configure via `core/logging_config.py`. +Configure via `src/core/logging_config.py`. ## Breaking Changes from Original App @@ -519,13 +516,13 @@ The pre-computed pathway architecture introduces these changes: ### Adding New Analysis Features -1. Add statistical functions to `analysis/statistics.py` -2. Integrate into pipeline in `analysis/pathway_analyzer.py` -3. Update visualization in `visualization/plotly_generator.py` +1. Add statistical functions to `src/analysis/statistics.py` +2. Integrate into pipeline in `src/analysis/pathway_analyzer.py` +3. Update visualization in `src/visualization/plotly_generator.py` ### Adding New Reference Data 1. Add CSV file to `data/` directory -2. Define schema in `data_processing/schema.py` -3. Create migration function in `data_processing/reference_data.py` -4. Add path to `PathConfig` in `core/config.py` +2. Define schema in `src/data_processing/schema.py` +3. Create migration function in `src/data_processing/reference_data.py` +4. Add path to `PathConfig` in `src/core/config.py` diff --git a/DESIGN_SYSTEM.md b/docs/DESIGN_SYSTEM.md similarity index 100% rename from DESIGN_SYSTEM.md rename to docs/DESIGN_SYSTEM.md diff --git a/SNOWFLAKE_REFERENCE.md b/docs/SNOWFLAKE_REFERENCE.md similarity index 100% rename from SNOWFLAKE_REFERENCE.md rename to docs/SNOWFLAKE_REFERENCE.md diff --git a/pyproject.toml b/pyproject.toml index dacbafd..a6486a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,3 @@ -[tool.setuptools] -py-modules = [] -packages = [] [project] name = "patient-pathway-analysis" version = "0.1.0" @@ -8,25 +5,15 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "darkdetect==0.8.0", - "decorator==5.1.1", - "et-xmlfile==1.1.0", - "executing==1.2.0", "fastparquet>=2024.11.0", - "idna==3.4", - "itsdangerous==2.1.2", - "jedi==0.18.2", - "jinja2==3.1.2", - "jupyter-core==5.3.1", - "numpy==1.25.0", - "packaging==23.1", - "pandas==2.0.3", - "pillow==10.0.0", - "plotly==5.15.0", + "numpy>=1.25.0", + "pandas>=2.0.3", + "pillow>=10.0.0", + "plotly>=5.15.0", "pyarrow>=20.0.0", - "python-dateutil==2.8.2", "reflex>=0.6.0", - "tenacity==8.2.2", + "snowflake-connector-python>=3.0.0", + "tomli>=2.0.0", ] [project.optional-dependencies] @@ -36,6 +23,7 @@ test = [ ] [tool.pytest.ini_options] +pythonpath = ["src"] testpaths = ["tests"] python_files = ["test_*.py"] python_classes = ["Test*"] @@ -52,7 +40,7 @@ markers = [ ] [tool.coverage.run] -source = ["core", "data_processing", "analysis", "visualization", "tools"] +source = ["src/core", "src/data_processing", "src/analysis", "src/visualization"] branch = true omit = [ "*/tests/*", diff --git a/rxconfig.py b/rxconfig.py index 69e8786..2ad6f6b 100644 --- a/rxconfig.py +++ b/rxconfig.py @@ -1,3 +1,8 @@ +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent / "src")) + import reflex as rx config = rx.Config( diff --git a/setup_dev.py b/setup_dev.py new file mode 100644 index 0000000..158f090 --- /dev/null +++ b/setup_dev.py @@ -0,0 +1,13 @@ +"""One-time dev setup: adds src/ to the venv's Python path via a .pth file.""" + +import site +import sys +from pathlib import Path + +src_dir = Path(__file__).resolve().parent / "src" +site_packages = Path(site.getsitepackages()[-1]) +pth_file = site_packages / "patient_pathways.pth" + +pth_file.write_text(str(src_dir) + "\n") +print(f"Created {pth_file}") +print(f" -> {src_dir}") diff --git a/src/analysis/CLAUDE.md b/src/analysis/CLAUDE.md new file mode 100644 index 0000000..9c29d44 --- /dev/null +++ b/src/analysis/CLAUDE.md @@ -0,0 +1,24 @@ +# Analysis Package + +Four-step pathway analysis pipeline refactored from original 267-line `generate_graph()` function. + +## Module: pathway_analyzer.py + +**Main entry points:** +- `generate_icicle_chart(df, filters)` — Directory charts (Trust → Directory → Drug → Pathway) +- `generate_icicle_chart_indication(df, indication_df, filters)` — Indication charts using Search_Term hierarchy + +**Pipeline steps:** +1. `prepare_data()` — Filter by date/trusts/drugs/directories. **MUST use `df.copy()`** to prevent mutation. +2. `calculate_statistics()` — Compute frequency, cost, duration stats +3. `build_hierarchy()` — Create Trust → Directory/Indication → Drug → Pathway structure +4. `prepare_chart_data()` — Format data for Plotly icicle chart + +**Note on modified UPIDs:** +For drug-aware indication matching, UPIDs are formatted as `{original}|{search_term}`. The hierarchy-building functions treat UPID as opaque — pipe delimiters work transparently without code changes. + +## Module: statistics.py + +Statistical calculation helper functions (frequency, cost, duration, per-patient metrics). + +Called by `calculate_statistics()` during pipeline execution. diff --git a/analysis/__init__.py b/src/analysis/__init__.py similarity index 100% rename from analysis/__init__.py rename to src/analysis/__init__.py diff --git a/analysis/pathway_analyzer.py b/src/analysis/pathway_analyzer.py similarity index 100% rename from analysis/pathway_analyzer.py rename to src/analysis/pathway_analyzer.py diff --git a/analysis/statistics.py b/src/analysis/statistics.py similarity index 100% rename from analysis/statistics.py rename to src/analysis/statistics.py diff --git a/src/cli/CLAUDE.md b/src/cli/CLAUDE.md new file mode 100644 index 0000000..0ed3bec --- /dev/null +++ b/src/cli/CLAUDE.md @@ -0,0 +1,27 @@ +# CLI Package + +Command-line interface for pathway data refresh operations. + +## refresh_pathways.py + +Main CLI module for refreshing pre-computed pathway data from Snowflake to SQLite. + +**Key Functions:** +- `refresh_pathways()` — Orchestrates full pipeline: fetch from Snowflake, transform via tools/data.py, generate pathway charts, insert to SQLite +- `insert_pathway_records()` — Bulk inserts using parameterized queries with `INSERT OR REPLACE` (handles overwrites via UNIQUE constraint) +- `log_refresh_start()`, `log_refresh_complete()`, `log_refresh_failed()` — Tracks refresh status in pathway_refresh_log table +- `get_default_filters()` — Loads available trusts, drugs, directories from CSV files + +**CLI Arguments:** +- `--chart-type [all|directory|indication]` — Which pathway types to refresh (default: all) +- `--dry-run` — Test without database changes +- `--minimum-patients N` — Pathway nodes with