feat: add chart_type argument to refresh command (Task 3.1)

- Add --chart-type argument with choices: directory, indication, all
- Update insert_pathway_records to include chart_type column
- Update refresh_pathways to process multiple chart types
- Update logging to show chart type counts
- Indication chart processing deferred to Task 3.2 (GP diagnosis integration)
This commit is contained in:
Andrew Charlwood
2026-02-05 14:38:57 +00:00
parent 0d15000aa0
commit 593d14c70f
2 changed files with 97 additions and 27 deletions
+4 -4
View File
@@ -108,13 +108,13 @@ python -m reflex compile
## Phase 3: CLI & Data Refresh Updates ## Phase 3: CLI & Data Refresh Updates
### 3.1 Update Refresh Command for Dual Chart Types ### 3.1 Update Refresh Command for Dual Chart Types
- [ ] Modify `cli/refresh_pathways.py`: - [x] Modify `cli/refresh_pathways.py`:
- Process both "directory" and "indication" chart types - Process both "directory" and "indication" chart types
- For each of 6 date filters: generate 2 chart datasets - For each of 6 date filters: generate 2 chart datasets
- Total: 12 pathway datasets (6 dates × 2 chart types) - Total: 12 pathway datasets (6 dates × 2 chart types)
- [ ] Add `--chart-type` argument: "all" (default), "directory", "indication" - [x] Add `--chart-type` argument: "all" (default), "directory", "indication"
- [ ] Update progress logging to show both chart types - [x] Update progress logging to show both chart types
- [ ] Verify: Dry run shows both chart types being processed - [ ] Verify: Dry run shows both chart types being processed (requires Task 3.2 for full indication support)
### 3.2 Integrate Diagnosis-Based Directorate in Pipeline ### 3.2 Integrate Diagnosis-Based Directorate in Pipeline
- [ ] Update `fetch_and_transform_data()` to include diagnosis lookup: - [ ] Update `fetch_and_transform_data()` to include diagnosis lookup:
+86 -16
View File
@@ -3,12 +3,16 @@ CLI command for refreshing pathway data from Snowflake.
This command fetches activity data from Snowflake, processes it through the This command fetches activity data from Snowflake, processes it through the
pathway pipeline for all 6 date filter combinations, and stores the results pathway pipeline for all 6 date filter combinations, and stores the results
in the SQLite pathway_nodes table. in the SQLite pathway_nodes table. Supports two chart types:
- "directory": Trust → Directory → Drug → Pathway (default)
- "indication": Trust → Search_Term → Drug → Pathway (requires GP diagnosis lookup)
Usage: Usage:
python -m cli.refresh_pathways python -m cli.refresh_pathways
python -m cli.refresh_pathways --minimum-patients 10 python -m cli.refresh_pathways --minimum-patients 10
python -m cli.refresh_pathways --provider-codes RGT,RM1 python -m cli.refresh_pathways --provider-codes RGT,RM1
python -m cli.refresh_pathways --chart-type all
python -m cli.refresh_pathways --chart-type directory
python -m cli.refresh_pathways --dry-run python -m cli.refresh_pathways --dry-run
Run `python -m cli.refresh_pathways --help` for full options. Run `python -m cli.refresh_pathways --help` for full options.
@@ -34,9 +38,15 @@ from data_processing.schema import (
create_pathway_tables, create_pathway_tables,
) )
from data_processing.pathway_pipeline import ( from data_processing.pathway_pipeline import (
ChartType,
DATE_FILTER_CONFIGS, DATE_FILTER_CONFIGS,
fetch_and_transform_data, fetch_and_transform_data,
process_all_date_filters, process_all_date_filters,
process_pathway_for_date_filter,
process_indication_pathway_for_date_filter,
extract_denormalized_fields,
extract_indication_fields,
convert_to_records,
) )
logger = get_logger(__name__) logger = get_logger(__name__)
@@ -113,9 +123,9 @@ def insert_pathway_records(
if not records: if not records:
return 0 return 0
# Column order matching pathway_nodes schema # Column order matching pathway_nodes schema (includes chart_type)
columns = [ columns = [
'date_filter_id', 'parents', 'ids', 'labels', 'level', 'date_filter_id', 'chart_type', 'parents', 'ids', 'labels', 'level',
'value', 'cost', 'costpp', 'cost_pp_pa', 'colour', 'value', 'cost', 'costpp', 'cost_pp_pa', 'colour',
'first_seen', 'last_seen', 'first_seen_parent', 'last_seen_parent', 'first_seen', 'last_seen', 'first_seen_parent', 'last_seen_parent',
'average_spacing', 'average_administered', 'avg_days', 'average_spacing', 'average_administered', 'avg_days',
@@ -213,6 +223,7 @@ def refresh_pathways(
db_path: Optional[Path] = None, db_path: Optional[Path] = None,
paths: Optional[PathConfig] = None, paths: Optional[PathConfig] = None,
dry_run: bool = False, dry_run: bool = False,
chart_type: str = "directory",
) -> tuple[bool, str, dict]: ) -> tuple[bool, str, dict]:
""" """
Main refresh function that orchestrates the full pipeline. Main refresh function that orchestrates the full pipeline.
@@ -226,6 +237,7 @@ def refresh_pathways(
db_path: Path to SQLite database (uses default if None) db_path: Path to SQLite database (uses default if None)
paths: PathConfig for file paths paths: PathConfig for file paths
dry_run: If True, don't actually insert records dry_run: If True, don't actually insert records
chart_type: Which chart type to process: "directory", "indication", or "all"
Returns: Returns:
Tuple of (success: bool, message: str, stats: dict) Tuple of (success: bool, message: str, stats: dict)
@@ -255,6 +267,12 @@ def refresh_pathways(
if not drug_filter: if not drug_filter:
return False, "No drugs specified and could not load defaults", {} return False, "No drugs specified and could not load defaults", {}
# Determine which chart types to process
if chart_type == "all":
chart_types_to_process: list[ChartType] = ["directory", "indication"]
else:
chart_types_to_process = [chart_type] # type: ignore
logger.info("=" * 60) logger.info("=" * 60)
logger.info("Pathway Data Refresh Starting") logger.info("Pathway Data Refresh Starting")
logger.info("=" * 60) logger.info("=" * 60)
@@ -263,6 +281,7 @@ def refresh_pathways(
logger.info(f"Drug filter: {len(drug_filter)} drugs") logger.info(f"Drug filter: {len(drug_filter)} drugs")
logger.info(f"Directory filter: {len(directory_filter)} directories") logger.info(f"Directory filter: {len(directory_filter)} directories")
logger.info(f"Provider codes: {provider_codes or 'All'}") logger.info(f"Provider codes: {provider_codes or 'All'}")
logger.info(f"Chart type(s): {', '.join(chart_types_to_process)}")
logger.info(f"Database: {db_manager.db_path}") logger.info(f"Database: {db_manager.db_path}")
logger.info(f"Dry run: {dry_run}") logger.info(f"Dry run: {dry_run}")
logger.info("=" * 60) logger.info("=" * 60)
@@ -306,11 +325,25 @@ def refresh_pathways(
stats["snowflake_rows"] = len(df) stats["snowflake_rows"] = len(df)
logger.info(f"Fetched {len(df)} records from Snowflake") logger.info(f"Fetched {len(df)} records from Snowflake")
# Step 2: Process all date filters # Step 2: Process all date filters for each chart type
logger.info("") num_date_filters = len(DATE_FILTER_CONFIGS)
logger.info("Step 2/4: Processing pathway data for 6 date filter combinations...") num_chart_types = len(chart_types_to_process)
total_datasets = num_date_filters * num_chart_types
results = process_all_date_filters( logger.info("")
logger.info(f"Step 2/4: Processing pathway data for {total_datasets} datasets "
f"({num_date_filters} date filters x {num_chart_types} chart types)...")
# Store results keyed by "date_filter_id:chart_type"
results: dict[str, list[dict]] = {}
for current_chart_type in chart_types_to_process:
logger.info("")
logger.info(f"Processing chart type: {current_chart_type}")
if current_chart_type == "directory":
# Use existing process_all_date_filters for directory charts
dir_results = process_all_date_filters(
df=df, df=df,
trust_filter=trust_filter, trust_filter=trust_filter,
drug_filter=drug_filter, drug_filter=drug_filter,
@@ -319,15 +352,37 @@ def refresh_pathways(
refresh_id=refresh_id, refresh_id=refresh_id,
paths=paths, paths=paths,
) )
# Add results with chart_type suffix
for filter_id, records in dir_results.items():
# Records already have chart_type set by convert_to_records
results[f"{filter_id}:directory"] = records
# Count records per filter elif current_chart_type == "indication":
for filter_id, records in results.items(): # For indication charts, we need indication_df from GP diagnosis lookups
stats["date_filter_counts"][filter_id] = len(records) # This will be implemented in Task 3.2
# For now, log that indication processing requires the diagnosis pipeline
logger.warning("Indication chart processing not yet fully integrated")
logger.warning("Task 3.2 will add GP diagnosis lookup integration")
logger.info("Skipping indication charts for now...")
for config in DATE_FILTER_CONFIGS:
results[f"{config.id}:indication"] = []
# Count records per filter and chart type
stats["chart_type_counts"] = {}
for key, records in results.items():
stats["date_filter_counts"][key] = len(records)
stats["total_records"] += len(records) stats["total_records"] += len(records)
# Also track by chart type
_, ct = key.split(":")
stats["chart_type_counts"][ct] = stats["chart_type_counts"].get(ct, 0) + len(records)
logger.info("")
logger.info(f"Processed {stats['total_records']} total pathway nodes") logger.info(f"Processed {stats['total_records']} total pathway nodes")
for filter_id, count in stats["date_filter_counts"].items(): for chart_type_name, count in stats.get("chart_type_counts", {}).items():
logger.info(f" {filter_id}: {count} nodes") logger.info(f" {chart_type_name}: {count} nodes total")
for key, count in sorted(stats["date_filter_counts"].items()):
if count > 0:
logger.info(f" {key}: {count} nodes")
if dry_run: if dry_run:
logger.info("") logger.info("")
@@ -344,13 +399,13 @@ def refresh_pathways(
deleted = clear_pathway_nodes(conn) deleted = clear_pathway_nodes(conn)
logger.info(f"Cleared {deleted} existing pathway nodes") logger.info(f"Cleared {deleted} existing pathway nodes")
# Insert new records for each date filter # Insert new records for each date filter + chart type combination
total_inserted = 0 total_inserted = 0
for filter_id, records in results.items(): for key, records in results.items():
if records: if records:
inserted = insert_pathway_records(conn, records) inserted = insert_pathway_records(conn, records)
total_inserted += len(records) total_inserted += len(records)
logger.info(f" Inserted {len(records)} records for {filter_id}") logger.info(f" Inserted {len(records)} records for {key}")
# Step 4: Log completion # Step 4: Log completion
logger.info("") logger.info("")
@@ -401,9 +456,15 @@ def main() -> int:
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
# Basic refresh with defaults # Basic refresh with defaults (directory chart only)
python -m cli.refresh_pathways python -m cli.refresh_pathways
# Refresh both chart types (directory and indication)
python -m cli.refresh_pathways --chart-type all
# Refresh only indication-based charts
python -m cli.refresh_pathways --chart-type indication
# Refresh with custom minimum patients # Refresh with custom minimum patients
python -m cli.refresh_pathways --minimum-patients 10 python -m cli.refresh_pathways --minimum-patients 10
@@ -445,6 +506,14 @@ Examples:
help="Process data but don't insert into database" help="Process data but don't insert into database"
) )
parser.add_argument(
"--chart-type",
type=str,
choices=["directory", "indication", "all"],
default="directory",
help="Chart type to process: 'directory' (default), 'indication', or 'all'"
)
parser.add_argument( parser.add_argument(
"--verbose", "-v", "--verbose", "-v",
action="store_true", action="store_true",
@@ -472,6 +541,7 @@ Examples:
provider_codes=provider_codes, provider_codes=provider_codes,
db_path=db_path, db_path=db_path,
dry_run=args.dry_run, dry_run=args.dry_run,
chart_type=args.chart_type,
) )
if success: if success: