feat: complete Task 2.2 - test refresh pipeline with Snowflake data

Tested full refresh pipeline end-to-end with real Snowflake data: - Fixed trust filter to read Name column from defaultTrusts.csv - Fixed Decimal type handling in calculate_cost_per_patient_per_annum - Fixed array handling in convert_to_records for average_administered - Added required reference CSV files to data/ directory - Configured Snowflake connection (account, warehouse, user) Results: - Snowflake fetch: 656,695 records in ~7s - Transformations: 519,848 records after UPID/drug/directory - Pathway nodes: 293 for all_6mo (8 trusts, 14 directories) - Total processing time: ~6.2 minutes
2026-02-05 00:20:12 +00:00
parent 8b65dfd9a8
commit adc1dbfc58
12 changed files with 1708 additions and 21 deletions
@@ -10,11 +10,11 @@
 [connection]
 # Snowflake account identifier (e.g., "xy12345.uk-south.azure")
 # Ask your Snowflake administrator for the correct account name
-account = ""
+account = "ZK91403.uk-south.azure"

 # Default warehouse to use for queries
 # Common options: ANALYST_WH, COMPUTE_WH
-warehouse = "ANALYST_WH"
+warehouse = "WH__XSMALL"

 # Default database for queries
 # DATA_HUB is the primary analyst-curated data warehouse
@@ -30,7 +30,7 @@ authenticator = "externalbrowser"

 # User principal (email address for externalbrowser auth)
 # Leave empty to use current Windows user or prompt
-user = ""
+user = "ANDREW.CHARLWOOD@NHS.NET"

 # Role to use (optional, uses default role if empty)
 role = ""