refactor: reorganize repository to src/ layout
Move 6 packages (core, config, data_processing, analysis, visualization, cli) into src/ to reduce root clutter. Merge tools/data.py into data_processing/transforms.py. Move docs to docs/. Path resolution via .pth file (setup_dev.py), pytest pythonpath config, and sys.path bootstrap in rxconfig.py and CLI entry points. Clean up pyproject.toml deps (remove stale pins, add snowflake-connector-python). Fix tomllib import for Python 3.10 compatibility. All 113 tests pass.
This commit is contained in:
@@ -1,268 +0,0 @@
|
||||
"""
|
||||
Configuration module for Patient Pathway Analysis.
|
||||
|
||||
This module provides access to configuration settings loaded from TOML files.
|
||||
Primary configuration file: config/snowflake.toml
|
||||
|
||||
Usage:
|
||||
from config import load_snowflake_config, SnowflakeConfig
|
||||
|
||||
config = load_snowflake_config()
|
||||
print(config.connection.account)
|
||||
print(config.cache.ttl_seconds)
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
import tomllib # Python 3.11+ built-in TOML parser
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConnectionConfig:
|
||||
"""Snowflake connection settings."""
|
||||
account: str = ""
|
||||
warehouse: str = "ANALYST_WH"
|
||||
database: str = "DATA_HUB"
|
||||
schema: str = "DWH"
|
||||
authenticator: str = "externalbrowser"
|
||||
user: str = ""
|
||||
role: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TimeoutConfig:
|
||||
"""Timeout settings for Snowflake operations."""
|
||||
connection_timeout: int = 30
|
||||
query_timeout: int = 300
|
||||
login_timeout: int = 120
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheConfig:
|
||||
"""Cache settings for Snowflake query results."""
|
||||
enabled: bool = True
|
||||
directory: str = "data/cache"
|
||||
ttl_seconds: int = 86400 # 24 hours
|
||||
ttl_current_data_seconds: int = 3600 # 1 hour
|
||||
max_size_mb: int = 500
|
||||
|
||||
|
||||
@dataclass
|
||||
class TableReference:
|
||||
"""Reference to a Snowflake table or view."""
|
||||
database: str = ""
|
||||
schema: str = ""
|
||||
view: str = ""
|
||||
table: str = ""
|
||||
key_columns: list = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def fully_qualified_name(self) -> str:
|
||||
"""Return the fully qualified table/view name."""
|
||||
obj_name = self.table or self.view
|
||||
if not obj_name:
|
||||
return ""
|
||||
if self.database and self.schema:
|
||||
return f'"{self.database}"."{self.schema}"."{obj_name}"'
|
||||
elif self.schema:
|
||||
return f'"{self.schema}"."{obj_name}"'
|
||||
else:
|
||||
return f'"{obj_name}"'
|
||||
|
||||
|
||||
@dataclass
|
||||
class TablesConfig:
|
||||
"""Configuration for commonly used tables."""
|
||||
activity: TableReference = field(default_factory=TableReference)
|
||||
patient: TableReference = field(default_factory=TableReference)
|
||||
medication: TableReference = field(default_factory=TableReference)
|
||||
organization: TableReference = field(default_factory=TableReference)
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryConfig:
|
||||
"""Query execution settings."""
|
||||
quote_identifiers: bool = True
|
||||
test_limit: int = 20
|
||||
max_rows: int = 100000
|
||||
chunk_size: int = 10000
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnowflakeConfig:
|
||||
"""Complete Snowflake configuration."""
|
||||
connection: ConnectionConfig = field(default_factory=ConnectionConfig)
|
||||
timeouts: TimeoutConfig = field(default_factory=TimeoutConfig)
|
||||
cache: CacheConfig = field(default_factory=CacheConfig)
|
||||
tables: TablesConfig = field(default_factory=TablesConfig)
|
||||
query: QueryConfig = field(default_factory=QueryConfig)
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
"""
|
||||
Validate the configuration.
|
||||
|
||||
Returns:
|
||||
List of error messages (empty if valid).
|
||||
"""
|
||||
errors = []
|
||||
|
||||
if not self.connection.account:
|
||||
errors.append("Snowflake account is not configured (connection.account)")
|
||||
|
||||
if not self.connection.warehouse:
|
||||
errors.append("Snowflake warehouse is not configured (connection.warehouse)")
|
||||
|
||||
if self.connection.authenticator not in ("externalbrowser", "snowflake", "oauth", "okta"):
|
||||
errors.append(f"Invalid authenticator: {self.connection.authenticator}")
|
||||
|
||||
if self.cache.ttl_seconds < 0:
|
||||
errors.append("Cache TTL must be non-negative")
|
||||
|
||||
if self.query.max_rows < 1:
|
||||
errors.append("max_rows must be at least 1")
|
||||
|
||||
return errors
|
||||
|
||||
@property
|
||||
def is_configured(self) -> bool:
|
||||
"""Return True if minimum required settings are present."""
|
||||
return bool(self.connection.account)
|
||||
|
||||
|
||||
def _parse_table_reference(data: dict) -> TableReference:
|
||||
"""Parse a table reference from TOML data."""
|
||||
return TableReference(
|
||||
database=data.get("database", ""),
|
||||
schema=data.get("schema", ""),
|
||||
view=data.get("view", ""),
|
||||
table=data.get("table", ""),
|
||||
key_columns=data.get("key_columns", []),
|
||||
)
|
||||
|
||||
|
||||
def load_snowflake_config(config_path: Optional[Path] = None) -> SnowflakeConfig:
|
||||
"""
|
||||
Load Snowflake configuration from TOML file.
|
||||
|
||||
Args:
|
||||
config_path: Path to the TOML config file. Defaults to config/snowflake.toml
|
||||
relative to the project root.
|
||||
|
||||
Returns:
|
||||
SnowflakeConfig dataclass with all settings.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the config file doesn't exist.
|
||||
tomllib.TOMLDecodeError: If the TOML is invalid.
|
||||
"""
|
||||
if config_path is None:
|
||||
# Default to config/snowflake.toml relative to this file's directory
|
||||
config_path = Path(__file__).parent / "snowflake.toml"
|
||||
|
||||
if not config_path.exists():
|
||||
# Return default config if file doesn't exist
|
||||
return SnowflakeConfig()
|
||||
|
||||
with open(config_path, "rb") as f:
|
||||
data = tomllib.load(f)
|
||||
|
||||
# Parse connection settings
|
||||
conn_data = data.get("connection", {})
|
||||
connection = ConnectionConfig(
|
||||
account=conn_data.get("account", ""),
|
||||
warehouse=conn_data.get("warehouse", "ANALYST_WH"),
|
||||
database=conn_data.get("database", "DATA_HUB"),
|
||||
schema=conn_data.get("schema", "DWH"),
|
||||
authenticator=conn_data.get("authenticator", "externalbrowser"),
|
||||
user=conn_data.get("user", ""),
|
||||
role=conn_data.get("role", ""),
|
||||
)
|
||||
|
||||
# Parse timeout settings
|
||||
timeout_data = data.get("timeouts", {})
|
||||
timeouts = TimeoutConfig(
|
||||
connection_timeout=timeout_data.get("connection_timeout", 600),
|
||||
query_timeout=timeout_data.get("query_timeout", 300),
|
||||
login_timeout=timeout_data.get("login_timeout", 120),
|
||||
)
|
||||
|
||||
# Parse cache settings
|
||||
cache_data = data.get("cache", {})
|
||||
cache = CacheConfig(
|
||||
enabled=cache_data.get("enabled", True),
|
||||
directory=cache_data.get("directory", "data/cache"),
|
||||
ttl_seconds=cache_data.get("ttl_seconds", 86400),
|
||||
ttl_current_data_seconds=cache_data.get("ttl_current_data_seconds", 3600),
|
||||
max_size_mb=cache_data.get("max_size_mb", 500),
|
||||
)
|
||||
|
||||
# Parse table references
|
||||
tables_data = data.get("tables", {})
|
||||
tables = TablesConfig(
|
||||
activity=_parse_table_reference(tables_data.get("activity", {})),
|
||||
patient=_parse_table_reference(tables_data.get("patient", {})),
|
||||
medication=_parse_table_reference(tables_data.get("medication", {})),
|
||||
organization=_parse_table_reference(tables_data.get("organization", {})),
|
||||
)
|
||||
|
||||
# Parse query settings
|
||||
query_data = data.get("query", {})
|
||||
query = QueryConfig(
|
||||
quote_identifiers=query_data.get("quote_identifiers", True),
|
||||
test_limit=query_data.get("test_limit", 20),
|
||||
max_rows=query_data.get("max_rows", 100000),
|
||||
chunk_size=query_data.get("chunk_size", 10000),
|
||||
)
|
||||
|
||||
return SnowflakeConfig(
|
||||
connection=connection,
|
||||
timeouts=timeouts,
|
||||
cache=cache,
|
||||
tables=tables,
|
||||
query=query,
|
||||
)
|
||||
|
||||
|
||||
# Module-level cached config (loaded on first access)
|
||||
_cached_config: Optional[SnowflakeConfig] = None
|
||||
|
||||
|
||||
def get_snowflake_config() -> SnowflakeConfig:
|
||||
"""
|
||||
Get the Snowflake configuration (cached after first load).
|
||||
|
||||
Returns:
|
||||
SnowflakeConfig dataclass with all settings.
|
||||
"""
|
||||
global _cached_config
|
||||
if _cached_config is None:
|
||||
_cached_config = load_snowflake_config()
|
||||
return _cached_config
|
||||
|
||||
|
||||
def reload_snowflake_config() -> SnowflakeConfig:
|
||||
"""
|
||||
Reload the Snowflake configuration from disk.
|
||||
|
||||
Returns:
|
||||
SnowflakeConfig dataclass with all settings.
|
||||
"""
|
||||
global _cached_config
|
||||
_cached_config = load_snowflake_config()
|
||||
return _cached_config
|
||||
|
||||
|
||||
# Export public API
|
||||
__all__ = [
|
||||
"SnowflakeConfig",
|
||||
"ConnectionConfig",
|
||||
"TimeoutConfig",
|
||||
"CacheConfig",
|
||||
"TableReference",
|
||||
"TablesConfig",
|
||||
"QueryConfig",
|
||||
"load_snowflake_config",
|
||||
"get_snowflake_config",
|
||||
"reload_snowflake_config",
|
||||
]
|
||||
@@ -1,129 +0,0 @@
|
||||
# Snowflake Configuration for NHS Patient Pathway Analysis
|
||||
#
|
||||
# This file contains connection settings for the Snowflake data warehouse.
|
||||
# IMPORTANT: This file should NOT be committed to version control if it contains
|
||||
# sensitive information. However, with externalbrowser auth, no passwords are stored.
|
||||
#
|
||||
# For NHS SSO authentication, the 'externalbrowser' authenticator opens a browser
|
||||
# window for authentication via NHS identity management.
|
||||
|
||||
[connection]
|
||||
# Snowflake account identifier (e.g., "xy12345.uk-south.azure")
|
||||
# Ask your Snowflake administrator for the correct account name
|
||||
account = "ZK91403.uk-south.azure"
|
||||
|
||||
# Default warehouse to use for queries
|
||||
# Common options: ANALYST_WH, COMPUTE_WH
|
||||
warehouse = "WH__XSMALL"
|
||||
|
||||
# Default database for queries
|
||||
# DATA_HUB is the primary analyst-curated data warehouse
|
||||
database = "DATA_HUB"
|
||||
|
||||
# Default schema (optional, can be overridden per query)
|
||||
schema = "DWH"
|
||||
|
||||
# Authentication method
|
||||
# "externalbrowser" opens browser for NHS SSO (required for NHS environments)
|
||||
# Other options: "snowflake" (username/password), "oauth", "okta"
|
||||
authenticator = "externalbrowser"
|
||||
|
||||
# User principal (email address for externalbrowser auth)
|
||||
# Leave empty to use current Windows user or prompt
|
||||
user = "ANDREW.CHARLWOOD@NHS.NET"
|
||||
|
||||
# Role to use (optional, uses default role if empty)
|
||||
role = ""
|
||||
|
||||
[timeouts]
|
||||
# Network timeout in seconds (how long client waits for Snowflake response)
|
||||
# Must be high enough for GP record lookups which can take 30-60s per batch
|
||||
connection_timeout = 600
|
||||
|
||||
# Query execution timeout in seconds (for long-running queries)
|
||||
# Set to 0 for no timeout
|
||||
query_timeout = 300
|
||||
|
||||
# Login timeout in seconds (for SSO browser auth)
|
||||
login_timeout = 120
|
||||
|
||||
[cache]
|
||||
# Enable result caching
|
||||
enabled = true
|
||||
|
||||
# Cache directory (relative to project root or absolute path)
|
||||
# Defaults to data/cache/ if not specified
|
||||
directory = "data/cache"
|
||||
|
||||
# Time-to-live for cached results in seconds
|
||||
# 24 hours for historical data (86400 seconds)
|
||||
ttl_seconds = 86400
|
||||
|
||||
# TTL for data that includes today's date (shorter)
|
||||
ttl_current_data_seconds = 3600
|
||||
|
||||
# Maximum cache size in MB (oldest entries removed when exceeded)
|
||||
max_size_mb = 500
|
||||
|
||||
[databases]
|
||||
# Quick reference for database purposes (read-only documentation)
|
||||
# DATA_HUB = "Analyst-curated data warehouse - primary source for most queries"
|
||||
# PRIMARY_CARE = "Raw extracts from EMIS and TPP clinical systems"
|
||||
# NATIONAL = "NHS England national datasets (SUS, ECDS, MHSDS, etc.)"
|
||||
# FACTS_AND_DIMENSIONS_ALL_DATA = "External reference data (BNF, SNOMED, QOF clusters)"
|
||||
# REPORTING_DATASETS_ICB = "Reporting outputs and analyst workspaces"
|
||||
|
||||
# Tables commonly used for high-cost drug analysis
|
||||
[tables.activity]
|
||||
# Main activity data source (high-cost drug interventions)
|
||||
# Acute__Conmon__PatientLevelDrugs contains patient-level high-cost drug data
|
||||
database = "DATA_HUB"
|
||||
schema = "CDM"
|
||||
table = "Acute__Conmon__PatientLevelDrugs"
|
||||
key_columns = [
|
||||
"PseudoNHSNoLinked", # Pseudonymised NHS number for patient linking
|
||||
"ProviderCode", # NHS provider code (e.g., RM1, RGP)
|
||||
"LocalPatientID", # Local patient identifier within provider
|
||||
"InterventionDate", # Date of drug intervention
|
||||
"DrugName", # Drug name (raw, needs standardization)
|
||||
"DrugSNOMEDCode", # SNOMED code for drug
|
||||
"PriceActual", # Actual cost of intervention
|
||||
"TreatmentFunctionCode", # NHS treatment function code
|
||||
"TreatmentFunctionDesc", # Treatment function description
|
||||
"AdditionalDetail1", # Additional details (used for directory identification)
|
||||
]
|
||||
|
||||
[tables.patient]
|
||||
# Patient demographics
|
||||
database = "DATA_HUB"
|
||||
schema = "DWH"
|
||||
view = "DimPerson"
|
||||
key_columns = ["PatientPseudonym", "PersonKey", "CurrentGeneralPractice"]
|
||||
|
||||
[tables.medication]
|
||||
# Medication reference data
|
||||
database = "DATA_HUB"
|
||||
schema = "DWH"
|
||||
view = "DimMedicineAndDevice"
|
||||
key_columns = ["ProductSnomedCode", "TherapeuticMoietySnomedCode", "ProductDescription"]
|
||||
|
||||
[tables.organization]
|
||||
# NHS organizations and GP practices
|
||||
database = "DATA_HUB"
|
||||
schema = "DWH"
|
||||
view = "DimOrganisationAndSite"
|
||||
key_columns = ["SiteCode", "OrganisationName"]
|
||||
|
||||
[query]
|
||||
# Default query behaviors
|
||||
# Always double-quote identifiers for case-sensitivity
|
||||
quote_identifiers = true
|
||||
|
||||
# Default row limit for test queries
|
||||
test_limit = 20
|
||||
|
||||
# Maximum rows to fetch in a single query (prevents runaway queries)
|
||||
max_rows = 100000
|
||||
|
||||
# Chunk size for large result sets
|
||||
chunk_size = 10000
|
||||
Reference in New Issue
Block a user