refactor: reorganize repository to src/ layout

Move 6 packages (core, config, data_processing, analysis, visualization, cli) into src/ to reduce root clutter. Merge tools/data.py into data_processing/transforms.py. Move docs to docs/. Path resolution via .pth file (setup_dev.py), pytest pythonpath config, and sys.path bootstrap in rxconfig.py and CLI entry points. Clean up pyproject.toml deps (remove stale pins, add snowflake-connector-python). Fix tomllib import for Python 3.10 compatibility. All 113 tests pass.
2026-02-06 12:03:48 +00:00
parent 1581b1d3dd
commit 76838887e6
40 changed files with 589 additions and 214 deletions
@@ -1,268 +0,0 @@
-"""
-Configuration module for Patient Pathway Analysis.
-
-This module provides access to configuration settings loaded from TOML files.
-Primary configuration file: config/snowflake.toml
-
-Usage:
-    from config import load_snowflake_config, SnowflakeConfig
-
-    config = load_snowflake_config()
-    print(config.connection.account)
-    print(config.cache.ttl_seconds)
-"""
-
-from pathlib import Path
-from dataclasses import dataclass, field
-from typing import Optional
-import tomllib  # Python 3.11+ built-in TOML parser
-
-
-@dataclass
-class ConnectionConfig:
-    """Snowflake connection settings."""
-    account: str = ""
-    warehouse: str = "ANALYST_WH"
-    database: str = "DATA_HUB"
-    schema: str = "DWH"
-    authenticator: str = "externalbrowser"
-    user: str = ""
-    role: str = ""
-
-
-@dataclass
-class TimeoutConfig:
-    """Timeout settings for Snowflake operations."""
-    connection_timeout: int = 30
-    query_timeout: int = 300
-    login_timeout: int = 120
-
-
-@dataclass
-class CacheConfig:
-    """Cache settings for Snowflake query results."""
-    enabled: bool = True
-    directory: str = "data/cache"
-    ttl_seconds: int = 86400  # 24 hours
-    ttl_current_data_seconds: int = 3600  # 1 hour
-    max_size_mb: int = 500
-
-
-@dataclass
-class TableReference:
-    """Reference to a Snowflake table or view."""
-    database: str = ""
-    schema: str = ""
-    view: str = ""
-    table: str = ""
-    key_columns: list = field(default_factory=list)
-
-    @property
-    def fully_qualified_name(self) -> str:
-        """Return the fully qualified table/view name."""
-        obj_name = self.table or self.view
-        if not obj_name:
-            return ""
-        if self.database and self.schema:
-            return f'"{self.database}"."{self.schema}"."{obj_name}"'
-        elif self.schema:
-            return f'"{self.schema}"."{obj_name}"'
-        else:
-            return f'"{obj_name}"'
-
-
-@dataclass
-class TablesConfig:
-    """Configuration for commonly used tables."""
-    activity: TableReference = field(default_factory=TableReference)
-    patient: TableReference = field(default_factory=TableReference)
-    medication: TableReference = field(default_factory=TableReference)
-    organization: TableReference = field(default_factory=TableReference)
-
-
-@dataclass
-class QueryConfig:
-    """Query execution settings."""
-    quote_identifiers: bool = True
-    test_limit: int = 20
-    max_rows: int = 100000
-    chunk_size: int = 10000
-
-
-@dataclass
-class SnowflakeConfig:
-    """Complete Snowflake configuration."""
-    connection: ConnectionConfig = field(default_factory=ConnectionConfig)
-    timeouts: TimeoutConfig = field(default_factory=TimeoutConfig)
-    cache: CacheConfig = field(default_factory=CacheConfig)
-    tables: TablesConfig = field(default_factory=TablesConfig)
-    query: QueryConfig = field(default_factory=QueryConfig)
-
-    def validate(self) -> list[str]:
-        """
-        Validate the configuration.
-
-        Returns:
-            List of error messages (empty if valid).
-        """
-        errors = []
-
-        if not self.connection.account:
-            errors.append("Snowflake account is not configured (connection.account)")
-
-        if not self.connection.warehouse:
-            errors.append("Snowflake warehouse is not configured (connection.warehouse)")
-
-        if self.connection.authenticator not in ("externalbrowser", "snowflake", "oauth", "okta"):
-            errors.append(f"Invalid authenticator: {self.connection.authenticator}")
-
-        if self.cache.ttl_seconds < 0:
-            errors.append("Cache TTL must be non-negative")
-
-        if self.query.max_rows < 1:
-            errors.append("max_rows must be at least 1")
-
-        return errors
-
-    @property
-    def is_configured(self) -> bool:
-        """Return True if minimum required settings are present."""
-        return bool(self.connection.account)
-
-
-def _parse_table_reference(data: dict) -> TableReference:
-    """Parse a table reference from TOML data."""
-    return TableReference(
-        database=data.get("database", ""),
-        schema=data.get("schema", ""),
-        view=data.get("view", ""),
-        table=data.get("table", ""),
-        key_columns=data.get("key_columns", []),
-    )
-
-
-def load_snowflake_config(config_path: Optional[Path] = None) -> SnowflakeConfig:
-    """
-    Load Snowflake configuration from TOML file.
-
-    Args:
-        config_path: Path to the TOML config file. Defaults to config/snowflake.toml
-                     relative to the project root.
-
-    Returns:
-        SnowflakeConfig dataclass with all settings.
-
-    Raises:
-        FileNotFoundError: If the config file doesn't exist.
-        tomllib.TOMLDecodeError: If the TOML is invalid.
-    """
-    if config_path is None:
-        # Default to config/snowflake.toml relative to this file's directory
-        config_path = Path(__file__).parent / "snowflake.toml"
-
-    if not config_path.exists():
-        # Return default config if file doesn't exist
-        return SnowflakeConfig()
-
-    with open(config_path, "rb") as f:
-        data = tomllib.load(f)
-
-    # Parse connection settings
-    conn_data = data.get("connection", {})
-    connection = ConnectionConfig(
-        account=conn_data.get("account", ""),
-        warehouse=conn_data.get("warehouse", "ANALYST_WH"),
-        database=conn_data.get("database", "DATA_HUB"),
-        schema=conn_data.get("schema", "DWH"),
-        authenticator=conn_data.get("authenticator", "externalbrowser"),
-        user=conn_data.get("user", ""),
-        role=conn_data.get("role", ""),
-    )
-
-    # Parse timeout settings
-    timeout_data = data.get("timeouts", {})
-    timeouts = TimeoutConfig(
-        connection_timeout=timeout_data.get("connection_timeout", 600),
-        query_timeout=timeout_data.get("query_timeout", 300),
-        login_timeout=timeout_data.get("login_timeout", 120),
-    )
-
-    # Parse cache settings
-    cache_data = data.get("cache", {})
-    cache = CacheConfig(
-        enabled=cache_data.get("enabled", True),
-        directory=cache_data.get("directory", "data/cache"),
-        ttl_seconds=cache_data.get("ttl_seconds", 86400),
-        ttl_current_data_seconds=cache_data.get("ttl_current_data_seconds", 3600),
-        max_size_mb=cache_data.get("max_size_mb", 500),
-    )
-
-    # Parse table references
-    tables_data = data.get("tables", {})
-    tables = TablesConfig(
-        activity=_parse_table_reference(tables_data.get("activity", {})),
-        patient=_parse_table_reference(tables_data.get("patient", {})),
-        medication=_parse_table_reference(tables_data.get("medication", {})),
-        organization=_parse_table_reference(tables_data.get("organization", {})),
-    )
-
-    # Parse query settings
-    query_data = data.get("query", {})
-    query = QueryConfig(
-        quote_identifiers=query_data.get("quote_identifiers", True),
-        test_limit=query_data.get("test_limit", 20),
-        max_rows=query_data.get("max_rows", 100000),
-        chunk_size=query_data.get("chunk_size", 10000),
-    )
-
-    return SnowflakeConfig(
-        connection=connection,
-        timeouts=timeouts,
-        cache=cache,
-        tables=tables,
-        query=query,
-    )
-
-
-# Module-level cached config (loaded on first access)
-_cached_config: Optional[SnowflakeConfig] = None
-
-
-def get_snowflake_config() -> SnowflakeConfig:
-    """
-    Get the Snowflake configuration (cached after first load).
-
-    Returns:
-        SnowflakeConfig dataclass with all settings.
-    """
-    global _cached_config
-    if _cached_config is None:
-        _cached_config = load_snowflake_config()
-    return _cached_config
-
-
-def reload_snowflake_config() -> SnowflakeConfig:
-    """
-    Reload the Snowflake configuration from disk.
-
-    Returns:
-        SnowflakeConfig dataclass with all settings.
-    """
-    global _cached_config
-    _cached_config = load_snowflake_config()
-    return _cached_config
-
-
-# Export public API
-__all__ = [
-    "SnowflakeConfig",
-    "ConnectionConfig",
-    "TimeoutConfig",
-    "CacheConfig",
-    "TableReference",
-    "TablesConfig",
-    "QueryConfig",
-    "load_snowflake_config",
-    "get_snowflake_config",
-    "reload_snowflake_config",
-]
@@ -1,129 +0,0 @@
-# Snowflake Configuration for NHS Patient Pathway Analysis
-#
-# This file contains connection settings for the Snowflake data warehouse.
-# IMPORTANT: This file should NOT be committed to version control if it contains
-# sensitive information. However, with externalbrowser auth, no passwords are stored.
-#
-# For NHS SSO authentication, the 'externalbrowser' authenticator opens a browser
-# window for authentication via NHS identity management.
-
-[connection]
-# Snowflake account identifier (e.g., "xy12345.uk-south.azure")
-# Ask your Snowflake administrator for the correct account name
-account = "ZK91403.uk-south.azure"
-
-# Default warehouse to use for queries
-# Common options: ANALYST_WH, COMPUTE_WH
-warehouse = "WH__XSMALL"
-
-# Default database for queries
-# DATA_HUB is the primary analyst-curated data warehouse
-database = "DATA_HUB"
-
-# Default schema (optional, can be overridden per query)
-schema = "DWH"
-
-# Authentication method
-# "externalbrowser" opens browser for NHS SSO (required for NHS environments)
-# Other options: "snowflake" (username/password), "oauth", "okta"
-authenticator = "externalbrowser"
-
-# User principal (email address for externalbrowser auth)
-# Leave empty to use current Windows user or prompt
-user = "ANDREW.CHARLWOOD@NHS.NET"
-
-# Role to use (optional, uses default role if empty)
-role = ""
-
-[timeouts]
-# Network timeout in seconds (how long client waits for Snowflake response)
-# Must be high enough for GP record lookups which can take 30-60s per batch
-connection_timeout = 600
-
-# Query execution timeout in seconds (for long-running queries)
-# Set to 0 for no timeout
-query_timeout = 300
-
-# Login timeout in seconds (for SSO browser auth)
-login_timeout = 120
-
-[cache]
-# Enable result caching
-enabled = true
-
-# Cache directory (relative to project root or absolute path)
-# Defaults to data/cache/ if not specified
-directory = "data/cache"
-
-# Time-to-live for cached results in seconds
-# 24 hours for historical data (86400 seconds)
-ttl_seconds = 86400
-
-# TTL for data that includes today's date (shorter)
-ttl_current_data_seconds = 3600
-
-# Maximum cache size in MB (oldest entries removed when exceeded)
-max_size_mb = 500
-
-[databases]
-# Quick reference for database purposes (read-only documentation)
-# DATA_HUB = "Analyst-curated data warehouse - primary source for most queries"
-# PRIMARY_CARE = "Raw extracts from EMIS and TPP clinical systems"
-# NATIONAL = "NHS England national datasets (SUS, ECDS, MHSDS, etc.)"
-# FACTS_AND_DIMENSIONS_ALL_DATA = "External reference data (BNF, SNOMED, QOF clusters)"
-# REPORTING_DATASETS_ICB = "Reporting outputs and analyst workspaces"
-
-# Tables commonly used for high-cost drug analysis
-[tables.activity]
-# Main activity data source (high-cost drug interventions)
-# Acute__Conmon__PatientLevelDrugs contains patient-level high-cost drug data
-database = "DATA_HUB"
-schema = "CDM"
-table = "Acute__Conmon__PatientLevelDrugs"
-key_columns = [
-    "PseudoNHSNoLinked",    # Pseudonymised NHS number for patient linking
-    "ProviderCode",          # NHS provider code (e.g., RM1, RGP)
-    "LocalPatientID",        # Local patient identifier within provider
-    "InterventionDate",      # Date of drug intervention
-    "DrugName",              # Drug name (raw, needs standardization)
-    "DrugSNOMEDCode",        # SNOMED code for drug
-    "PriceActual",           # Actual cost of intervention
-    "TreatmentFunctionCode", # NHS treatment function code
-    "TreatmentFunctionDesc", # Treatment function description
-    "AdditionalDetail1",     # Additional details (used for directory identification)
-]
-
-[tables.patient]
-# Patient demographics
-database = "DATA_HUB"
-schema = "DWH"
-view = "DimPerson"
-key_columns = ["PatientPseudonym", "PersonKey", "CurrentGeneralPractice"]
-
-[tables.medication]
-# Medication reference data
-database = "DATA_HUB"
-schema = "DWH"
-view = "DimMedicineAndDevice"
-key_columns = ["ProductSnomedCode", "TherapeuticMoietySnomedCode", "ProductDescription"]
-
-[tables.organization]
-# NHS organizations and GP practices
-database = "DATA_HUB"
-schema = "DWH"
-view = "DimOrganisationAndSite"
-key_columns = ["SiteCode", "OrganisationName"]
-
-[query]
-# Default query behaviors
-# Always double-quote identifiers for case-sensitivity
-quote_identifiers = true
-
-# Default row limit for test queries
-test_limit = 20
-
-# Maximum rows to fetch in a single query (prevents runaway queries)
-max_rows = 100000
-
-# Chunk size for large result sets
-chunk_size = 10000