refactor: reorganize repository to src/ layout

Move 6 packages (core, config, data_processing, analysis, visualization, cli) into src/ to reduce root clutter. Merge tools/data.py into data_processing/transforms.py. Move docs to docs/. Path resolution via .pth file (setup_dev.py), pytest pythonpath config, and sys.path bootstrap in rxconfig.py and CLI entry points. Clean up pyproject.toml deps (remove stale pins, add snowflake-connector-python). Fix tomllib import for Python 3.10 compatibility. All 113 tests pass.
2026-02-06 12:03:48 +00:00
parent 1581b1d3dd
commit 76838887e6
40 changed files with 589 additions and 214 deletions
@@ -1,246 +0,0 @@
-"""
-Data loader abstractions for NHS High-Cost Drug Patient Pathway Analysis Tool.
-
-Provides a unified interface for loading patient intervention data from:
- CSV/Parquet files (current behavior)
- SQLite database (new, faster approach)
- Snowflake (future, direct from warehouse)
-
-The DataLoader ABC defines the contract for all loader implementations.
-"""
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Optional
-
-import pandas as pd
-
-from core import PathConfig, default_paths
-from core.logging_config import get_logger
-
-logger = get_logger(__name__)
-
-
-@dataclass
-class LoadResult:
-    """Result of a data load operation.
-
-    Attributes:
-        df: The loaded DataFrame with processed patient intervention data
-        source: Description of the data source (e.g., "file:/path/to/file.csv")
-        row_count: Number of rows loaded
-        columns: List of column names in the DataFrame
-        load_time_seconds: Time taken to load the data
-    """
-    df: pd.DataFrame
-    source: str
-    row_count: int
-    columns: list[str] = field(default_factory=list)
-    load_time_seconds: float = 0.0
-
-    def __post_init__(self):
-        if not self.columns:
-            self.columns = list(self.df.columns)
-
-
-# Expected columns in a processed DataFrame
-# These are the columns that generate_graph() expects to receive
-REQUIRED_COLUMNS = [
-    "UPID",           # Unique Patient ID (Provider Code prefix + PersonKey)
-    "Drug Name",      # Standardized drug name
-    "Intervention Date",  # Date of intervention
-    "Price Actual",   # Cost of intervention
-    "OrganisationName",  # NHS Trust name
-    "Directory",      # Medical specialty/directory
-    "Provider Code",  # NHS provider code
-    "PersonKey",      # Patient identifier within provider
-]
-
-# Additional columns that are useful but not strictly required
-OPTIONAL_COLUMNS = [
-    "UPIDTreatment",  # UPID + Drug Name combo (created by generate_graph)
-    "Treatment Function Code",  # NHS treatment function code
-    "Additional Detail 1",
-    "Additional Detail 2",
-    "Additional Detail 3",
-    "Additional Detail 4",
-    "Additional Detail 5",
-]
-
-
-class DataLoader(ABC):
-    """Abstract base class for data loaders.
-
-    All data loaders must implement the load() method which returns
-    a DataFrame ready for use by generate_graph().
-
-    The returned DataFrame must contain REQUIRED_COLUMNS at minimum.
-    """
-
-    @abstractmethod
-    def load(self) -> LoadResult:
-        """Load and process patient intervention data.
-
-        Returns:
-            LoadResult containing the processed DataFrame and metadata.
-            The DataFrame must contain all REQUIRED_COLUMNS.
-
-        Raises:
-            FileNotFoundError: If the data source doesn't exist
-            ValueError: If the data is malformed or missing required columns
-        """
-        pass
-
-    @abstractmethod
-    def validate_source(self) -> tuple[bool, str]:
-        """Check if the data source is valid and accessible.
-
-        Returns:
-            Tuple of (is_valid, message).
-            If is_valid is False, message explains the issue.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def source_description(self) -> str:
-        """Human-readable description of the data source."""
-        pass
-
-    def validate_dataframe(self, df: pd.DataFrame) -> tuple[bool, list[str]]:
-        """Validate that a DataFrame has all required columns.
-
-        Args:
-            df: DataFrame to validate
-
-        Returns:
-            Tuple of (is_valid, missing_columns).
-            If is_valid is False, missing_columns lists what's missing.
-        """
-        missing = [col for col in REQUIRED_COLUMNS if col not in df.columns]
-        return len(missing) == 0, missing
-
-
-class FileDataLoader(DataLoader):
-    """Loads data from CSV or Parquet files.
-
-    This replicates the current behavior of dashboard_gui.main():
-    1. Read CSV or Parquet file
-    2. Apply patient_id() transformation
-    3. Convert dates
-    4. Apply drug_names() standardization
-    5. Clean organization names
-    6. Apply department_identification()
-
-    Args:
-        file_path: Path to the CSV or Parquet file
-        paths: PathConfig for reference data file locations (uses default_paths if None)
-    """
-
-    def __init__(
-        self,
-        file_path: Path | str,
-        paths: Optional[PathConfig] = None,
-    ):
-        self.file_path = Path(file_path)
-        self.paths = paths or default_paths
-
-    def validate_source(self) -> tuple[bool, str]:
-        """Check if the file exists and has a supported extension."""
-        if not self.file_path.exists():
-            return False, f"File not found: {self.file_path}"
-
-        ext = self.file_path.suffix.lower()
-        if ext not in ('.csv', '.parquet'):
-            return False, f"Unsupported file type: {ext}. Must be .csv or .parquet"
-
-        return True, "OK"
-
-    @property
-    def source_description(self) -> str:
-        return f"file:{self.file_path}"
-
-    def load(self) -> LoadResult:
-        """Load and process data from CSV or Parquet file.
-
-        Applies the same transformation pipeline as the original
-        dashboard_gui.main() function.
-        """
-        import time
-        from tools import data
-
-        start_time = time.time()
-
-        # Validate source before loading
-        is_valid, msg = self.validate_source()
-        if not is_valid:
-            raise FileNotFoundError(msg)
-
-        # Read file based on extension
-        ext = self.file_path.suffix.lower()
-        logger.info(f"Reading {ext} file: {self.file_path}")
-
-        if ext == '.csv':
-            df_raw = pd.read_csv(self.file_path, low_memory=False)
-        else:  # .parquet
-            df_raw = pd.read_parquet(self.file_path)
-
-        logger.info(f"File read successfully. {len(df_raw)} rows.")
-
-        # Apply transformations (same as dashboard_gui.main())
-        df = data.patient_id(df_raw)
-        logger.info("Patient ID processing complete.")
-
-        df['Intervention Date'] = pd.to_datetime(df['Intervention Date'], format="%Y-%m-%d")
-        logger.info("Date conversion complete.")
-
-        # Preserve original drug name before standardization (for SQLite storage)
-        df['Drug Name Raw'] = df['Drug Name'].copy()
-
-        df = data.drug_names(df, self.paths)
-        logger.info("Drug name processing complete.")
-
-        df['OrganisationName'] = df['OrganisationName'].str.replace(',', '')
-        logger.info("Organisation name cleaning complete.")
-
-        df = data.department_identification(df, self.paths)
-        logger.info("Department identification complete.")
-
-        # Validate result
-        is_valid, missing = self.validate_dataframe(df)
-        if not is_valid:
-            raise ValueError(f"Processed DataFrame missing required columns: {missing}")
-
-        load_time = time.time() - start_time
-        logger.info(f"Data loading complete. {len(df)} rows in {load_time:.2f}s")
-
-        return LoadResult(
-            df=df,
-            source=self.source_description,
-            row_count=len(df),
-            load_time_seconds=load_time,
-        )
-
-
-def get_loader(
-    source: str | Path,
-    paths: Optional[PathConfig] = None,
-    **kwargs
-) -> DataLoader:
-    """Factory function to create the appropriate DataLoader.
-
-    Args:
-        source: File path (CSV/Parquet)
-        paths: PathConfig for reference data (used by FileDataLoader)
-        **kwargs: Additional arguments passed to the loader constructor
-
-    Returns:
-        Appropriate DataLoader instance
-
-    Examples:
-        >>> loader = get_loader("data/activity.csv")
-        >>> loader = get_loader("data/activity.parquet")
-    """
-    path = Path(source)
-    return FileDataLoader(file_path=path, paths=paths)