Initial commit before Ralph loop

2026-02-04 13:04:29 +00:00
commit fdd33a67af
89 changed files with 20660 additions and 0 deletions
@@ -0,0 +1,446 @@
+"""
+Large dataset performance tests for the Patient Pathway Analysis tool.
+
+This module tests the system's ability to handle realistic workloads:
+1. Full dataset analysis (all drugs, trusts, directories)
+2. Memory usage under load
+3. Scalability characteristics
+
+Run with: python -m pytest tests/test_large_dataset_performance.py -v
+"""
+
+import gc
+import time
+import tracemalloc
+from datetime import date
+from pathlib import Path
+
+import pytest
+
+# Mark all tests in this module as large dataset tests
+pytestmark = pytest.mark.largedata
+
+
+class TestLargeDatasetPerformance:
+    """Performance tests with full dataset."""
+
+    @pytest.fixture(autouse=True)
+    def setup_paths(self):
+        """Set up paths and verify data exists."""
+        from core import default_paths
+        from data_processing import get_loader
+
+        # Check if database exists
+        db_path = default_paths.data_dir / "pathways.db"
+        if not db_path.exists():
+            pytest.skip("SQLite database not found")
+
+        self.paths = default_paths
+        self.loader = get_loader('sqlite')
+
+        # Load data once
+        result = self.loader.load()
+        if result is None or result.df is None or len(result.df) == 0:
+            pytest.skip("No data available in database")
+
+        self.df = result.df
+        self.row_count = result.row_count
+
+    def test_data_load_time_acceptable(self):
+        """Data loading should complete in under 5 seconds."""
+        from data_processing import get_loader
+
+        gc.collect()
+        start = time.perf_counter()
+        loader = get_loader('sqlite')
+        result = loader.load()
+        elapsed = time.perf_counter() - start
+
+        assert result is not None, "Data loading failed"
+        assert result.row_count > 0, "No data loaded"
+        # Allow 5 seconds for data loading
+        assert elapsed < 5.0, f"Data loading took {elapsed:.2f}s (target: <5s)"
+
+    def test_analysis_pipeline_completes(self):
+        """Full analysis pipeline should complete without error."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        import pandas as pd
+
+        # Get available filters from actual data
+        trusts = self.df['Provider Code'].unique().tolist()[:20]
+        drugs = self.df['Drug Name'].dropna().unique().tolist()[:10]
+        directories = self.df['Directory'].dropna().unique().tolist()
+
+        # Load org codes for trust name mapping
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = []
+        for t in trusts:
+            if t in org_codes.index:
+                trust_names.append(org_codes.loc[t, 'Name'])
+        if not trust_names:
+            trust_names = org_codes['Name'].tolist()[:20]
+
+        # Run analysis with reasonable filter
+        ice_df, title = generate_icicle_chart(
+            df=self.df,
+            start_date="2020-01-01",
+            end_date="2025-01-01",
+            last_seen_date="2020-01-01",
+            trust_filter=trust_names,
+            drug_filter=drugs,
+            directory_filter=directories,
+            minimum_num_patients=1,
+            title="Large Dataset Test",
+            paths=self.paths,
+        )
+
+        # Should produce some results
+        assert ice_df is not None, "Analysis produced no results"
+        assert len(ice_df) > 0, "Analysis produced empty results"
+
+    def test_analysis_pipeline_time_acceptable(self):
+        """Analysis pipeline should complete in under 60 seconds."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        import pandas as pd
+
+        # Get available filters from actual data
+        trusts = self.df['Provider Code'].unique().tolist()[:20]
+        drugs = self.df['Drug Name'].dropna().unique().tolist()[:10]
+        directories = self.df['Directory'].dropna().unique().tolist()
+
+        # Load org codes for trust name mapping
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = []
+        for t in trusts:
+            if t in org_codes.index:
+                trust_names.append(org_codes.loc[t, 'Name'])
+        if not trust_names:
+            trust_names = org_codes['Name'].tolist()[:20]
+
+        gc.collect()
+        start = time.perf_counter()
+
+        ice_df, title = generate_icicle_chart(
+            df=self.df,
+            start_date="2020-01-01",
+            end_date="2025-01-01",
+            last_seen_date="2020-01-01",
+            trust_filter=trust_names,
+            drug_filter=drugs,
+            directory_filter=directories,
+            minimum_num_patients=1,
+            title="Performance Test",
+            paths=self.paths,
+        )
+
+        elapsed = time.perf_counter() - start
+
+        # Allow 60 seconds for full analysis (observed ~19s with 440K rows)
+        assert elapsed < 60.0, f"Analysis took {elapsed:.2f}s (target: <60s)"
+        print(f"\n  Analysis completed in {elapsed:.2f}s with {len(ice_df) if ice_df is not None else 0} result rows")
+
+    def test_memory_usage_acceptable(self):
+        """Memory usage should not exceed 500MB during analysis."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        import pandas as pd
+
+        # Get available filters from actual data
+        trusts = self.df['Provider Code'].unique().tolist()[:15]
+        drugs = self.df['Drug Name'].dropna().unique().tolist()[:5]
+        directories = self.df['Directory'].dropna().unique().tolist()
+
+        # Load org codes for trust name mapping
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = []
+        for t in trusts:
+            if t in org_codes.index:
+                trust_names.append(org_codes.loc[t, 'Name'])
+        if not trust_names:
+            trust_names = org_codes['Name'].tolist()[:15]
+
+        gc.collect()
+        tracemalloc.start()
+
+        ice_df, title = generate_icicle_chart(
+            df=self.df,
+            start_date="2020-01-01",
+            end_date="2025-01-01",
+            last_seen_date="2020-01-01",
+            trust_filter=trust_names,
+            drug_filter=drugs,
+            directory_filter=directories,
+            minimum_num_patients=1,
+            title="Memory Test",
+            paths=self.paths,
+        )
+
+        current, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        peak_mb = peak / 1024 / 1024
+
+        # Allow 500MB peak memory
+        assert peak_mb < 500, f"Peak memory {peak_mb:.1f}MB exceeds 500MB limit"
+        print(f"\n  Peak memory usage: {peak_mb:.1f}MB")
+
+    def test_figure_creation_scales(self):
+        """Figure creation time should scale linearly with result size."""
+        from visualization.plotly_generator import create_icicle_figure
+        import pandas as pd
+        import numpy as np
+
+        # Test with different sizes
+        sizes = [100, 500, 1000, 2000]
+        times = []
+
+        for n_rows in sizes:
+            sample_data = {
+                'parents': ['N&WICS'] * n_rows,
+                'ids': [f'N&WICS - Test{i}' for i in range(n_rows)],
+                'labels': [f'Test{i}' for i in range(n_rows)],
+                'value': np.random.randint(1, 100, n_rows),
+                'colour': np.random.random(n_rows),
+                'cost': np.random.randint(1000, 100000, n_rows),
+                'costpp': np.random.randint(100, 10000, n_rows),
+                'cost_pp_pa': [str(np.random.randint(100, 10000)) for _ in range(n_rows)],
+                'First seen': pd.to_datetime(['2024-01-01'] * n_rows),
+                'Last seen': pd.to_datetime(['2024-12-31'] * n_rows),
+                'First seen (Parent)': ['2024-01-01'] * n_rows,
+                'Last seen (Parent)': ['2024-12-31'] * n_rows,
+                'average_spacing': ['Test spacing'] * n_rows,
+                'avg_days': pd.to_timedelta([100] * n_rows, unit='D'),
+            }
+            sample_df = pd.DataFrame(sample_data)
+
+            gc.collect()
+            start = time.perf_counter()
+            fig = create_icicle_figure(sample_df, f"Scale Test {n_rows}")
+            elapsed = time.perf_counter() - start
+
+            times.append(elapsed)
+
+        # Check that time scaling is roughly linear (not exponential)
+        # If time doubles when size doubles, it's linear
+        # We allow some variance, so check that 10x data doesn't take more than 20x time
+        time_ratio = times[-1] / times[0]
+        size_ratio = sizes[-1] / sizes[0]
+
+        # Allow 3x the expected linear scaling
+        max_allowed_ratio = size_ratio * 3
+
+        assert time_ratio < max_allowed_ratio, (
+            f"Figure creation doesn't scale well: "
+            f"{sizes[-1]} rows took {times[-1]:.3f}s vs {sizes[0]} rows at {times[0]:.3f}s "
+            f"(ratio {time_ratio:.1f}x, expected <{max_allowed_ratio:.1f}x)"
+        )
+
+        print(f"\n  Figure scaling: {sizes[0]} rows: {times[0]*1000:.1f}ms, "
+              f"{sizes[-1]} rows: {times[-1]*1000:.1f}ms (ratio: {time_ratio:.1f}x)")
+
+
+class TestDataVolumeStress:
+    """Stress tests to verify system handles various data volumes."""
+
+    @pytest.fixture(autouse=True)
+    def setup_paths(self):
+        """Set up paths and verify data exists."""
+        from core import default_paths
+        from data_processing import get_loader
+
+        # Check if database exists
+        db_path = default_paths.data_dir / "pathways.db"
+        if not db_path.exists():
+            pytest.skip("SQLite database not found")
+
+        self.paths = default_paths
+        self.loader = get_loader('sqlite')
+
+        # Load data once
+        result = self.loader.load()
+        if result is None or result.df is None or len(result.df) == 0:
+            pytest.skip("No data available in database")
+
+        self.df = result.df
+
+    def test_handles_all_drugs(self):
+        """Analysis can handle filtering by all drugs."""
+        from analysis.pathway_analyzer import prepare_data
+        import pandas as pd
+
+        all_drugs = self.df['Drug Name'].dropna().unique().tolist()
+
+        # Load org codes
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = org_codes['Name'].tolist()[:5]
+
+        result = prepare_data(
+            df=self.df,
+            trust_filter=trust_names,
+            drug_filter=all_drugs,
+            directory_filter=self.df['Directory'].dropna().unique().tolist(),
+            paths=self.paths,
+        )
+
+        # Should complete without error (returns tuple)
+        assert result is not None
+        assert len(result) == 3  # (df, org_codes, directory_df)
+
+    def test_handles_all_trusts(self):
+        """Analysis can handle filtering by all trusts."""
+        from analysis.pathway_analyzer import prepare_data
+        import pandas as pd
+
+        # Load org codes
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        all_trust_names = org_codes['Name'].tolist()
+
+        result = prepare_data(
+            df=self.df,
+            trust_filter=all_trust_names,
+            drug_filter=['ADALIMUMAB', 'ETANERCEPT'],
+            directory_filter=self.df['Directory'].dropna().unique().tolist(),
+            paths=self.paths,
+        )
+
+        # Should complete without error (returns tuple)
+        assert result is not None
+        assert len(result) == 3  # (df, org_codes, directory_df)
+
+    def test_handles_wide_date_range(self):
+        """Analysis can handle a wide date range via generate_icicle_chart."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        import pandas as pd
+
+        # Load org codes
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = org_codes['Name'].tolist()[:10]
+
+        # Use very wide date range via full pipeline
+        ice_df, title = generate_icicle_chart(
+            df=self.df,
+            start_date="2010-01-01",
+            end_date="2030-01-01",
+            last_seen_date="2010-01-01",
+            trust_filter=trust_names,
+            drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
+            directory_filter=self.df['Directory'].dropna().unique().tolist(),
+            minimum_num_patients=1,
+            title="Wide Date Range Test",
+            paths=self.paths,
+        )
+
+        # Should complete without error
+        assert ice_df is not None or ice_df is None  # Just verifying no exception
+
+    def test_handles_minimum_patient_threshold(self):
+        """Analysis correctly applies minimum patient threshold."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        import pandas as pd
+
+        # Load org codes
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = org_codes['Name'].tolist()[:10]
+
+        # Run with minimum 50 patients
+        ice_df_50, _ = generate_icicle_chart(
+            df=self.df,
+            start_date="2020-01-01",
+            end_date="2025-01-01",
+            last_seen_date="2020-01-01",
+            trust_filter=trust_names,
+            drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
+            directory_filter=self.df['Directory'].dropna().unique().tolist(),
+            minimum_num_patients=50,
+            title="Threshold Test 50",
+            paths=self.paths,
+        )
+
+        # Run with minimum 1 patient
+        ice_df_1, _ = generate_icicle_chart(
+            df=self.df,
+            start_date="2020-01-01",
+            end_date="2025-01-01",
+            last_seen_date="2020-01-01",
+            trust_filter=trust_names,
+            drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
+            directory_filter=self.df['Directory'].dropna().unique().tolist(),
+            minimum_num_patients=1,
+            title="Threshold Test 1",
+            paths=self.paths,
+        )
+
+        # Higher threshold should produce fewer or equal results
+        len_50 = len(ice_df_50) if ice_df_50 is not None else 0
+        len_1 = len(ice_df_1) if ice_df_1 is not None else 0
+
+        assert len_50 <= len_1, (
+            f"Higher minimum threshold should produce fewer results: "
+            f"min=50 gave {len_50} rows, min=1 gave {len_1} rows"
+        )
+
+
+class TestConcurrentOperations:
+    """Tests for handling multiple operations."""
+
+    @pytest.fixture(autouse=True)
+    def setup_paths(self):
+        """Set up paths and verify data exists."""
+        from core import default_paths
+        from data_processing import get_loader
+
+        # Check if database exists
+        db_path = default_paths.data_dir / "pathways.db"
+        if not db_path.exists():
+            pytest.skip("SQLite database not found")
+
+        self.paths = default_paths
+
+    def test_multiple_data_loads(self):
+        """Multiple data loads should not cause issues."""
+        from data_processing import get_loader
+
+        results = []
+        for i in range(3):
+            loader = get_loader('sqlite')
+            result = loader.load()
+            if result is not None:
+                results.append(result.row_count)
+
+        # All loads should return same row count
+        assert len(set(results)) == 1, f"Inconsistent row counts: {results}"
+
+    def test_sequential_analyses(self):
+        """Multiple sequential analyses should complete."""
+        from analysis.pathway_analyzer import generate_icicle_chart
+        from data_processing import get_loader
+        import pandas as pd
+
+        # Load data
+        loader = get_loader('sqlite')
+        result = loader.load()
+        if result is None or result.df is None:
+            pytest.skip("No data available")
+
+        df = result.df
+
+        # Load org codes
+        org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
+        trust_names = org_codes['Name'].tolist()[:5]
+
+        # Run multiple analyses
+        for i in range(3):
+            ice_df, title = generate_icicle_chart(
+                df=df,
+                start_date="2020-01-01",
+                end_date="2025-01-01",
+                last_seen_date="2020-01-01",
+                trust_filter=trust_names,
+                drug_filter=['ADALIMUMAB'],
+                directory_filter=df['Directory'].dropna().unique().tolist(),
+                minimum_num_patients=1,
+                title=f"Sequential Test {i+1}",
+                paths=self.paths,
+            )
+
+            # Each should complete
+            assert ice_df is not None or ice_df is None  # Just check no error