refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables
Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
This commit is contained in:
@@ -1,446 +0,0 @@
|
||||
"""
|
||||
Large dataset performance tests for the Patient Pathway Analysis tool.
|
||||
|
||||
This module tests the system's ability to handle realistic workloads:
|
||||
1. Full dataset analysis (all drugs, trusts, directories)
|
||||
2. Memory usage under load
|
||||
3. Scalability characteristics
|
||||
|
||||
Run with: python -m pytest tests/test_large_dataset_performance.py -v
|
||||
"""
|
||||
|
||||
import gc
|
||||
import time
|
||||
import tracemalloc
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Mark all tests in this module as large dataset tests
|
||||
pytestmark = pytest.mark.largedata
|
||||
|
||||
|
||||
class TestLargeDatasetPerformance:
|
||||
"""Performance tests with full dataset."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_paths(self):
|
||||
"""Set up paths and verify data exists."""
|
||||
from core import default_paths
|
||||
from data_processing import get_loader
|
||||
|
||||
# Check if database exists
|
||||
db_path = default_paths.data_dir / "pathways.db"
|
||||
if not db_path.exists():
|
||||
pytest.skip("SQLite database not found")
|
||||
|
||||
self.paths = default_paths
|
||||
self.loader = get_loader('sqlite')
|
||||
|
||||
# Load data once
|
||||
result = self.loader.load()
|
||||
if result is None or result.df is None or len(result.df) == 0:
|
||||
pytest.skip("No data available in database")
|
||||
|
||||
self.df = result.df
|
||||
self.row_count = result.row_count
|
||||
|
||||
def test_data_load_time_acceptable(self):
|
||||
"""Data loading should complete in under 5 seconds."""
|
||||
from data_processing import get_loader
|
||||
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
loader = get_loader('sqlite')
|
||||
result = loader.load()
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
assert result is not None, "Data loading failed"
|
||||
assert result.row_count > 0, "No data loaded"
|
||||
# Allow 5 seconds for data loading
|
||||
assert elapsed < 5.0, f"Data loading took {elapsed:.2f}s (target: <5s)"
|
||||
|
||||
def test_analysis_pipeline_completes(self):
|
||||
"""Full analysis pipeline should complete without error."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
import pandas as pd
|
||||
|
||||
# Get available filters from actual data
|
||||
trusts = self.df['Provider Code'].unique().tolist()[:20]
|
||||
drugs = self.df['Drug Name'].dropna().unique().tolist()[:10]
|
||||
directories = self.df['Directory'].dropna().unique().tolist()
|
||||
|
||||
# Load org codes for trust name mapping
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = []
|
||||
for t in trusts:
|
||||
if t in org_codes.index:
|
||||
trust_names.append(org_codes.loc[t, 'Name'])
|
||||
if not trust_names:
|
||||
trust_names = org_codes['Name'].tolist()[:20]
|
||||
|
||||
# Run analysis with reasonable filter
|
||||
ice_df, title = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=drugs,
|
||||
directory_filter=directories,
|
||||
minimum_num_patients=1,
|
||||
title="Large Dataset Test",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Should produce some results
|
||||
assert ice_df is not None, "Analysis produced no results"
|
||||
assert len(ice_df) > 0, "Analysis produced empty results"
|
||||
|
||||
def test_analysis_pipeline_time_acceptable(self):
|
||||
"""Analysis pipeline should complete in under 60 seconds."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
import pandas as pd
|
||||
|
||||
# Get available filters from actual data
|
||||
trusts = self.df['Provider Code'].unique().tolist()[:20]
|
||||
drugs = self.df['Drug Name'].dropna().unique().tolist()[:10]
|
||||
directories = self.df['Directory'].dropna().unique().tolist()
|
||||
|
||||
# Load org codes for trust name mapping
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = []
|
||||
for t in trusts:
|
||||
if t in org_codes.index:
|
||||
trust_names.append(org_codes.loc[t, 'Name'])
|
||||
if not trust_names:
|
||||
trust_names = org_codes['Name'].tolist()[:20]
|
||||
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
|
||||
ice_df, title = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=drugs,
|
||||
directory_filter=directories,
|
||||
minimum_num_patients=1,
|
||||
title="Performance Test",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
# Allow 60 seconds for full analysis (observed ~19s with 440K rows)
|
||||
assert elapsed < 60.0, f"Analysis took {elapsed:.2f}s (target: <60s)"
|
||||
print(f"\n Analysis completed in {elapsed:.2f}s with {len(ice_df) if ice_df is not None else 0} result rows")
|
||||
|
||||
def test_memory_usage_acceptable(self):
|
||||
"""Memory usage should not exceed 500MB during analysis."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
import pandas as pd
|
||||
|
||||
# Get available filters from actual data
|
||||
trusts = self.df['Provider Code'].unique().tolist()[:15]
|
||||
drugs = self.df['Drug Name'].dropna().unique().tolist()[:5]
|
||||
directories = self.df['Directory'].dropna().unique().tolist()
|
||||
|
||||
# Load org codes for trust name mapping
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = []
|
||||
for t in trusts:
|
||||
if t in org_codes.index:
|
||||
trust_names.append(org_codes.loc[t, 'Name'])
|
||||
if not trust_names:
|
||||
trust_names = org_codes['Name'].tolist()[:15]
|
||||
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
||||
ice_df, title = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=drugs,
|
||||
directory_filter=directories,
|
||||
minimum_num_patients=1,
|
||||
title="Memory Test",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
current, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
|
||||
peak_mb = peak / 1024 / 1024
|
||||
|
||||
# Allow 500MB peak memory
|
||||
assert peak_mb < 500, f"Peak memory {peak_mb:.1f}MB exceeds 500MB limit"
|
||||
print(f"\n Peak memory usage: {peak_mb:.1f}MB")
|
||||
|
||||
def test_figure_creation_scales(self):
|
||||
"""Figure creation time should scale linearly with result size."""
|
||||
from visualization.plotly_generator import create_icicle_figure
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Test with different sizes
|
||||
sizes = [100, 500, 1000, 2000]
|
||||
times = []
|
||||
|
||||
for n_rows in sizes:
|
||||
sample_data = {
|
||||
'parents': ['N&WICS'] * n_rows,
|
||||
'ids': [f'N&WICS - Test{i}' for i in range(n_rows)],
|
||||
'labels': [f'Test{i}' for i in range(n_rows)],
|
||||
'value': np.random.randint(1, 100, n_rows),
|
||||
'colour': np.random.random(n_rows),
|
||||
'cost': np.random.randint(1000, 100000, n_rows),
|
||||
'costpp': np.random.randint(100, 10000, n_rows),
|
||||
'cost_pp_pa': [str(np.random.randint(100, 10000)) for _ in range(n_rows)],
|
||||
'First seen': pd.to_datetime(['2024-01-01'] * n_rows),
|
||||
'Last seen': pd.to_datetime(['2024-12-31'] * n_rows),
|
||||
'First seen (Parent)': ['2024-01-01'] * n_rows,
|
||||
'Last seen (Parent)': ['2024-12-31'] * n_rows,
|
||||
'average_spacing': ['Test spacing'] * n_rows,
|
||||
'avg_days': pd.to_timedelta([100] * n_rows, unit='D'),
|
||||
}
|
||||
sample_df = pd.DataFrame(sample_data)
|
||||
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
fig = create_icicle_figure(sample_df, f"Scale Test {n_rows}")
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
times.append(elapsed)
|
||||
|
||||
# Check that time scaling is roughly linear (not exponential)
|
||||
# If time doubles when size doubles, it's linear
|
||||
# We allow some variance, so check that 10x data doesn't take more than 20x time
|
||||
time_ratio = times[-1] / times[0]
|
||||
size_ratio = sizes[-1] / sizes[0]
|
||||
|
||||
# Allow 3x the expected linear scaling
|
||||
max_allowed_ratio = size_ratio * 3
|
||||
|
||||
assert time_ratio < max_allowed_ratio, (
|
||||
f"Figure creation doesn't scale well: "
|
||||
f"{sizes[-1]} rows took {times[-1]:.3f}s vs {sizes[0]} rows at {times[0]:.3f}s "
|
||||
f"(ratio {time_ratio:.1f}x, expected <{max_allowed_ratio:.1f}x)"
|
||||
)
|
||||
|
||||
print(f"\n Figure scaling: {sizes[0]} rows: {times[0]*1000:.1f}ms, "
|
||||
f"{sizes[-1]} rows: {times[-1]*1000:.1f}ms (ratio: {time_ratio:.1f}x)")
|
||||
|
||||
|
||||
class TestDataVolumeStress:
|
||||
"""Stress tests to verify system handles various data volumes."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_paths(self):
|
||||
"""Set up paths and verify data exists."""
|
||||
from core import default_paths
|
||||
from data_processing import get_loader
|
||||
|
||||
# Check if database exists
|
||||
db_path = default_paths.data_dir / "pathways.db"
|
||||
if not db_path.exists():
|
||||
pytest.skip("SQLite database not found")
|
||||
|
||||
self.paths = default_paths
|
||||
self.loader = get_loader('sqlite')
|
||||
|
||||
# Load data once
|
||||
result = self.loader.load()
|
||||
if result is None or result.df is None or len(result.df) == 0:
|
||||
pytest.skip("No data available in database")
|
||||
|
||||
self.df = result.df
|
||||
|
||||
def test_handles_all_drugs(self):
|
||||
"""Analysis can handle filtering by all drugs."""
|
||||
from analysis.pathway_analyzer import prepare_data
|
||||
import pandas as pd
|
||||
|
||||
all_drugs = self.df['Drug Name'].dropna().unique().tolist()
|
||||
|
||||
# Load org codes
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = org_codes['Name'].tolist()[:5]
|
||||
|
||||
result = prepare_data(
|
||||
df=self.df,
|
||||
trust_filter=trust_names,
|
||||
drug_filter=all_drugs,
|
||||
directory_filter=self.df['Directory'].dropna().unique().tolist(),
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Should complete without error (returns tuple)
|
||||
assert result is not None
|
||||
assert len(result) == 3 # (df, org_codes, directory_df)
|
||||
|
||||
def test_handles_all_trusts(self):
|
||||
"""Analysis can handle filtering by all trusts."""
|
||||
from analysis.pathway_analyzer import prepare_data
|
||||
import pandas as pd
|
||||
|
||||
# Load org codes
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
all_trust_names = org_codes['Name'].tolist()
|
||||
|
||||
result = prepare_data(
|
||||
df=self.df,
|
||||
trust_filter=all_trust_names,
|
||||
drug_filter=['ADALIMUMAB', 'ETANERCEPT'],
|
||||
directory_filter=self.df['Directory'].dropna().unique().tolist(),
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Should complete without error (returns tuple)
|
||||
assert result is not None
|
||||
assert len(result) == 3 # (df, org_codes, directory_df)
|
||||
|
||||
def test_handles_wide_date_range(self):
|
||||
"""Analysis can handle a wide date range via generate_icicle_chart."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
import pandas as pd
|
||||
|
||||
# Load org codes
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = org_codes['Name'].tolist()[:10]
|
||||
|
||||
# Use very wide date range via full pipeline
|
||||
ice_df, title = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2010-01-01",
|
||||
end_date="2030-01-01",
|
||||
last_seen_date="2010-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
|
||||
directory_filter=self.df['Directory'].dropna().unique().tolist(),
|
||||
minimum_num_patients=1,
|
||||
title="Wide Date Range Test",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Should complete without error
|
||||
assert ice_df is not None or ice_df is None # Just verifying no exception
|
||||
|
||||
def test_handles_minimum_patient_threshold(self):
|
||||
"""Analysis correctly applies minimum patient threshold."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
import pandas as pd
|
||||
|
||||
# Load org codes
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = org_codes['Name'].tolist()[:10]
|
||||
|
||||
# Run with minimum 50 patients
|
||||
ice_df_50, _ = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
|
||||
directory_filter=self.df['Directory'].dropna().unique().tolist(),
|
||||
minimum_num_patients=50,
|
||||
title="Threshold Test 50",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Run with minimum 1 patient
|
||||
ice_df_1, _ = generate_icicle_chart(
|
||||
df=self.df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=self.df['Drug Name'].dropna().unique().tolist()[:5],
|
||||
directory_filter=self.df['Directory'].dropna().unique().tolist(),
|
||||
minimum_num_patients=1,
|
||||
title="Threshold Test 1",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Higher threshold should produce fewer or equal results
|
||||
len_50 = len(ice_df_50) if ice_df_50 is not None else 0
|
||||
len_1 = len(ice_df_1) if ice_df_1 is not None else 0
|
||||
|
||||
assert len_50 <= len_1, (
|
||||
f"Higher minimum threshold should produce fewer results: "
|
||||
f"min=50 gave {len_50} rows, min=1 gave {len_1} rows"
|
||||
)
|
||||
|
||||
|
||||
class TestConcurrentOperations:
|
||||
"""Tests for handling multiple operations."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_paths(self):
|
||||
"""Set up paths and verify data exists."""
|
||||
from core import default_paths
|
||||
from data_processing import get_loader
|
||||
|
||||
# Check if database exists
|
||||
db_path = default_paths.data_dir / "pathways.db"
|
||||
if not db_path.exists():
|
||||
pytest.skip("SQLite database not found")
|
||||
|
||||
self.paths = default_paths
|
||||
|
||||
def test_multiple_data_loads(self):
|
||||
"""Multiple data loads should not cause issues."""
|
||||
from data_processing import get_loader
|
||||
|
||||
results = []
|
||||
for i in range(3):
|
||||
loader = get_loader('sqlite')
|
||||
result = loader.load()
|
||||
if result is not None:
|
||||
results.append(result.row_count)
|
||||
|
||||
# All loads should return same row count
|
||||
assert len(set(results)) == 1, f"Inconsistent row counts: {results}"
|
||||
|
||||
def test_sequential_analyses(self):
|
||||
"""Multiple sequential analyses should complete."""
|
||||
from analysis.pathway_analyzer import generate_icicle_chart
|
||||
from data_processing import get_loader
|
||||
import pandas as pd
|
||||
|
||||
# Load data
|
||||
loader = get_loader('sqlite')
|
||||
result = loader.load()
|
||||
if result is None or result.df is None:
|
||||
pytest.skip("No data available")
|
||||
|
||||
df = result.df
|
||||
|
||||
# Load org codes
|
||||
org_codes = pd.read_csv(self.paths.org_codes_csv, index_col=1)
|
||||
trust_names = org_codes['Name'].tolist()[:5]
|
||||
|
||||
# Run multiple analyses
|
||||
for i in range(3):
|
||||
ice_df, title = generate_icicle_chart(
|
||||
df=df,
|
||||
start_date="2020-01-01",
|
||||
end_date="2025-01-01",
|
||||
last_seen_date="2020-01-01",
|
||||
trust_filter=trust_names,
|
||||
drug_filter=['ADALIMUMAB'],
|
||||
directory_filter=df['Directory'].dropna().unique().tolist(),
|
||||
minimum_num_patients=1,
|
||||
title=f"Sequential Test {i+1}",
|
||||
paths=self.paths,
|
||||
)
|
||||
|
||||
# Each should complete
|
||||
assert ice_df is not None or ice_df is None # Just check no error
|
||||
Reference in New Issue
Block a user