refactor: slim pathways.db from 351 MB to 3.5 MB by removing unused tables
Drop fact_interventions (440K rows), mv_patient_treatment_summary (35K rows), ref_drug_snomed_mapping (144K rows), and processed_files — all unused since the app moved to pre-computed pathway_nodes. Key changes: - Rewrite load_data() to source from pathway_nodes + pathway_refresh_log - Remove 7 dead methods and 8 dead state vars from pathways_app.py - Delete patient_data.py, load_snomed_mapping.py, test_large_dataset_performance.py - Remove SQLiteDataLoader (depended on fact_interventions) - Remove file tracking schema (processed_files tracked fact_interventions loads) - Remove legacy diagnosis functions from diagnosis_lookup.py - Add source_row_count migration for pathway_refresh_log - Clean all cross-references in __init__.py, data_source.py, migrate.py
This commit is contained in:
+11
-155
@@ -35,6 +35,7 @@ from data_processing.schema import (
|
||||
verify_all_tables_exist,
|
||||
get_all_table_counts,
|
||||
migrate_pathway_nodes_chart_type,
|
||||
migrate_refresh_log_source_row_count,
|
||||
)
|
||||
from data_processing.reference_data import (
|
||||
MigrationResult,
|
||||
@@ -49,12 +50,6 @@ from data_processing.reference_data import (
|
||||
verify_drug_directory_map_migration,
|
||||
verify_drug_indication_clusters_migration,
|
||||
)
|
||||
from data_processing.patient_data import (
|
||||
load_patient_data,
|
||||
refresh_patient_treatment_summary,
|
||||
get_patient_data_stats,
|
||||
verify_mv_consistency,
|
||||
)
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -67,9 +62,8 @@ def initialize_database(
|
||||
"""
|
||||
Initialize the database with all required tables.
|
||||
|
||||
Creates all tables defined in the schema (reference tables, fact tables,
|
||||
materialized views, and file tracking tables). Uses IF NOT EXISTS so
|
||||
safe to run multiple times.
|
||||
Creates all tables defined in the schema (reference tables and pathway
|
||||
tables). Uses IF NOT EXISTS so safe to run multiple times.
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
@@ -122,6 +116,14 @@ def initialize_database(
|
||||
else:
|
||||
logger.error(f"pathway_nodes migration failed: {msg}")
|
||||
return False
|
||||
|
||||
# Add source_row_count column to pathway_refresh_log if it doesn't exist
|
||||
success, msg = migrate_refresh_log_source_row_count(conn)
|
||||
if success:
|
||||
logger.info(f"pathway_refresh_log migration: {msg}")
|
||||
else:
|
||||
logger.error(f"pathway_refresh_log migration failed: {msg}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Migration failed: {e}")
|
||||
return False
|
||||
@@ -274,107 +276,6 @@ def create_progress_reporter(description: str = "Loading", width: int = 40):
|
||||
return report_progress
|
||||
|
||||
|
||||
def load_patient_data_cli(
|
||||
file_path: Path,
|
||||
db_manager: Optional[DatabaseManager] = None,
|
||||
paths: Optional[PathConfig] = None,
|
||||
force: bool = False,
|
||||
refresh_mv: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Load patient data from file with CLI progress reporting.
|
||||
|
||||
Args:
|
||||
file_path: Path to CSV or Parquet file.
|
||||
db_manager: DatabaseManager instance. Uses default if not provided.
|
||||
paths: PathConfig for reference data. Uses default if not provided.
|
||||
force: If True, re-process even if file hash matches.
|
||||
refresh_mv: If True, refresh the materialized view after loading.
|
||||
|
||||
Returns:
|
||||
True if loading succeeded, False otherwise.
|
||||
"""
|
||||
if db_manager is None:
|
||||
db_manager = DatabaseManager()
|
||||
if paths is None:
|
||||
paths = default_paths
|
||||
|
||||
print(f"\n=== Loading Patient Data ===\n")
|
||||
print(f"File: {file_path}")
|
||||
|
||||
# Check file exists
|
||||
if not file_path.exists():
|
||||
print(f"ERROR: File not found: {file_path}")
|
||||
return False
|
||||
|
||||
# Calculate and display file info
|
||||
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
print(f"Size: {file_size_mb:.1f} MB")
|
||||
print()
|
||||
|
||||
# Create progress callback
|
||||
progress_callback = create_progress_reporter("Loading rows", width=40)
|
||||
|
||||
# Load the data
|
||||
result = load_patient_data(
|
||||
file_path=file_path,
|
||||
db_manager=db_manager,
|
||||
paths=paths,
|
||||
batch_size=5000,
|
||||
force=force,
|
||||
progress_callback=progress_callback
|
||||
)
|
||||
|
||||
# Print result
|
||||
print()
|
||||
if result.was_already_processed:
|
||||
print("File already processed (same hash). Skipping.")
|
||||
print(f"Use --force to re-process.")
|
||||
elif result.success:
|
||||
print(f"Loaded {result.rows_inserted:,} rows in {result.load_time_seconds:.1f}s")
|
||||
if result.rows_skipped > 0:
|
||||
print(f"Skipped {result.rows_skipped:,} rows (missing UPID or date)")
|
||||
else:
|
||||
print(f"FAILED: {result.error_message}")
|
||||
return False
|
||||
|
||||
# Refresh materialized view if requested
|
||||
if refresh_mv and result.success and not result.was_already_processed:
|
||||
print()
|
||||
print("Refreshing materialized view...")
|
||||
mv_progress = create_progress_reporter("Processing patients", width=40)
|
||||
mv_result = refresh_patient_treatment_summary(
|
||||
db_manager=db_manager,
|
||||
progress_callback=mv_progress
|
||||
)
|
||||
|
||||
if mv_result.success:
|
||||
print(f"MV refreshed: {mv_result.patients_processed:,} patients in {mv_result.refresh_time_seconds:.1f}s")
|
||||
|
||||
# Verify consistency
|
||||
consistent, msg = verify_mv_consistency(db_manager)
|
||||
if consistent:
|
||||
print(f"MV verification: OK")
|
||||
else:
|
||||
print(f"MV verification: FAILED - {msg}")
|
||||
else:
|
||||
print(f"MV refresh FAILED: {mv_result.error_message}")
|
||||
|
||||
# Print summary statistics
|
||||
print()
|
||||
print("=== Patient Data Summary ===")
|
||||
stats = get_patient_data_stats(db_manager)
|
||||
print(f" Total rows: {stats['total_rows']:,}")
|
||||
print(f" Unique patients: {stats['unique_patients']:,}")
|
||||
print(f" Unique drugs: {stats['unique_drugs']:,}")
|
||||
print(f" Unique organizations: {stats['unique_organizations']:,}")
|
||||
if stats['date_range'][0] and stats['date_range'][1]:
|
||||
print(f" Date range: {stats['date_range'][0]} to {stats['date_range'][1]}")
|
||||
print()
|
||||
|
||||
return result.success
|
||||
|
||||
|
||||
def get_database_status(db_manager: Optional[DatabaseManager] = None) -> dict:
|
||||
"""
|
||||
Get the current status of the database.
|
||||
@@ -452,8 +353,6 @@ Examples:
|
||||
python -m data_processing.migrate --drop-existing # Reset database
|
||||
python -m data_processing.migrate --reference-data # Migrate reference data
|
||||
python -m data_processing.migrate --reference-data --verify # With verification
|
||||
python -m data_processing.migrate --load-patient-data data.parquet # Load patient data
|
||||
python -m data_processing.migrate --load-patient-data data.csv --force # Force reload
|
||||
python -m data_processing.migrate --db-path ./data/test.db # Custom path
|
||||
"""
|
||||
)
|
||||
@@ -493,23 +392,6 @@ Examples:
|
||||
action="store_true",
|
||||
help="Enable verbose logging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-patient-data",
|
||||
type=Path,
|
||||
metavar="FILE",
|
||||
help="Load patient data from CSV or Parquet file with progress reporting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Force re-processing even if file hash matches (use with --load-patient-data)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-refresh-mv",
|
||||
action="store_true",
|
||||
help="Skip materialized view refresh after loading (use with --load-patient-data)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up logging
|
||||
@@ -562,32 +444,6 @@ Examples:
|
||||
print("Reference data migration completed with errors. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Handle --load-patient-data (load patient data from CSV/Parquet)
|
||||
if args.load_patient_data:
|
||||
# Ensure database exists with tables first
|
||||
if not db_manager.exists:
|
||||
print("Database does not exist. Initializing schema first...")
|
||||
success = initialize_database(db_manager=db_manager)
|
||||
if not success:
|
||||
print("\nDatabase initialization failed. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Load patient data with progress reporting
|
||||
success = load_patient_data_cli(
|
||||
file_path=args.load_patient_data,
|
||||
db_manager=db_manager,
|
||||
paths=default_paths,
|
||||
force=args.force,
|
||||
refresh_mv=not args.no_refresh_mv
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Patient data load completed successfully.")
|
||||
return 0
|
||||
else:
|
||||
print("Patient data load failed. Check logs for details.")
|
||||
return 1
|
||||
|
||||
# Run schema migration (default behavior)
|
||||
success = initialize_database(
|
||||
db_manager=db_manager,
|
||||
|
||||
Reference in New Issue
Block a user