177 lines
6.1 KiB
Python
177 lines
6.1 KiB
Python
"""
|
|
Test Phase 3.4.4: Measure directory assignment "Undefined" rate with real Snowflake data.
|
|
|
|
This test fetches HCD activity data from Snowflake, runs it through the directory
|
|
assignment pipeline, and measures what percentage of records end up with "Undefined"
|
|
directory vs. successfully assigned directories.
|
|
"""
|
|
|
|
import json
|
|
import pandas as pd
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from tools.data import patient_id, drug_names, department_identification
|
|
from core import default_paths
|
|
|
|
|
|
def load_snowflake_result(json_file: Path) -> pd.DataFrame:
|
|
"""Load Snowflake query result from JSON file and convert to DataFrame."""
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# The result is in format: [{"type": "text", "text": "..."}]
|
|
# where text contains JSON with {"columns": [...], "rows": [...]}
|
|
if isinstance(data, list) and len(data) > 0 and 'text' in data[0]:
|
|
records_text = data[0]['text']
|
|
result_obj = json.loads(records_text)
|
|
# Extract rows from the result object
|
|
if isinstance(result_obj, dict) and 'rows' in result_obj:
|
|
records = result_obj['rows']
|
|
else:
|
|
records = result_obj
|
|
else:
|
|
records = data
|
|
|
|
return pd.DataFrame(records)
|
|
|
|
|
|
def analyze_directory_sources(df: pd.DataFrame) -> dict:
|
|
"""Analyze the distribution of Directory_Source values."""
|
|
if 'Directory_Source' not in df.columns:
|
|
return {"error": "Directory_Source column not found"}
|
|
|
|
source_counts = df['Directory_Source'].value_counts()
|
|
total = len(df)
|
|
|
|
result = {
|
|
"total_records": total,
|
|
"source_distribution": {},
|
|
"undefined_rate": 0.0,
|
|
"assigned_rate": 0.0
|
|
}
|
|
|
|
for source, count in source_counts.items():
|
|
pct = (count / total) * 100
|
|
result["source_distribution"][source] = {
|
|
"count": int(count),
|
|
"percentage": round(pct, 2)
|
|
}
|
|
|
|
# Calculate undefined vs assigned rates
|
|
undefined_count = source_counts.get('UNDEFINED', 0)
|
|
result["undefined_rate"] = round((undefined_count / total) * 100, 2) if total > 0 else 0
|
|
result["assigned_rate"] = round(100 - result["undefined_rate"], 2)
|
|
|
|
return result
|
|
|
|
|
|
def analyze_by_drug(df: pd.DataFrame) -> dict:
|
|
"""Analyze undefined rate by drug."""
|
|
if 'Drug Name' not in df.columns or 'Directory_Source' not in df.columns:
|
|
return {"error": "Required columns not found"}
|
|
|
|
results = {}
|
|
for drug in df['Drug Name'].dropna().unique():
|
|
drug_df = df[df['Drug Name'] == drug]
|
|
total = len(drug_df)
|
|
undefined = len(drug_df[drug_df['Directory_Source'] == 'UNDEFINED'])
|
|
results[drug] = {
|
|
"total": total,
|
|
"undefined": undefined,
|
|
"undefined_rate": round((undefined / total) * 100, 2) if total > 0 else 0
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
"""Main function to run the real data test."""
|
|
# Path to the Snowflake result file (updated 2026-02-04)
|
|
result_file = Path(r"C:\Users\charlwoodand\.claude\projects\C--Users-charlwoodand-Ralph-local-Tasks-Patient-pathway-analysis\2b846818-a586-47de-bfb9-a740bd07fc70\tool-results\mcp-snowflake-mcp-read_data-1770199331688.txt")
|
|
|
|
if not result_file.exists():
|
|
print(f"ERROR: Result file not found: {result_file}")
|
|
return
|
|
|
|
print("Loading Snowflake data...")
|
|
df = load_snowflake_result(result_file)
|
|
print(f"Loaded {len(df)} records")
|
|
print(f"Columns: {list(df.columns)}")
|
|
|
|
# Rename columns to match expected format for tools/data.py functions
|
|
column_mapping = {
|
|
'ProviderCode': 'Provider Code',
|
|
'PersonKey': 'PersonKey',
|
|
'DrugName': 'Drug Name',
|
|
'InterventionDate': 'Intervention Date',
|
|
'TreatmentFunctionCode': 'Treatment Function Code',
|
|
'AdditionalDetail1': 'Additional Detail 1',
|
|
'AdditionalDescription1': 'Additional Description 1',
|
|
'AdditionalDetail2': 'Additional Detail 2',
|
|
'AdditionalDescription2': 'Additional Description 2',
|
|
'PriceActual': 'Price Actual',
|
|
'OrganisationName': 'OrganisationName'
|
|
}
|
|
|
|
df = df.rename(columns=column_mapping)
|
|
print(f"Renamed columns: {list(df.columns)}")
|
|
|
|
# Step 1: Generate UPID
|
|
print("\nStep 1: Generating UPID...")
|
|
df = patient_id(df)
|
|
print(f"Sample UPIDs: {df['UPID'].head(5).tolist()}")
|
|
|
|
# Step 2: Standardize drug names
|
|
print("\nStep 2: Standardizing drug names...")
|
|
df = drug_names(df, default_paths)
|
|
print(f"Unique drugs after standardization: {df['Drug Name'].dropna().unique().tolist()}")
|
|
|
|
# Step 3: Run directory assignment
|
|
print("\nStep 3: Running directory assignment...")
|
|
df = department_identification(df, default_paths)
|
|
|
|
# Step 4: Analyze results
|
|
print("\n" + "="*60)
|
|
print("DIRECTORY ASSIGNMENT RESULTS")
|
|
print("="*60)
|
|
|
|
overall_stats = analyze_directory_sources(df)
|
|
|
|
print(f"\nTotal records processed: {overall_stats['total_records']}")
|
|
print(f"\nDirectory Source Distribution:")
|
|
for source, stats in sorted(overall_stats['source_distribution'].items(),
|
|
key=lambda x: -x[1]['count']):
|
|
print(f" {source}: {stats['count']:,} ({stats['percentage']:.1f}%)")
|
|
|
|
print(f"\n*** UNDEFINED RATE: {overall_stats['undefined_rate']:.1f}% ***")
|
|
print(f"*** ASSIGNED RATE: {overall_stats['assigned_rate']:.1f}% ***")
|
|
|
|
# Analyze by drug
|
|
print("\n" + "-"*60)
|
|
print("UNDEFINED RATE BY DRUG")
|
|
print("-"*60)
|
|
|
|
drug_stats = analyze_by_drug(df)
|
|
for drug, stats in sorted(drug_stats.items(), key=lambda x: -x[1]['undefined_rate']):
|
|
print(f" {drug}: {stats['undefined_rate']:.1f}% undefined ({stats['undefined']:,}/{stats['total']:,})")
|
|
|
|
# Show sample of directory assignments
|
|
print("\n" + "-"*60)
|
|
print("SAMPLE DIRECTORY ASSIGNMENTS")
|
|
print("-"*60)
|
|
|
|
sample_cols = ['UPID', 'Drug Name', 'Directory', 'Directory_Source']
|
|
available_cols = [c for c in sample_cols if c in df.columns]
|
|
print(df[available_cols].head(20).to_string())
|
|
|
|
return overall_stats, drug_stats
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|