Restructured src to more logical heirachy

2026-02-09 16:22:05 +00:00
parent 7e63e6ea45
commit fcbde7c689
35 changed files with 0 additions and 0 deletions
@@ -0,0 +1,331 @@
+import numpy as np
+import pandas as pd
+import csv
+import urllib.request
+import io # Added for StringIO
+import re # Added for regex escape and word boundaries
+from typing import Optional
+
+from core import PathConfig, default_paths
+from core.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+def drug_names(df, paths: Optional[PathConfig] = None):
+    # Generate dictionary to convert drug names from activity data to generic standardisation
+    if paths is None:
+        paths = default_paths
+
+    d = {}
+    with open(paths.drugnames_csv, 'r', newline='') as f:
+        reader = csv.reader(f, delimiter=',')
+        for drug_name, generic in reader:
+            d[drug_name.upper()] = generic.upper()
+
+    # Map drug names with dictionary generated earlier
+    df["Drug Name"] = df["Drug Name"].str.upper().map(d)
+
+    # Remove (Left eye) or (Right eye) from Drug Name, including whitespace
+    df["Drug Name"] = df["Drug Name"].str.replace(r'\(LEFT EYE\)', '', regex=True) # Escaped parentheses
+    df["Drug Name"] = df["Drug Name"].str.replace(r'\(RIGHT EYE\)', '', regex=True) # Escaped parentheses
+    df["Drug Name"] = df["Drug Name"].str.strip()
+    return df
+
+
+def patient_id(df):
+    # Generate unique patient ID
+    df["UPID"] = df["Provider Code"].str[:3] + df["PersonKey"].astype(str)
+    return df
+
+
+def compress_csv(filepath):
+    df = pd.read_csv(filepath)
+    compressed_path = filepath.replace(".csv", "_bz2.csv")
+    df.to_csv(compressed_path, compression="bz2", index=False)
+    return compressed_path
+
+
+def department_identification(df, paths: Optional[PathConfig] = None):
+    # --- Setup ---
+    if paths is None:
+        paths = default_paths
+
+    # 1. Load directory_list.csv and prepare uppercase versions/pattern
+    try:
+        directory_df = pd.read_csv(paths.directory_list_csv)
+        directory_list = directory_df["directory"].dropna().astype(str).tolist()
+        if not directory_list:
+             raise ValueError("directory_list.csv is empty or contains only NA values.")
+        directory_list_upper = [d.upper() for d in directory_list]
+        # Use word boundaries (\b) to avoid partial matches within words, escape special regex chars
+        dir_pattern_upper = r'\b({})'.format('|'.join(map(re.escape, directory_list_upper)))
+    except FileNotFoundError:
+         logger.error(f"File not found: {paths.directory_list_csv}. Cannot extract directories.")
+         return df
+    except ValueError as e:
+         logger.error(f"Error loading directory list: {e}")
+         return df
+
+    # Simpler pattern for Primary_Source (no word boundaries)
+    dir_pattern_primary_simple = r'({})'.format('|'.join(map(re.escape, directory_list_upper)))
+
+    # 2. Load treatment_function_codes.csv and prepare uppercase mapping
+    treatment_codes = pd.read_csv(paths.treatment_function_codes_csv)
+    mapping_treatment_codes = dict(treatment_codes[['Code', 'Service']].values)
+    mapping_treatment_codes_upper = {k: str(v).upper() for k, v in mapping_treatment_codes.items()}
+
+    # 3. Load drug_directory_list.csv and parse into drug_to_valid_dirs
+    drug_to_valid_dirs: dict[str, set[str]] = {}
+    # Try pandas direct read - much simpler approach
+    drug_dir_df = pd.read_csv(paths.drug_directory_list_csv, skipinitialspace=True)
+    
+    # Identify the drug name column (first column) and directory column (second column)
+    drug_col = drug_dir_df.columns[0]
+    dir_col = drug_dir_df.columns[1]
+    
+    # Process dataframe directly
+    drug_to_valid_dirs = {}
+    for _, row in drug_dir_df.iterrows():
+        drug_name = str(row[drug_col]).strip().upper()
+        try:
+            # Directories are pipe-separated in the second column
+            dirs_str = str(row[dir_col]) if not pd.isna(row[dir_col]) else ""
+            dirs = {d.strip().upper() for d in dirs_str.split('|') if d.strip()}
+            if drug_name and dirs and drug_name.lower() != 'nan':
+                drug_to_valid_dirs[drug_name] = dirs
+        except Exception:
+            # Silently continue on row errors
+            continue
+    # 4. Create drug_to_single_dir map
+    drug_to_single_dir = {
+        drug: list(dirs)[0]
+        for drug, dirs in drug_to_valid_dirs.items()
+        if len(dirs) == 1
+    }
+
+    # --- Data Preprocessing ---
+    # Keep original extraction columns list
+    additional_detail_columns = ["Additional Detail 1", "Additional Description 1", "Additional Detail 2", "Additional Description 2",
+     "Additional Detail 3", "Additional Description 3", "Additional Detail 4", "Additional Description 4",
+     "Additional Detail 5", "Additional Description 5", "NCDR Treatment Function Name", "Treatment Function Desc"]
+
+    # 6. Convert detail columns to uppercase BEFORE extraction
+    for ad in additional_detail_columns:
+         # Check if column exists and is object/string type before applying .str
+         if ad in df.columns and pd.api.types.is_object_dtype(df[ad]):
+              df[ad] = df[ad].str.upper()
+
+    # Original extraction loop (using original case list for extraction)
+    # Extract directory from specified columns
+    directory_df = pd.read_csv(paths.directory_list_csv)
+    directory_list = directory_df["directory"].tolist() # Reload original case list
+
+    for ad in additional_detail_columns:
+        try:
+            # Ensure column is string type before cleaning
+            if pd.api.types.is_string_dtype(df[ad]):
+                 # Extract directly from the uppercased string column
+                 extracted = df[ad].str.extract(dir_pattern_upper, expand=False)
+                 df.loc[extracted.index, ad] = extracted
+            else:
+                 df[ad] = np.nan # Set non-string columns to NaN
+        except AttributeError: # Skip columns that might not exist or are not string type
+             df[ad] = np.nan # Ensure column exists but set to NaN if error
+        except Exception as e: # Catch other potential errors during extract
+             logger.error(f"Error processing column {ad}: {e}")
+             df[ad] = np.nan
+
+    # 7. Process Treatment Function Code
+    df["Treatment Function Code"].replace(np.nan, 0, inplace=True)
+    # Ensure it's int type before mapping, handle potential errors
+    try:
+        df["Treatment Function Code"] = df["Treatment Function Code"].astype(int)
+    except ValueError:
+        # Handle cases where conversion to int fails (e.g., non-numeric values)
+        # Try coercing errors to NaN, then fillna with 0
+        df["Treatment Function Code"] = pd.to_numeric(df["Treatment Function Code"], errors='coerce').fillna(0).astype(int)
+
+    df["Treatment Function Code"] = df["Treatment Function Code"].map(mapping_treatment_codes_upper)
+    df.rename(columns={'Treatment Function Code': 'Fallback_Source'}, inplace=True)
+
+    # Apply replacements before combining
+    df.replace('MEDICAL OPHTHALMOLOGY', 'OPHTHALMOLOGY', inplace=True)
+
+    # --- Single Directory Assignment ---
+    # 8. Apply single directory override
+    # Ensure Drug Name is suitable for mapping (already done in drug_names func)
+    df['Directory'] = df['Drug Name'].map(drug_to_single_dir)
+
+    # Initialize Directory_Source column - track which fallback level was used
+    df['Directory_Source'] = pd.NA
+    # Mark rows where single valid directory was assigned
+    df.loc[df['Directory'].notna(), 'Directory_Source'] = 'SINGLE_VALID_DIR'
+
+    # --- Prepare Fallback Logic ---
+    # 9. Create Primary source from Additional Detail 1
+    if 'Additional Detail 1' in df.columns:
+        df['Primary_Source'] = df['Additional Detail 1'].astype(pd.StringDtype())
+        df['Primary_Source'] = df['Primary_Source'].str.upper() # Apply upper to strings
+    else:
+        df['Primary_Source'] = pd.NA # Use pd.NA for StringDtype
+
+    # Extract actual directory name using the pattern
+    try:
+        # Use simpler pattern for primary source
+        df['Extracted_Primary_Dir'] = df['Primary_Source'].str.extract(dir_pattern_primary_simple, expand=False, flags=re.IGNORECASE)
+        df['Extracted_Fallback_Dir'] = df['Fallback_Source'].str.extract(dir_pattern_upper, expand=False, flags=re.IGNORECASE)
+    except Exception as e:
+        logger.error(f"Error during directory extraction: {e}")
+        # Assign NA columns if extraction fails
+        df['Extracted_Primary_Dir'] = pd.NA
+        df['Extracted_Fallback_Dir'] = pd.NA
+
+    # Strip potential whitespace from extracted directories
+    if 'Extracted_Primary_Dir' in df.columns:
+         df['Extracted_Primary_Dir'] = df['Extracted_Primary_Dir'].str.strip()
+    if 'Extracted_Fallback_Dir' in df.columns:
+         df['Extracted_Fallback_Dir'] = df['Extracted_Fallback_Dir'].str.strip()
+
+    # 10. Combine sources, prioritizing Primary_Source
+    # Combine EXTRACTED directories
+    df['Primary_Directory'] = df['Extracted_Primary_Dir'].fillna(df['Extracted_Fallback_Dir'])
+
+    # Track extraction source for Directory_Source column
+    # Rows where we have Extracted_Primary_Dir will use EXTRACTED_PRIMARY
+    # Rows where we only have Extracted_Fallback_Dir will use EXTRACTED_FALLBACK
+    df['_extracted_source'] = pd.NA
+    df.loc[df['Extracted_Primary_Dir'].notna(), '_extracted_source'] = 'EXTRACTED_PRIMARY'
+    df.loc[(df['Extracted_Primary_Dir'].isna()) & (df['Extracted_Fallback_Dir'].notna()), '_extracted_source'] = 'EXTRACTED_FALLBACK'
+
+    # 11. Clean up intermediate columns
+    df.drop(columns=['Primary_Source', 'Fallback_Source', 'Extracted_Primary_Dir', 'Extracted_Fallback_Dir'], inplace=True, errors='ignore')
+
+    # --- Identify Rows Needing Calculation ---
+    # 12. Filter rows where Directory is not yet assigned
+    df_to_process = df[df['Directory'].isnull()].copy()
+
+    # --- Calculate Most Frequent Valid Directory ---
+    # 13. Drop rows without a potential primary directory
+    df_to_process.dropna(subset=['Primary_Directory'], inplace=True)
+
+    # 14. Group and count potential directories
+    if not df_to_process.empty:
+        df_counts = df_to_process.groupby(['UPID', 'Drug Name', 'Primary_Directory'], observed=True)['Primary_Directory'].count().reset_index(name='count')
+
+        # 15. Sort by count descending
+        df_counts.sort_values(['UPID', 'Drug Name', 'count'], ascending=[True, True, False], inplace=True)
+
+        # 16. Define helper function
+        def find_first_valid_dir(group, drug_map):
+            drug_name = group['Drug Name'].iloc[0]
+            valid_dirs = drug_map.get(drug_name, set())
+            
+            if not valid_dirs:
+                return np.nan
+            
+            for dir_candidate in group['Primary_Directory']:
+                # Skip NA values
+                if pd.isna(dir_candidate):
+                    continue
+                    
+                # Check if valid directory for this drug
+                if isinstance(dir_candidate, str) and dir_candidate in valid_dirs:
+                    return dir_candidate
+            
+            return np.nan # No valid directory found in the group
+
+        # 17. Group by UPID and Drug Name
+        valid_groups = df_counts.groupby(['UPID', 'Drug Name'], observed=True, group_keys=False)
+
+        # 18. Apply helper function to find the best valid directory
+        calculated_dirs = valid_groups.apply(lambda grp: find_first_valid_dir(grp, drug_to_valid_dirs))
+
+        # 19. Reset index to get UPID, Drug Name columns
+        final_mapping = calculated_dirs.reset_index()
+
+        # 20. Rename the resulting column
+        final_mapping.columns = ['UPID', 'Drug Name', 'Calculated_Directory']
+
+        # --- Merge Results and Finalize ---
+        # 21. Merge calculated directories back to the main DataFrame
+        df = pd.merge(df, final_mapping, on=['UPID', 'Drug Name'], how='left')
+
+        # 22. Fill NaN Directories with the calculated ones and track source
+        # Find rows that will be filled from Calculated_Directory
+        rows_to_fill = df['Directory'].isna() & df['Calculated_Directory'].notna()
+        # For these rows, set Directory_Source based on _extracted_source (where the calculated dir came from)
+        # The "calculated" directory is still derived from extraction, just via frequency analysis
+        df.loc[rows_to_fill, 'Directory_Source'] = df.loc[rows_to_fill, '_extracted_source'].fillna('CALCULATED_MOST_FREQ')
+        # Replace with the actual value of _extracted_source or fall back to CALCULATED_MOST_FREQ
+        # Actually, let's simplify: if we're using the calculated most frequent directory, that's CALCULATED_MOST_FREQ
+        df.loc[rows_to_fill, 'Directory_Source'] = 'CALCULATED_MOST_FREQ'
+
+        df['Directory'].fillna(df['Calculated_Directory'], inplace=True)
+
+        # 23. Drop temporary columns
+        df.drop(columns=['Calculated_Directory', 'Primary_Directory', '_extracted_source'], inplace=True, errors='ignore')
+
+    else:
+         # If df_to_process was empty, still need to drop temporary columns
+         df.drop(columns=['Primary_Directory', '_extracted_source'], inplace=True, errors='ignore')
+
+    # 24. Drop rows with missing UPID (original logic)
+    df['UPID'].replace('', np.nan, inplace=True) # Ensure empty strings are NaN
+    df_orig = df.copy() # Save before dropna for future reference if needed
+    df.dropna(subset=['UPID'], inplace=True)
+
+    # 25. Export rows with NA Directory to CSV for analysis (keep this for diagnostics)
+    na_directory_rows = df[df['Directory'].isna()].copy()
+    
+    # Export to CSV if there are any NA Directory rows
+    if len(na_directory_rows) > 0:
+        na_directory_rows.to_csv(paths.na_directory_rows_csv, index=False)
+    
+    # 26. FALLBACK MECHANISM 1: Infer directory based on same UPID
+    # Create a mapping of most frequent directory per UPID (only for UPIDs with a directory)
+    if len(df[df['Directory'].isna()]) > 0:
+        # First get valid directories per UPID
+        valid_upid_dirs = df[df['Directory'].notna()].groupby('UPID')['Directory'].agg(
+            lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None
+        ).to_dict()
+
+        # Apply UPID-based inference and track source
+        for idx in df[df['Directory'].isna()].index:
+            upid = df.loc[idx, 'UPID']
+            if upid in valid_upid_dirs and valid_upid_dirs[upid] is not None:
+                df.loc[idx, 'Directory'] = valid_upid_dirs[upid]
+                df.loc[idx, 'Directory_Source'] = 'UPID_INFERENCE'
+
+    # 27. FALLBACK MECHANISM 2: Label remaining NA as "Undefined"
+    # Track rows that will be marked as Undefined
+    rows_undefined = df['Directory'].isna()
+    df.loc[rows_undefined, 'Directory_Source'] = 'UNDEFINED'
+    # Fill remaining NA directories with "Undefined"
+    df['Directory'].fillna("Undefined", inplace=True)
+
+    # 28. Return the processed DataFrame
+    return df
+
+
+
+def ta_list_get(paths: Optional[PathConfig] = None):
+    if paths is None:
+        paths = default_paths
+
+    link = "https://www.nice.org.uk/Media/Default/About/what-we-do/NICE-guidance/NICE-technology-appraisals/TA%20recommendations.xlsx"
+    urllib.request.urlretrieve(link, paths.ta_recommendations_xlsx)
+    ta_db = pd.read_excel(paths.ta_recommendations_xlsx, index_col=0)
+
+    # Filter out TA's which are not Recommended or not Pharmaceutical
+    ta_db = ta_db[ta_db["Categorisation (for specific recommendation)"].isin(["Recommended", "Optimised"])]
+    ta_db = ta_db[ta_db["Technology type"] == "Pharmaceutical"]
+
+    # Amend TA001 strings to only the integer
+    ta_db["TA ID"] = ta_db["TA ID"].str.replace(r'\D+', '', regex=True).astype(int)
+    ta_db["TA ID"] = "NICE TA" + ta_db["TA ID"].astype(str)
+    ta_series = ta_db[["TA ID", "Indication"]].drop_duplicates()
+    return ta_series
+
+
+
+