feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.2)

Add load_drug_indication_mapping() and get_search_terms_for_drug() to
diagnosis_lookup.py. Loads DimSearchTerm.csv to build bidirectional
lookup between drug name fragments and Search_Terms. Uses substring
matching for drug fragments (handles both exact names like ADALIMUMAB
and partial fragments like PEGYLATED). Handles duplicate Search_Terms
(e.g., diabetes appearing under two directorates) by combining fragments.
This commit is contained in:
Andrew Charlwood
2026-02-05 22:48:09 +00:00
parent 1c4d2c07ee
commit 0779df78d1
2 changed files with 276 additions and 202 deletions
+104
View File
@@ -1087,6 +1087,107 @@ def batch_lookup_indication_groups(
return result_df
# === Drug-to-indication mapping from DimSearchTerm.csv ===
def load_drug_indication_mapping(
csv_path: Optional[str] = None,
) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
"""
Load the drug-to-Search_Term mapping from DimSearchTerm.csv.
Builds two lookup dicts:
- fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it
- search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)
DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate
CleanedDrugName is pipe-separated (e.g., "ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB").
Note: A Search_Term can appear multiple times with different PrimaryDirectorates
(e.g., "diabetes" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).
Drug fragments from all rows for the same Search_Term are combined.
Args:
csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.
Returns:
Tuple of (fragment_to_search_terms, search_term_to_fragments)
"""
if csv_path is None:
csv_path = str(Path(__file__).parent.parent / "data" / "DimSearchTerm.csv")
fragment_to_search_terms: dict[str, list[str]] = {}
search_term_to_fragments: dict[str, list[str]] = {}
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
search_term = row.get("Search_Term", "").strip()
drug_names_raw = row.get("CleanedDrugName", "").strip()
if not search_term or not drug_names_raw:
continue
fragments = [frag.strip().upper() for frag in drug_names_raw.split("|") if frag.strip()]
# Build search_term -> fragments (accumulate for duplicate Search_Terms)
if search_term not in search_term_to_fragments:
search_term_to_fragments[search_term] = []
for frag in fragments:
if frag not in search_term_to_fragments[search_term]:
search_term_to_fragments[search_term].append(frag)
# Build fragment -> search_terms
for frag in fragments:
if frag not in fragment_to_search_terms:
fragment_to_search_terms[frag] = []
if search_term not in fragment_to_search_terms[frag]:
fragment_to_search_terms[frag].append(search_term)
logger.info(
f"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, "
f"{len(fragment_to_search_terms)} drug fragments"
)
except FileNotFoundError:
logger.error(f"DimSearchTerm.csv not found at {csv_path}")
except Exception as e:
logger.error(f"Error loading DimSearchTerm.csv: {e}")
return fragment_to_search_terms, search_term_to_fragments
def get_search_terms_for_drug(
drug_name: str,
search_term_to_fragments: dict[str, list[str]],
) -> list[str]:
"""
Get all Search_Terms that list a given drug using substring matching.
Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given
drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)
and partial fragments (PEGYLATED, INHALED).
Args:
drug_name: HCD drug name (e.g., "ADALIMUMAB 40MG", "PEGYLATED LIPOSOMAL DOXORUBICIN")
search_term_to_fragments: Mapping of search_term -> list of drug fragments
Returns:
List of Search_Terms whose drug fragments match the drug name
"""
drug_name_upper = drug_name.upper()
matched_terms: list[str] = []
for search_term, fragments in search_term_to_fragments.items():
for frag in fragments:
if frag in drug_name_upper:
matched_terms.append(search_term)
break # One matching fragment is enough for this Search_Term
return matched_terms
# === NEW APPROACH: Query Snowflake directly using cluster CTE ===
# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)
@@ -1428,6 +1529,9 @@ __all__ = [
"get_directorate_from_diagnosis",
# Batch lookup for indication groups
"batch_lookup_indication_groups",
# Drug-indication mapping from DimSearchTerm.csv
"load_drug_indication_mapping",
"get_search_terms_for_drug",
# Snowflake-direct indication lookup (new approach)
"get_patient_indication_groups",
"CLUSTER_MAPPING_SQL",