HighCostDrugsDemo/archive/can_delete/iteration_3.raw.jsonl

{"type":"system","subtype":"init","cwd":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign","session_id":"73d004c4-7376-4936-9771-dc37f5b75850","tools":["Task","TaskOutput","Bash","Glob","Grep","ExitPlanMode","Read","Edit","Write","NotebookEdit","WebFetch","TodoWrite","WebSearch","TaskStop","AskUserQuestion","Skill","EnterPlanMode","ToolSearch","mcp__snowflake-mcp__test_connection","mcp__snowflake-mcp__list_databases","mcp__snowflake-mcp__list_schemas","mcp__snowflake-mcp__list_tables","mcp__snowflake-mcp__list_views","mcp__snowflake-mcp__describe_table","mcp__snowflake-mcp__read_data","mcp__snowflake-mcp__get_system_health","mcp__snowflake-mcp__describe_query","mcp__snowflake-mcp__execute_async","mcp__snowflake-mcp__get_query_status","mcp__snowflake-mcp__get_async_results","mcp__snowflake-mcp__list_async_queries","mcp__snowflake-mcp__read_data_paginated","mcp__snowflake-mcp__read_data_pandas"],"mcp_servers":[{"name":"Context7","status":"failed"},{"name":"snowflake-mcp","status":"connected"}],"model":"claude-opus-4-6","permissionMode":"bypassPermissions","slash_commands":["keybindings-help","debug","frontend-design","sc:analyze","sc:build","sc:cleanup","sc:design","sc:document","sc:estimate","sc:explain","sc:git","sc:implement","sc:improve","sc:index","sc:load","sc:spawn","sc:task","sc:test","sc:troubleshoot","sc:workflow","compact","context","cost","init","pr-comments","release-notes","review","security-review","insights"],"apiKeySource":"none","claude_code_version":"2.1.32","output_style":"default","agents":["Bash","general-purpose","statusline-setup","Explore","Plan","code-reviewer","documentation-specialist","performance-optimizer","medicines-intelligence-analyst","project-analyst","team-configurator","tech-lead-orchestrator","data-scientist","postgres-expert","sql-pro","django-api-developer","django-backend-expert","django-orm-expert","laravel-backend-expert","laravel-eloquent-expert","actix-expert","android-expert","angular-expert","angularjs-expert","ansible-expert","aspnet-core-expert","astro-expert","auth0-expert","ava-expert","braintree-expert","bullmq-expert","bun-expert","c-expert","cassandra-expert","celery-expert","circleci-expert","clojure-expert","cockroachdb-expert","cpp-expert","csharp-expert","css-expert","cypress-expert","dart-expert","deno-expert","django-expert","docker-expert","dynamodb-expert","elasticsearch-expert","electron-expert","elixir-expert","elk-expert","erlang-expert","expo-expert","express-expert","fastapi-expert","fastify-expert","fiber-expert","flask-expert","flutter-expert","flyway-expert","gin-expert","github-actions-expert","gitlab-ci-expert","go-expert","grafana-expert","graphql-expert","grpc-expert","haskell-expert","html-expert","ios-expert","jasmine-expert","java-expert","javascript-expert","jenkins-expert","jest-expert","jquery-expert","jwt-expert","kafka-expert","keycloak-expert","knex-expert","kotlin-expert","kubernetes-expert","langchain-expert","laravel-expert","liquibase-expert","loki-expert","lua-expert","mariadb-expert","mocha-expert","mongodb-expert","mongoose-expert","mqtt-expert","mssql-expert","mysql-expert","nats-expert","neo4j-expert","nestjs-expert","nextjs-expert","nodejs-expert","numpy-expert","oauth-oidc-expert","ocaml-expert","openai-api-expert","openapi-expert","opensearch-expert","opentelemetry-expert","owasp-top10-expert","pandas-expert","perl-expert","phoenix-expert","php-expert","playwright-expert","prisma-expert","prometheus-expert","pulumi-expert","puppeteer-expert","python-expert","pytorch-expert","rabbitmq-expert","rails-expert","react-expert","react-native-expert","redis-expert","remix-expert","rest-expert","rollup-expert","ruby-expert","rust-expert","scala-expert","scikit-learn-expert","selenium-expert","sequelize-expert","sidekiq-expert","sns-expert","solidjs-expert","spring-boot-expert","sql-expert","sqlite-expert","sqs-expert","stripe-expert","svelte-expert","swift-expert","swiftui-expert","tailwind-expert","tauri-expert","tensorflow-expert","terraform-expert","testcafe-expert","trpc-expert","typeorm-expert","typescript-expert","vector-db-expert","vitest-expert","vue-expert","webpack-expert","websocket-expert","rails-activerecord-expert","rails-api-developer","react-component-architect","react-nextjs-expert","vue-component-architect","vue-nuxt-expert","api-architect","backend-developer","frontend-developer","tailwind-frontend-expert"],"skills":["keybindings-help","debug","frontend-design"],"plugins":[],"uuid":"e7536f14-9835-4f59-bc65-d5d355a503f1"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01TYdJwKiat1TwJ62jNo3j7K","type":"message","role":"assistant","content":[{"type":"text","text":"\n\nI'll start by reading the required files in order to understand the current state of the project."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":42305,"cache_read_input_tokens":27385,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":42305},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"934ff03c-a7c4-4429-9b84-d23ed94e197e"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01TYdJwKiat1TwJ62jNo3j7K","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01HtLodpH8qcXmsM9oE3UCMH","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":42305,"cache_read_input_tokens":27385,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":42305},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"caac8e1e-57f0-4651-96bc-2a38f62feeca"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01HtLodpH8qcXmsM9oE3UCMH","type":"tool_result","content":"     1ÔåÆ# Progress Log - Drug-Aware Indication Matching\n     2ÔåÆ\n     3ÔåÆ## Project Context\n     4ÔåÆ\n     5ÔåÆThis project extends the indication-based pathway charts (Phase 1-5 complete) with drug-aware matching.\n     6ÔåÆ\n     7ÔåÆ**Previous state**: Patients get ONE indication based on their most recent GP diagnosis match (SNOMED cluster codes). This ignores which drugs the patient is taking.\n     8ÔåÆ\n     9ÔåÆ**New goal**: Match each drug to an indication by cross-referencing the patient's GP diagnoses AND the drug's Search_Term mapping from DimSearchTerm.csv.\n    10ÔåÆ\n    11ÔåÆ## Key Data/Patterns\n    12ÔåÆ\n    13ÔåÆ### DimSearchTerm.csv\n    14ÔåÆ- Located at `data/DimSearchTerm.csv`\n    15ÔåÆ- Columns: Search_Term, CleanedDrugName (pipe-separated), PrimaryDirectorate\n    16ÔåÆ- ~165 rows mapping clinical conditions to drug name fragments\n    17ÔåÆ- Drug fragments are substrings that match standardized drug names from HCD data\n    18ÔåÆ- Some entries have generic fragments: INHALED, CONTINUOUS, STANDARD-DOSE, PEGYLATED\n    19ÔåÆ\n    20ÔåÆ### Current get_patient_indication_groups() in diagnosis_lookup.py\n    21ÔåÆ- Uses CLUSTER_MAPPING_SQL as CTE in Snowflake query\n    22ÔåÆ- Returns ONLY the most recent match per patient (QUALIFY ROW_NUMBER() = 1)\n    23ÔåÆ- Needs to return ALL matching Search_Terms per patient (remove QUALIFY)\n    24ÔåÆ- Batches 500 patients per query\n    25ÔåÆ\n    26ÔåÆ### Modified UPID approach\n    27ÔåÆ- Current: UPID = Provider Code[:3] + PersonKey (e.g., \"RMV12345\")\n    28ÔåÆ- New: UPID = original + \"|\" + search_term (e.g., \"RMV12345|rheumatoid arthritis\")\n    29ÔåÆ- The pipe delimiter \"|\" is safe because existing UPIDs are alphanumeric\n    30ÔåÆ- generate_icicle_chart_indication() treats UPID as an opaque identifier ÔÇö modified UPIDs work transparently\n    31ÔåÆ- The \" - \" delimiter in pathway ids is used for hierarchy levels, not within UPIDs\n    32ÔåÆ\n    33ÔåÆ### PseudoNHSNoLinked mapping\n    34ÔåÆ- HCD data has PseudoNHSNoLinked column that matches PatientPseudonym in GP records\n    35ÔåÆ- PersonKey is provider-specific local ID ÔÇö do NOT use for GP matching\n    36ÔåÆ- One PseudoNHSNoLinked can map to multiple UPIDs (multi-provider patients)\n    37ÔåÆ- GP match lookup: PseudoNHSNoLinked ÔåÆ list of matched Search_Terms\n    38ÔåÆ\n    39ÔåÆ### Drug matching logic\n    40ÔåÆ- For each HCD row (UPID + Drug Name):\n    41ÔåÆ  1. Get patient's GP-matched Search_Terms with code_frequency (via PseudoNHSNoLinked)\n    42ÔåÆ  2. Get which Search_Terms list this drug (from DimSearchTerm.csv)\n    43ÔåÆ  3. Intersection = valid indications\n    44ÔåÆ  4. If 1: use it. If multiple: pick highest code_frequency (most GP coding = most likely indication). If 0: fallback to directory.\n    45ÔåÆ- Modified UPID groups drugs under same indication together naturally\n    46ÔåÆ- code_frequency = COUNT(*) of matching SNOMED codes per Search_Term per patient in GP records\n    47ÔåÆ- GP code time range: only count codes from MIN(Intervention Date) onwards (the HCD data window)\n    48ÔåÆ  - Reduces noise from old/irrelevant diagnoses, makes frequency more meaningful\n    49ÔåÆ  - Pass earliest_hcd_date as parameter to get_patient_indication_groups()\n    50ÔåÆ- Tiebreaker rationale: 47 RA codes vs 2 crohn's codes ÔåÆ RA is clearly the active condition\n    51ÔåÆ\n    52ÔåÆ### Known edge cases\n    53ÔåÆ- Some DimSearchTerm drug fragments are generic (INHALED, ORAL, CONTINUOUS)\n    54ÔåÆ  - These could match broadly but are constrained by GP diagnosis requirement\n    55ÔåÆ- A patient visiting multiple providers has multiple UPIDs\n    56ÔåÆ  - Each UPID gets its own drug-indication matching independently\n    57ÔåÆ- Same Search_Term appears twice in DimSearchTerm.csv with different directorates\n    58ÔåÆ  - e.g., \"diabetes\" ÔåÆ DIABETIC MEDICINE and OPHTHALMOLOGY\n    59ÔåÆ  - For indication charts, we use Search_Term not directorate, so this is fine\n    60ÔåÆ\n    61ÔåÆ## Iteration Log\n    62ÔåÆ\n    63ÔåÆ## Iteration 1 ÔÇö 2026-02-05\n    64ÔåÆ### Task: 1.3 ÔÇö Build drug-to-Search_Term lookup from DimSearchTerm.csv\n    65ÔåÆ### Why this task:\n    66ÔåÆ- First iteration, chose Phase 1 foundations. Task 1.2 (CSV loading) is self-contained and testable locally without Snowflake.\n    67ÔåÆ- Task 1.1 (Snowflake query update) can't be verified without a live connection ÔÇö better to do 1.2 first.\n    68ÔåÆ- Both 1.1 and 1.2 are independent, so order doesn't matter for dependencies.\n    69ÔåÆ### Status: COMPLETE\n    70ÔåÆ### What was done:\n    71ÔåÆ- Added `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n    72ÔåÆ  - Loads `data/DimSearchTerm.csv`, builds two dicts:\n    73ÔåÆ    - `fragment_to_search_terms`: drug fragment (UPPER) ÔåÆ list of Search_Terms\n    74ÔåÆ    - `search_term_to_fragments`: search_term ÔåÆ list of drug fragments (UPPER)\n    75ÔåÆ  - Handles duplicate Search_Terms (e.g., \"diabetes\" rows combined)\n    76ÔåÆ  - Result: 164 Search_Terms, 346 drug fragments\n    77ÔåÆ- Added `get_search_terms_for_drug()` to `diagnosis_lookup.py`:\n    78ÔåÆ  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n    79ÔåÆ  - Named differently from plan's `drug_matches_search_term()` ÔÇö returns all matches at once rather than single boolean, more practical for Phase 2\n    80ÔåÆ- Updated `__all__` exports\n    81ÔåÆ### Validation results:\n    82ÔåÆ- Tier 1 (Code): py_compile passed, import check passed\n    83ÔåÆ- Tier 2 (Data): ADALIMUMAB ÔåÆ 7 indications (including axial spondyloarthritis, rheumatoid arthritis), OMALIZUMAB ÔåÆ 4 indications (asthma, allergic asthma, etc.), PEGYLATED LIPOSOMAL DOXORUBICIN ÔåÆ 4 matches via substring, \"ADALIMUMAB 40MG\" matches correctly with dosage info, diabetes fragments combined from 2 CSV rows\n    84ÔåÆ- Tier 3 (Functional): N/A (no UI changes)\n    85ÔåÆ### Files changed:\n    86ÔåÆ- data_processing/diagnosis_lookup.py (added load_drug_indication_mapping, get_search_terms_for_drug)\n    87ÔåÆ- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n    88ÔåÆ### Committed: 0779df7 \"feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.3)\"\n    89ÔåÆ### Patterns discovered:\n    90ÔåÆ- DimSearchTerm.csv has 164 unique Search_Terms (not 165 as noted) because diabetes appears twice with different directorates but same Search_Term\n    91ÔåÆ- Some drug fragments are very generic: INHALED, CONTINUOUS, ORAL, STANDARD-DOSE, INTRAVENOUS, PEGYLATED, ROUTINE, INDUCTION ÔÇö these will match broadly but are constrained by the GP diagnosis requirement in Phase 2\n    92ÔåÆ- Function signatures for Phase 2: `get_search_terms_for_drug(drug_name, search_term_to_fragments)` returns list[str] ÔÇö use this to get candidate indications per drug\n    93ÔåÆ### Next iteration should:\n    94ÔåÆ- Work on Task 1.2: Merge asthma Search_Terms in CLUSTER_MAPPING_SQL and load_drug_indication_mapping()\n    95ÔåÆ  - Merge \"allergic asthma\", \"asthma\", \"severe persistent allergic asthma\" ÔåÆ \"asthma\"\n    96ÔåÆ  - Keep \"urticaria\" separate\n    97ÔåÆ  - This is self-contained and testable locally\n    98ÔåÆ- OR work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n    99ÔåÆ  - The current query at line ~1352 of diagnosis_lookup.py uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1` ÔÇö this must be replaced with GROUP BY + COUNT(*)\n   100ÔåÆ  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n   101ÔåÆ  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n   102ÔåÆ- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n   103ÔåÆ### Blocked items:\n   104ÔåÆ- None\n   105ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e4820247-bf0f-438d-bda0-c46dfea3fff8","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","content":"# Progress Log - Drug-Aware Indication Matching\n\n## Project Context\n\nThis project extends the indication-based pathway charts (Phase 1-5 complete) with drug-aware matching.\n\n**Previous state**: Patients get ONE indication based on their most recent GP diagnosis match (SNOMED cluster codes). This ignores which drugs the patient is taking.\n\n**New goal**: Match each drug to an indication by cross-referencing the patient's GP diagnoses AND the drug's Search_Term mapping from DimSearchTerm.csv.\n\n## Key Data/Patterns\n\n### DimSearchTerm.csv\n- Located at `data/DimSearchTerm.csv`\n- Columns: Search_Term, CleanedDrugName (pipe-separated), PrimaryDirectorate\n- ~165 rows mapping clinical conditions to drug name fragments\n- Drug fragments are substrings that match standardized drug names from HCD data\n- Some entries have generic fragments: INHALED, CONTINUOUS, STANDARD-DOSE, PEGYLATED\n\n### Current get_patient_indication_groups() in diagnosis_lookup.py\n- Uses CLUSTER_MAPPING_SQL as CTE in Snowflake query\n- Returns ONLY the most recent match per patient (QUALIFY ROW_NUMBER() = 1)\n- Needs to return ALL matching Search_Terms per patient (remove QUALIFY)\n- Batches 500 patients per query\n\n### Modified UPID approach\n- Current: UPID = Provider Code[:3] + PersonKey (e.g., \"RMV12345\")\n- New: UPID = original + \"|\" + search_term (e.g., \"RMV12345|rheumatoid arthritis\")\n- The pipe delimiter \"|\" is safe because existing UPIDs are alphanumeric\n- generate_icicle_chart_indication() treats UPID as an opaque identifier ÔÇö modified UPIDs work transparently\n- The \" - \" delimiter in pathway ids is used for hierarchy levels, not within UPIDs\n\n### PseudoNHSNoLinked mapping\n- HCD data has PseudoNHSNoLinked column that matches PatientPseudonym in GP records\n- PersonKey is provider-specific local ID ÔÇö do NOT use for GP matching\n- One PseudoNHSNoLinked can map to multiple UPIDs (multi-provider patients)\n- GP match lookup: PseudoNHSNoLinked ÔåÆ list of matched Search_Terms\n\n### Drug matching logic\n- For each HCD row (UPID + Drug Name):\n  1. Get patient's GP-matched Search_Terms with code_frequency (via PseudoNHSNoLinked)\n  2. Get which Search_Terms list this drug (from DimSearchTerm.csv)\n  3. Intersection = valid indications\n  4. If 1: use it. If multiple: pick highest code_frequency (most GP coding = most likely indication). If 0: fallback to directory.\n- Modified UPID groups drugs under same indication together naturally\n- code_frequency = COUNT(*) of matching SNOMED codes per Search_Term per patient in GP records\n- GP code time range: only count codes from MIN(Intervention Date) onwards (the HCD data window)\n  - Reduces noise from old/irrelevant diagnoses, makes frequency more meaningful\n  - Pass earliest_hcd_date as parameter to get_patient_indication_groups()\n- Tiebreaker rationale: 47 RA codes vs 2 crohn's codes ÔåÆ RA is clearly the active condition\n\n### Known edge cases\n- Some DimSearchTerm drug fragments are generic (INHALED, ORAL, CONTINUOUS)\n  - These could match broadly but are constrained by GP diagnosis requirement\n- A patient visiting multiple providers has multiple UPIDs\n  - Each UPID gets its own drug-indication matching independently\n- Same Search_Term appears twice in DimSearchTerm.csv with different directorates\n  - e.g., \"diabetes\" ÔåÆ DIABETIC MEDICINE and OPHTHALMOLOGY\n  - For indication charts, we use Search_Term not directorate, so this is fine\n\n## Iteration Log\n\n## Iteration 1 ÔÇö 2026-02-05\n### Task: 1.3 ÔÇö Build drug-to-Search_Term lookup from DimSearchTerm.csv\n### Why this task:\n- First iteration, chose Phase 1 foundations. Task 1.2 (CSV loading) is self-contained and testable locally without Snowflake.\n- Task 1.1 (Snowflake query update) can't be verified without a live connection ÔÇö better to do 1.2 first.\n- Both 1.1 and 1.2 are independent, so order doesn't matter for dependencies.\n### Status: COMPLETE\n### What was done:\n- Added `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n  - Loads `data/DimSearchTerm.csv`, builds two dicts:\n    - `fragment_to_search_terms`: drug fragment (UPPER) ÔåÆ list of Search_Terms\n    - `search_term_to_fragments`: search_term ÔåÆ list of drug fragments (UPPER)\n  - Handles duplicate Search_Terms (e.g., \"diabetes\" rows combined)\n  - Result: 164 Search_Terms, 346 drug fragments\n- Added `get_search_terms_for_drug()` to `diagnosis_lookup.py`:\n  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n  - Named differently from plan's `drug_matches_search_term()` ÔÇö returns all matches at once rather than single boolean, more practical for Phase 2\n- Updated `__all__` exports\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data): ADALIMUMAB ÔåÆ 7 indications (including axial spondyloarthritis, rheumatoid arthritis), OMALIZUMAB ÔåÆ 4 indications (asthma, allergic asthma, etc.), PEGYLATED LIPOSOMAL DOXORUBICIN ÔåÆ 4 matches via substring, \"ADALIMUMAB 40MG\" matches correctly with dosage info, diabetes fragments combined from 2 CSV rows\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (added load_drug_indication_mapping, get_search_terms_for_drug)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: 0779df7 \"feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.3)\"\n### Patterns discovered:\n- DimSearchTerm.csv has 164 unique Search_Terms (not 165 as noted) because diabetes appears twice with different directorates but same Search_Term\n- Some drug fragments are very generic: INHALED, CONTINUOUS, ORAL, STANDARD-DOSE, INTRAVENOUS, PEGYLATED, ROUTINE, INDUCTION ÔÇö these will match broadly but are constrained by the GP diagnosis requirement in Phase 2\n- Function signatures for Phase 2: `get_search_terms_for_drug(drug_name, search_term_to_fragments)` returns list[str] ÔÇö use this to get candidate indications per drug\n### Next iteration should:\n- Work on Task 1.2: Merge asthma Search_Terms in CLUSTER_MAPPING_SQL and load_drug_indication_mapping()\n  - Merge \"allergic asthma\", \"asthma\", \"severe persistent allergic asthma\" ÔåÆ \"asthma\"\n  - Keep \"urticaria\" separate\n  - This is self-contained and testable locally\n- OR work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at line ~1352 of diagnosis_lookup.py uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1` ÔÇö this must be replaced with GROUP BY + COUNT(*)\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n### Blocked items:\n- None\n","numLines":105,"startLine":1,"totalLines":105}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01TYdJwKiat1TwJ62jNo3j7K","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_015LHyoAaiPcudsqoTDnsBsx","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":42305,"cache_read_input_tokens":27385,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":42305},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"60fd85be-cfb8-4435-bec4-51e23fdae3ac"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_015LHyoAaiPcudsqoTDnsBsx","type":"tool_result","content":"     1ÔåÆ# Implementation Plan - Drug-Aware Indication Matching\n     2ÔåÆ\n     3ÔåÆ## Project Overview\n     4ÔåÆ\n     5ÔåÆUpdate the indication-based pathway charts so that patient indications are matched **per drug**, not just per patient. Currently, each patient gets ONE indication (most recent GP diagnosis match). This ignores which drugs the patient is actually taking.\n     6ÔåÆ\n     7ÔåÆ### The Problem\n     8ÔåÆ\n     9ÔåÆA patient on ADALIMUMAB + OMALIZUMAB currently gets assigned a single indication (e.g., \"rheumatoid arthritis\" ÔÇö the most recent GP match). But:\n    10ÔåÆ- ADALIMUMAB is used for rheumatoid arthritis, axial spondyloarthritis, crohn's disease, etc.\n    11ÔåÆ- OMALIZUMAB is used for asthma, allergic asthma, urticaria\n    12ÔåÆ\n    13ÔåÆThese are different clinical pathways and should be treated as separate treatment journeys.\n    14ÔåÆ\n    15ÔåÆ### The Solution\n    16ÔåÆ\n    17ÔåÆMatch each drug to an indication by cross-referencing:\n    18ÔåÆ1. **GP diagnosis** ÔÇö which Search_Terms the patient has matching SNOMED codes for\n    19ÔåÆ2. **Drug mapping** ÔÇö which Search_Terms list each drug (from `DimSearchTerm.csv`)\n    20ÔåÆ\n    21ÔåÆOnly assign a drug to an indication if BOTH conditions are met. If a patient's drugs map to different indications, they become separate pathways (via modified UPID).\n    22ÔåÆ\n    23ÔåÆ### Key Design Decisions\n    24ÔåÆ\n    25ÔåÆ| Aspect | Decision |\n    26ÔåÆ|--------|----------|\n    27ÔåÆ| Drug-indication source | `data/DimSearchTerm.csv` ÔÇö Search_Term ÔåÆ CleanedDrugName mapping |\n    28ÔåÆ| UPID modification | `{original_UPID}\\|{search_term}` for drugs with matched indication |\n    29ÔåÆ| GP diagnosis matching | Return ALL matches per patient (not just most recent) |\n    30ÔåÆ| Drug matching | Substring match: HCD drug name contains DimSearchTerm fragment |\n    31ÔåÆ| Multiple indication matches per drug | Use highest GP code frequency as tiebreaker (COUNT of matching SNOMED codes per Search_Term) |\n    32ÔåÆ| GP code time range | Only codes from MIN(Intervention Date) onwards ÔÇö restricts to HCD data window |\n    33ÔåÆ| No indication match | Fallback to directory (same as current behavior) |\n    34ÔåÆ| Same patient, different indications | Separate pathways via different modified UPIDs |\n    35ÔåÆ\n    36ÔåÆ### Examples\n    37ÔåÆ\n    38ÔåÆ**Patient on ADALIMUMAB + GOLIMUMAB, GP dx: axial spondyloarthritis + asthma**\n    39ÔåÆ- axial spondyloarthritis drug list includes both ADALIMUMAB and GOLIMUMAB\n    40ÔåÆ- ÔåÆ Both drugs grouped under \"axial spondyloarthritis\", single pathway\n    41ÔåÆ- Modified UPID: `RMV12345|axial spondyloarthritis`\n    42ÔåÆ\n    43ÔåÆ**Patient on ADALIMUMAB + OMALIZUMAB, GP dx: axial spondyloarthritis + asthma**\n    44ÔåÆ- axial spondyloarthritis lists ADALIMUMAB but not OMALIZUMAB\n    45ÔåÆ- asthma lists OMALIZUMAB but not ADALIMUMAB\n    46ÔåÆ- ÔåÆ Two separate pathways:\n    47ÔåÆ  - `RMV12345|axial spondyloarthritis` with ADALIMUMAB\n    48ÔåÆ  - `RMV12345|asthma` with OMALIZUMAB\n    49ÔåÆ\n    50ÔåÆ**Patient on ADALIMUMAB, GP dx: rheumatoid arthritis (47 codes) + crohn's disease (2 codes)**\n    51ÔåÆ- Both Search_Terms list ADALIMUMAB AND patient has GP dx for both\n    52ÔåÆ- ÔåÆ Tiebreaker: highest code frequency ÔÇö rheumatoid arthritis has 47 matching SNOMED codes vs 2 for crohn's\n    53ÔåÆ- ÔåÆ Single pathway under rheumatoid arthritis (more clinical activity = more likely the treatment indication)\n    54ÔåÆ\n    55ÔåÆ---\n    56ÔåÆ\n    57ÔåÆ## Phase 1: Update Snowflake Query & Drug Mapping\n    58ÔåÆ\n    59ÔåÆ### 1.1 Update `get_patient_indication_groups()` to return ALL matches with frequency\n    60ÔåÆ- [ ] Modify the Snowflake query in `get_patient_indication_groups()` (diagnosis_lookup.py):\n    61ÔåÆ  - Remove `QUALIFY ROW_NUMBER() OVER (PARTITION BY ... ORDER BY EventDateTime DESC) = 1`\n    62ÔåÆ  - Return ALL matching Search_Terms per patient with code frequency:\n    63ÔåÆ    ```sql\n    64ÔåÆ    SELECT pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    65ÔåÆ           aic.Search_Term AS \"Search_Term\",\n    66ÔåÆ           COUNT(*) AS \"code_frequency\"\n    67ÔåÆ    FROM PrimaryCareClinicalCoding pc\n    68ÔåÆ    JOIN AllIndicationCodes aic ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\n    69ÔåÆ    WHERE pc.\"PatientPseudonym\" IN (...)\n    70ÔåÆ      AND pc.\"EventDateTime\" >= :earliest_hcd_date\n    71ÔåÆ    GROUP BY pc.\"PatientPseudonym\", aic.Search_Term\n    72ÔåÆ    ```\n    73ÔåÆ  - `code_frequency` = number of matching SNOMED codes per Search_Term per patient\n    74ÔåÆ  - Higher frequency = more clinical activity = stronger signal for tiebreaker\n    75ÔåÆ  - `earliest_hcd_date` = `MIN(Intervention Date)` from the HCD DataFrame ÔÇö restricts GP codes to the HCD data window, reducing noise from old/irrelevant diagnoses\n    76ÔåÆ- [ ] Accept `earliest_hcd_date` parameter in `get_patient_indication_groups()` and pass to query\n    77ÔåÆ- [ ] Keep batch processing (500 patients per query)\n    78ÔåÆ- [ ] Update return type: DataFrame now has multiple rows per patient (PatientPseudonym, Search_Term, code_frequency)\n    79ÔåÆ- [ ] Verify: Query returns more rows than before (patients with multiple matching diagnoses)\n    80ÔåÆ\n    81ÔåÆ### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n    82ÔåÆ- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n    83ÔåÆ  - `allergic asthma` (Cluster: OMALIZUMAB only)\n    84ÔåÆ  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n    85ÔåÆ  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n    86ÔåÆ- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n    87ÔåÆ- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n    88ÔåÆ- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n    89ÔåÆ  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n    90ÔåÆ  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n    91ÔåÆ- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n    92ÔåÆ- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB\n    93ÔåÆ\n    94ÔåÆ### 1.3 Build drug-to-Search_Term lookup from DimSearchTerm.csv\n    95ÔåÆ- [x] Add function `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n    96ÔåÆ  - Loads `data/DimSearchTerm.csv`\n    97ÔåÆ  - Builds dict: `drug_fragment (uppercase) ÔåÆ list[Search_Term]`\n    98ÔåÆ  - Also builds reverse: `search_term ÔåÆ list[drug_fragments]`\n    99ÔåÆ  - CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\")\n   100ÔåÆ- [x] Add function `get_search_terms_for_drug(drug_name, search_term_to_fragments) -> list[str]`:\n   101ÔåÆ  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n   102ÔåÆ  - More practical than per-term boolean check ÔÇö returns all matches at once for Phase 2 use\n   103ÔåÆ- [x] Verify: ADALIMUMAB matches \"axial spondyloarthritis\", OMALIZUMAB matches \"asthma\"\n   104ÔåÆ\n   105ÔåÆ---\n   106ÔåÆ\n   107ÔåÆ## Phase 2: Drug-Aware Indication Matching Logic\n   108ÔåÆ\n   109ÔåÆ### 2.1 Create `assign_drug_indications()` function\n   110ÔåÆ- [ ] Add to `diagnosis_lookup.py` or `pathway_pipeline.py`:\n   111ÔåÆ  ```\n   112ÔåÆ  def assign_drug_indications(\n   113ÔåÆ      df: pd.DataFrame,              # HCD data with UPID, Drug Name columns\n   114ÔåÆ      gp_matches_df: pd.DataFrame,   # PatientPseudonym ÔåÆ list of matched Search_Terms\n   115ÔåÆ      drug_mapping: dict,             # From load_drug_indication_mapping()\n   116ÔåÆ  ) -> tuple[pd.DataFrame, pd.DataFrame]:\n   117ÔåÆ      Returns: (modified_df, indication_df)\n   118ÔåÆ      - modified_df: HCD data with UPID replaced by {UPID}|{indication}\n   119ÔåÆ      - indication_df: mapping modified_UPID ÔåÆ Search_Term\n   120ÔåÆ  ```\n   121ÔåÆ- [ ] Logic per UPID + Drug Name pair:\n   122ÔåÆ  1. Get patient's GP-matched Search_Terms with code_frequency (from gp_matches_df via PseudoNHSNoLinked)\n   123ÔåÆ  2. Get which Search_Terms include this drug (from drug_mapping)\n   124ÔåÆ  3. Intersection = valid indications for this drug-patient pair\n   125ÔåÆ  4. If 1 match: use it\n   126ÔåÆ  5. If multiple matches: use highest code_frequency as tiebreaker (most GP coding activity = most likely treatment indication)\n   127ÔåÆ  6. If 0 matches: use fallback directory\n   128ÔåÆ- [ ] Modify UPID in df rows: `{original_UPID}|{matched_search_term}`\n   129ÔåÆ- [ ] Build indication_df: `{modified_UPID}` ÔåÆ `Search_Term` (or fallback label)\n   130ÔåÆ- [ ] Verify: Function compiles, handles edge cases (no GP match, no drug match)\n   131ÔåÆ\n   132ÔåÆ### 2.2 Handle tiebreaker for multiple indication matches\n   133ÔåÆ- [ ] When a drug matches multiple Search_Terms AND patient has GP dx for multiple:\n   134ÔåÆ  - Use `code_frequency` from the GP query (COUNT of matching SNOMED codes per Search_Term)\n   135ÔåÆ  - Higher code_frequency = more clinical activity for that condition = more likely treatment indication\n   136ÔåÆ  - E.g., patient with 47 RA codes and 2 crohn's codes ÔåÆ ADALIMUMAB assigned to RA\n   137ÔåÆ  - code_frequency is already returned by the updated query in Task 1.1\n   138ÔåÆ- [ ] Verify: Tiebreaker logic correctly picks highest-frequency diagnosis\n   139ÔåÆ- [ ] Verify: Tie on frequency (rare but possible) falls back to alphabetical Search_Term for determinism\n   140ÔåÆ\n   141ÔåÆ---\n   142ÔåÆ\n   143ÔåÆ## Phase 3: Pipeline Integration\n   144ÔåÆ\n   145ÔåÆ### 3.1 Update `refresh_pathways.py` indication processing\n   146ÔåÆ- [ ] In the `elif current_chart_type == \"indication\":` block:\n   147ÔåÆ  1. Call `get_patient_indication_groups()` as before (but now returns ALL matches)\n   148ÔåÆ  2. Load drug mapping: `drug_mapping = load_drug_indication_mapping()`\n   149ÔåÆ  3. Call `assign_drug_indications(df, gp_matches_df, drug_mapping)`\n   150ÔåÆ  4. Use modified_df (with indication-aware UPIDs) for pathway processing\n   151ÔåÆ  5. Use indication_df for the indication mapping\n   152ÔåÆ- [ ] Pass modified_df (not original df) to `process_indication_pathway_for_date_filter()`\n   153ÔåÆ- [ ] Verify: Pipeline compiles, `python -m py_compile cli/refresh_pathways.py`\n   154ÔåÆ\n   155ÔåÆ### 3.2 Test with dry run\n   156ÔåÆ- [ ] Run `python -m cli.refresh_pathways --chart-type indication --dry-run -v`\n   157ÔåÆ- [ ] Verify:\n   158ÔåÆ  - Modified UPIDs appear in pipeline log (e.g., `RMV12345|rheumatoid arthritis`)\n   159ÔåÆ  - Patient counts are reasonable (will be higher than before since same patient can appear under multiple indications)\n   160ÔåÆ  - Drug-indication matching is logged (match rate, fallback rate)\n   161ÔåÆ  - Pathway hierarchy shows drug-specific grouping under correct indications\n   162ÔåÆ\n   163ÔåÆ---\n   164ÔåÆ\n   165ÔåÆ## Phase 4: Full Refresh & Validation\n   166ÔåÆ\n   167ÔåÆ### 4.1 Full refresh with both chart types\n   168ÔåÆ- [ ] Run `python -m cli.refresh_pathways --chart-type all`\n   169ÔåÆ- [ ] Verify:\n   170ÔåÆ  - Both chart types generate data\n   171ÔåÆ  - Directory charts unchanged (no modified UPIDs)\n   172ÔåÆ  - Indication charts reflect drug-aware matching\n   173ÔåÆ\n   174ÔåÆ### 4.2 Validate indication chart correctness\n   175ÔåÆ- [ ] Check that drugs under an indication all appear in that Search_Term's drug list\n   176ÔåÆ- [ ] Verify that a patient on drugs for different indications creates separate pathway branches\n   177ÔåÆ- [ ] Verify that drugs sharing an indication are grouped in the same pathway\n   178ÔåÆ- [ ] Log: patient count comparison (old vs new approach)\n   179ÔåÆ\n   180ÔåÆ### 4.3 Validate Reflex UI\n   181ÔåÆ- [ ] Run `python -m reflex compile` to verify app compiles\n   182ÔåÆ- [ ] Verify chart type toggle still works\n   183ÔåÆ- [ ] Verify indication chart shows correct hierarchy\n   184ÔåÆ\n   185ÔåÆ---\n   186ÔåÆ\n   187ÔåÆ## Completion Criteria\n   188ÔåÆ\n   189ÔåÆAll tasks marked `[x]` AND:\n   190ÔåÆ- [ ] App compiles without errors (`reflex compile` succeeds)\n   191ÔåÆ- [ ] Both chart types generate pathway data\n   192ÔåÆ- [ ] Indication charts show drug-specific indication matching\n   193ÔåÆ- [ ] Drugs under the same indication for the same patient are in one pathway\n   194ÔåÆ- [ ] Drugs under different indications for the same patient create separate pathways\n   195ÔåÆ- [ ] Fallback works for drugs with no indication match\n   196ÔåÆ- [ ] Full refresh completes successfully\n   197ÔåÆ- [ ] Existing directory charts are unaffected\n   198ÔåÆ\n   199ÔåÆ---\n   200ÔåÆ\n   201ÔåÆ## Reference\n   202ÔåÆ\n   203ÔåÆ### DimSearchTerm.csv Structure\n   204ÔåÆ```\n   205ÔåÆSearch_Term,CleanedDrugName,PrimaryDirectorate\n   206ÔåÆrheumatoid arthritis,ABATACEPT|ADALIMUMAB|ANAKINRA|BARICITINIB|...,RHEUMATOLOGY\n   207ÔåÆasthma,BENRALIZUMAB|DUPILUMAB|INHALED|MEPOLIZUMAB|OMALIZUMAB|RESLIZUMAB,THORACIC MEDICINE\n   208ÔåÆ```\n   209ÔåÆ\n   210ÔåÆ### Modified UPID Format\n   211ÔåÆ```\n   212ÔåÆOriginal:  RMV12345\n   213ÔåÆModified:  RMV12345|rheumatoid arthritis\n   214ÔåÆFallback:  RMV12345|RHEUMATOLOGY (no GP dx)\n   215ÔåÆ```\n   216ÔåÆ\n   217ÔåÆ### Current vs New Indication Flow\n   218ÔåÆ```\n   219ÔåÆCURRENT:\n   220ÔåÆ  Patient ÔåÆ GP dx (most recent) ÔåÆ single Search_Term ÔåÆ one pathway\n   221ÔåÆ\n   222ÔåÆNEW:\n   223ÔåÆ  Patient + Drug A ÔåÆ GP dx matching Drug A ÔåÆ Search_Term X\n   224ÔåÆ  Patient + Drug B ÔåÆ GP dx matching Drug B ÔåÆ Search_Term Y\n   225ÔåÆ  ÔåÆ If X == Y: one pathway under X\n   226ÔåÆ  ÔåÆ If X != Y: two pathways (modified UPIDs)\n   227ÔåÆ```\n   228ÔåÆ\n   229ÔåÆ### Key Files\n   230ÔåÆ\n   231ÔåÆ| File | Changes |\n   232ÔåÆ|------|---------|\n   233ÔåÆ| `data_processing/diagnosis_lookup.py` | Update query, add drug mapping functions |\n   234ÔåÆ| `data_processing/pathway_pipeline.py` | Possibly minor changes for modified UPIDs |\n   235ÔåÆ| `cli/refresh_pathways.py` | Integrate drug-aware matching into pipeline |\n   236ÔåÆ| `data/DimSearchTerm.csv` | Reference data (read-only) |\n   237ÔåÆ| `analysis/pathway_analyzer.py` | No changes expected (UPID changes are transparent) |\n   238ÔåÆ| `pathways_app/pathways_app.py` | No changes expected |\n   239ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"58406d80-3d86-4438-b7ec-2bc204ac359f","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md","content":"# Implementation Plan - Drug-Aware Indication Matching\n\n## Project Overview\n\nUpdate the indication-based pathway charts so that patient indications are matched **per drug**, not just per patient. Currently, each patient gets ONE indication (most recent GP diagnosis match). This ignores which drugs the patient is actually taking.\n\n### The Problem\n\nA patient on ADALIMUMAB + OMALIZUMAB currently gets assigned a single indication (e.g., \"rheumatoid arthritis\" ÔÇö the most recent GP match). But:\n- ADALIMUMAB is used for rheumatoid arthritis, axial spondyloarthritis, crohn's disease, etc.\n- OMALIZUMAB is used for asthma, allergic asthma, urticaria\n\nThese are different clinical pathways and should be treated as separate treatment journeys.\n\n### The Solution\n\nMatch each drug to an indication by cross-referencing:\n1. **GP diagnosis** ÔÇö which Search_Terms the patient has matching SNOMED codes for\n2. **Drug mapping** ÔÇö which Search_Terms list each drug (from `DimSearchTerm.csv`)\n\nOnly assign a drug to an indication if BOTH conditions are met. If a patient's drugs map to different indications, they become separate pathways (via modified UPID).\n\n### Key Design Decisions\n\n| Aspect | Decision |\n|--------|----------|\n| Drug-indication source | `data/DimSearchTerm.csv` ÔÇö Search_Term ÔåÆ CleanedDrugName mapping |\n| UPID modification | `{original_UPID}\\|{search_term}` for drugs with matched indication |\n| GP diagnosis matching | Return ALL matches per patient (not just most recent) |\n| Drug matching | Substring match: HCD drug name contains DimSearchTerm fragment |\n| Multiple indication matches per drug | Use highest GP code frequency as tiebreaker (COUNT of matching SNOMED codes per Search_Term) |\n| GP code time range | Only codes from MIN(Intervention Date) onwards ÔÇö restricts to HCD data window |\n| No indication match | Fallback to directory (same as current behavior) |\n| Same patient, different indications | Separate pathways via different modified UPIDs |\n\n### Examples\n\n**Patient on ADALIMUMAB + GOLIMUMAB, GP dx: axial spondyloarthritis + asthma**\n- axial spondyloarthritis drug list includes both ADALIMUMAB and GOLIMUMAB\n- ÔåÆ Both drugs grouped under \"axial spondyloarthritis\", single pathway\n- Modified UPID: `RMV12345|axial spondyloarthritis`\n\n**Patient on ADALIMUMAB + OMALIZUMAB, GP dx: axial spondyloarthritis + asthma**\n- axial spondyloarthritis lists ADALIMUMAB but not OMALIZUMAB\n- asthma lists OMALIZUMAB but not ADALIMUMAB\n- ÔåÆ Two separate pathways:\n  - `RMV12345|axial spondyloarthritis` with ADALIMUMAB\n  - `RMV12345|asthma` with OMALIZUMAB\n\n**Patient on ADALIMUMAB, GP dx: rheumatoid arthritis (47 codes) + crohn's disease (2 codes)**\n- Both Search_Terms list ADALIMUMAB AND patient has GP dx for both\n- ÔåÆ Tiebreaker: highest code frequency ÔÇö rheumatoid arthritis has 47 matching SNOMED codes vs 2 for crohn's\n- ÔåÆ Single pathway under rheumatoid arthritis (more clinical activity = more likely the treatment indication)\n\n---\n\n## Phase 1: Update Snowflake Query & Drug Mapping\n\n### 1.1 Update `get_patient_indication_groups()` to return ALL matches with frequency\n- [ ] Modify the Snowflake query in `get_patient_indication_groups()` (diagnosis_lookup.py):\n  - Remove `QUALIFY ROW_NUMBER() OVER (PARTITION BY ... ORDER BY EventDateTime DESC) = 1`\n  - Return ALL matching Search_Terms per patient with code frequency:\n    ```sql\n    SELECT pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n           aic.Search_Term AS \"Search_Term\",\n           COUNT(*) AS \"code_frequency\"\n    FROM PrimaryCareClinicalCoding pc\n    JOIN AllIndicationCodes aic ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\n    WHERE pc.\"PatientPseudonym\" IN (...)\n      AND pc.\"EventDateTime\" >= :earliest_hcd_date\n    GROUP BY pc.\"PatientPseudonym\", aic.Search_Term\n    ```\n  - `code_frequency` = number of matching SNOMED codes per Search_Term per patient\n  - Higher frequency = more clinical activity = stronger signal for tiebreaker\n  - `earliest_hcd_date` = `MIN(Intervention Date)` from the HCD DataFrame ÔÇö restricts GP codes to the HCD data window, reducing noise from old/irrelevant diagnoses\n- [ ] Accept `earliest_hcd_date` parameter in `get_patient_indication_groups()` and pass to query\n- [ ] Keep batch processing (500 patients per query)\n- [ ] Update return type: DataFrame now has multiple rows per patient (PatientPseudonym, Search_Term, code_frequency)\n- [ ] Verify: Query returns more rows than before (patients with multiple matching diagnoses)\n\n### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB\n\n### 1.3 Build drug-to-Search_Term lookup from DimSearchTerm.csv\n- [x] Add function `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n  - Loads `data/DimSearchTerm.csv`\n  - Builds dict: `drug_fragment (uppercase) ÔåÆ list[Search_Term]`\n  - Also builds reverse: `search_term ÔåÆ list[drug_fragments]`\n  - CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\")\n- [x] Add function `get_search_terms_for_drug(drug_name, search_term_to_fragments) -> list[str]`:\n  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n  - More practical than per-term boolean check ÔÇö returns all matches at once for Phase 2 use\n- [x] Verify: ADALIMUMAB matches \"axial spondyloarthritis\", OMALIZUMAB matches \"asthma\"\n\n---\n\n## Phase 2: Drug-Aware Indication Matching Logic\n\n### 2.1 Create `assign_drug_indications()` function\n- [ ] Add to `diagnosis_lookup.py` or `pathway_pipeline.py`:\n  ```\n  def assign_drug_indications(\n      df: pd.DataFrame,              # HCD data with UPID, Drug Name columns\n      gp_matches_df: pd.DataFrame,   # PatientPseudonym ÔåÆ list of matched Search_Terms\n      drug_mapping: dict,             # From load_drug_indication_mapping()\n  ) -> tuple[pd.DataFrame, pd.DataFrame]:\n      Returns: (modified_df, indication_df)\n      - modified_df: HCD data with UPID replaced by {UPID}|{indication}\n      - indication_df: mapping modified_UPID ÔåÆ Search_Term\n  ```\n- [ ] Logic per UPID + Drug Name pair:\n  1. Get patient's GP-matched Search_Terms with code_frequency (from gp_matches_df via PseudoNHSNoLinked)\n  2. Get which Search_Terms include this drug (from drug_mapping)\n  3. Intersection = valid indications for this drug-patient pair\n  4. If 1 match: use it\n  5. If multiple matches: use highest code_frequency as tiebreaker (most GP coding activity = most likely treatment indication)\n  6. If 0 matches: use fallback directory\n- [ ] Modify UPID in df rows: `{original_UPID}|{matched_search_term}`\n- [ ] Build indication_df: `{modified_UPID}` ÔåÆ `Search_Term` (or fallback label)\n- [ ] Verify: Function compiles, handles edge cases (no GP match, no drug match)\n\n### 2.2 Handle tiebreaker for multiple indication matches\n- [ ] When a drug matches multiple Search_Terms AND patient has GP dx for multiple:\n  - Use `code_frequency` from the GP query (COUNT of matching SNOMED codes per Search_Term)\n  - Higher code_frequency = more clinical activity for that condition = more likely treatment indication\n  - E.g., patient with 47 RA codes and 2 crohn's codes ÔåÆ ADALIMUMAB assigned to RA\n  - code_frequency is already returned by the updated query in Task 1.1\n- [ ] Verify: Tiebreaker logic correctly picks highest-frequency diagnosis\n- [ ] Verify: Tie on frequency (rare but possible) falls back to alphabetical Search_Term for determinism\n\n---\n\n## Phase 3: Pipeline Integration\n\n### 3.1 Update `refresh_pathways.py` indication processing\n- [ ] In the `elif current_chart_type == \"indication\":` block:\n  1. Call `get_patient_indication_groups()` as before (but now returns ALL matches)\n  2. Load drug mapping: `drug_mapping = load_drug_indication_mapping()`\n  3. Call `assign_drug_indications(df, gp_matches_df, drug_mapping)`\n  4. Use modified_df (with indication-aware UPIDs) for pathway processing\n  5. Use indication_df for the indication mapping\n- [ ] Pass modified_df (not original df) to `process_indication_pathway_for_date_filter()`\n- [ ] Verify: Pipeline compiles, `python -m py_compile cli/refresh_pathways.py`\n\n### 3.2 Test with dry run\n- [ ] Run `python -m cli.refresh_pathways --chart-type indication --dry-run -v`\n- [ ] Verify:\n  - Modified UPIDs appear in pipeline log (e.g., `RMV12345|rheumatoid arthritis`)\n  - Patient counts are reasonable (will be higher than before since same patient can appear under multiple indications)\n  - Drug-indication matching is logged (match rate, fallback rate)\n  - Pathway hierarchy shows drug-specific grouping under correct indications\n\n---\n\n## Phase 4: Full Refresh & Validation\n\n### 4.1 Full refresh with both chart types\n- [ ] Run `python -m cli.refresh_pathways --chart-type all`\n- [ ] Verify:\n  - Both chart types generate data\n  - Directory charts unchanged (no modified UPIDs)\n  - Indication charts reflect drug-aware matching\n\n### 4.2 Validate indication chart correctness\n- [ ] Check that drugs under an indication all appear in that Search_Term's drug list\n- [ ] Verify that a patient on drugs for different indications creates separate pathway branches\n- [ ] Verify that drugs sharing an indication are grouped in the same pathway\n- [ ] Log: patient count comparison (old vs new approach)\n\n### 4.3 Validate Reflex UI\n- [ ] Run `python -m reflex compile` to verify app compiles\n- [ ] Verify chart type toggle still works\n- [ ] Verify indication chart shows correct hierarchy\n\n---\n\n## Completion Criteria\n\nAll tasks marked `[x]` AND:\n- [ ] App compiles without errors (`reflex compile` succeeds)\n- [ ] Both chart types generate pathway data\n- [ ] Indication charts show drug-specific indication matching\n- [ ] Drugs under the same indication for the same patient are in one pathway\n- [ ] Drugs under different indications for the same patient create separate pathways\n- [ ] Fallback works for drugs with no indication match\n- [ ] Full refresh completes successfully\n- [ ] Existing directory charts are unaffected\n\n---\n\n## Reference\n\n### DimSearchTerm.csv Structure\n```\nSearch_Term,CleanedDrugName,PrimaryDirectorate\nrheumatoid arthritis,ABATACEPT|ADALIMUMAB|ANAKINRA|BARICITINIB|...,RHEUMATOLOGY\nasthma,BENRALIZUMAB|DUPILUMAB|INHALED|MEPOLIZUMAB|OMALIZUMAB|RESLIZUMAB,THORACIC MEDICINE\n```\n\n### Modified UPID Format\n```\nOriginal:  RMV12345\nModified:  RMV12345|rheumatoid arthritis\nFallback:  RMV12345|RHEUMATOLOGY (no GP dx)\n```\n\n### Current vs New Indication Flow\n```\nCURRENT:\n  Patient ÔåÆ GP dx (most recent) ÔåÆ single Search_Term ÔåÆ one pathway\n\nNEW:\n  Patient + Drug A ÔåÆ GP dx matching Drug A ÔåÆ Search_Term X\n  Patient + Drug B ÔåÆ GP dx matching Drug B ÔåÆ Search_Term Y\n  ÔåÆ If X == Y: one pathway under X\n  ÔåÆ If X != Y: two pathways (modified UPIDs)\n```\n\n### Key Files\n\n| File | Changes |\n|------|---------|\n| `data_processing/diagnosis_lookup.py` | Update query, add drug mapping functions |\n| `data_processing/pathway_pipeline.py` | Possibly minor changes for modified UPIDs |\n| `cli/refresh_pathways.py` | Integrate drug-aware matching into pipeline |\n| `data/DimSearchTerm.csv` | Reference data (read-only) |\n| `analysis/pathway_analyzer.py` | No changes expected (UPID changes are transparent) |\n| `pathways_app/pathways_app.py` | No changes expected |\n","numLines":239,"startLine":1,"totalLines":239}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01TYdJwKiat1TwJ62jNo3j7K","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01NvZSc7YVugsacRf2oEygdV","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\guardrails.md"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":2,"cache_creation_input_tokens":42305,"cache_read_input_tokens":27385,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":42305},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"26c22bd3-0ba7-4e11-b870-d24275219088"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01NvZSc7YVugsacRf2oEygdV","type":"tool_result","content":"     1ÔåÆ# Guardrails\n     2ÔåÆ\n     3ÔåÆKnown failure patterns. Read EVERY iteration. Follow ALL of these rules.\n     4ÔåÆIf you discover a new failure pattern during your work, add it to this file.\n     5ÔåÆ\n     6ÔåÆ---\n     7ÔåÆ\n     8ÔåÆ## Drug-Indication Matching Guardrails\n     9ÔåÆ\n    10ÔåÆ### Match drugs to indications, not just patients to indications\n    11ÔåÆ- **When**: Building the indication mapping for pathway charts\n    12ÔåÆ- **Rule**: Each drug must be validated against BOTH the patient's GP diagnoses AND the drug-to-indication mapping from DimSearchTerm.csv. A patient being diagnosed with rheumatoid arthritis does NOT mean all their drugs are for rheumatoid arthritis.\n    13ÔåÆ- **Why**: The previous approach assigned ONE indication per patient (most recent GP dx), ignoring which drugs actually treat which conditions. This produced misleading pathways.\n    14ÔåÆ\n    15ÔåÆ### Use DimSearchTerm.csv for drug-to-Search_Term mapping\n    16ÔåÆ- **When**: Determining which Search_Term a drug belongs to\n    17ÔåÆ- **Rule**: Load `data/DimSearchTerm.csv`. The `CleanedDrugName` column has pipe-separated drug name fragments. Match HCD drug names against these fragments using substring matching (case-insensitive).\n    18ÔåÆ- **Why**: This CSV is the authoritative mapping of which drugs are used for which clinical indications.\n    19ÔåÆ\n    20ÔåÆ### Use substring matching for drug fragments\n    21ÔåÆ- **When**: Matching HCD drug names against DimSearchTerm CleanedDrugName fragments\n    22ÔåÆ- **Rule**: Check if any fragment from DimSearchTerm is a SUBSTRING of the HCD drug name (case-insensitive). E.g., \"PEGYLATED\" should match \"PEGYLATED LIPOSOMAL DOXORUBICIN\".\n    23ÔåÆ- **Why**: DimSearchTerm contains both full drug names (ADALIMUMAB) and partial fragments (PEGYLATED, INHALED). Exact match would miss the partial ones.\n    24ÔåÆ\n    25ÔåÆ### Modified UPID uses pipe delimiter\n    26ÔåÆ- **When**: Creating indication-aware UPIDs\n    27ÔåÆ- **Rule**: Format is `{original_UPID}|{search_term}`. Use pipe `|` as delimiter. Do NOT use ` - ` (hyphen with spaces) as that's used for pathway hierarchy levels in the `ids` column.\n    28ÔåÆ- **Why**: The `ids` column uses \" - \" to separate hierarchy levels (e.g., \"N&WICS - NNUH - rheumatoid arthritis - ADALIMUMAB\"). Using the same delimiter in UPIDs would break hierarchy parsing.\n    29ÔåÆ\n    30ÔåÆ### Return ALL GP matches per patient, not just most recent\n    31ÔåÆ- **When**: Querying Snowflake for patient GP diagnoses\n    32ÔåÆ- **Rule**: Remove `QUALIFY ROW_NUMBER() OVER (PARTITION BY ... ORDER BY EventDateTime DESC) = 1`. Return ALL matching Search_Terms per patient with `GROUP BY + COUNT(*)` for code_frequency.\n    33ÔåÆ- **Why**: A patient may have GP diagnoses for both rheumatoid arthritis AND asthma. We need ALL matches to cross-reference with their drugs.\n    34ÔåÆ\n    35ÔåÆ### Restrict GP code lookup to HCD data window\n    36ÔåÆ- **When**: Building the WHERE clause for the GP record query\n    37ÔåÆ- **Rule**: Add `AND pc.\"EventDateTime\" >= :earliest_hcd_date` where `earliest_hcd_date` is `MIN(Intervention Date)` from the HCD DataFrame. Pass this as a parameter to `get_patient_indication_groups()`.\n    38ÔåÆ- **Why**: Old GP codes from years before treatment started add noise. A diagnosis coded 10 years ago may no longer be relevant. Restricting to the HCD window ensures code_frequency reflects recent clinical activity for the conditions being actively treated.\n    39ÔåÆ\n    40ÔåÆ### Tiebreaker: highest GP code frequency when a drug matches multiple indications\n    41ÔåÆ- **When**: A single drug maps to multiple Search_Terms AND the patient has GP dx for multiple\n    42ÔåÆ- **Rule**: Use `code_frequency` (COUNT of matching SNOMED codes per Search_Term per patient) from the GP query. The Search_Term with the most matching codes in the patient's GP record wins. If tied, use alphabetical Search_Term for determinism.\n    43ÔåÆ- **Why**: E.g., ADALIMUMAB is listed under rheumatoid arthritis, crohn's disease, psoriatic arthritis, etc. A patient with 47 RA codes and 2 crohn's codes is almost certainly on ADALIMUMAB for RA. Frequency of GP coding is a much stronger signal of clinical intent than recency ÔÇö a recent one-off asthma check doesn't mean ADALIMUMAB is for asthma.\n    44ÔåÆ\n    45ÔåÆ### Same patient, different indications = separate modified UPIDs\n    46ÔåÆ- **When**: A patient's drugs map to different Search_Terms\n    47ÔåÆ- **Rule**: Create separate modified UPIDs for each indication. E.g., `RMV12345|rheumatoid arthritis` and `RMV12345|asthma`. These are treated as separate \"patients\" by the pathway analyzer.\n    48ÔåÆ- **Why**: This is the core design ÔÇö drugs for different indications should create separate treatment pathways, even for the same physical patient.\n    49ÔåÆ\n    50ÔåÆ### Fallback to directory for unmatched drugs\n    51ÔåÆ- **When**: A drug doesn't match any Search_Term OR the patient has no GP dx for any of the drug's Search_Terms\n    52ÔåÆ- **Rule**: Use fallback format: `{UPID}|{Directory} (no GP dx)`. The indication_df maps this to `\"{Directory} (no GP dx)\"`.\n    53ÔåÆ- **Why**: Maintains consistent behavior with the previous approach for patients/drugs without GP diagnosis matches.\n    54ÔåÆ\n    55ÔåÆ### Merge asthma Search_Terms but keep urticaria separate\n    56ÔåÆ- **When**: Working with asthma-related Search_Terms from CLUSTER_MAPPING_SQL or DimSearchTerm.csv\n    57ÔåÆ- **Rule**: Merge \"allergic asthma\", \"asthma\", and \"severe persistent allergic asthma\" into a single \"asthma\" Search_Term. Keep \"urticaria\" as a separate Search_Term ÔÇö do NOT merge it with asthma.\n    58ÔåÆ- **Why**: These are clinically the same condition at different severity levels. Splitting them fragments the data. Urticaria is a distinct dermatological condition that happens to share OMALIZUMAB.\n    59ÔåÆ\n    60ÔåÆ### Don't modify directory chart processing\n    61ÔåÆ- **When**: Making changes to the indication matching logic\n    62ÔåÆ- **Rule**: Only modify the indication chart path (`elif current_chart_type == \"indication\":`). Directory charts use unmodified UPIDs and directory-based grouping.\n    63ÔåÆ- **Why**: Directory charts work correctly and should not be affected by indication matching changes.\n    64ÔåÆ\n    65ÔåÆ---\n    66ÔåÆ\n    67ÔåÆ## Snowflake Query Guardrails\n    68ÔåÆ\n    69ÔåÆ### Use PseudoNHSNoLinked for GP record matching\n    70ÔåÆ- **When**: Querying GP records (PrimaryCareClinicalCoding) for patient diagnoses\n    71ÔåÆ- **Rule**: Use `PseudoNHSNoLinked` column from HCD data, NOT `PersonKey` (LocalPatientID)\n    72ÔåÆ- **Why**: PersonKey is provider-specific local ID. Only PseudoNHSNoLinked matches PatientPseudonym in GP records.\n    73ÔåÆ\n    74ÔåÆ### Embed cluster query as CTE in Snowflake\n    75ÔåÆ- **When**: Looking up patient indications during data refresh\n    76ÔåÆ- **Rule**: Use the `CLUSTER_MAPPING_SQL` content as a WITH clause in the patient lookup query\n    77ÔåÆ- **Why**: This ensures we always use the complete cluster mapping and don't need local storage\n    78ÔåÆ\n    79ÔåÆ### Quote mixed-case column aliases in Snowflake SQL\n    80ÔåÆ- **When**: Writing SELECT queries that return results to Python code\n    81ÔåÆ- **Rule**: Use `AS \"ColumnName\"` (quoted) for any column alias you'll access by name in Python\n    82ÔåÆ- **Why**: Snowflake uppercases unquoted identifiers. `SELECT foo AS Search_Term` returns `SEARCH_TERM`, so `row.get('Search_Term')` returns None. Fix: `SELECT foo AS \"Search_Term\"`\n    83ÔåÆ\n    84ÔåÆ### Build indication_df from all unique UPIDs, not PseudoNHSNoLinked\n    85ÔåÆ- **When**: Creating the indication mapping DataFrame for pathway processing\n    86ÔåÆ- **Rule**: Use `df.drop_duplicates(subset=['UPID'])` not `drop_duplicates(subset=['PseudoNHSNoLinked'])`\n    87ÔåÆ- **Why**: A patient visiting multiple providers has multiple UPIDs. Using unique PseudoNHSNoLinked only maps one UPID per patient, leaving others as NaN.\n    88ÔåÆ\n    89ÔåÆ---\n    90ÔåÆ\n    91ÔåÆ## Data Processing Guardrails\n    92ÔåÆ\n    93ÔåÆ### Copy DataFrames in functions that modify columns\n    94ÔåÆ- **When**: Writing functions like `prepare_data()` that modify DataFrame columns\n    95ÔåÆ- **Rule**: Always `df = df.copy()` at the start of any function that modifies column values on the input DataFrame\n    96ÔåÆ- **Why**: `prepare_data()` mapped Provider Code ÔåÆ Name in-place. When called multiple times on the same DataFrame, only the first call worked. The fix: `df.copy()` prevents destructive mutation.\n    97ÔåÆ\n    98ÔåÆ### Include chart_type in UNIQUE constraints for pathway_nodes\n    99ÔåÆ- **When**: Creating or modifying the pathway_nodes table schema\n   100ÔåÆ- **Rule**: The UNIQUE constraint MUST include `chart_type`: `UNIQUE(date_filter_id, chart_type, ids)`\n   101ÔåÆ- **Why**: Without `chart_type`, `INSERT OR REPLACE` silently overwrites directory chart nodes when indication chart nodes are inserted.\n   102ÔåÆ\n   103ÔåÆ### Handle NaN in Directory when building fallback labels\n   104ÔåÆ- **When**: Creating fallback indication labels for patients without GP diagnosis match\n   105ÔåÆ- **Rule**: Check `pd.notna(directory)` before concatenating to string. Use `\"UNKNOWN (no GP dx)\"` for NaN cases.\n   106ÔåÆ- **Why**: NaN handling prevents TypeError and ensures meaningful fallback labels.\n   107ÔåÆ\n   108ÔåÆ### Use parameterized queries for SQLite\n   109ÔåÆ- **When**: Building WHERE clauses with user-selected filters\n   110ÔåÆ- **Rule**: Use `?` placeholders and pass params tuple ÔÇö never string interpolation\n   111ÔåÆ- **Why**: Prevents SQL injection and handles special characters in drug/directory names\n   112ÔåÆ\n   113ÔåÆ### Use existing pathway_analyzer functions\n   114ÔåÆ- **When**: Processing pathway data for the icicle chart\n   115ÔåÆ- **Rule**: Reuse functions from `analysis/pathway_analyzer.py` ÔÇö don't reinvent\n   116ÔåÆ- **Why**: The existing code handles edge cases (empty groups, statistics calculation, color mapping)\n   117ÔåÆ\n   118ÔåÆ---\n   119ÔåÆ\n   120ÔåÆ## Reflex Guardrails\n   121ÔåÆ\n   122ÔåÆ### Use .to() methods for Var operations in rx.foreach\n   123ÔåÆ- **When**: Working with items inside `rx.foreach` render functions\n   124ÔåÆ- **Rule**: Use `item.to(int)` for numeric comparisons, `item.to_string()` for text operations\n   125ÔåÆ- **Why**: Items from rx.foreach are Var objects, not plain Python values.\n   126ÔåÆ\n   127ÔåÆ### Use rx.cond for conditional rendering, not Python if\n   128ÔåÆ- **When**: Conditionally showing/hiding components or changing styles based on state\n   129ÔåÆ- **Rule**: Use `rx.cond(condition, true_component, false_component)` ÔÇö not Python `if`\n   130ÔåÆ- **Why**: Python `if` evaluates at definition time; `rx.cond` evaluates reactively at render time\n   131ÔåÆ\n   132ÔåÆ---\n   133ÔåÆ\n   134ÔåÆ## Process Guardrails\n   135ÔåÆ\n   136ÔåÆ### One task per iteration\n   137ÔåÆ- **When**: Temptation to do additional tasks after completing the current one\n   138ÔåÆ- **Rule**: Complete ONE task, validate it, commit it, update progress, then stop\n   139ÔåÆ- **Why**: Multiple tasks increase error risk and make failures harder to diagnose\n   140ÔåÆ\n   141ÔåÆ### Never mark complete without validation\n   142ÔåÆ- **When**: Task feels \"done\" but hasn't been tested\n   143ÔåÆ- **Rule**: All validation tiers must pass before marking `[x]`\n   144ÔåÆ- **Why**: \"Feels done\" is not \"is done\"\n   145ÔåÆ\n   146ÔåÆ### Write explicit handoff notes\n   147ÔåÆ- **When**: Every iteration, before stopping\n   148ÔåÆ- **Rule**: The \"Next iteration should\" section must contain specific, actionable guidance\n   149ÔåÆ- **Why**: The next iteration has zero memory. If you don't write it down, it's lost.\n   150ÔåÆ\n   151ÔåÆ### Check existing code for patterns\n   152ÔåÆ- **When**: Unsure how to implement something\n   153ÔåÆ- **Rule**: Look at `pathways_app/pathways_app.py`, `analysis/pathway_analyzer.py`, `cli/refresh_pathways.py`\n   154ÔåÆ- **Why**: The existing codebase has solved many quirks already\n   155ÔåÆ\n   156ÔåÆ<!--\n   157ÔåÆADD NEW GUARDRAILS BELOW as failures are observed during the loop.\n   158ÔåÆ\n   159ÔåÆFormat:\n   160ÔåÆ### [Short descriptive name]\n   161ÔåÆ- **When**: What situation triggers this guardrail?\n   162ÔåÆ- **Rule**: What must you do (or not do)?\n   163ÔåÆ- **Why**: What failure prompted adding this guardrail?\n   164ÔåÆ-->\n   165ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"4d26449c-2d44-4e6b-abba-2a91b402f9dc","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\guardrails.md","content":"# Guardrails\n\nKnown failure patterns. Read EVERY iteration. Follow ALL of these rules.\nIf you discover a new failure pattern during your work, add it to this file.\n\n---\n\n## Drug-Indication Matching Guardrails\n\n### Match drugs to indications, not just patients to indications\n- **When**: Building the indication mapping for pathway charts\n- **Rule**: Each drug must be validated against BOTH the patient's GP diagnoses AND the drug-to-indication mapping from DimSearchTerm.csv. A patient being diagnosed with rheumatoid arthritis does NOT mean all their drugs are for rheumatoid arthritis.\n- **Why**: The previous approach assigned ONE indication per patient (most recent GP dx), ignoring which drugs actually treat which conditions. This produced misleading pathways.\n\n### Use DimSearchTerm.csv for drug-to-Search_Term mapping\n- **When**: Determining which Search_Term a drug belongs to\n- **Rule**: Load `data/DimSearchTerm.csv`. The `CleanedDrugName` column has pipe-separated drug name fragments. Match HCD drug names against these fragments using substring matching (case-insensitive).\n- **Why**: This CSV is the authoritative mapping of which drugs are used for which clinical indications.\n\n### Use substring matching for drug fragments\n- **When**: Matching HCD drug names against DimSearchTerm CleanedDrugName fragments\n- **Rule**: Check if any fragment from DimSearchTerm is a SUBSTRING of the HCD drug name (case-insensitive). E.g., \"PEGYLATED\" should match \"PEGYLATED LIPOSOMAL DOXORUBICIN\".\n- **Why**: DimSearchTerm contains both full drug names (ADALIMUMAB) and partial fragments (PEGYLATED, INHALED). Exact match would miss the partial ones.\n\n### Modified UPID uses pipe delimiter\n- **When**: Creating indication-aware UPIDs\n- **Rule**: Format is `{original_UPID}|{search_term}`. Use pipe `|` as delimiter. Do NOT use ` - ` (hyphen with spaces) as that's used for pathway hierarchy levels in the `ids` column.\n- **Why**: The `ids` column uses \" - \" to separate hierarchy levels (e.g., \"N&WICS - NNUH - rheumatoid arthritis - ADALIMUMAB\"). Using the same delimiter in UPIDs would break hierarchy parsing.\n\n### Return ALL GP matches per patient, not just most recent\n- **When**: Querying Snowflake for patient GP diagnoses\n- **Rule**: Remove `QUALIFY ROW_NUMBER() OVER (PARTITION BY ... ORDER BY EventDateTime DESC) = 1`. Return ALL matching Search_Terms per patient with `GROUP BY + COUNT(*)` for code_frequency.\n- **Why**: A patient may have GP diagnoses for both rheumatoid arthritis AND asthma. We need ALL matches to cross-reference with their drugs.\n\n### Restrict GP code lookup to HCD data window\n- **When**: Building the WHERE clause for the GP record query\n- **Rule**: Add `AND pc.\"EventDateTime\" >= :earliest_hcd_date` where `earliest_hcd_date` is `MIN(Intervention Date)` from the HCD DataFrame. Pass this as a parameter to `get_patient_indication_groups()`.\n- **Why**: Old GP codes from years before treatment started add noise. A diagnosis coded 10 years ago may no longer be relevant. Restricting to the HCD window ensures code_frequency reflects recent clinical activity for the conditions being actively treated.\n\n### Tiebreaker: highest GP code frequency when a drug matches multiple indications\n- **When**: A single drug maps to multiple Search_Terms AND the patient has GP dx for multiple\n- **Rule**: Use `code_frequency` (COUNT of matching SNOMED codes per Search_Term per patient) from the GP query. The Search_Term with the most matching codes in the patient's GP record wins. If tied, use alphabetical Search_Term for determinism.\n- **Why**: E.g., ADALIMUMAB is listed under rheumatoid arthritis, crohn's disease, psoriatic arthritis, etc. A patient with 47 RA codes and 2 crohn's codes is almost certainly on ADALIMUMAB for RA. Frequency of GP coding is a much stronger signal of clinical intent than recency ÔÇö a recent one-off asthma check doesn't mean ADALIMUMAB is for asthma.\n\n### Same patient, different indications = separate modified UPIDs\n- **When**: A patient's drugs map to different Search_Terms\n- **Rule**: Create separate modified UPIDs for each indication. E.g., `RMV12345|rheumatoid arthritis` and `RMV12345|asthma`. These are treated as separate \"patients\" by the pathway analyzer.\n- **Why**: This is the core design ÔÇö drugs for different indications should create separate treatment pathways, even for the same physical patient.\n\n### Fallback to directory for unmatched drugs\n- **When**: A drug doesn't match any Search_Term OR the patient has no GP dx for any of the drug's Search_Terms\n- **Rule**: Use fallback format: `{UPID}|{Directory} (no GP dx)`. The indication_df maps this to `\"{Directory} (no GP dx)\"`.\n- **Why**: Maintains consistent behavior with the previous approach for patients/drugs without GP diagnosis matches.\n\n### Merge asthma Search_Terms but keep urticaria separate\n- **When**: Working with asthma-related Search_Terms from CLUSTER_MAPPING_SQL or DimSearchTerm.csv\n- **Rule**: Merge \"allergic asthma\", \"asthma\", and \"severe persistent allergic asthma\" into a single \"asthma\" Search_Term. Keep \"urticaria\" as a separate Search_Term ÔÇö do NOT merge it with asthma.\n- **Why**: These are clinically the same condition at different severity levels. Splitting them fragments the data. Urticaria is a distinct dermatological condition that happens to share OMALIZUMAB.\n\n### Don't modify directory chart processing\n- **When**: Making changes to the indication matching logic\n- **Rule**: Only modify the indication chart path (`elif current_chart_type == \"indication\":`). Directory charts use unmodified UPIDs and directory-based grouping.\n- **Why**: Directory charts work correctly and should not be affected by indication matching changes.\n\n---\n\n## Snowflake Query Guardrails\n\n### Use PseudoNHSNoLinked for GP record matching\n- **When**: Querying GP records (PrimaryCareClinicalCoding) for patient diagnoses\n- **Rule**: Use `PseudoNHSNoLinked` column from HCD data, NOT `PersonKey` (LocalPatientID)\n- **Why**: PersonKey is provider-specific local ID. Only PseudoNHSNoLinked matches PatientPseudonym in GP records.\n\n### Embed cluster query as CTE in Snowflake\n- **When**: Looking up patient indications during data refresh\n- **Rule**: Use the `CLUSTER_MAPPING_SQL` content as a WITH clause in the patient lookup query\n- **Why**: This ensures we always use the complete cluster mapping and don't need local storage\n\n### Quote mixed-case column aliases in Snowflake SQL\n- **When**: Writing SELECT queries that return results to Python code\n- **Rule**: Use `AS \"ColumnName\"` (quoted) for any column alias you'll access by name in Python\n- **Why**: Snowflake uppercases unquoted identifiers. `SELECT foo AS Search_Term` returns `SEARCH_TERM`, so `row.get('Search_Term')` returns None. Fix: `SELECT foo AS \"Search_Term\"`\n\n### Build indication_df from all unique UPIDs, not PseudoNHSNoLinked\n- **When**: Creating the indication mapping DataFrame for pathway processing\n- **Rule**: Use `df.drop_duplicates(subset=['UPID'])` not `drop_duplicates(subset=['PseudoNHSNoLinked'])`\n- **Why**: A patient visiting multiple providers has multiple UPIDs. Using unique PseudoNHSNoLinked only maps one UPID per patient, leaving others as NaN.\n\n---\n\n## Data Processing Guardrails\n\n### Copy DataFrames in functions that modify columns\n- **When**: Writing functions like `prepare_data()` that modify DataFrame columns\n- **Rule**: Always `df = df.copy()` at the start of any function that modifies column values on the input DataFrame\n- **Why**: `prepare_data()` mapped Provider Code ÔåÆ Name in-place. When called multiple times on the same DataFrame, only the first call worked. The fix: `df.copy()` prevents destructive mutation.\n\n### Include chart_type in UNIQUE constraints for pathway_nodes\n- **When**: Creating or modifying the pathway_nodes table schema\n- **Rule**: The UNIQUE constraint MUST include `chart_type`: `UNIQUE(date_filter_id, chart_type, ids)`\n- **Why**: Without `chart_type`, `INSERT OR REPLACE` silently overwrites directory chart nodes when indication chart nodes are inserted.\n\n### Handle NaN in Directory when building fallback labels\n- **When**: Creating fallback indication labels for patients without GP diagnosis match\n- **Rule**: Check `pd.notna(directory)` before concatenating to string. Use `\"UNKNOWN (no GP dx)\"` for NaN cases.\n- **Why**: NaN handling prevents TypeError and ensures meaningful fallback labels.\n\n### Use parameterized queries for SQLite\n- **When**: Building WHERE clauses with user-selected filters\n- **Rule**: Use `?` placeholders and pass params tuple ÔÇö never string interpolation\n- **Why**: Prevents SQL injection and handles special characters in drug/directory names\n\n### Use existing pathway_analyzer functions\n- **When**: Processing pathway data for the icicle chart\n- **Rule**: Reuse functions from `analysis/pathway_analyzer.py` ÔÇö don't reinvent\n- **Why**: The existing code handles edge cases (empty groups, statistics calculation, color mapping)\n\n---\n\n## Reflex Guardrails\n\n### Use .to() methods for Var operations in rx.foreach\n- **When**: Working with items inside `rx.foreach` render functions\n- **Rule**: Use `item.to(int)` for numeric comparisons, `item.to_string()` for text operations\n- **Why**: Items from rx.foreach are Var objects, not plain Python values.\n\n### Use rx.cond for conditional rendering, not Python if\n- **When**: Conditionally showing/hiding components or changing styles based on state\n- **Rule**: Use `rx.cond(condition, true_component, false_component)` ÔÇö not Python `if`\n- **Why**: Python `if` evaluates at definition time; `rx.cond` evaluates reactively at render time\n\n---\n\n## Process Guardrails\n\n### One task per iteration\n- **When**: Temptation to do additional tasks after completing the current one\n- **Rule**: Complete ONE task, validate it, commit it, update progress, then stop\n- **Why**: Multiple tasks increase error risk and make failures harder to diagnose\n\n### Never mark complete without validation\n- **When**: Task feels \"done\" but hasn't been tested\n- **Rule**: All validation tiers must pass before marking `[x]`\n- **Why**: \"Feels done\" is not \"is done\"\n\n### Write explicit handoff notes\n- **When**: Every iteration, before stopping\n- **Rule**: The \"Next iteration should\" section must contain specific, actionable guidance\n- **Why**: The next iteration has zero memory. If you don't write it down, it's lost.\n\n### Check existing code for patterns\n- **When**: Unsure how to implement something\n- **Rule**: Look at `pathways_app/pathways_app.py`, `analysis/pathway_analyzer.py`, `cli/refresh_pathways.py`\n- **Why**: The existing codebase has solved many quirks already\n\n<!--\nADD NEW GUARDRAILS BELOW as failures are observed during the loop.\n\nFormat:\n### [Short descriptive name]\n- **When**: What situation triggers this guardrail?\n- **Rule**: What must you do (or not do)?\n- **Why**: What failure prompted adding this guardrail?\n-->\n","numLines":165,"startLine":1,"totalLines":165}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_015hwRwP2LN6RM4vZuwUtiri","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me check recent git history and read the current state of the key file I'll need to modify."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":10748,"cache_read_input_tokens":69690,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":10748},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"9f33f29e-2c2a-4383-8388-012208b07fe0"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_015hwRwP2LN6RM4vZuwUtiri","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_014rmAYmw7GmYnFa5fYMYyxJ","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git log --oneline -5","description":"Show recent git commits"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":10748,"cache_read_input_tokens":69690,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":10748},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"7c037de4-18ea-41e7-b0ef-a26e72659689"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_015hwRwP2LN6RM4vZuwUtiri","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01WVFwQe4NEKPmgzHFMKsQ3M","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":10748,"cache_read_input_tokens":69690,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":10748},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"91569abb-0781-450c-9b5f-b61d9ab6b287"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_014rmAYmw7GmYnFa5fYMYyxJ","type":"tool_result","content":"c85aae4 docs: update progress.txt with Iteration 1 results (Task 1.2)\n0779df7 feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.2)\n1c4d2c0 docs: mark project complete - all tasks done, viewport testing blocked by env (Iteration 9)\nfed9094 docs: update CLAUDE.md with indication chart architecture and CLI docs (Task 5.2)\n4884e0a fix: recreate pathway_nodes with correct UNIQUE constraint and validate end-to-end (Task 5.1)","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"37a53927-ecde-4fef-a160-dd59716921a6","tool_use_result":{"stdout":"c85aae4 docs: update progress.txt with Iteration 1 results (Task 1.2)\n0779df7 feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.2)\n1c4d2c0 docs: mark project complete - all tasks done, viewport testing blocked by env (Iteration 9)\nfed9094 docs: update CLAUDE.md with indication chart architecture and CLI docs (Task 5.2)\n4884e0a fix: recreate pathway_nodes with correct UNIQUE constraint and validate end-to-end (Task 5.1)\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01WVFwQe4NEKPmgzHFMKsQ3M","type":"tool_result","content":"     1ÔåÆ\"\"\"\n     2ÔåÆDiagnosis lookup module for NHS Patient Pathway Analysis.\n     3ÔåÆ\n     4ÔåÆProvides functions to validate patient indications by checking GP diagnosis records\n     5ÔåÆagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\n     6ÔåÆdrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n     7ÔåÆ\n     8ÔåÆKey workflow:\n     9ÔåÆ1. Get drug's valid indication clusters from local mapping\n    10ÔåÆ2. Get all SNOMED codes for those clusters from Snowflake\n    11ÔåÆ3. Check if patient has any of those SNOMED codes in GP records\n    12ÔåÆ4. Report indication validation status\n    13ÔåÆ\n    14ÔåÆIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\n    15ÔåÆGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n    16ÔåÆ\"\"\"\n    17ÔåÆ\n    18ÔåÆfrom dataclasses import dataclass, field\n    19ÔåÆfrom datetime import date, datetime\n    20ÔåÆfrom pathlib import Path\n    21ÔåÆfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\n    22ÔåÆimport csv\n    23ÔåÆ\n    24ÔåÆif TYPE_CHECKING:\n    25ÔåÆ    import pandas as pd\n    26ÔåÆ\n    27ÔåÆfrom core.logging_config import get_logger\n    28ÔåÆfrom data_processing.database import DatabaseManager, default_db_manager\n    29ÔåÆfrom data_processing.snowflake_connector import (\n    30ÔåÆ    SnowflakeConnector,\n    31ÔåÆ    get_connector,\n    32ÔåÆ    is_snowflake_available,\n    33ÔåÆ    is_snowflake_configured,\n    34ÔåÆ    SNOWFLAKE_AVAILABLE,\n    35ÔåÆ)\n    36ÔåÆfrom data_processing.cache import get_cache, is_cache_enabled\n    37ÔåÆ\n    38ÔåÆlogger = get_logger(__name__)\n    39ÔåÆ\n    40ÔåÆ\n    41ÔåÆ@dataclass\n    42ÔåÆclass ClusterSnomedCodes:\n    43ÔåÆ    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    44ÔåÆ    cluster_id: str\n    45ÔåÆ    cluster_description: str\n    46ÔåÆ    snomed_codes: list[str] = field(default_factory=list)\n    47ÔåÆ    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n    48ÔåÆ\n    49ÔåÆ    @property\n    50ÔåÆ    def code_count(self) -> int:\n    51ÔåÆ        return len(self.snomed_codes)\n    52ÔåÆ\n    53ÔåÆ\n    54ÔåÆ@dataclass\n    55ÔåÆclass IndicationValidationResult:\n    56ÔåÆ    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    57ÔåÆ    patient_pseudonym: str\n    58ÔåÆ    drug_name: str\n    59ÔåÆ    has_valid_indication: bool\n    60ÔåÆ    matched_cluster_id: Optional[str] = None\n    61ÔåÆ    matched_snomed_code: Optional[str] = None\n    62ÔåÆ    matched_snomed_description: Optional[str] = None\n    63ÔåÆ    checked_clusters: list[str] = field(default_factory=list)\n    64ÔåÆ    total_codes_checked: int = 0\n    65ÔåÆ    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    66ÔåÆ    error_message: Optional[str] = None\n    67ÔåÆ\n    68ÔåÆ\n    69ÔåÆ@dataclass\n    70ÔåÆclass DrugIndicationMatchRate:\n    71ÔåÆ    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    72ÔåÆ    drug_name: str\n    73ÔåÆ    total_patients: int\n    74ÔåÆ    patients_with_indication: int\n    75ÔåÆ    patients_without_indication: int\n    76ÔåÆ    match_rate: float  # 0.0 to 1.0\n    77ÔåÆ    clusters_checked: list[str] = field(default_factory=list)\n    78ÔåÆ    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n    79ÔåÆ\n    80ÔåÆ\n    81ÔåÆ@dataclass\n    82ÔåÆclass DrugSnomedMapping:\n    83ÔåÆ    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    84ÔåÆ    snomed_code: str\n    85ÔåÆ    snomed_description: str\n    86ÔåÆ    search_term: str\n    87ÔåÆ    primary_directorate: str\n    88ÔåÆ    indication: str = \"\"\n    89ÔåÆ    ta_id: str = \"\"\n    90ÔåÆ\n    91ÔåÆ\n    92ÔåÆ@dataclass\n    93ÔåÆclass DirectSnomedMatchResult:\n    94ÔåÆ    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    95ÔåÆ    patient_pseudonym: str\n    96ÔåÆ    matched: bool\n    97ÔåÆ    snomed_code: Optional[str] = None\n    98ÔåÆ    snomed_description: Optional[str] = None\n    99ÔåÆ    search_term: Optional[str] = None\n   100ÔåÆ    primary_directorate: Optional[str] = None\n   101ÔåÆ    event_date: Optional[datetime] = None\n   102ÔåÆ    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n   103ÔåÆ\n   104ÔåÆ\n   105ÔåÆ@dataclass\n   106ÔåÆclass DirectorateAssignment:\n   107ÔåÆ    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n   108ÔåÆ    upid: str\n   109ÔåÆ    drug_name: str\n   110ÔåÆ    directorate: Optional[str]\n   111ÔåÆ    search_term: Optional[str] = None\n   112ÔåÆ    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n   113ÔåÆ    snomed_code: Optional[str] = None\n   114ÔåÆ    event_date: Optional[datetime] = None\n   115ÔåÆ\n   116ÔåÆ\n   117ÔåÆdef get_drug_clusters(\n   118ÔåÆ    drug_name: str,\n   119ÔåÆ    db_manager: Optional[DatabaseManager] = None\n   120ÔåÆ) -> list[dict]:\n   121ÔåÆ    \"\"\"\n   122ÔåÆ    Get all SNOMED cluster mappings for a drug from local SQLite.\n   123ÔåÆ\n   124ÔåÆ    Args:\n   125ÔåÆ        drug_name: Drug name to look up (case-insensitive)\n   126ÔåÆ        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n   127ÔåÆ\n   128ÔåÆ    Returns:\n   129ÔåÆ        List of dicts with keys: drug_name, indication, cluster_id,\n   130ÔåÆ        cluster_description, nice_ta_reference\n   131ÔåÆ    \"\"\"\n   132ÔåÆ    if db_manager is None:\n   133ÔåÆ        db_manager = default_db_manager\n   134ÔåÆ\n   135ÔåÆ    query = \"\"\"\n   136ÔåÆ        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n   137ÔåÆ        FROM ref_drug_indication_clusters\n   138ÔåÆ        WHERE UPPER(drug_name) = UPPER(?)\n   139ÔåÆ        ORDER BY indication, cluster_id\n   140ÔåÆ    \"\"\"\n   141ÔåÆ\n   142ÔåÆ    try:\n   143ÔåÆ        with db_manager.get_connection() as conn:\n   144ÔåÆ            cursor = conn.execute(query, (drug_name,))\n   145ÔåÆ            rows = cursor.fetchall()\n   146ÔåÆ\n   147ÔåÆ            results = []\n   148ÔåÆ            for row in rows:\n   149ÔåÆ                results.append({\n   150ÔåÆ                    \"drug_name\": row[\"drug_name\"],\n   151ÔåÆ                    \"indication\": row[\"indication\"],\n   152ÔåÆ                    \"cluster_id\": row[\"cluster_id\"],\n   153ÔåÆ                    \"cluster_description\": row[\"cluster_description\"],\n   154ÔåÆ                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n   155ÔåÆ                })\n   156ÔåÆ\n   157ÔåÆ            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n   158ÔåÆ            return results\n   159ÔåÆ\n   160ÔåÆ    except Exception as e:\n   161ÔåÆ        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n   162ÔåÆ        return []\n   163ÔåÆ\n   164ÔåÆ\n   165ÔåÆdef get_drug_cluster_ids(\n   166ÔåÆ    drug_name: str,\n   167ÔåÆ    db_manager: Optional[DatabaseManager] = None\n   168ÔåÆ) -> list[str]:\n   169ÔåÆ    \"\"\"\n   170ÔåÆ    Get unique cluster IDs for a drug.\n   171ÔåÆ\n   172ÔåÆ    Args:\n   173ÔåÆ        drug_name: Drug name to look up\n   174ÔåÆ        db_manager: Optional DatabaseManager\n   175ÔåÆ\n   176ÔåÆ    Returns:\n   177ÔåÆ        List of unique cluster IDs\n   178ÔåÆ    \"\"\"\n   179ÔåÆ    clusters = get_drug_clusters(drug_name, db_manager)\n   180ÔåÆ    return list(set(c[\"cluster_id\"] for c in clusters))\n   181ÔåÆ\n   182ÔåÆ\n   183ÔåÆdef get_drug_snomed_codes(\n   184ÔåÆ    drug_name: str,\n   185ÔåÆ    db_manager: Optional[DatabaseManager] = None\n   186ÔåÆ) -> list[DrugSnomedMapping]:\n   187ÔåÆ    \"\"\"\n   188ÔåÆ    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n   189ÔåÆ\n   190ÔåÆ    This uses the enriched mapping CSV data loaded into SQLite, which provides\n   191ÔåÆ    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n   192ÔåÆ\n   193ÔåÆ    Args:\n   194ÔåÆ        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n   195ÔåÆ        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n   196ÔåÆ\n   197ÔåÆ    Returns:\n   198ÔåÆ        List of DrugSnomedMapping with snomed_code, snomed_description,\n   199ÔåÆ        search_term, primary_directorate, indication, ta_id\n   200ÔåÆ    \"\"\"\n   201ÔåÆ    if db_manager is None:\n   202ÔåÆ        db_manager = default_db_manager\n   203ÔåÆ\n   204ÔåÆ    query = \"\"\"\n   205ÔåÆ        SELECT DISTINCT\n   206ÔåÆ            snomed_code,\n   207ÔåÆ            snomed_description,\n   208ÔåÆ            search_term,\n   209ÔåÆ            primary_directorate,\n   210ÔåÆ            indication,\n   211ÔåÆ            ta_id\n   212ÔåÆ        FROM ref_drug_snomed_mapping\n   213ÔåÆ        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n   214ÔåÆ           OR UPPER(drug_name) = UPPER(?)\n   215ÔåÆ        ORDER BY search_term, snomed_code\n   216ÔåÆ    \"\"\"\n   217ÔåÆ\n   218ÔåÆ    try:\n   219ÔåÆ        with db_manager.get_connection() as conn:\n   220ÔåÆ            cursor = conn.execute(query, (drug_name, drug_name))\n   221ÔåÆ            rows = cursor.fetchall()\n   222ÔåÆ\n   223ÔåÆ            results = []\n   224ÔåÆ            for row in rows:\n   225ÔåÆ                results.append(DrugSnomedMapping(\n   226ÔåÆ                    snomed_code=row[\"snomed_code\"],\n   227ÔåÆ                    snomed_description=row[\"snomed_description\"] or \"\",\n   228ÔåÆ                    search_term=row[\"search_term\"] or \"\",\n   229ÔåÆ                    primary_directorate=row[\"primary_directorate\"] or \"\",\n   230ÔåÆ                    indication=row[\"indication\"] or \"\",\n   231ÔåÆ                    ta_id=row[\"ta_id\"] or \"\",\n   232ÔåÆ                ))\n   233ÔåÆ\n   234ÔåÆ            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n   235ÔåÆ            return results\n   236ÔåÆ\n   237ÔåÆ    except Exception as e:\n   238ÔåÆ        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n   239ÔåÆ        return []\n   240ÔåÆ\n   241ÔåÆ\n   242ÔåÆdef patient_has_indication_direct(\n   243ÔåÆ    patient_pseudonym: str,\n   244ÔåÆ    drug_snomed_mappings: list[DrugSnomedMapping],\n   245ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   246ÔåÆ    before_date: Optional[date] = None,\n   247ÔåÆ) -> DirectSnomedMatchResult:\n   248ÔåÆ    \"\"\"\n   249ÔåÆ    Check if patient has any of the SNOMED codes in their GP records.\n   250ÔåÆ\n   251ÔåÆ    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n   252ÔåÆ    for exact SNOMED code matches (not via cluster). Returns the most recent\n   253ÔåÆ    match by EventDateTime if multiple matches exist.\n   254ÔåÆ\n   255ÔåÆ    Args:\n   256ÔåÆ        patient_pseudonym: Patient's pseudonymised NHS number\n   257ÔåÆ        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n   258ÔåÆ        connector: Optional SnowflakeConnector (defaults to singleton)\n   259ÔåÆ        before_date: Optional date - only check diagnoses before this date\n   260ÔåÆ\n   261ÔåÆ    Returns:\n   262ÔåÆ        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n   263ÔåÆ    \"\"\"\n   264ÔåÆ    result = DirectSnomedMatchResult(\n   265ÔåÆ        patient_pseudonym=patient_pseudonym,\n   266ÔåÆ        matched=False,\n   267ÔåÆ        source=\"NONE\",\n   268ÔåÆ    )\n   269ÔåÆ\n   270ÔåÆ    if not drug_snomed_mappings:\n   271ÔåÆ        return result\n   272ÔåÆ\n   273ÔåÆ    if not SNOWFLAKE_AVAILABLE:\n   274ÔåÆ        logger.warning(\"Snowflake connector not available\")\n   275ÔåÆ        return result\n   276ÔåÆ\n   277ÔåÆ    if not is_snowflake_configured():\n   278ÔåÆ        logger.warning(\"Snowflake not configured - cannot check GP records\")\n   279ÔåÆ        return result\n   280ÔåÆ\n   281ÔåÆ    if connector is None:\n   282ÔåÆ        connector = get_connector()\n   283ÔåÆ\n   284ÔåÆ    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n   285ÔåÆ    snomed_lookup = {\n   286ÔåÆ        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n   287ÔåÆ        for m in drug_snomed_mappings\n   288ÔåÆ    }\n   289ÔåÆ\n   290ÔåÆ    # Get unique SNOMED codes\n   291ÔåÆ    snomed_codes = list(snomed_lookup.keys())\n   292ÔåÆ\n   293ÔåÆ    # Build placeholders for SNOMED codes\n   294ÔåÆ    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n   295ÔåÆ\n   296ÔåÆ    # Query to find most recent matching SNOMED code in GP records\n   297ÔåÆ    query = f'''\n   298ÔåÆ        SELECT\n   299ÔåÆ            \"SNOMEDCode\",\n   300ÔåÆ            \"EventDateTime\"\n   301ÔåÆ        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n   302ÔåÆ        WHERE \"PatientPseudonym\" = %s\n   303ÔåÆ            AND \"SNOMEDCode\" IN ({placeholders})\n   304ÔåÆ    '''\n   305ÔåÆ\n   306ÔåÆ    params: list = [patient_pseudonym] + snomed_codes\n   307ÔåÆ\n   308ÔåÆ    if before_date:\n   309ÔåÆ        query += ' AND \"EventDateTime\" < %s'\n   310ÔåÆ        params.append(before_date.isoformat())\n   311ÔåÆ\n   312ÔåÆ    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n   313ÔåÆ\n   314ÔåÆ    try:\n   315ÔåÆ        results = connector.execute_dict(query, tuple(params))\n   316ÔåÆ\n   317ÔåÆ        if results:\n   318ÔåÆ            row = results[0]\n   319ÔåÆ            matched_code = row.get(\"SNOMEDCode\")\n   320ÔåÆ            event_dt = row.get(\"EventDateTime\")\n   321ÔåÆ\n   322ÔåÆ            if matched_code and matched_code in snomed_lookup:\n   323ÔåÆ                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n   324ÔåÆ\n   325ÔåÆ                return DirectSnomedMatchResult(\n   326ÔåÆ                    patient_pseudonym=patient_pseudonym,\n   327ÔåÆ                    matched=True,\n   328ÔåÆ                    snomed_code=matched_code,\n   329ÔåÆ                    snomed_description=snomed_desc,\n   330ÔåÆ                    search_term=search_term,\n   331ÔåÆ                    primary_directorate=primary_dir,\n   332ÔåÆ                    event_date=event_dt,\n   333ÔåÆ                    source=\"DIRECT_SNOMED\",\n   334ÔåÆ                )\n   335ÔåÆ\n   336ÔåÆ        return result\n   337ÔåÆ\n   338ÔåÆ    except Exception as e:\n   339ÔåÆ        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n   340ÔåÆ        return result\n   341ÔåÆ\n   342ÔåÆ\n   343ÔåÆdef get_directorate_from_diagnosis(\n   344ÔåÆ    upid: str,\n   345ÔåÆ    drug_name: str,\n   346ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   347ÔåÆ    db_manager: Optional[DatabaseManager] = None,\n   348ÔåÆ    before_date: Optional[date] = None,\n   349ÔåÆ) -> DirectorateAssignment:\n   350ÔåÆ    \"\"\"\n   351ÔåÆ    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n   352ÔåÆ\n   353ÔåÆ    This function attempts to assign a directorate based on the patient's GP records\n   354ÔåÆ    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n   355ÔåÆ    indicating that the caller should use alternative assignment methods (e.g.,\n   356ÔåÆ    department_identification() from tools/data.py).\n   357ÔåÆ\n   358ÔåÆ    Workflow:\n   359ÔåÆ    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n   360ÔåÆ    2. Query patient's GP records for matching SNOMED codes\n   361ÔåÆ    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n   362ÔåÆ    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n   363ÔåÆ\n   364ÔåÆ    Args:\n   365ÔåÆ        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n   366ÔåÆ        drug_name: Drug name to look up\n   367ÔåÆ        connector: Optional SnowflakeConnector (defaults to singleton)\n   368ÔåÆ        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n   369ÔåÆ        before_date: Optional date - only check diagnoses before this date\n   370ÔåÆ\n   371ÔåÆ    Returns:\n   372ÔåÆ        DirectorateAssignment with directorate, search_term, and source\n   373ÔåÆ    \"\"\"\n   374ÔåÆ    result = DirectorateAssignment(\n   375ÔåÆ        upid=upid,\n   376ÔåÆ        drug_name=drug_name,\n   377ÔåÆ        directorate=None,\n   378ÔåÆ        source=\"FALLBACK\",\n   379ÔåÆ    )\n   380ÔåÆ\n   381ÔåÆ    # Step 1: Get SNOMED codes for the drug\n   382ÔåÆ    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n   383ÔåÆ\n   384ÔåÆ    if not drug_snomed_mappings:\n   385ÔåÆ        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n   386ÔåÆ        return result\n   387ÔåÆ\n   388ÔåÆ    # Step 2: Check Snowflake availability\n   389ÔåÆ    if not SNOWFLAKE_AVAILABLE:\n   390ÔåÆ        logger.debug(\"Snowflake not available - using fallback\")\n   391ÔåÆ        return result\n   392ÔåÆ\n   393ÔåÆ    if not is_snowflake_configured():\n   394ÔåÆ        logger.debug(\"Snowflake not configured - using fallback\")\n   395ÔåÆ        return result\n   396ÔåÆ\n   397ÔåÆ    # Step 3: Get patient pseudonym from UPID\n   398ÔåÆ    # UPID format is Provider Code (3 chars) + PersonKey\n   399ÔåÆ    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n   400ÔåÆ    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n   401ÔåÆ    # For now, we'll use UPID as the identifier - the actual integration\n   402ÔåÆ    # will need to happen at the DataFrame level where we have PersonKey\n   403ÔåÆ    #\n   404ÔåÆ    # NOTE: This function will be called from the pipeline where we have\n   405ÔåÆ    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n   406ÔåÆ\n   407ÔåÆ    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n   408ÔåÆ    # The caller should pass the PatientPseudonym or we need to look it up.\n   409ÔåÆ    # For now, let's assume the caller will use this in a batch context\n   410ÔåÆ    # where they can map UPID -> PatientPseudonym.\n   411ÔåÆ\n   412ÔåÆ    # Let me reconsider: the function signature takes UPID but we need\n   413ÔåÆ    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n   414ÔåÆ    # we'll have the PersonKey column which IS the PatientPseudonym.\n   415ÔåÆ    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n   416ÔåÆ    #\n   417ÔåÆ    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n   418ÔåÆ    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n   419ÔåÆ\n   420ÔåÆ    # Step 4: Check patient's GP records for matching SNOMED codes\n   421ÔåÆ    match_result = patient_has_indication_direct(\n   422ÔåÆ        patient_pseudonym=patient_pseudonym,\n   423ÔåÆ        drug_snomed_mappings=drug_snomed_mappings,\n   424ÔåÆ        connector=connector,\n   425ÔåÆ        before_date=before_date,\n   426ÔåÆ    )\n   427ÔåÆ\n   428ÔåÆ    if match_result.matched and match_result.primary_directorate:\n   429ÔåÆ        return DirectorateAssignment(\n   430ÔåÆ            upid=upid,\n   431ÔåÆ            drug_name=drug_name,\n   432ÔåÆ            directorate=match_result.primary_directorate,\n   433ÔåÆ            search_term=match_result.search_term,\n   434ÔåÆ            source=\"DIAGNOSIS\",\n   435ÔåÆ            snomed_code=match_result.snomed_code,\n   436ÔåÆ            event_date=match_result.event_date,\n   437ÔåÆ        )\n   438ÔåÆ\n   439ÔåÆ    # No match found - return fallback result\n   440ÔåÆ    return result\n   441ÔåÆ\n   442ÔåÆ\n   443ÔåÆdef get_cluster_snomed_codes(\n   444ÔåÆ    cluster_id: str,\n   445ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   446ÔåÆ    use_cache: bool = True,\n   447ÔåÆ) -> ClusterSnomedCodes:\n   448ÔåÆ    \"\"\"\n   449ÔåÆ    Get all SNOMED codes for a cluster from Snowflake.\n   450ÔåÆ\n   451ÔåÆ    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n   452ÔåÆ    that belong to the specified cluster.\n   453ÔåÆ\n   454ÔåÆ    Args:\n   455ÔåÆ        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n   456ÔåÆ        connector: Optional SnowflakeConnector (defaults to singleton)\n   457ÔåÆ        use_cache: Whether to use cached results (default True)\n   458ÔåÆ\n   459ÔåÆ    Returns:\n   460ÔåÆ        ClusterSnomedCodes with list of SNOMED codes and descriptions\n   461ÔåÆ    \"\"\"\n   462ÔåÆ    if not SNOWFLAKE_AVAILABLE:\n   463ÔåÆ        logger.warning(\"Snowflake connector not available\")\n   464ÔåÆ        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n   465ÔåÆ\n   466ÔåÆ    if not is_snowflake_configured():\n   467ÔåÆ        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n   468ÔåÆ        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n   469ÔåÆ\n   470ÔåÆ    # Check cache first\n   471ÔåÆ    cache_key = f\"cluster_snomed_{cluster_id}\"\n   472ÔåÆ    if use_cache and is_cache_enabled():\n   473ÔåÆ        cache = get_cache()\n   474ÔåÆ        cached = cache.get(cache_key)\n   475ÔåÆ        if cached is not None and len(cached) > 0:\n   476ÔåÆ            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n   477ÔåÆ            cached_dict = cached[0]  # First element is our data dict\n   478ÔåÆ            return ClusterSnomedCodes(\n   479ÔåÆ                cluster_id=cluster_id,\n   480ÔåÆ                cluster_description=str(cached_dict.get(\"description\", \"\")),\n   481ÔåÆ                snomed_codes=list(cached_dict.get(\"codes\", [])),\n   482ÔåÆ                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n   483ÔåÆ            )\n   484ÔåÆ\n   485ÔåÆ    if connector is None:\n   486ÔåÆ        connector = get_connector()\n   487ÔåÆ\n   488ÔåÆ    query = '''\n   489ÔåÆ        SELECT DISTINCT\n   490ÔåÆ            \"Cluster_ID\",\n   491ÔåÆ            \"Cluster_Description\",\n   492ÔåÆ            \"SNOMEDCode\",\n   493ÔåÆ            \"SNOMEDDescription\"\n   494ÔåÆ        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n   495ÔåÆ        WHERE \"Cluster_ID\" = %s\n   496ÔåÆ        ORDER BY \"SNOMEDCode\"\n   497ÔåÆ    '''\n   498ÔåÆ\n   499ÔåÆ    try:\n   500ÔåÆ        results = connector.execute_dict(query, (cluster_id,))\n   501ÔåÆ\n   502ÔåÆ        if not results:\n   503ÔåÆ            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n   504ÔåÆ            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n   505ÔåÆ\n   506ÔåÆ        codes = []\n   507ÔåÆ        descriptions = {}\n   508ÔåÆ        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n   509ÔåÆ\n   510ÔåÆ        for row in results:\n   511ÔåÆ            code = row.get(\"SNOMEDCode\")\n   512ÔåÆ            if code:\n   513ÔåÆ                codes.append(code)\n   514ÔåÆ                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n   515ÔåÆ\n   516ÔåÆ        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n   517ÔåÆ\n   518ÔåÆ        # Cache the results (using query-based cache with fake params)\n   519ÔåÆ        if use_cache and is_cache_enabled():\n   520ÔåÆ            cache = get_cache()\n   521ÔåÆ            cache_data = [{\n   522ÔåÆ                \"description\": description,\n   523ÔåÆ                \"codes\": codes,\n   524ÔåÆ                \"descriptions\": descriptions,\n   525ÔåÆ            }]\n   526ÔåÆ            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n   527ÔåÆ\n   528ÔåÆ        return ClusterSnomedCodes(\n   529ÔåÆ            cluster_id=cluster_id,\n   530ÔåÆ            cluster_description=description,\n   531ÔåÆ            snomed_codes=codes,\n   532ÔåÆ            snomed_descriptions=descriptions,\n   533ÔåÆ        )\n   534ÔåÆ\n   535ÔåÆ    except Exception as e:\n   536ÔåÆ        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n   537ÔåÆ        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n   538ÔåÆ\n   539ÔåÆ\n   540ÔåÆdef patient_has_indication(\n   541ÔåÆ    patient_pseudonym: str,\n   542ÔåÆ    cluster_ids: list[str],\n   543ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   544ÔåÆ    before_date: Optional[date] = None,\n   545ÔåÆ) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n   546ÔåÆ    \"\"\"\n   547ÔåÆ    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n   548ÔåÆ\n   549ÔåÆ    Args:\n   550ÔåÆ        patient_pseudonym: Patient's pseudonymised NHS number\n   551ÔåÆ        cluster_ids: List of cluster IDs to check against\n   552ÔåÆ        connector: Optional SnowflakeConnector\n   553ÔåÆ        before_date: Optional date - only check diagnoses before this date\n   554ÔåÆ\n   555ÔåÆ    Returns:\n   556ÔåÆ        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n   557ÔåÆ    \"\"\"\n   558ÔåÆ    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n   559ÔåÆ        return False, None, None, None\n   560ÔåÆ\n   561ÔåÆ    if not cluster_ids:\n   562ÔåÆ        return False, None, None, None\n   563ÔåÆ\n   564ÔåÆ    if connector is None:\n   565ÔåÆ        connector = get_connector()\n   566ÔåÆ\n   567ÔåÆ    # Build placeholders for cluster IDs\n   568ÔåÆ    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n   569ÔåÆ\n   570ÔåÆ    # Query to check if patient has any matching SNOMED code\n   571ÔåÆ    query = f'''\n   572ÔåÆ        SELECT\n   573ÔåÆ            pc.\"SNOMEDCode\",\n   574ÔåÆ            cc.\"Cluster_ID\",\n   575ÔåÆ            cc.\"SNOMEDDescription\"\n   576ÔåÆ        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n   577ÔåÆ        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n   578ÔåÆ            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n   579ÔåÆ        WHERE pc.\"PatientPseudonym\" = %s\n   580ÔåÆ            AND cc.\"Cluster_ID\" IN ({placeholders})\n   581ÔåÆ    '''\n   582ÔåÆ\n   583ÔåÆ    params = [patient_pseudonym] + cluster_ids\n   584ÔåÆ\n   585ÔåÆ    if before_date:\n   586ÔåÆ        query += ' AND pc.\"EventDateTime\" < %s'\n   587ÔåÆ        params.append(before_date.isoformat())\n   588ÔåÆ\n   589ÔåÆ    query += ' LIMIT 1'\n   590ÔåÆ\n   591ÔåÆ    try:\n   592ÔåÆ        results = connector.execute_dict(query, tuple(params))\n   593ÔåÆ\n   594ÔåÆ        if results:\n   595ÔåÆ            row = results[0]\n   596ÔåÆ            return (\n   597ÔåÆ                True,\n   598ÔåÆ                row.get(\"Cluster_ID\"),\n   599ÔåÆ                row.get(\"SNOMEDCode\"),\n   600ÔåÆ                row.get(\"SNOMEDDescription\"),\n   601ÔåÆ            )\n   602ÔåÆ\n   603ÔåÆ        return False, None, None, None\n   604ÔåÆ\n   605ÔåÆ    except Exception as e:\n   606ÔåÆ        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n   607ÔåÆ        return False, None, None, None\n   608ÔåÆ\n   609ÔåÆ\n   610ÔåÆdef validate_indication(\n   611ÔåÆ    patient_pseudonym: str,\n   612ÔåÆ    drug_name: str,\n   613ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   614ÔåÆ    db_manager: Optional[DatabaseManager] = None,\n   615ÔåÆ    before_date: Optional[date] = None,\n   616ÔåÆ) -> IndicationValidationResult:\n   617ÔåÆ    \"\"\"\n   618ÔåÆ    Validate that a patient has an appropriate indication for a drug.\n   619ÔåÆ\n   620ÔåÆ    Full validation workflow:\n   621ÔåÆ    1. Get drug's valid indication clusters from local mapping\n   622ÔåÆ    2. Check if patient has any matching SNOMED codes in GP records\n   623ÔåÆ    3. Return detailed validation result\n   624ÔåÆ\n   625ÔåÆ    Args:\n   626ÔåÆ        patient_pseudonym: Patient's pseudonymised NHS number\n   627ÔåÆ        drug_name: Drug name to validate indication for\n   628ÔåÆ        connector: Optional SnowflakeConnector\n   629ÔåÆ        db_manager: Optional DatabaseManager\n   630ÔåÆ        before_date: Optional date - only check diagnoses before this date\n   631ÔåÆ\n   632ÔåÆ    Returns:\n   633ÔåÆ        IndicationValidationResult with validation details\n   634ÔåÆ    \"\"\"\n   635ÔåÆ    result = IndicationValidationResult(\n   636ÔåÆ        patient_pseudonym=patient_pseudonym,\n   637ÔåÆ        drug_name=drug_name,\n   638ÔåÆ        has_valid_indication=False,\n   639ÔåÆ    )\n   640ÔåÆ\n   641ÔåÆ    # Step 1: Get drug's cluster mappings\n   642ÔåÆ    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n   643ÔåÆ\n   644ÔåÆ    if not cluster_ids:\n   645ÔåÆ        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n   646ÔåÆ        result.source = \"NONE\"\n   647ÔåÆ        return result\n   648ÔåÆ\n   649ÔåÆ    result.checked_clusters = cluster_ids\n   650ÔåÆ\n   651ÔåÆ    # Step 2: Check Snowflake availability\n   652ÔåÆ    if not SNOWFLAKE_AVAILABLE:\n   653ÔåÆ        result.error_message = \"Snowflake connector not installed\"\n   654ÔåÆ        result.source = \"NONE\"\n   655ÔåÆ        return result\n   656ÔåÆ\n   657ÔåÆ    if not is_snowflake_configured():\n   658ÔåÆ        result.error_message = \"Snowflake not configured\"\n   659ÔåÆ        result.source = \"NONE\"\n   660ÔåÆ        return result\n   661ÔåÆ\n   662ÔåÆ    # Step 3: Check patient GP records\n   663ÔåÆ    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n   664ÔåÆ        patient_pseudonym=patient_pseudonym,\n   665ÔåÆ        cluster_ids=cluster_ids,\n   666ÔåÆ        connector=connector,\n   667ÔåÆ        before_date=before_date,\n   668ÔåÆ    )\n   669ÔåÆ\n   670ÔåÆ    result.has_valid_indication = has_indication\n   671ÔåÆ    result.matched_cluster_id = matched_cluster\n   672ÔåÆ    result.matched_snomed_code = matched_code\n   673ÔåÆ    result.matched_snomed_description = matched_desc\n   674ÔåÆ    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n   675ÔåÆ\n   676ÔåÆ    return result\n   677ÔåÆ\n   678ÔåÆ\n   679ÔåÆdef get_indication_match_rate(\n   680ÔåÆ    drug_name: str,\n   681ÔåÆ    patient_pseudonyms: list[str],\n   682ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   683ÔåÆ    db_manager: Optional[DatabaseManager] = None,\n   684ÔåÆ    sample_unmatched_count: int = 10,\n   685ÔåÆ) -> DrugIndicationMatchRate:\n   686ÔåÆ    \"\"\"\n   687ÔåÆ    Calculate indication match rate for a drug across a list of patients.\n   688ÔåÆ\n   689ÔåÆ    Args:\n   690ÔåÆ        drug_name: Drug name to check\n   691ÔåÆ        patient_pseudonyms: List of patient pseudonymised NHS numbers\n   692ÔåÆ        connector: Optional SnowflakeConnector\n   693ÔåÆ        db_manager: Optional DatabaseManager\n   694ÔåÆ        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n   695ÔåÆ\n   696ÔåÆ    Returns:\n   697ÔåÆ        DrugIndicationMatchRate with match statistics\n   698ÔåÆ    \"\"\"\n   699ÔåÆ    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n   700ÔåÆ        connector = get_connector()\n   701ÔåÆ\n   702ÔåÆ    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n   703ÔåÆ\n   704ÔåÆ    total = len(patient_pseudonyms)\n   705ÔåÆ    matched = 0\n   706ÔåÆ    unmatched = 0\n   707ÔåÆ    sample_unmatched: list[str] = []\n   708ÔåÆ\n   709ÔåÆ    if not cluster_ids:\n   710ÔåÆ        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n   711ÔåÆ        return DrugIndicationMatchRate(\n   712ÔåÆ            drug_name=drug_name,\n   713ÔåÆ            total_patients=total,\n   714ÔåÆ            patients_with_indication=0,\n   715ÔåÆ            patients_without_indication=total,\n   716ÔåÆ            match_rate=0.0,\n   717ÔåÆ            clusters_checked=[],\n   718ÔåÆ            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n   719ÔåÆ        )\n   720ÔåÆ\n   721ÔåÆ    for i, pseudonym in enumerate(patient_pseudonyms):\n   722ÔåÆ        if i > 0 and i % 100 == 0:\n   723ÔåÆ            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n   724ÔåÆ\n   725ÔåÆ        has_indication, _, _, _ = patient_has_indication(\n   726ÔåÆ            patient_pseudonym=pseudonym,\n   727ÔåÆ            cluster_ids=cluster_ids,\n   728ÔåÆ            connector=connector,\n   729ÔåÆ        )\n   730ÔåÆ\n   731ÔåÆ        if has_indication:\n   732ÔåÆ            matched += 1\n   733ÔåÆ        else:\n   734ÔåÆ            unmatched += 1\n   735ÔåÆ            if len(sample_unmatched) < sample_unmatched_count:\n   736ÔåÆ                sample_unmatched.append(pseudonym)\n   737ÔåÆ\n   738ÔåÆ    match_rate = matched / total if total > 0 else 0.0\n   739ÔåÆ\n   740ÔåÆ    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n   741ÔåÆ\n   742ÔåÆ    return DrugIndicationMatchRate(\n   743ÔåÆ        drug_name=drug_name,\n   744ÔåÆ        total_patients=total,\n   745ÔåÆ        patients_with_indication=matched,\n   746ÔåÆ        patients_without_indication=unmatched,\n   747ÔåÆ        match_rate=match_rate,\n   748ÔåÆ        clusters_checked=cluster_ids,\n   749ÔåÆ        sample_unmatched=sample_unmatched,\n   750ÔåÆ    )\n   751ÔåÆ\n   752ÔåÆ\n   753ÔåÆdef batch_validate_indications(\n   754ÔåÆ    patient_drug_pairs: list[tuple[str, str]],\n   755ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   756ÔåÆ    db_manager: Optional[DatabaseManager] = None,\n   757ÔåÆ    progress_callback: Optional[Callable[[int, int], None]] = None,\n   758ÔåÆ) -> list[IndicationValidationResult]:\n   759ÔåÆ    \"\"\"\n   760ÔåÆ    Validate indications for multiple patient-drug pairs efficiently.\n   761ÔåÆ\n   762ÔåÆ    Args:\n   763ÔåÆ        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n   764ÔåÆ        connector: Optional SnowflakeConnector\n   765ÔåÆ        db_manager: Optional DatabaseManager\n   766ÔåÆ        progress_callback: Optional callback(current, total) for progress updates\n   767ÔåÆ\n   768ÔåÆ    Returns:\n   769ÔåÆ        List of IndicationValidationResult for each pair\n   770ÔåÆ    \"\"\"\n   771ÔåÆ    results = []\n   772ÔåÆ    total = len(patient_drug_pairs)\n   773ÔåÆ\n   774ÔåÆ    # Cache cluster lookups by drug\n   775ÔåÆ    drug_clusters_cache = {}\n   776ÔåÆ\n   777ÔåÆ    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n   778ÔåÆ        if progress_callback:\n   779ÔåÆ            progress_callback(i + 1, total)\n   780ÔåÆ\n   781ÔåÆ        # Get clusters from cache or lookup\n   782ÔåÆ        drug_upper = drug_name.upper()\n   783ÔåÆ        if drug_upper not in drug_clusters_cache:\n   784ÔåÆ            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n   785ÔåÆ\n   786ÔåÆ        cluster_ids = drug_clusters_cache[drug_upper]\n   787ÔåÆ\n   788ÔåÆ        if not cluster_ids:\n   789ÔåÆ            results.append(IndicationValidationResult(\n   790ÔåÆ                patient_pseudonym=pseudonym,\n   791ÔåÆ                drug_name=drug_name,\n   792ÔåÆ                has_valid_indication=False,\n   793ÔåÆ                source=\"NONE\",\n   794ÔåÆ                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n   795ÔåÆ            ))\n   796ÔåÆ            continue\n   797ÔåÆ\n   798ÔåÆ        # Check patient indication\n   799ÔåÆ        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n   800ÔåÆ            patient_pseudonym=pseudonym,\n   801ÔåÆ            cluster_ids=cluster_ids,\n   802ÔåÆ            connector=connector,\n   803ÔåÆ        )\n   804ÔåÆ\n   805ÔåÆ        results.append(IndicationValidationResult(\n   806ÔåÆ            patient_pseudonym=pseudonym,\n   807ÔåÆ            drug_name=drug_name,\n   808ÔåÆ            has_valid_indication=has_indication,\n   809ÔåÆ            matched_cluster_id=matched_cluster,\n   810ÔåÆ            matched_snomed_code=matched_code,\n   811ÔåÆ            matched_snomed_description=matched_desc,\n   812ÔåÆ            checked_clusters=cluster_ids,\n   813ÔåÆ            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n   814ÔåÆ        ))\n   815ÔåÆ\n   816ÔåÆ    matched_count = sum(1 for r in results if r.has_valid_indication)\n   817ÔåÆ    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n   818ÔåÆ\n   819ÔåÆ    return results\n   820ÔåÆ\n   821ÔåÆ\n   822ÔåÆdef get_available_clusters(\n   823ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   824ÔåÆ) -> list[dict]:\n   825ÔåÆ    \"\"\"\n   826ÔåÆ    Get list of all available SNOMED clusters from Snowflake.\n   827ÔåÆ\n   828ÔåÆ    Returns:\n   829ÔåÆ        List of dicts with cluster_id, cluster_description, code_count\n   830ÔåÆ    \"\"\"\n   831ÔåÆ    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n   832ÔåÆ        logger.warning(\"Snowflake not available - cannot list clusters\")\n   833ÔåÆ        return []\n   834ÔåÆ\n   835ÔåÆ    if connector is None:\n   836ÔåÆ        connector = get_connector()\n   837ÔåÆ\n   838ÔåÆ    query = '''\n   839ÔåÆ        SELECT\n   840ÔåÆ            \"Cluster_ID\",\n   841ÔåÆ            \"Cluster_Description\",\n   842ÔåÆ            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n   843ÔåÆ        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n   844ÔåÆ        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n   845ÔåÆ        ORDER BY \"Cluster_ID\"\n   846ÔåÆ    '''\n   847ÔåÆ\n   848ÔåÆ    try:\n   849ÔåÆ        results = connector.execute_dict(query)\n   850ÔåÆ\n   851ÔåÆ        clusters = []\n   852ÔåÆ        for row in results:\n   853ÔåÆ            clusters.append({\n   854ÔåÆ                \"cluster_id\": row.get(\"Cluster_ID\"),\n   855ÔåÆ                \"cluster_description\": row.get(\"Cluster_Description\"),\n   856ÔåÆ                \"code_count\": row.get(\"code_count\", 0),\n   857ÔåÆ            })\n   858ÔåÆ\n   859ÔåÆ        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n   860ÔåÆ        return clusters\n   861ÔåÆ\n   862ÔåÆ    except Exception as e:\n   863ÔåÆ        logger.error(f\"Error getting available clusters: {e}\")\n   864ÔåÆ        return []\n   865ÔåÆ\n   866ÔåÆ\n   867ÔåÆdef batch_lookup_indication_groups(\n   868ÔåÆ    df: \"pd.DataFrame\",\n   869ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n   870ÔåÆ    db_manager: Optional[DatabaseManager] = None,\n   871ÔåÆ    batch_size: int = 500,\n   872ÔåÆ) -> \"pd.DataFrame\":\n   873ÔåÆ    \"\"\"\n   874ÔåÆ    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n   875ÔåÆ\n   876ÔåÆ    This is the efficient batch version of get_directorate_from_diagnosis().\n   877ÔåÆ    Instead of querying Snowflake per patient, it batches the lookups for performance.\n   878ÔåÆ\n   879ÔåÆ    Strategy:\n   880ÔåÆ    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n   881ÔåÆ    2. For each unique drug, get all SNOMED codes from local SQLite\n   882ÔåÆ    3. Build batched Snowflake queries to check GP records\n   883ÔåÆ    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n   884ÔåÆ\n   885ÔåÆ    For unmatched patients, Indication_Group will be their Directory (with suffix).\n   886ÔåÆ\n   887ÔåÆ    Args:\n   888ÔåÆ        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n   889ÔåÆ        connector: Optional SnowflakeConnector (defaults to singleton)\n   890ÔåÆ        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n   891ÔåÆ        batch_size: Number of patients per Snowflake query batch\n   892ÔåÆ\n   893ÔåÆ    Returns:\n   894ÔåÆ        DataFrame with columns: UPID, Indication_Group, Source\n   895ÔåÆ        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n   896ÔåÆ        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n   897ÔåÆ    \"\"\"\n   898ÔåÆ    import pandas as pd\n   899ÔåÆ\n   900ÔåÆ    if db_manager is None:\n   901ÔåÆ        db_manager = default_db_manager\n   902ÔåÆ\n   903ÔåÆ    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n   904ÔåÆ\n   905ÔåÆ    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n   906ÔåÆ    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n   907ÔåÆ    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n   908ÔåÆ    if 'PseudoNHSNoLinked' not in df.columns:\n   909ÔåÆ        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n   910ÔåÆ        # Return fallback for all patients\n   911ÔåÆ        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n   912ÔåÆ        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n   913ÔåÆ        result_df['Source'] = \"FALLBACK\"\n   914ÔåÆ        return result_df[['UPID', 'Indication_Group', 'Source']]\n   915ÔåÆ\n   916ÔåÆ    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n   917ÔåÆ    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n   918ÔåÆ    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n   919ÔåÆ\n   920ÔåÆ    # Step 2: Get all unique drugs and their SNOMED codes\n   921ÔåÆ    unique_drugs = unique_pairs['Drug Name'].unique()\n   922ÔåÆ    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n   923ÔåÆ\n   924ÔåÆ    # Build drug -> list of DrugSnomedMapping dict\n   925ÔåÆ    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n   926ÔåÆ    all_snomed_codes: set[str] = set()\n   927ÔåÆ    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n   928ÔåÆ\n   929ÔåÆ    for drug_name in unique_drugs:\n   930ÔåÆ        mappings = get_drug_snomed_codes(drug_name, db_manager)\n   931ÔåÆ        drug_snomed_map[drug_name] = mappings\n   932ÔåÆ\n   933ÔåÆ        for m in mappings:\n   934ÔåÆ            all_snomed_codes.add(m.snomed_code)\n   935ÔåÆ            if m.snomed_code not in snomed_to_drug_searchterm:\n   936ÔåÆ                snomed_to_drug_searchterm[m.snomed_code] = []\n   937ÔåÆ            snomed_to_drug_searchterm[m.snomed_code].append(\n   938ÔåÆ                (drug_name, m.search_term, m.primary_directorate)\n   939ÔåÆ            )\n   940ÔåÆ\n   941ÔåÆ    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n   942ÔåÆ\n   943ÔåÆ    # Step 3: Check Snowflake availability\n   944ÔåÆ    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n   945ÔåÆ        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n   946ÔåÆ        result_df = unique_pairs[['UPID', 'Directory']].copy()\n   947ÔåÆ        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n   948ÔåÆ        result_df['Source'] = \"FALLBACK\"\n   949ÔåÆ        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n   950ÔåÆ\n   951ÔåÆ    if connector is None:\n   952ÔåÆ        connector = get_connector()\n   953ÔåÆ\n   954ÔåÆ    # Step 4: Query GP records for all patients in batches\n   955ÔåÆ    # The query finds the most recent matching SNOMED code for each patient\n   956ÔåÆ\n   957ÔåÆ    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n   958ÔåÆ    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n   959ÔåÆ    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n   960ÔåÆ\n   961ÔåÆ    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n   962ÔåÆ\n   963ÔåÆ    # Results dict: PersonKey -> (snomed_code, event_date)\n   964ÔåÆ    gp_matches: dict[str, tuple[str, Any]] = {}\n   965ÔåÆ\n   966ÔåÆ    # Convert SNOMED codes to list for query\n   967ÔåÆ    snomed_list = list(all_snomed_codes)\n   968ÔåÆ\n   969ÔåÆ    if not snomed_list:\n   970ÔåÆ        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n   971ÔåÆ        result_df = unique_pairs[['UPID', 'Directory']].copy()\n   972ÔåÆ        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n   973ÔåÆ        result_df['Source'] = \"FALLBACK\"\n   974ÔåÆ        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n   975ÔåÆ\n   976ÔåÆ    # Build SNOMED IN clause (reused across batches)\n   977ÔåÆ    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n   978ÔåÆ\n   979ÔåÆ    # Process patients in batches\n   980ÔåÆ    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n   981ÔåÆ        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n   982ÔåÆ        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n   983ÔåÆ\n   984ÔåÆ        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n   985ÔåÆ\n   986ÔåÆ        # Build patient IN clause\n   987ÔåÆ        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n   988ÔåÆ\n   989ÔåÆ        # Query to find all matching SNOMED codes for these patients\n   990ÔåÆ        # We'll get all matches and pick the most recent per patient in Python\n   991ÔåÆ        query = f'''\n   992ÔåÆ            SELECT\n   993ÔåÆ                \"PatientPseudonym\",\n   994ÔåÆ                \"SNOMEDCode\",\n   995ÔåÆ                \"EventDateTime\"\n   996ÔåÆ            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n   997ÔåÆ            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n   998ÔåÆ              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n   999ÔåÆ            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n  1000ÔåÆ        '''\n  1001ÔåÆ\n  1002ÔåÆ        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n  1003ÔåÆ\n  1004ÔåÆ        try:\n  1005ÔåÆ            results = connector.execute_dict(query, params)\n  1006ÔåÆ\n  1007ÔåÆ            # Process results - pick most recent per patient\n  1008ÔåÆ            for row in results:\n  1009ÔåÆ                person_key = row.get(\"PatientPseudonym\")\n  1010ÔåÆ                snomed_code = row.get(\"SNOMEDCode\")\n  1011ÔåÆ                event_date = row.get(\"EventDateTime\")\n  1012ÔåÆ\n  1013ÔåÆ                if person_key and snomed_code:\n  1014ÔåÆ                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n  1015ÔåÆ                    if person_key not in gp_matches:\n  1016ÔåÆ                        gp_matches[person_key] = (snomed_code, event_date)\n  1017ÔåÆ\n  1018ÔåÆ        except Exception as e:\n  1019ÔåÆ            logger.error(f\"Error querying GP records for batch: {e}\")\n  1020ÔåÆ            # Continue with other batches\n  1021ÔåÆ\n  1022ÔåÆ    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n  1023ÔåÆ\n  1024ÔåÆ    # Step 5: Build result DataFrame\n  1025ÔåÆ    # For each unique_pair, determine Indication_Group based on match status\n  1026ÔåÆ    results_list = []\n  1027ÔåÆ\n  1028ÔåÆ    # We need to dedupe by UPID - a patient might be on multiple drugs\n  1029ÔåÆ    # Strategy: For each UPID, use the most recent match (if any)\n  1030ÔåÆ    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n  1031ÔåÆ\n  1032ÔåÆ    for _, row in unique_pairs.iterrows():\n  1033ÔåÆ        upid = row['UPID']\n  1034ÔåÆ        drug_name = row['Drug Name']\n  1035ÔåÆ        patient_pseudonym = row['PseudoNHSNoLinked']\n  1036ÔåÆ        directory = row['Directory']\n  1037ÔåÆ\n  1038ÔåÆ        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n  1039ÔåÆ        if patient_pseudonym in gp_matches:\n  1040ÔåÆ            matched_snomed, event_date = gp_matches[patient_pseudonym]\n  1041ÔåÆ\n  1042ÔåÆ            # Find the search_term for this SNOMED code and drug\n  1043ÔåÆ            # (A SNOMED code might map to multiple drugs with different search_terms)\n  1044ÔåÆ            if matched_snomed in snomed_to_drug_searchterm:\n  1045ÔåÆ                # Look for match with current drug first\n  1046ÔåÆ                search_term = None\n  1047ÔåÆ                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n  1048ÔåÆ                    if drug.upper() == drug_name.upper():\n  1049ÔåÆ                        search_term = st\n  1050ÔåÆ                        break\n  1051ÔåÆ                # If no drug-specific match, use any match\n  1052ÔåÆ                if search_term is None:\n  1053ÔåÆ                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n  1054ÔåÆ\n  1055ÔåÆ                # Only update if we don't have a match for this UPID yet\n  1056ÔåÆ                if upid not in upid_to_match:\n  1057ÔåÆ                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n  1058ÔåÆ            else:\n  1059ÔåÆ                # Shouldn't happen but fallback just in case\n  1060ÔåÆ                if upid not in upid_to_match:\n  1061ÔåÆ                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n  1062ÔåÆ        else:\n  1063ÔåÆ            # No GP match - use fallback\n  1064ÔåÆ            if upid not in upid_to_match:\n  1065ÔåÆ                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n  1066ÔåÆ\n  1067ÔåÆ    # Build result DataFrame\n  1068ÔåÆ    for upid, (indication_group, source) in upid_to_match.items():\n  1069ÔåÆ        results_list.append({\n  1070ÔåÆ            'UPID': upid,\n  1071ÔåÆ            'Indication_Group': indication_group,\n  1072ÔåÆ            'Source': source,\n  1073ÔåÆ        })\n  1074ÔåÆ\n  1075ÔåÆ    result_df = pd.DataFrame(results_list)\n  1076ÔåÆ\n  1077ÔåÆ    # Log statistics\n  1078ÔåÆ    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n  1079ÔåÆ    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n  1080ÔåÆ    total = len(result_df)\n  1081ÔåÆ\n  1082ÔåÆ    logger.info(f\"Indication lookup complete:\")\n  1083ÔåÆ    logger.info(f\"  Total unique patients: {total}\")\n  1084ÔåÆ    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n  1085ÔåÆ    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n  1086ÔåÆ\n  1087ÔåÆ    return result_df\n  1088ÔåÆ\n  1089ÔåÆ\n  1090ÔåÆ# === Drug-to-indication mapping from DimSearchTerm.csv ===\n  1091ÔåÆ\n  1092ÔåÆ\n  1093ÔåÆdef load_drug_indication_mapping(\n  1094ÔåÆ    csv_path: Optional[str] = None,\n  1095ÔåÆ) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n  1096ÔåÆ    \"\"\"\n  1097ÔåÆ    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n  1098ÔåÆ\n  1099ÔåÆ    Builds two lookup dicts:\n  1100ÔåÆ    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n  1101ÔåÆ    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n  1102ÔåÆ\n  1103ÔåÆ    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n  1104ÔåÆ    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n  1105ÔåÆ\n  1106ÔåÆ    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n  1107ÔåÆ    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n  1108ÔåÆ    Drug fragments from all rows for the same Search_Term are combined.\n  1109ÔåÆ\n  1110ÔåÆ    Args:\n  1111ÔåÆ        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n  1112ÔåÆ\n  1113ÔåÆ    Returns:\n  1114ÔåÆ        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n  1115ÔåÆ    \"\"\"\n  1116ÔåÆ    if csv_path is None:\n  1117ÔåÆ        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n  1118ÔåÆ\n  1119ÔåÆ    fragment_to_search_terms: dict[str, list[str]] = {}\n  1120ÔåÆ    search_term_to_fragments: dict[str, list[str]] = {}\n  1121ÔåÆ\n  1122ÔåÆ    try:\n  1123ÔåÆ        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n  1124ÔåÆ            reader = csv.DictReader(f)\n  1125ÔåÆ            for row in reader:\n  1126ÔåÆ                search_term = row.get(\"Search_Term\", \"\").strip()\n  1127ÔåÆ                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n  1128ÔåÆ\n  1129ÔåÆ                if not search_term or not drug_names_raw:\n  1130ÔåÆ                    continue\n  1131ÔåÆ\n  1132ÔåÆ                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n  1133ÔåÆ\n  1134ÔåÆ                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n  1135ÔåÆ                if search_term not in search_term_to_fragments:\n  1136ÔåÆ                    search_term_to_fragments[search_term] = []\n  1137ÔåÆ                for frag in fragments:\n  1138ÔåÆ                    if frag not in search_term_to_fragments[search_term]:\n  1139ÔåÆ                        search_term_to_fragments[search_term].append(frag)\n  1140ÔåÆ\n  1141ÔåÆ                # Build fragment -> search_terms\n  1142ÔåÆ                for frag in fragments:\n  1143ÔåÆ                    if frag not in fragment_to_search_terms:\n  1144ÔåÆ                        fragment_to_search_terms[frag] = []\n  1145ÔåÆ                    if search_term not in fragment_to_search_terms[frag]:\n  1146ÔåÆ                        fragment_to_search_terms[frag].append(search_term)\n  1147ÔåÆ\n  1148ÔåÆ        logger.info(\n  1149ÔåÆ            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n  1150ÔåÆ            f\"{len(fragment_to_search_terms)} drug fragments\"\n  1151ÔåÆ        )\n  1152ÔåÆ\n  1153ÔåÆ    except FileNotFoundError:\n  1154ÔåÆ        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n  1155ÔåÆ    except Exception as e:\n  1156ÔåÆ        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n  1157ÔåÆ\n  1158ÔåÆ    return fragment_to_search_terms, search_term_to_fragments\n  1159ÔåÆ\n  1160ÔåÆ\n  1161ÔåÆdef get_search_terms_for_drug(\n  1162ÔåÆ    drug_name: str,\n  1163ÔåÆ    search_term_to_fragments: dict[str, list[str]],\n  1164ÔåÆ) -> list[str]:\n  1165ÔåÆ    \"\"\"\n  1166ÔåÆ    Get all Search_Terms that list a given drug using substring matching.\n  1167ÔåÆ\n  1168ÔåÆ    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n  1169ÔåÆ    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n  1170ÔåÆ    and partial fragments (PEGYLATED, INHALED).\n  1171ÔåÆ\n  1172ÔåÆ    Args:\n  1173ÔåÆ        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n  1174ÔåÆ        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n  1175ÔåÆ\n  1176ÔåÆ    Returns:\n  1177ÔåÆ        List of Search_Terms whose drug fragments match the drug name\n  1178ÔåÆ    \"\"\"\n  1179ÔåÆ    drug_name_upper = drug_name.upper()\n  1180ÔåÆ    matched_terms: list[str] = []\n  1181ÔåÆ\n  1182ÔåÆ    for search_term, fragments in search_term_to_fragments.items():\n  1183ÔåÆ        for frag in fragments:\n  1184ÔåÆ            if frag in drug_name_upper:\n  1185ÔåÆ                matched_terms.append(search_term)\n  1186ÔåÆ                break  # One matching fragment is enough for this Search_Term\n  1187ÔåÆ\n  1188ÔåÆ    return matched_terms\n  1189ÔåÆ\n  1190ÔåÆ\n  1191ÔåÆ# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n  1192ÔåÆ\n  1193ÔåÆ# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n  1194ÔåÆ# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\n  1195ÔåÆCLUSTER_MAPPING_SQL = \"\"\"\n  1196ÔåÆWITH SearchTermClusters AS (\n  1197ÔåÆ    SELECT Search_Term, Cluster_ID FROM (VALUES\n  1198ÔåÆ        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n  1199ÔåÆ        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n  1200ÔåÆ        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n  1201ÔåÆ        ('allergic asthma', 'AST_COD'),\n  1202ÔåÆ        ('allergic rhinitis', 'MILDINTAST_COD'),\n  1203ÔåÆ        ('alzheimer''s disease', 'DEMALZ_COD'),\n  1204ÔåÆ        ('amyloidosis', 'AMYLOID_COD'),\n  1205ÔåÆ        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n  1206ÔåÆ        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n  1207ÔåÆ        ('apixaban', 'DOACCON_COD'),\n  1208ÔåÆ        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n  1209ÔåÆ        ('arthritis', 'eFI2_InflammatoryArthritis'),\n  1210ÔåÆ        ('asthma', 'eFI2_Asthma'),\n  1211ÔåÆ        ('atopic dermatitis', 'ATOPDERM_COD'),\n  1212ÔåÆ        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n  1213ÔåÆ        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n  1214ÔåÆ        ('bipolar disorder', 'MH_COD'),\n  1215ÔåÆ        ('bladder', 'eFI2_UrinaryIncontinence'),\n  1216ÔåÆ        ('breast cancer', 'BRCANSCR_COD'),\n  1217ÔåÆ        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n  1218ÔåÆ        ('cardiovascular disease', 'CVDRISKASS_COD'),\n  1219ÔåÆ        ('cervical cancer', 'CSDEC_COD'),\n  1220ÔåÆ        ('cholangiocarcinoma', 'eFI2_Cancer'),\n  1221ÔåÆ        ('chronic kidney disease', 'CKD_COD'),\n  1222ÔåÆ        ('chronic liver disease', 'eFI2_LiverProblems'),\n  1223ÔåÆ        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n  1224ÔåÆ        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n  1225ÔåÆ        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n  1226ÔåÆ        ('colon cancer', 'eFI2_Cancer'),\n  1227ÔåÆ        ('colorectal cancer', 'GICANREF_COD'),\n  1228ÔåÆ        ('constipation', 'CHRONCONSTIP_COD'),\n  1229ÔåÆ        ('covid-19', 'POSSPOSTCOVID_COD'),\n  1230ÔåÆ        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n  1231ÔåÆ        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n  1232ÔåÆ        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n  1233ÔåÆ        ('deep vein thrombosis', 'VTE_COD'),\n  1234ÔåÆ        ('depression', 'eFI2_Depression'),\n  1235ÔåÆ        ('diabetes', 'eFI2_DiabetesEver'),\n  1236ÔåÆ        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n  1237ÔåÆ        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n  1238ÔåÆ        ('dravet syndrome', 'EPIL_COD'),\n  1239ÔåÆ        ('drug misuse', 'ILLSUBINT_COD'),\n  1240ÔåÆ        ('dyspepsia', 'eFI2_AbdominalPain'),\n  1241ÔåÆ        ('epilepsy', 'eFI2_Seizures'),\n  1242ÔåÆ        ('fallopian tube', 'STERIL_COD'),\n  1243ÔåÆ        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n  1244ÔåÆ        ('gastric cancer', 'eFI2_Cancer'),\n  1245ÔåÆ        ('giant cell arteritis', 'GCA_COD'),\n  1246ÔåÆ        ('glioma', 'NHAEMCANMORPH_COD'),\n  1247ÔåÆ        ('gout', 'eFI2_InflammatoryArthritis'),\n  1248ÔåÆ        ('graft versus host disease', 'GVHD_COD'),\n  1249ÔåÆ        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n  1250ÔåÆ        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n  1251ÔåÆ        ('hand eczema', 'ECZEMA_COD'),\n  1252ÔåÆ        ('heart failure', 'eFI2_HeartFailure'),\n  1253ÔåÆ        ('hepatitis b', 'HEPBCVAC_COD'),\n  1254ÔåÆ        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n  1255ÔåÆ        ('hiv', 'PREFLANG_COD'),\n  1256ÔåÆ        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n  1257ÔåÆ        ('hormone receptor', 'eFI2_ThyroidProblems'),\n  1258ÔåÆ        ('hypercholesterolaemia', 'CLASSFH_COD'),\n  1259ÔåÆ        ('immune thrombocytopenia', 'ITP_COD'),\n  1260ÔåÆ        ('influenza', 'FLUINVITE_COD'),\n  1261ÔåÆ        ('insomnia', 'eFI2_SleepProblems'),\n  1262ÔåÆ        ('irritable bowel syndrome', 'IBS_COD'),\n  1263ÔåÆ        ('ischaemic stroke', 'OSTR_COD'),\n  1264ÔåÆ        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n  1265ÔåÆ        ('kidney transplant', 'RENALTRANSP_COD'),\n  1266ÔåÆ        ('leukaemia', 'eFI2_Cancer'),\n  1267ÔåÆ        ('lung cancer', 'FTCANREF_COD'),\n  1268ÔåÆ        ('lymphoma', 'C19HAEMCAN_COD'),\n  1269ÔåÆ        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n  1270ÔåÆ        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n  1271ÔåÆ        ('major depressive episodes', 'eFI2_Depression'),\n  1272ÔåÆ        ('malignant melanoma', 'eFI2_Cancer'),\n  1273ÔåÆ        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n  1274ÔåÆ        ('manic episode', 'MH_COD'),\n  1275ÔåÆ        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n  1276ÔåÆ        ('melanoma', 'eFI2_Cancer'),\n  1277ÔåÆ        ('merkel cell carcinoma', 'C19CAN_COD'),\n  1278ÔåÆ        ('migraine', 'eFI2_Headache'),\n  1279ÔåÆ        ('motor neurone disease', 'MND_COD'),\n  1280ÔåÆ        ('multiple myeloma', 'C19HAEMCAN_COD'),\n  1281ÔåÆ        ('multiple sclerosis', 'MS_COD'),\n  1282ÔåÆ        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n  1283ÔåÆ        ('myelofibrosis', 'MDS_COD'),\n  1284ÔåÆ        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n  1285ÔåÆ        ('myotonia', 'CNDATRISK2_COD'),\n  1286ÔåÆ        ('narcolepsy', 'LD_COD'),\n  1287ÔåÆ        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n  1288ÔåÆ        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n  1289ÔåÆ        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n  1290ÔåÆ        ('obesity', 'BMI30_COD'),\n  1291ÔåÆ        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n  1292ÔåÆ        ('osteoporosis', 'eFI2_Osteoporosis'),\n  1293ÔåÆ        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n  1294ÔåÆ        ('ovarian cancer', 'C19CAN_COD'),\n  1295ÔåÆ        ('peripheral arterial disease', 'PADEXC_COD'),\n  1296ÔåÆ        ('plaque psoriasis', 'PSORIASIS_COD'),\n  1297ÔåÆ        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n  1298ÔåÆ        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n  1299ÔåÆ        ('pregnancy', 'C19PREG_COD'),\n  1300ÔåÆ        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n  1301ÔåÆ        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n  1302ÔåÆ        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n  1303ÔåÆ        ('psoriasis', 'PSORIASIS_COD'),\n  1304ÔåÆ        ('psoriatic arthritis', 'RARTHAD_COD'),\n  1305ÔåÆ        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n  1306ÔåÆ        ('pulmonary fibrosis', 'ILD_COD'),\n  1307ÔåÆ        ('relapsing multiple sclerosis', 'MS_COD'),\n  1308ÔåÆ        ('renal cell carcinoma', 'C19CAN_COD'),\n  1309ÔåÆ        ('renal transplantation', 'RENALTRANSP_COD'),\n  1310ÔåÆ        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n  1311ÔåÆ        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n  1312ÔåÆ        ('rivaroxaban', 'DOACCON_COD'),\n  1313ÔåÆ        ('schizophrenia', 'MH_COD'),\n  1314ÔåÆ        ('seizures', 'LSZFREQ_COD'),\n  1315ÔåÆ        ('sepsis', 'C19ACTIVITY_COD'),\n  1316ÔåÆ        ('severe persistent allergic asthma', 'SEVAST_COD'),\n  1317ÔåÆ        ('sickle cell disease', 'SICKLE_COD'),\n  1318ÔåÆ        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n  1319ÔåÆ        ('smoking cessation', 'SMOKINGINT_COD'),\n  1320ÔåÆ        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n  1321ÔåÆ        ('spinal muscular atrophy', 'MND_COD'),\n  1322ÔåÆ        ('squamous cell', 'C19CAN_COD'),\n  1323ÔåÆ        ('squamous cell carcinoma', 'C19CAN_COD'),\n  1324ÔåÆ        ('stem cell transplant', 'ALLOTRANSP_COD'),\n  1325ÔåÆ        ('stroke', 'eFI2_Stroke'),\n  1326ÔåÆ        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n  1327ÔåÆ        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n  1328ÔåÆ        ('thrombocytopenic purpura', 'TTP_COD'),\n  1329ÔåÆ        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n  1330ÔåÆ        ('thyroid cancer', 'C19CAN_COD'),\n  1331ÔåÆ        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n  1332ÔåÆ        ('transitional cell carcinoma', 'C19CAN_COD'),\n  1333ÔåÆ        ('type 1 diabetes', 'DMTYPE1_COD'),\n  1334ÔåÆ        ('type 2 diabetes', 'DMTYPE2_COD'),\n  1335ÔåÆ        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n  1336ÔåÆ        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n  1337ÔåÆ        ('urticaria', 'XSAL_COD'),\n  1338ÔåÆ        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n  1339ÔåÆ        ('vascular disease', 'CVDINVITE_COD'),\n  1340ÔåÆ        ('vasculitis', 'CRYOGLOBVASC_COD')\n  1341ÔåÆ    ) AS t(Search_Term, Cluster_ID)\n  1342ÔåÆ),\n  1343ÔåÆ\n  1344ÔåÆClusterCodes AS (\n  1345ÔåÆ    SELECT\n  1346ÔåÆ        stc.Search_Term,\n  1347ÔåÆ        c.\"SNOMEDCode\",\n  1348ÔåÆ        c.\"SNOMEDDescription\"\n  1349ÔåÆ    FROM SearchTermClusters stc\n  1350ÔåÆ    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n  1351ÔåÆ        ON stc.Cluster_ID = c.\"Cluster_ID\"\n  1352ÔåÆ    WHERE c.\"SNOMEDCode\" IS NOT NULL\n  1353ÔåÆ),\n  1354ÔåÆ\n  1355ÔåÆExplicitCodes AS (\n  1356ÔåÆ    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n  1357ÔåÆ        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n  1358ÔåÆ        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n  1359ÔåÆ        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n  1360ÔåÆ        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n  1361ÔåÆ        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n  1362ÔåÆ        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n  1363ÔåÆ        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n  1364ÔåÆ        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n  1365ÔåÆ        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n  1366ÔåÆ        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n  1367ÔåÆ        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n  1368ÔåÆ        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n  1369ÔåÆ        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n  1370ÔåÆ    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n  1371ÔåÆ),\n  1372ÔåÆ\n  1373ÔåÆAllIndicationCodes AS (\n  1374ÔåÆ    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n  1375ÔåÆ    FROM ClusterCodes\n  1376ÔåÆ    UNION ALL\n  1377ÔåÆ    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n  1378ÔåÆ    FROM ExplicitCodes\n  1379ÔåÆ)\n  1380ÔåÆ\"\"\"\n  1381ÔåÆ\n  1382ÔåÆ\n  1383ÔåÆdef get_patient_indication_groups(\n  1384ÔåÆ    patient_pseudonyms: list[str],\n  1385ÔåÆ    connector: Optional[SnowflakeConnector] = None,\n  1386ÔåÆ    batch_size: int = 500,\n  1387ÔåÆ) -> \"pd.DataFrame\":\n  1388ÔåÆ    \"\"\"\n  1389ÔåÆ    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n  1390ÔåÆ\n  1391ÔåÆ    This function queries Snowflake directly using the embedded cluster CTE\n  1392ÔåÆ    (from snomed_indication_mapping_query.sql) to find patients with matching\n  1393ÔåÆ    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n  1394ÔåÆ\n  1395ÔåÆ    The query:\n  1396ÔåÆ    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n  1397ÔåÆ    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n  1398ÔåÆ    3. Returns the most recent match per patient (by EventDateTime)\n  1399ÔåÆ\n  1400ÔåÆ    Args:\n  1401ÔåÆ        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n  1402ÔåÆ        connector: Optional SnowflakeConnector (defaults to singleton)\n  1403ÔåÆ        batch_size: Number of patients per Snowflake query batch (default 500)\n  1404ÔåÆ\n  1405ÔåÆ    Returns:\n  1406ÔåÆ        DataFrame with columns:\n  1407ÔåÆ        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n  1408ÔåÆ        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n  1409ÔåÆ        - EventDateTime: Date of the GP diagnosis record\n  1410ÔåÆ\n  1411ÔåÆ        Patients not found in results have no matching GP diagnosis.\n  1412ÔåÆ    \"\"\"\n  1413ÔåÆ    import pandas as pd\n  1414ÔåÆ\n  1415ÔåÆ    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n  1416ÔåÆ\n  1417ÔåÆ    # Handle edge case: empty patient list\n  1418ÔåÆ    if not patient_pseudonyms:\n  1419ÔåÆ        logger.warning(\"Empty patient list provided\")\n  1420ÔåÆ        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n  1421ÔåÆ\n  1422ÔåÆ    # Check Snowflake availability\n  1423ÔåÆ    if not SNOWFLAKE_AVAILABLE:\n  1424ÔåÆ        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n  1425ÔåÆ        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n  1426ÔåÆ\n  1427ÔåÆ    if not is_snowflake_configured():\n  1428ÔåÆ        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n  1429ÔåÆ        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n  1430ÔåÆ\n  1431ÔåÆ    if connector is None:\n  1432ÔåÆ        connector = get_connector()\n  1433ÔåÆ\n  1434ÔåÆ    # Results list to collect all matches\n  1435ÔåÆ    all_results: list[dict] = []\n  1436ÔåÆ\n  1437ÔåÆ    # Process patients in batches\n  1438ÔåÆ    total_patients = len(patient_pseudonyms)\n  1439ÔåÆ    for batch_start in range(0, total_patients, batch_size):\n  1440ÔåÆ        batch_end = min(batch_start + batch_size, total_patients)\n  1441ÔåÆ        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n  1442ÔåÆ        batch_num = batch_start // batch_size + 1\n  1443ÔåÆ        total_batches = (total_patients + batch_size - 1) // batch_size\n  1444ÔåÆ\n  1445ÔåÆ        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n  1446ÔåÆ\n  1447ÔåÆ        # Build patient IN clause placeholders\n  1448ÔåÆ        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n  1449ÔåÆ\n  1450ÔåÆ        # Build the full query with cluster CTE\n  1451ÔåÆ        # This finds the most recent matching diagnosis for each patient\n  1452ÔåÆ        # Note: Column names must be aliased to ensure consistent casing in results\n  1453ÔåÆ        query = f\"\"\"\n  1454ÔåÆ{CLUSTER_MAPPING_SQL}\n  1455ÔåÆSELECT\n  1456ÔåÆ    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n  1457ÔåÆ    aic.Search_Term AS \"Search_Term\",\n  1458ÔåÆ    pc.\"EventDateTime\" AS \"EventDateTime\"\n  1459ÔåÆFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n  1460ÔåÆINNER JOIN AllIndicationCodes aic\n  1461ÔåÆ    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\n  1462ÔåÆWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\n  1463ÔåÆQUALIFY ROW_NUMBER() OVER (\n  1464ÔåÆ    PARTITION BY pc.\"PatientPseudonym\"\n  1465ÔåÆ    ORDER BY pc.\"EventDateTime\" DESC\n  1466ÔåÆ) = 1\n  1467ÔåÆ\"\"\"\n  1468ÔåÆ\n  1469ÔåÆ        try:\n  1470ÔåÆ            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n  1471ÔåÆ\n  1472ÔåÆ            for row in results:\n  1473ÔåÆ                all_results.append({\n  1474ÔåÆ                    'PatientPseudonym': row.get('PatientPseudonym'),\n  1475ÔåÆ                    'Search_Term': row.get('Search_Term'),\n  1476ÔåÆ                    'EventDateTime': row.get('EventDateTime'),\n  1477ÔåÆ                })\n  1478ÔåÆ\n  1479ÔåÆ            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n  1480ÔåÆ\n  1481ÔåÆ        except Exception as e:\n  1482ÔåÆ            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n  1483ÔåÆ            # Continue with other batches - partial results are better than none\n  1484ÔåÆ\n  1485ÔåÆ    # Build result DataFrame\n  1486ÔåÆ    result_df = pd.DataFrame(all_results)\n  1487ÔåÆ\n  1488ÔåÆ    # Log summary statistics\n  1489ÔåÆ    if len(result_df) > 0:\n  1490ÔåÆ        matched_count = len(result_df)\n  1491ÔåÆ        match_rate = 100 * matched_count / total_patients\n  1492ÔåÆ        unique_terms = result_df['Search_Term'].nunique()\n  1493ÔåÆ        logger.info(f\"Indication lookup complete:\")\n  1494ÔåÆ        logger.info(f\"  Total patients queried: {total_patients}\")\n  1495ÔåÆ        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n  1496ÔåÆ        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n  1497ÔåÆ\n  1498ÔåÆ        # Log top Search_Terms\n  1499ÔåÆ        top_terms = result_df['Search_Term'].value_counts().head(5)\n  1500ÔåÆ        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n  1501ÔåÆ    else:\n  1502ÔåÆ        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n  1503ÔåÆ\n  1504ÔåÆ    return result_df\n  1505ÔåÆ\n  1506ÔåÆ\n  1507ÔåÆ# Export public API\n  1508ÔåÆ__all__ = [\n  1509ÔåÆ    # Dataclasses\n  1510ÔåÆ    \"ClusterSnomedCodes\",\n  1511ÔåÆ    \"IndicationValidationResult\",\n  1512ÔåÆ    \"DrugIndicationMatchRate\",\n  1513ÔåÆ    \"DrugSnomedMapping\",\n  1514ÔåÆ    \"DirectSnomedMatchResult\",\n  1515ÔåÆ    \"DirectorateAssignment\",\n  1516ÔåÆ    # Cluster-based lookup functions (existing)\n  1517ÔåÆ    \"get_drug_clusters\",\n  1518ÔåÆ    \"get_drug_cluster_ids\",\n  1519ÔåÆ    \"get_cluster_snomed_codes\",\n  1520ÔåÆ    \"patient_has_indication\",\n  1521ÔåÆ    \"validate_indication\",\n  1522ÔåÆ    \"get_indication_match_rate\",\n  1523ÔåÆ    \"batch_validate_indications\",\n  1524ÔåÆ    \"get_available_clusters\",\n  1525ÔåÆ    # Direct SNOMED lookup functions (new)\n  1526ÔåÆ    \"get_drug_snomed_codes\",\n  1527ÔåÆ    \"patient_has_indication_direct\",\n  1528ÔåÆ    # Diagnosis-based directorate assignment\n  1529ÔåÆ    \"get_directorate_from_diagnosis\",\n  1530ÔåÆ    # Batch lookup for indication groups\n  1531ÔåÆ    \"batch_lookup_indication_groups\",\n  1532ÔåÆ    # Drug-indication mapping from DimSearchTerm.csv\n  1533ÔåÆ    \"load_drug_indication_mapping\",\n  1534ÔåÆ    \"get_search_terms_for_drug\",\n  1535ÔåÆ    # Snowflake-direct indication lookup (new approach)\n  1536ÔåÆ    \"get_patient_indication_groups\",\n  1537ÔåÆ    \"CLUSTER_MAPPING_SQL\",\n  1538ÔåÆ]\n  1539ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"390ec152-78a3-49f1-8803-8e23ee23f6e5","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","content":"\"\"\"\nDiagnosis lookup module for NHS Patient Pathway Analysis.\n\nProvides functions to validate patient indications by checking GP diagnosis records\nagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\ndrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n\nKey workflow:\n1. Get drug's valid indication clusters from local mapping\n2. Get all SNOMED codes for those clusters from Snowflake\n3. Check if patient has any of those SNOMED codes in GP records\n4. Report indication validation status\n\nIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\nGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime import date, datetime\nfrom pathlib import Path\nfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\nimport csv\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nfrom core.logging_config import get_logger\nfrom data_processing.database import DatabaseManager, default_db_manager\nfrom data_processing.snowflake_connector import (\n    SnowflakeConnector,\n    get_connector,\n    is_snowflake_available,\n    is_snowflake_configured,\n    SNOWFLAKE_AVAILABLE,\n)\nfrom data_processing.cache import get_cache, is_cache_enabled\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass ClusterSnomedCodes:\n    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    cluster_id: str\n    cluster_description: str\n    snomed_codes: list[str] = field(default_factory=list)\n    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n\n    @property\n    def code_count(self) -> int:\n        return len(self.snomed_codes)\n\n\n@dataclass\nclass IndicationValidationResult:\n    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    patient_pseudonym: str\n    drug_name: str\n    has_valid_indication: bool\n    matched_cluster_id: Optional[str] = None\n    matched_snomed_code: Optional[str] = None\n    matched_snomed_description: Optional[str] = None\n    checked_clusters: list[str] = field(default_factory=list)\n    total_codes_checked: int = 0\n    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    error_message: Optional[str] = None\n\n\n@dataclass\nclass DrugIndicationMatchRate:\n    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    drug_name: str\n    total_patients: int\n    patients_with_indication: int\n    patients_without_indication: int\n    match_rate: float  # 0.0 to 1.0\n    clusters_checked: list[str] = field(default_factory=list)\n    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n\n\n@dataclass\nclass DrugSnomedMapping:\n    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    snomed_code: str\n    snomed_description: str\n    search_term: str\n    primary_directorate: str\n    indication: str = \"\"\n    ta_id: str = \"\"\n\n\n@dataclass\nclass DirectSnomedMatchResult:\n    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    patient_pseudonym: str\n    matched: bool\n    snomed_code: Optional[str] = None\n    snomed_description: Optional[str] = None\n    search_term: Optional[str] = None\n    primary_directorate: Optional[str] = None\n    event_date: Optional[datetime] = None\n    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n\n\n@dataclass\nclass DirectorateAssignment:\n    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n    upid: str\n    drug_name: str\n    directorate: Optional[str]\n    search_term: Optional[str] = None\n    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n    snomed_code: Optional[str] = None\n    event_date: Optional[datetime] = None\n\n\ndef get_drug_clusters(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[dict]:\n    \"\"\"\n    Get all SNOMED cluster mappings for a drug from local SQLite.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of dicts with keys: drug_name, indication, cluster_id,\n        cluster_description, nice_ta_reference\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n        FROM ref_drug_indication_clusters\n        WHERE UPPER(drug_name) = UPPER(?)\n        ORDER BY indication, cluster_id\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name,))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append({\n                    \"drug_name\": row[\"drug_name\"],\n                    \"indication\": row[\"indication\"],\n                    \"cluster_id\": row[\"cluster_id\"],\n                    \"cluster_description\": row[\"cluster_description\"],\n                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n                })\n\n            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n        return []\n\n\ndef get_drug_cluster_ids(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[str]:\n    \"\"\"\n    Get unique cluster IDs for a drug.\n\n    Args:\n        drug_name: Drug name to look up\n        db_manager: Optional DatabaseManager\n\n    Returns:\n        List of unique cluster IDs\n    \"\"\"\n    clusters = get_drug_clusters(drug_name, db_manager)\n    return list(set(c[\"cluster_id\"] for c in clusters))\n\n\ndef get_drug_snomed_codes(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[DrugSnomedMapping]:\n    \"\"\"\n    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n\n    This uses the enriched mapping CSV data loaded into SQLite, which provides\n    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of DrugSnomedMapping with snomed_code, snomed_description,\n        search_term, primary_directorate, indication, ta_id\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT DISTINCT\n            snomed_code,\n            snomed_description,\n            search_term,\n            primary_directorate,\n            indication,\n            ta_id\n        FROM ref_drug_snomed_mapping\n        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n           OR UPPER(drug_name) = UPPER(?)\n        ORDER BY search_term, snomed_code\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name, drug_name))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append(DrugSnomedMapping(\n                    snomed_code=row[\"snomed_code\"],\n                    snomed_description=row[\"snomed_description\"] or \"\",\n                    search_term=row[\"search_term\"] or \"\",\n                    primary_directorate=row[\"primary_directorate\"] or \"\",\n                    indication=row[\"indication\"] or \"\",\n                    ta_id=row[\"ta_id\"] or \"\",\n                ))\n\n            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n        return []\n\n\ndef patient_has_indication_direct(\n    patient_pseudonym: str,\n    drug_snomed_mappings: list[DrugSnomedMapping],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> DirectSnomedMatchResult:\n    \"\"\"\n    Check if patient has any of the SNOMED codes in their GP records.\n\n    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n    for exact SNOMED code matches (not via cluster). Returns the most recent\n    match by EventDateTime if multiple matches exist.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n    \"\"\"\n    result = DirectSnomedMatchResult(\n        patient_pseudonym=patient_pseudonym,\n        matched=False,\n        source=\"NONE\",\n    )\n\n    if not drug_snomed_mappings:\n        return result\n\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot check GP records\")\n        return result\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n    snomed_lookup = {\n        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n        for m in drug_snomed_mappings\n    }\n\n    # Get unique SNOMED codes\n    snomed_codes = list(snomed_lookup.keys())\n\n    # Build placeholders for SNOMED codes\n    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n\n    # Query to find most recent matching SNOMED code in GP records\n    query = f'''\n        SELECT\n            \"SNOMEDCode\",\n            \"EventDateTime\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n        WHERE \"PatientPseudonym\" = %s\n            AND \"SNOMEDCode\" IN ({placeholders})\n    '''\n\n    params: list = [patient_pseudonym] + snomed_codes\n\n    if before_date:\n        query += ' AND \"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            matched_code = row.get(\"SNOMEDCode\")\n            event_dt = row.get(\"EventDateTime\")\n\n            if matched_code and matched_code in snomed_lookup:\n                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n\n                return DirectSnomedMatchResult(\n                    patient_pseudonym=patient_pseudonym,\n                    matched=True,\n                    snomed_code=matched_code,\n                    snomed_description=snomed_desc,\n                    search_term=search_term,\n                    primary_directorate=primary_dir,\n                    event_date=event_dt,\n                    source=\"DIRECT_SNOMED\",\n                )\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n        return result\n\n\ndef get_directorate_from_diagnosis(\n    upid: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> DirectorateAssignment:\n    \"\"\"\n    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n\n    This function attempts to assign a directorate based on the patient's GP records\n    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n    indicating that the caller should use alternative assignment methods (e.g.,\n    department_identification() from tools/data.py).\n\n    Workflow:\n    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n    2. Query patient's GP records for matching SNOMED codes\n    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n\n    Args:\n        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n        drug_name: Drug name to look up\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectorateAssignment with directorate, search_term, and source\n    \"\"\"\n    result = DirectorateAssignment(\n        upid=upid,\n        drug_name=drug_name,\n        directorate=None,\n        source=\"FALLBACK\",\n    )\n\n    # Step 1: Get SNOMED codes for the drug\n    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n\n    if not drug_snomed_mappings:\n        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n        return result\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.debug(\"Snowflake not available - using fallback\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.debug(\"Snowflake not configured - using fallback\")\n        return result\n\n    # Step 3: Get patient pseudonym from UPID\n    # UPID format is Provider Code (3 chars) + PersonKey\n    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n    # For now, we'll use UPID as the identifier - the actual integration\n    # will need to happen at the DataFrame level where we have PersonKey\n    #\n    # NOTE: This function will be called from the pipeline where we have\n    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n\n    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n    # The caller should pass the PatientPseudonym or we need to look it up.\n    # For now, let's assume the caller will use this in a batch context\n    # where they can map UPID -> PatientPseudonym.\n\n    # Let me reconsider: the function signature takes UPID but we need\n    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n    # we'll have the PersonKey column which IS the PatientPseudonym.\n    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n    #\n    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n\n    # Step 4: Check patient's GP records for matching SNOMED codes\n    match_result = patient_has_indication_direct(\n        patient_pseudonym=patient_pseudonym,\n        drug_snomed_mappings=drug_snomed_mappings,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    if match_result.matched and match_result.primary_directorate:\n        return DirectorateAssignment(\n            upid=upid,\n            drug_name=drug_name,\n            directorate=match_result.primary_directorate,\n            search_term=match_result.search_term,\n            source=\"DIAGNOSIS\",\n            snomed_code=match_result.snomed_code,\n            event_date=match_result.event_date,\n        )\n\n    # No match found - return fallback result\n    return result\n\n\ndef get_cluster_snomed_codes(\n    cluster_id: str,\n    connector: Optional[SnowflakeConnector] = None,\n    use_cache: bool = True,\n) -> ClusterSnomedCodes:\n    \"\"\"\n    Get all SNOMED codes for a cluster from Snowflake.\n\n    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n    that belong to the specified cluster.\n\n    Args:\n        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        use_cache: Whether to use cached results (default True)\n\n    Returns:\n        ClusterSnomedCodes with list of SNOMED codes and descriptions\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    # Check cache first\n    cache_key = f\"cluster_snomed_{cluster_id}\"\n    if use_cache and is_cache_enabled():\n        cache = get_cache()\n        cached = cache.get(cache_key)\n        if cached is not None and len(cached) > 0:\n            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n            cached_dict = cached[0]  # First element is our data dict\n            return ClusterSnomedCodes(\n                cluster_id=cluster_id,\n                cluster_description=str(cached_dict.get(\"description\", \"\")),\n                snomed_codes=list(cached_dict.get(\"codes\", [])),\n                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n            )\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT DISTINCT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            \"SNOMEDCode\",\n            \"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        WHERE \"Cluster_ID\" = %s\n        ORDER BY \"SNOMEDCode\"\n    '''\n\n    try:\n        results = connector.execute_dict(query, (cluster_id,))\n\n        if not results:\n            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n        codes = []\n        descriptions = {}\n        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n\n        for row in results:\n            code = row.get(\"SNOMEDCode\")\n            if code:\n                codes.append(code)\n                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n\n        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n\n        # Cache the results (using query-based cache with fake params)\n        if use_cache and is_cache_enabled():\n            cache = get_cache()\n            cache_data = [{\n                \"description\": description,\n                \"codes\": codes,\n                \"descriptions\": descriptions,\n            }]\n            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n\n        return ClusterSnomedCodes(\n            cluster_id=cluster_id,\n            cluster_description=description,\n            snomed_codes=codes,\n            snomed_descriptions=descriptions,\n        )\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n\ndef patient_has_indication(\n    patient_pseudonym: str,\n    cluster_ids: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n    \"\"\"\n    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        cluster_ids: List of cluster IDs to check against\n        connector: Optional SnowflakeConnector\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        return False, None, None, None\n\n    if not cluster_ids:\n        return False, None, None, None\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build placeholders for cluster IDs\n    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n\n    # Query to check if patient has any matching SNOMED code\n    query = f'''\n        SELECT\n            pc.\"SNOMEDCode\",\n            cc.\"Cluster_ID\",\n            cc.\"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n        WHERE pc.\"PatientPseudonym\" = %s\n            AND cc.\"Cluster_ID\" IN ({placeholders})\n    '''\n\n    params = [patient_pseudonym] + cluster_ids\n\n    if before_date:\n        query += ' AND pc.\"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            return (\n                True,\n                row.get(\"Cluster_ID\"),\n                row.get(\"SNOMEDCode\"),\n                row.get(\"SNOMEDDescription\"),\n            )\n\n        return False, None, None, None\n\n    except Exception as e:\n        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n        return False, None, None, None\n\n\ndef validate_indication(\n    patient_pseudonym: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> IndicationValidationResult:\n    \"\"\"\n    Validate that a patient has an appropriate indication for a drug.\n\n    Full validation workflow:\n    1. Get drug's valid indication clusters from local mapping\n    2. Check if patient has any matching SNOMED codes in GP records\n    3. Return detailed validation result\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_name: Drug name to validate indication for\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        IndicationValidationResult with validation details\n    \"\"\"\n    result = IndicationValidationResult(\n        patient_pseudonym=patient_pseudonym,\n        drug_name=drug_name,\n        has_valid_indication=False,\n    )\n\n    # Step 1: Get drug's cluster mappings\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    if not cluster_ids:\n        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n        result.source = \"NONE\"\n        return result\n\n    result.checked_clusters = cluster_ids\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        result.error_message = \"Snowflake connector not installed\"\n        result.source = \"NONE\"\n        return result\n\n    if not is_snowflake_configured():\n        result.error_message = \"Snowflake not configured\"\n        result.source = \"NONE\"\n        return result\n\n    # Step 3: Check patient GP records\n    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n        patient_pseudonym=patient_pseudonym,\n        cluster_ids=cluster_ids,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    result.has_valid_indication = has_indication\n    result.matched_cluster_id = matched_cluster\n    result.matched_snomed_code = matched_code\n    result.matched_snomed_description = matched_desc\n    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n\n    return result\n\n\ndef get_indication_match_rate(\n    drug_name: str,\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    sample_unmatched_count: int = 10,\n) -> DrugIndicationMatchRate:\n    \"\"\"\n    Calculate indication match rate for a drug across a list of patients.\n\n    Args:\n        drug_name: Drug name to check\n        patient_pseudonyms: List of patient pseudonymised NHS numbers\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n\n    Returns:\n        DrugIndicationMatchRate with match statistics\n    \"\"\"\n    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n        connector = get_connector()\n\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    total = len(patient_pseudonyms)\n    matched = 0\n    unmatched = 0\n    sample_unmatched: list[str] = []\n\n    if not cluster_ids:\n        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n        return DrugIndicationMatchRate(\n            drug_name=drug_name,\n            total_patients=total,\n            patients_with_indication=0,\n            patients_without_indication=total,\n            match_rate=0.0,\n            clusters_checked=[],\n            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n        )\n\n    for i, pseudonym in enumerate(patient_pseudonyms):\n        if i > 0 and i % 100 == 0:\n            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n\n        has_indication, _, _, _ = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        if has_indication:\n            matched += 1\n        else:\n            unmatched += 1\n            if len(sample_unmatched) < sample_unmatched_count:\n                sample_unmatched.append(pseudonym)\n\n    match_rate = matched / total if total > 0 else 0.0\n\n    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n\n    return DrugIndicationMatchRate(\n        drug_name=drug_name,\n        total_patients=total,\n        patients_with_indication=matched,\n        patients_without_indication=unmatched,\n        match_rate=match_rate,\n        clusters_checked=cluster_ids,\n        sample_unmatched=sample_unmatched,\n    )\n\n\ndef batch_validate_indications(\n    patient_drug_pairs: list[tuple[str, str]],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    progress_callback: Optional[Callable[[int, int], None]] = None,\n) -> list[IndicationValidationResult]:\n    \"\"\"\n    Validate indications for multiple patient-drug pairs efficiently.\n\n    Args:\n        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        progress_callback: Optional callback(current, total) for progress updates\n\n    Returns:\n        List of IndicationValidationResult for each pair\n    \"\"\"\n    results = []\n    total = len(patient_drug_pairs)\n\n    # Cache cluster lookups by drug\n    drug_clusters_cache = {}\n\n    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n        if progress_callback:\n            progress_callback(i + 1, total)\n\n        # Get clusters from cache or lookup\n        drug_upper = drug_name.upper()\n        if drug_upper not in drug_clusters_cache:\n            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n\n        cluster_ids = drug_clusters_cache[drug_upper]\n\n        if not cluster_ids:\n            results.append(IndicationValidationResult(\n                patient_pseudonym=pseudonym,\n                drug_name=drug_name,\n                has_valid_indication=False,\n                source=\"NONE\",\n                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n            ))\n            continue\n\n        # Check patient indication\n        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        results.append(IndicationValidationResult(\n            patient_pseudonym=pseudonym,\n            drug_name=drug_name,\n            has_valid_indication=has_indication,\n            matched_cluster_id=matched_cluster,\n            matched_snomed_code=matched_code,\n            matched_snomed_description=matched_desc,\n            checked_clusters=cluster_ids,\n            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n        ))\n\n    matched_count = sum(1 for r in results if r.has_valid_indication)\n    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n\n    return results\n\n\ndef get_available_clusters(\n    connector: Optional[SnowflakeConnector] = None,\n) -> list[dict]:\n    \"\"\"\n    Get list of all available SNOMED clusters from Snowflake.\n\n    Returns:\n        List of dicts with cluster_id, cluster_description, code_count\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - cannot list clusters\")\n        return []\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n        ORDER BY \"Cluster_ID\"\n    '''\n\n    try:\n        results = connector.execute_dict(query)\n\n        clusters = []\n        for row in results:\n            clusters.append({\n                \"cluster_id\": row.get(\"Cluster_ID\"),\n                \"cluster_description\": row.get(\"Cluster_Description\"),\n                \"code_count\": row.get(\"code_count\", 0),\n            })\n\n        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n        return clusters\n\n    except Exception as e:\n        logger.error(f\"Error getting available clusters: {e}\")\n        return []\n\n\ndef batch_lookup_indication_groups(\n    df: \"pd.DataFrame\",\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n\n    This is the efficient batch version of get_directorate_from_diagnosis().\n    Instead of querying Snowflake per patient, it batches the lookups for performance.\n\n    Strategy:\n    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n    2. For each unique drug, get all SNOMED codes from local SQLite\n    3. Build batched Snowflake queries to check GP records\n    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n\n    For unmatched patients, Indication_Group will be their Directory (with suffix).\n\n    Args:\n        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        batch_size: Number of patients per Snowflake query batch\n\n    Returns:\n        DataFrame with columns: UPID, Indication_Group, Source\n        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n    \"\"\"\n    import pandas as pd\n\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n\n    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n    if 'PseudoNHSNoLinked' not in df.columns:\n        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n        # Return fallback for all patients\n        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']]\n\n    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n\n    # Step 2: Get all unique drugs and their SNOMED codes\n    unique_drugs = unique_pairs['Drug Name'].unique()\n    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n\n    # Build drug -> list of DrugSnomedMapping dict\n    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n    all_snomed_codes: set[str] = set()\n    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n\n    for drug_name in unique_drugs:\n        mappings = get_drug_snomed_codes(drug_name, db_manager)\n        drug_snomed_map[drug_name] = mappings\n\n        for m in mappings:\n            all_snomed_codes.add(m.snomed_code)\n            if m.snomed_code not in snomed_to_drug_searchterm:\n                snomed_to_drug_searchterm[m.snomed_code] = []\n            snomed_to_drug_searchterm[m.snomed_code].append(\n                (drug_name, m.search_term, m.primary_directorate)\n            )\n\n    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n\n    # Step 3: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Step 4: Query GP records for all patients in batches\n    # The query finds the most recent matching SNOMED code for each patient\n\n    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n\n    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n\n    # Results dict: PersonKey -> (snomed_code, event_date)\n    gp_matches: dict[str, tuple[str, Any]] = {}\n\n    # Convert SNOMED codes to list for query\n    snomed_list = list(all_snomed_codes)\n\n    if not snomed_list:\n        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    # Build SNOMED IN clause (reused across batches)\n    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n\n    # Process patients in batches\n    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n\n        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n\n        # Build patient IN clause\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Query to find all matching SNOMED codes for these patients\n        # We'll get all matches and pick the most recent per patient in Python\n        query = f'''\n            SELECT\n                \"PatientPseudonym\",\n                \"SNOMEDCode\",\n                \"EventDateTime\"\n            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n        '''\n\n        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n\n        try:\n            results = connector.execute_dict(query, params)\n\n            # Process results - pick most recent per patient\n            for row in results:\n                person_key = row.get(\"PatientPseudonym\")\n                snomed_code = row.get(\"SNOMEDCode\")\n                event_date = row.get(\"EventDateTime\")\n\n                if person_key and snomed_code:\n                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n                    if person_key not in gp_matches:\n                        gp_matches[person_key] = (snomed_code, event_date)\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch: {e}\")\n            # Continue with other batches\n\n    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n\n    # Step 5: Build result DataFrame\n    # For each unique_pair, determine Indication_Group based on match status\n    results_list = []\n\n    # We need to dedupe by UPID - a patient might be on multiple drugs\n    # Strategy: For each UPID, use the most recent match (if any)\n    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n\n    for _, row in unique_pairs.iterrows():\n        upid = row['UPID']\n        drug_name = row['Drug Name']\n        patient_pseudonym = row['PseudoNHSNoLinked']\n        directory = row['Directory']\n\n        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n        if patient_pseudonym in gp_matches:\n            matched_snomed, event_date = gp_matches[patient_pseudonym]\n\n            # Find the search_term for this SNOMED code and drug\n            # (A SNOMED code might map to multiple drugs with different search_terms)\n            if matched_snomed in snomed_to_drug_searchterm:\n                # Look for match with current drug first\n                search_term = None\n                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n                    if drug.upper() == drug_name.upper():\n                        search_term = st\n                        break\n                # If no drug-specific match, use any match\n                if search_term is None:\n                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n\n                # Only update if we don't have a match for this UPID yet\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n            else:\n                # Shouldn't happen but fallback just in case\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n        else:\n            # No GP match - use fallback\n            if upid not in upid_to_match:\n                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n\n    # Build result DataFrame\n    for upid, (indication_group, source) in upid_to_match.items():\n        results_list.append({\n            'UPID': upid,\n            'Indication_Group': indication_group,\n            'Source': source,\n        })\n\n    result_df = pd.DataFrame(results_list)\n\n    # Log statistics\n    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n    total = len(result_df)\n\n    logger.info(f\"Indication lookup complete:\")\n    logger.info(f\"  Total unique patients: {total}\")\n    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n\n    return result_df\n\n\n# === Drug-to-indication mapping from DimSearchTerm.csv ===\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                if not search_term or not drug_names_raw:\n                    continue\n\n                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n\n                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n                if search_term not in search_term_to_fragments:\n                    search_term_to_fragments[search_term] = []\n                for frag in fragments:\n                    if frag not in search_term_to_fragments[search_term]:\n                        search_term_to_fragments[search_term].append(frag)\n\n                # Build fragment -> search_terms\n                for frag in fragments:\n                    if frag not in fragment_to_search_terms:\n                        fragment_to_search_terms[frag] = []\n                    if search_term not in fragment_to_search_terms[frag]:\n                        fragment_to_search_terms[frag].append(search_term)\n\n        logger.info(\n            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n            f\"{len(fragment_to_search_terms)} drug fragments\"\n        )\n\n    except FileNotFoundError:\n        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n    except Exception as e:\n        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n\n    return fragment_to_search_terms, search_term_to_fragments\n\n\ndef get_search_terms_for_drug(\n    drug_name: str,\n    search_term_to_fragments: dict[str, list[str]],\n) -> list[str]:\n    \"\"\"\n    Get all Search_Terms that list a given drug using substring matching.\n\n    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n    and partial fragments (PEGYLATED, INHALED).\n\n    Args:\n        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n\n    Returns:\n        List of Search_Terms whose drug fragments match the drug name\n    \"\"\"\n    drug_name_upper = drug_name.upper()\n    matched_terms: list[str] = []\n\n    for search_term, fragments in search_term_to_fragments.items():\n        for frag in fragments:\n            if frag in drug_name_upper:\n                matched_terms.append(search_term)\n                break  # One matching fragment is enough for this Search_Term\n\n    return matched_terms\n\n\n# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n\n# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\nCLUSTER_MAPPING_SQL = \"\"\"\nWITH SearchTermClusters AS (\n    SELECT Search_Term, Cluster_ID FROM (VALUES\n        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n        ('allergic asthma', 'AST_COD'),\n        ('allergic rhinitis', 'MILDINTAST_COD'),\n        ('alzheimer''s disease', 'DEMALZ_COD'),\n        ('amyloidosis', 'AMYLOID_COD'),\n        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n        ('apixaban', 'DOACCON_COD'),\n        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n        ('arthritis', 'eFI2_InflammatoryArthritis'),\n        ('asthma', 'eFI2_Asthma'),\n        ('atopic dermatitis', 'ATOPDERM_COD'),\n        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n        ('bipolar disorder', 'MH_COD'),\n        ('bladder', 'eFI2_UrinaryIncontinence'),\n        ('breast cancer', 'BRCANSCR_COD'),\n        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n        ('cardiovascular disease', 'CVDRISKASS_COD'),\n        ('cervical cancer', 'CSDEC_COD'),\n        ('cholangiocarcinoma', 'eFI2_Cancer'),\n        ('chronic kidney disease', 'CKD_COD'),\n        ('chronic liver disease', 'eFI2_LiverProblems'),\n        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n        ('colon cancer', 'eFI2_Cancer'),\n        ('colorectal cancer', 'GICANREF_COD'),\n        ('constipation', 'CHRONCONSTIP_COD'),\n        ('covid-19', 'POSSPOSTCOVID_COD'),\n        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n        ('deep vein thrombosis', 'VTE_COD'),\n        ('depression', 'eFI2_Depression'),\n        ('diabetes', 'eFI2_DiabetesEver'),\n        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('dravet syndrome', 'EPIL_COD'),\n        ('drug misuse', 'ILLSUBINT_COD'),\n        ('dyspepsia', 'eFI2_AbdominalPain'),\n        ('epilepsy', 'eFI2_Seizures'),\n        ('fallopian tube', 'STERIL_COD'),\n        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n        ('gastric cancer', 'eFI2_Cancer'),\n        ('giant cell arteritis', 'GCA_COD'),\n        ('glioma', 'NHAEMCANMORPH_COD'),\n        ('gout', 'eFI2_InflammatoryArthritis'),\n        ('graft versus host disease', 'GVHD_COD'),\n        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n        ('hand eczema', 'ECZEMA_COD'),\n        ('heart failure', 'eFI2_HeartFailure'),\n        ('hepatitis b', 'HEPBCVAC_COD'),\n        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n        ('hiv', 'PREFLANG_COD'),\n        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n        ('hormone receptor', 'eFI2_ThyroidProblems'),\n        ('hypercholesterolaemia', 'CLASSFH_COD'),\n        ('immune thrombocytopenia', 'ITP_COD'),\n        ('influenza', 'FLUINVITE_COD'),\n        ('insomnia', 'eFI2_SleepProblems'),\n        ('irritable bowel syndrome', 'IBS_COD'),\n        ('ischaemic stroke', 'OSTR_COD'),\n        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n        ('kidney transplant', 'RENALTRANSP_COD'),\n        ('leukaemia', 'eFI2_Cancer'),\n        ('lung cancer', 'FTCANREF_COD'),\n        ('lymphoma', 'C19HAEMCAN_COD'),\n        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('major depressive episodes', 'eFI2_Depression'),\n        ('malignant melanoma', 'eFI2_Cancer'),\n        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n        ('manic episode', 'MH_COD'),\n        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n        ('melanoma', 'eFI2_Cancer'),\n        ('merkel cell carcinoma', 'C19CAN_COD'),\n        ('migraine', 'eFI2_Headache'),\n        ('motor neurone disease', 'MND_COD'),\n        ('multiple myeloma', 'C19HAEMCAN_COD'),\n        ('multiple sclerosis', 'MS_COD'),\n        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n        ('myelofibrosis', 'MDS_COD'),\n        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n        ('myotonia', 'CNDATRISK2_COD'),\n        ('narcolepsy', 'LD_COD'),\n        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n        ('obesity', 'BMI30_COD'),\n        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('osteoporosis', 'eFI2_Osteoporosis'),\n        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n        ('ovarian cancer', 'C19CAN_COD'),\n        ('peripheral arterial disease', 'PADEXC_COD'),\n        ('plaque psoriasis', 'PSORIASIS_COD'),\n        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n        ('pregnancy', 'C19PREG_COD'),\n        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n        ('psoriasis', 'PSORIASIS_COD'),\n        ('psoriatic arthritis', 'RARTHAD_COD'),\n        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n        ('pulmonary fibrosis', 'ILD_COD'),\n        ('relapsing multiple sclerosis', 'MS_COD'),\n        ('renal cell carcinoma', 'C19CAN_COD'),\n        ('renal transplantation', 'RENALTRANSP_COD'),\n        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n        ('rivaroxaban', 'DOACCON_COD'),\n        ('schizophrenia', 'MH_COD'),\n        ('seizures', 'LSZFREQ_COD'),\n        ('sepsis', 'C19ACTIVITY_COD'),\n        ('severe persistent allergic asthma', 'SEVAST_COD'),\n        ('sickle cell disease', 'SICKLE_COD'),\n        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n        ('smoking cessation', 'SMOKINGINT_COD'),\n        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n        ('spinal muscular atrophy', 'MND_COD'),\n        ('squamous cell', 'C19CAN_COD'),\n        ('squamous cell carcinoma', 'C19CAN_COD'),\n        ('stem cell transplant', 'ALLOTRANSP_COD'),\n        ('stroke', 'eFI2_Stroke'),\n        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n        ('thrombocytopenic purpura', 'TTP_COD'),\n        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n        ('thyroid cancer', 'C19CAN_COD'),\n        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('transitional cell carcinoma', 'C19CAN_COD'),\n        ('type 1 diabetes', 'DMTYPE1_COD'),\n        ('type 2 diabetes', 'DMTYPE2_COD'),\n        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n        ('urticaria', 'XSAL_COD'),\n        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('vascular disease', 'CVDINVITE_COD'),\n        ('vasculitis', 'CRYOGLOBVASC_COD')\n    ) AS t(Search_Term, Cluster_ID)\n),\n\nClusterCodes AS (\n    SELECT\n        stc.Search_Term,\n        c.\"SNOMEDCode\",\n        c.\"SNOMEDDescription\"\n    FROM SearchTermClusters stc\n    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n        ON stc.Cluster_ID = c.\"Cluster_ID\"\n    WHERE c.\"SNOMEDCode\" IS NOT NULL\n),\n\nExplicitCodes AS (\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n),\n\nAllIndicationCodes AS (\n    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n    FROM ClusterCodes\n    UNION ALL\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n    FROM ExplicitCodes\n)\n\"\"\"\n\n\ndef get_patient_indication_groups(\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n\n    This function queries Snowflake directly using the embedded cluster CTE\n    (from snomed_indication_mapping_query.sql) to find patients with matching\n    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n\n    The query:\n    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n    3. Returns the most recent match per patient (by EventDateTime)\n\n    Args:\n        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        batch_size: Number of patients per Snowflake query batch (default 500)\n\n    Returns:\n        DataFrame with columns:\n        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n        - EventDateTime: Date of the GP diagnosis record\n\n        Patients not found in results have no matching GP diagnosis.\n    \"\"\"\n    import pandas as pd\n\n    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n\n    # Handle edge case: empty patient list\n    if not patient_pseudonyms:\n        logger.warning(\"Empty patient list provided\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    # Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if not is_snowflake_configured():\n        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Results list to collect all matches\n    all_results: list[dict] = []\n\n    # Process patients in batches\n    total_patients = len(patient_pseudonyms)\n    for batch_start in range(0, total_patients, batch_size):\n        batch_end = min(batch_start + batch_size, total_patients)\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n        batch_num = batch_start // batch_size + 1\n        total_batches = (total_patients + batch_size - 1) // batch_size\n\n        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n\n        # Build patient IN clause placeholders\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Build the full query with cluster CTE\n        # This finds the most recent matching diagnosis for each patient\n        # Note: Column names must be aliased to ensure consistent casing in results\n        query = f\"\"\"\n{CLUSTER_MAPPING_SQL}\nSELECT\n    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    aic.Search_Term AS \"Search_Term\",\n    pc.\"EventDateTime\" AS \"EventDateTime\"\nFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\nINNER JOIN AllIndicationCodes aic\n    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\nWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\nQUALIFY ROW_NUMBER() OVER (\n    PARTITION BY pc.\"PatientPseudonym\"\n    ORDER BY pc.\"EventDateTime\" DESC\n) = 1\n\"\"\"\n\n        try:\n            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n\n            for row in results:\n                all_results.append({\n                    'PatientPseudonym': row.get('PatientPseudonym'),\n                    'Search_Term': row.get('Search_Term'),\n                    'EventDateTime': row.get('EventDateTime'),\n                })\n\n            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n            # Continue with other batches - partial results are better than none\n\n    # Build result DataFrame\n    result_df = pd.DataFrame(all_results)\n\n    # Log summary statistics\n    if len(result_df) > 0:\n        matched_count = len(result_df)\n        match_rate = 100 * matched_count / total_patients\n        unique_terms = result_df['Search_Term'].nunique()\n        logger.info(f\"Indication lookup complete:\")\n        logger.info(f\"  Total patients queried: {total_patients}\")\n        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n\n        # Log top Search_Terms\n        top_terms = result_df['Search_Term'].value_counts().head(5)\n        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n    else:\n        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n\n    return result_df\n\n\n# Export public API\n__all__ = [\n    # Dataclasses\n    \"ClusterSnomedCodes\",\n    \"IndicationValidationResult\",\n    \"DrugIndicationMatchRate\",\n    \"DrugSnomedMapping\",\n    \"DirectSnomedMatchResult\",\n    \"DirectorateAssignment\",\n    # Cluster-based lookup functions (existing)\n    \"get_drug_clusters\",\n    \"get_drug_cluster_ids\",\n    \"get_cluster_snomed_codes\",\n    \"patient_has_indication\",\n    \"validate_indication\",\n    \"get_indication_match_rate\",\n    \"batch_validate_indications\",\n    \"get_available_clusters\",\n    # Direct SNOMED lookup functions (new)\n    \"get_drug_snomed_codes\",\n    \"patient_has_indication_direct\",\n    # Diagnosis-based directorate assignment\n    \"get_directorate_from_diagnosis\",\n    # Batch lookup for indication groups\n    \"batch_lookup_indication_groups\",\n    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",\n    # Snowflake-direct indication lookup (new approach)\n    \"get_patient_indication_groups\",\n    \"CLUSTER_MAPPING_SQL\",\n]\n","numLines":1539,"startLine":1,"totalLines":1539}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RLvJ9RVypnUGsTK8b9AaDG","type":"message","role":"assistant","content":[{"type":"text","text":"Good. I've read all the required files. Let me assess the current state:\n\n**Completed tasks:**\n- Task 1.3: `load_drug_indication_mapping()` and `get_search_terms_for_drug()` ÔÇö done in Iteration 1\n\n**Remaining tasks (Phase 1):**\n- Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n- Task 1.2: Merge asthma Search_Terms in CLUSTER_MAPPING_SQL and `load_drug_indication_mapping()`\n\n**Task selection reasoning:**\nProgress.txt recommends either Task 1.2 (asthma merge ÔÇö self-contained, testable locally) or Task 1.1 (Snowflake query update). I'll choose **Task 1.2** because:\n1. It's fully self-contained and testable without Snowflake connectivity\n2. It's a prerequisite for getting clean data from both the Snowflake query AND the drug mapping\n3. It's simpler and lower risk ÔÇö a good foundation before modifying the Snowflake query\n\nLet me start by understanding what needs to change. I need to:\n1. Update `CLUSTER_MAPPING_SQL` to merge 3 asthma Search_Terms into one `\"asthma\"` entry\n2. Update `load_drug_indication_mapping()` to merge asthma variants when loading DimSearchTerm.csv\n3. Verify the changes work correctly\n\nFirst, let me check DimSearchTerm.csv to see the asthma-related rows."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":23572,"cache_read_input_tokens":80438,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":23572},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"40ddc926-1d72-40d6-9ffa-5fecdeb49cbe"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RLvJ9RVypnUGsTK8b9AaDG","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01UAmrDLuJwugMGbcXh4NQko","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"in_progress","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"pending","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":23572,"cache_read_input_tokens":80438,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":23572},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"afa3ade5-2941-44c9-bb91-43ac6fc54d23"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01UAmrDLuJwugMGbcXh4NQko","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"f7d761b1-b778-410a-afd2-8f8860dd3ad3","tool_use_result":{"oldTodos":[],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"in_progress","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"pending","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01WybbbYPJaCg88Svcs2RmKk","type":"message","role":"assistant","content":[{"type":"text","text":"Let me check the asthma-related entries in DimSearchTerm.csv."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":707,"cache_read_input_tokens":104010,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":707},"output_tokens":5,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"8619992d-d6b9-4371-9719-671bf77fbfc2"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01WybbbYPJaCg88Svcs2RmKk","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_011E5hXkAV6ameXTJoHMiZiC","name":"Grep","input":{"pattern":"(?i)asthma|urticaria","path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data\\DimSearchTerm.csv","output_mode":"content"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":707,"cache_read_input_tokens":104010,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":707},"output_tokens":5,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"1e7b3520-aa4f-4b43-96ef-019be3d116b9"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_011E5hXkAV6ameXTJoHMiZiC","type":"tool_result","content":"6:allergic asthma,OMALIZUMAB,THORACIC MEDICINE\r\n16:asthma,BENRALIZUMAB|DUPILUMAB|INHALED|MEPOLIZUMAB|OMALIZUMAB|RESLIZUMAB,THORACIC MEDICINE\r\n138:severe persistent allergic asthma,OMALIZUMAB,THORACIC MEDICINE\r\n161:urticaria,OMALIZUMAB,DERMATOLOGY"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"7364d97c-1230-4f52-b84c-57379e245018","tool_use_result":{"mode":"content","numFiles":0,"filenames":[],"content":"6:allergic asthma,OMALIZUMAB,THORACIC MEDICINE\r\n16:asthma,BENRALIZUMAB|DUPILUMAB|INHALED|MEPOLIZUMAB|OMALIZUMAB|RESLIZUMAB,THORACIC MEDICINE\r\n138:severe persistent allergic asthma,OMALIZUMAB,THORACIC MEDICINE\r\n161:urticaria,OMALIZUMAB,DERMATOLOGY","numLines":4}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01QharzBZF6XjMnMSaexBMnP","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me check the current CLUSTER_MAPPING_SQL entries for asthma. I can see from the file I already read:\n- `('allergic asthma', 'AST_COD')` ÔÇö line 1201\n- `('asthma', 'eFI2_Asthma')` ÔÇö line 1210\n- `('severe persistent allergic asthma', 'SEVAST_COD')` ÔÇö line 1316\n- `('urticaria', 'XSAL_COD')` ÔÇö line 1337\n\nThe plan for CLUSTER_MAPPING_SQL is to map all 3 asthma Cluster_IDs to `Search_Term = 'asthma'`:\n- `('asthma', 'AST_COD')` ÔÇö was allergic asthma\n- `('asthma', 'eFI2_Asthma')` ÔÇö stays the same\n- `('asthma', 'SEVAST_COD')` ÔÇö was severe persistent allergic asthma\n\n`urticaria` stays as-is."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":266,"cache_read_input_tokens":104717,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":266},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e9e1ae65-84cf-472c-8cb2-4fb518732142"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01QharzBZF6XjMnMSaexBMnP","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_0131Rc28Z3GakLie1rHPfcnL","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"in_progress","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":266,"cache_read_input_tokens":104717,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":266},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"85be062c-5155-4af3-81e0-d50b60d43b17"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_0131Rc28Z3GakLie1rHPfcnL","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"b91b037a-ce02-4931-8343-c56a190ba5b8","tool_use_result":{"oldTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"in_progress","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"pending","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"in_progress","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01BLG7ZvkavNNNHTZ4vhmx1C","type":"message","role":"assistant","content":[{"type":"text","text":"Now updating CLUSTER_MAPPING_SQL. I need to change `'allergic asthma'` ÔåÆ `'asthma'` and `'severe persistent allergic asthma'` ÔåÆ `'asthma'`. This means the CTE will have multiple rows mapping to `Search_Term = 'asthma'` with different Cluster_IDs (AST_COD, eFI2_Asthma, SEVAST_COD). That's fine ÔÇö the UNION in AllIndicationCodes will just give us more SNOMED codes mapping to `'asthma'`, which is what we want."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":593,"cache_read_input_tokens":104983,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":593},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"c45867f0-3c5e-43e8-b467-7b95b74d6467"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01BLG7ZvkavNNNHTZ4vhmx1C","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_016hfD6Qshc6W1715oYUt8Ei","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","old_string":"        ('allergic asthma', 'AST_COD'),","new_string":"        ('asthma', 'AST_COD'),"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":593,"cache_read_input_tokens":104983,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":593},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"a18daf35-026c-479b-9811-007c1a24cb66"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_016hfD6Qshc6W1715oYUt8Ei","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"a674024c-4fa7-4fbf-858c-0cb96f94bc4d","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","oldString":"        ('allergic asthma', 'AST_COD'),","newString":"        ('asthma', 'AST_COD'),","originalFile":"\"\"\"\nDiagnosis lookup module for NHS Patient Pathway Analysis.\n\nProvides functions to validate patient indications by checking GP diagnosis records\nagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\ndrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n\nKey workflow:\n1. Get drug's valid indication clusters from local mapping\n2. Get all SNOMED codes for those clusters from Snowflake\n3. Check if patient has any of those SNOMED codes in GP records\n4. Report indication validation status\n\nIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\nGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime import date, datetime\nfrom pathlib import Path\nfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\nimport csv\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nfrom core.logging_config import get_logger\nfrom data_processing.database import DatabaseManager, default_db_manager\nfrom data_processing.snowflake_connector import (\n    SnowflakeConnector,\n    get_connector,\n    is_snowflake_available,\n    is_snowflake_configured,\n    SNOWFLAKE_AVAILABLE,\n)\nfrom data_processing.cache import get_cache, is_cache_enabled\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass ClusterSnomedCodes:\n    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    cluster_id: str\n    cluster_description: str\n    snomed_codes: list[str] = field(default_factory=list)\n    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n\n    @property\n    def code_count(self) -> int:\n        return len(self.snomed_codes)\n\n\n@dataclass\nclass IndicationValidationResult:\n    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    patient_pseudonym: str\n    drug_name: str\n    has_valid_indication: bool\n    matched_cluster_id: Optional[str] = None\n    matched_snomed_code: Optional[str] = None\n    matched_snomed_description: Optional[str] = None\n    checked_clusters: list[str] = field(default_factory=list)\n    total_codes_checked: int = 0\n    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    error_message: Optional[str] = None\n\n\n@dataclass\nclass DrugIndicationMatchRate:\n    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    drug_name: str\n    total_patients: int\n    patients_with_indication: int\n    patients_without_indication: int\n    match_rate: float  # 0.0 to 1.0\n    clusters_checked: list[str] = field(default_factory=list)\n    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n\n\n@dataclass\nclass DrugSnomedMapping:\n    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    snomed_code: str\n    snomed_description: str\n    search_term: str\n    primary_directorate: str\n    indication: str = \"\"\n    ta_id: str = \"\"\n\n\n@dataclass\nclass DirectSnomedMatchResult:\n    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    patient_pseudonym: str\n    matched: bool\n    snomed_code: Optional[str] = None\n    snomed_description: Optional[str] = None\n    search_term: Optional[str] = None\n    primary_directorate: Optional[str] = None\n    event_date: Optional[datetime] = None\n    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n\n\n@dataclass\nclass DirectorateAssignment:\n    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n    upid: str\n    drug_name: str\n    directorate: Optional[str]\n    search_term: Optional[str] = None\n    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n    snomed_code: Optional[str] = None\n    event_date: Optional[datetime] = None\n\n\ndef get_drug_clusters(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[dict]:\n    \"\"\"\n    Get all SNOMED cluster mappings for a drug from local SQLite.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of dicts with keys: drug_name, indication, cluster_id,\n        cluster_description, nice_ta_reference\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n        FROM ref_drug_indication_clusters\n        WHERE UPPER(drug_name) = UPPER(?)\n        ORDER BY indication, cluster_id\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name,))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append({\n                    \"drug_name\": row[\"drug_name\"],\n                    \"indication\": row[\"indication\"],\n                    \"cluster_id\": row[\"cluster_id\"],\n                    \"cluster_description\": row[\"cluster_description\"],\n                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n                })\n\n            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n        return []\n\n\ndef get_drug_cluster_ids(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[str]:\n    \"\"\"\n    Get unique cluster IDs for a drug.\n\n    Args:\n        drug_name: Drug name to look up\n        db_manager: Optional DatabaseManager\n\n    Returns:\n        List of unique cluster IDs\n    \"\"\"\n    clusters = get_drug_clusters(drug_name, db_manager)\n    return list(set(c[\"cluster_id\"] for c in clusters))\n\n\ndef get_drug_snomed_codes(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[DrugSnomedMapping]:\n    \"\"\"\n    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n\n    This uses the enriched mapping CSV data loaded into SQLite, which provides\n    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of DrugSnomedMapping with snomed_code, snomed_description,\n        search_term, primary_directorate, indication, ta_id\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT DISTINCT\n            snomed_code,\n            snomed_description,\n            search_term,\n            primary_directorate,\n            indication,\n            ta_id\n        FROM ref_drug_snomed_mapping\n        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n           OR UPPER(drug_name) = UPPER(?)\n        ORDER BY search_term, snomed_code\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name, drug_name))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append(DrugSnomedMapping(\n                    snomed_code=row[\"snomed_code\"],\n                    snomed_description=row[\"snomed_description\"] or \"\",\n                    search_term=row[\"search_term\"] or \"\",\n                    primary_directorate=row[\"primary_directorate\"] or \"\",\n                    indication=row[\"indication\"] or \"\",\n                    ta_id=row[\"ta_id\"] or \"\",\n                ))\n\n            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n        return []\n\n\ndef patient_has_indication_direct(\n    patient_pseudonym: str,\n    drug_snomed_mappings: list[DrugSnomedMapping],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> DirectSnomedMatchResult:\n    \"\"\"\n    Check if patient has any of the SNOMED codes in their GP records.\n\n    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n    for exact SNOMED code matches (not via cluster). Returns the most recent\n    match by EventDateTime if multiple matches exist.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n    \"\"\"\n    result = DirectSnomedMatchResult(\n        patient_pseudonym=patient_pseudonym,\n        matched=False,\n        source=\"NONE\",\n    )\n\n    if not drug_snomed_mappings:\n        return result\n\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot check GP records\")\n        return result\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n    snomed_lookup = {\n        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n        for m in drug_snomed_mappings\n    }\n\n    # Get unique SNOMED codes\n    snomed_codes = list(snomed_lookup.keys())\n\n    # Build placeholders for SNOMED codes\n    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n\n    # Query to find most recent matching SNOMED code in GP records\n    query = f'''\n        SELECT\n            \"SNOMEDCode\",\n            \"EventDateTime\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n        WHERE \"PatientPseudonym\" = %s\n            AND \"SNOMEDCode\" IN ({placeholders})\n    '''\n\n    params: list = [patient_pseudonym] + snomed_codes\n\n    if before_date:\n        query += ' AND \"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            matched_code = row.get(\"SNOMEDCode\")\n            event_dt = row.get(\"EventDateTime\")\n\n            if matched_code and matched_code in snomed_lookup:\n                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n\n                return DirectSnomedMatchResult(\n                    patient_pseudonym=patient_pseudonym,\n                    matched=True,\n                    snomed_code=matched_code,\n                    snomed_description=snomed_desc,\n                    search_term=search_term,\n                    primary_directorate=primary_dir,\n                    event_date=event_dt,\n                    source=\"DIRECT_SNOMED\",\n                )\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n        return result\n\n\ndef get_directorate_from_diagnosis(\n    upid: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> DirectorateAssignment:\n    \"\"\"\n    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n\n    This function attempts to assign a directorate based on the patient's GP records\n    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n    indicating that the caller should use alternative assignment methods (e.g.,\n    department_identification() from tools/data.py).\n\n    Workflow:\n    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n    2. Query patient's GP records for matching SNOMED codes\n    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n\n    Args:\n        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n        drug_name: Drug name to look up\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectorateAssignment with directorate, search_term, and source\n    \"\"\"\n    result = DirectorateAssignment(\n        upid=upid,\n        drug_name=drug_name,\n        directorate=None,\n        source=\"FALLBACK\",\n    )\n\n    # Step 1: Get SNOMED codes for the drug\n    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n\n    if not drug_snomed_mappings:\n        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n        return result\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.debug(\"Snowflake not available - using fallback\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.debug(\"Snowflake not configured - using fallback\")\n        return result\n\n    # Step 3: Get patient pseudonym from UPID\n    # UPID format is Provider Code (3 chars) + PersonKey\n    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n    # For now, we'll use UPID as the identifier - the actual integration\n    # will need to happen at the DataFrame level where we have PersonKey\n    #\n    # NOTE: This function will be called from the pipeline where we have\n    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n\n    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n    # The caller should pass the PatientPseudonym or we need to look it up.\n    # For now, let's assume the caller will use this in a batch context\n    # where they can map UPID -> PatientPseudonym.\n\n    # Let me reconsider: the function signature takes UPID but we need\n    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n    # we'll have the PersonKey column which IS the PatientPseudonym.\n    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n    #\n    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n\n    # Step 4: Check patient's GP records for matching SNOMED codes\n    match_result = patient_has_indication_direct(\n        patient_pseudonym=patient_pseudonym,\n        drug_snomed_mappings=drug_snomed_mappings,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    if match_result.matched and match_result.primary_directorate:\n        return DirectorateAssignment(\n            upid=upid,\n            drug_name=drug_name,\n            directorate=match_result.primary_directorate,\n            search_term=match_result.search_term,\n            source=\"DIAGNOSIS\",\n            snomed_code=match_result.snomed_code,\n            event_date=match_result.event_date,\n        )\n\n    # No match found - return fallback result\n    return result\n\n\ndef get_cluster_snomed_codes(\n    cluster_id: str,\n    connector: Optional[SnowflakeConnector] = None,\n    use_cache: bool = True,\n) -> ClusterSnomedCodes:\n    \"\"\"\n    Get all SNOMED codes for a cluster from Snowflake.\n\n    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n    that belong to the specified cluster.\n\n    Args:\n        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        use_cache: Whether to use cached results (default True)\n\n    Returns:\n        ClusterSnomedCodes with list of SNOMED codes and descriptions\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    # Check cache first\n    cache_key = f\"cluster_snomed_{cluster_id}\"\n    if use_cache and is_cache_enabled():\n        cache = get_cache()\n        cached = cache.get(cache_key)\n        if cached is not None and len(cached) > 0:\n            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n            cached_dict = cached[0]  # First element is our data dict\n            return ClusterSnomedCodes(\n                cluster_id=cluster_id,\n                cluster_description=str(cached_dict.get(\"description\", \"\")),\n                snomed_codes=list(cached_dict.get(\"codes\", [])),\n                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n            )\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT DISTINCT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            \"SNOMEDCode\",\n            \"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        WHERE \"Cluster_ID\" = %s\n        ORDER BY \"SNOMEDCode\"\n    '''\n\n    try:\n        results = connector.execute_dict(query, (cluster_id,))\n\n        if not results:\n            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n        codes = []\n        descriptions = {}\n        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n\n        for row in results:\n            code = row.get(\"SNOMEDCode\")\n            if code:\n                codes.append(code)\n                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n\n        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n\n        # Cache the results (using query-based cache with fake params)\n        if use_cache and is_cache_enabled():\n            cache = get_cache()\n            cache_data = [{\n                \"description\": description,\n                \"codes\": codes,\n                \"descriptions\": descriptions,\n            }]\n            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n\n        return ClusterSnomedCodes(\n            cluster_id=cluster_id,\n            cluster_description=description,\n            snomed_codes=codes,\n            snomed_descriptions=descriptions,\n        )\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n\ndef patient_has_indication(\n    patient_pseudonym: str,\n    cluster_ids: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n    \"\"\"\n    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        cluster_ids: List of cluster IDs to check against\n        connector: Optional SnowflakeConnector\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        return False, None, None, None\n\n    if not cluster_ids:\n        return False, None, None, None\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build placeholders for cluster IDs\n    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n\n    # Query to check if patient has any matching SNOMED code\n    query = f'''\n        SELECT\n            pc.\"SNOMEDCode\",\n            cc.\"Cluster_ID\",\n            cc.\"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n        WHERE pc.\"PatientPseudonym\" = %s\n            AND cc.\"Cluster_ID\" IN ({placeholders})\n    '''\n\n    params = [patient_pseudonym] + cluster_ids\n\n    if before_date:\n        query += ' AND pc.\"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            return (\n                True,\n                row.get(\"Cluster_ID\"),\n                row.get(\"SNOMEDCode\"),\n                row.get(\"SNOMEDDescription\"),\n            )\n\n        return False, None, None, None\n\n    except Exception as e:\n        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n        return False, None, None, None\n\n\ndef validate_indication(\n    patient_pseudonym: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> IndicationValidationResult:\n    \"\"\"\n    Validate that a patient has an appropriate indication for a drug.\n\n    Full validation workflow:\n    1. Get drug's valid indication clusters from local mapping\n    2. Check if patient has any matching SNOMED codes in GP records\n    3. Return detailed validation result\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_name: Drug name to validate indication for\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        IndicationValidationResult with validation details\n    \"\"\"\n    result = IndicationValidationResult(\n        patient_pseudonym=patient_pseudonym,\n        drug_name=drug_name,\n        has_valid_indication=False,\n    )\n\n    # Step 1: Get drug's cluster mappings\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    if not cluster_ids:\n        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n        result.source = \"NONE\"\n        return result\n\n    result.checked_clusters = cluster_ids\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        result.error_message = \"Snowflake connector not installed\"\n        result.source = \"NONE\"\n        return result\n\n    if not is_snowflake_configured():\n        result.error_message = \"Snowflake not configured\"\n        result.source = \"NONE\"\n        return result\n\n    # Step 3: Check patient GP records\n    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n        patient_pseudonym=patient_pseudonym,\n        cluster_ids=cluster_ids,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    result.has_valid_indication = has_indication\n    result.matched_cluster_id = matched_cluster\n    result.matched_snomed_code = matched_code\n    result.matched_snomed_description = matched_desc\n    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n\n    return result\n\n\ndef get_indication_match_rate(\n    drug_name: str,\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    sample_unmatched_count: int = 10,\n) -> DrugIndicationMatchRate:\n    \"\"\"\n    Calculate indication match rate for a drug across a list of patients.\n\n    Args:\n        drug_name: Drug name to check\n        patient_pseudonyms: List of patient pseudonymised NHS numbers\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n\n    Returns:\n        DrugIndicationMatchRate with match statistics\n    \"\"\"\n    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n        connector = get_connector()\n\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    total = len(patient_pseudonyms)\n    matched = 0\n    unmatched = 0\n    sample_unmatched: list[str] = []\n\n    if not cluster_ids:\n        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n        return DrugIndicationMatchRate(\n            drug_name=drug_name,\n            total_patients=total,\n            patients_with_indication=0,\n            patients_without_indication=total,\n            match_rate=0.0,\n            clusters_checked=[],\n            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n        )\n\n    for i, pseudonym in enumerate(patient_pseudonyms):\n        if i > 0 and i % 100 == 0:\n            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n\n        has_indication, _, _, _ = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        if has_indication:\n            matched += 1\n        else:\n            unmatched += 1\n            if len(sample_unmatched) < sample_unmatched_count:\n                sample_unmatched.append(pseudonym)\n\n    match_rate = matched / total if total > 0 else 0.0\n\n    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n\n    return DrugIndicationMatchRate(\n        drug_name=drug_name,\n        total_patients=total,\n        patients_with_indication=matched,\n        patients_without_indication=unmatched,\n        match_rate=match_rate,\n        clusters_checked=cluster_ids,\n        sample_unmatched=sample_unmatched,\n    )\n\n\ndef batch_validate_indications(\n    patient_drug_pairs: list[tuple[str, str]],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    progress_callback: Optional[Callable[[int, int], None]] = None,\n) -> list[IndicationValidationResult]:\n    \"\"\"\n    Validate indications for multiple patient-drug pairs efficiently.\n\n    Args:\n        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        progress_callback: Optional callback(current, total) for progress updates\n\n    Returns:\n        List of IndicationValidationResult for each pair\n    \"\"\"\n    results = []\n    total = len(patient_drug_pairs)\n\n    # Cache cluster lookups by drug\n    drug_clusters_cache = {}\n\n    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n        if progress_callback:\n            progress_callback(i + 1, total)\n\n        # Get clusters from cache or lookup\n        drug_upper = drug_name.upper()\n        if drug_upper not in drug_clusters_cache:\n            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n\n        cluster_ids = drug_clusters_cache[drug_upper]\n\n        if not cluster_ids:\n            results.append(IndicationValidationResult(\n                patient_pseudonym=pseudonym,\n                drug_name=drug_name,\n                has_valid_indication=False,\n                source=\"NONE\",\n                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n            ))\n            continue\n\n        # Check patient indication\n        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        results.append(IndicationValidationResult(\n            patient_pseudonym=pseudonym,\n            drug_name=drug_name,\n            has_valid_indication=has_indication,\n            matched_cluster_id=matched_cluster,\n            matched_snomed_code=matched_code,\n            matched_snomed_description=matched_desc,\n            checked_clusters=cluster_ids,\n            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n        ))\n\n    matched_count = sum(1 for r in results if r.has_valid_indication)\n    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n\n    return results\n\n\ndef get_available_clusters(\n    connector: Optional[SnowflakeConnector] = None,\n) -> list[dict]:\n    \"\"\"\n    Get list of all available SNOMED clusters from Snowflake.\n\n    Returns:\n        List of dicts with cluster_id, cluster_description, code_count\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - cannot list clusters\")\n        return []\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n        ORDER BY \"Cluster_ID\"\n    '''\n\n    try:\n        results = connector.execute_dict(query)\n\n        clusters = []\n        for row in results:\n            clusters.append({\n                \"cluster_id\": row.get(\"Cluster_ID\"),\n                \"cluster_description\": row.get(\"Cluster_Description\"),\n                \"code_count\": row.get(\"code_count\", 0),\n            })\n\n        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n        return clusters\n\n    except Exception as e:\n        logger.error(f\"Error getting available clusters: {e}\")\n        return []\n\n\ndef batch_lookup_indication_groups(\n    df: \"pd.DataFrame\",\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n\n    This is the efficient batch version of get_directorate_from_diagnosis().\n    Instead of querying Snowflake per patient, it batches the lookups for performance.\n\n    Strategy:\n    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n    2. For each unique drug, get all SNOMED codes from local SQLite\n    3. Build batched Snowflake queries to check GP records\n    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n\n    For unmatched patients, Indication_Group will be their Directory (with suffix).\n\n    Args:\n        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        batch_size: Number of patients per Snowflake query batch\n\n    Returns:\n        DataFrame with columns: UPID, Indication_Group, Source\n        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n    \"\"\"\n    import pandas as pd\n\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n\n    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n    if 'PseudoNHSNoLinked' not in df.columns:\n        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n        # Return fallback for all patients\n        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']]\n\n    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n\n    # Step 2: Get all unique drugs and their SNOMED codes\n    unique_drugs = unique_pairs['Drug Name'].unique()\n    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n\n    # Build drug -> list of DrugSnomedMapping dict\n    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n    all_snomed_codes: set[str] = set()\n    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n\n    for drug_name in unique_drugs:\n        mappings = get_drug_snomed_codes(drug_name, db_manager)\n        drug_snomed_map[drug_name] = mappings\n\n        for m in mappings:\n            all_snomed_codes.add(m.snomed_code)\n            if m.snomed_code not in snomed_to_drug_searchterm:\n                snomed_to_drug_searchterm[m.snomed_code] = []\n            snomed_to_drug_searchterm[m.snomed_code].append(\n                (drug_name, m.search_term, m.primary_directorate)\n            )\n\n    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n\n    # Step 3: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Step 4: Query GP records for all patients in batches\n    # The query finds the most recent matching SNOMED code for each patient\n\n    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n\n    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n\n    # Results dict: PersonKey -> (snomed_code, event_date)\n    gp_matches: dict[str, tuple[str, Any]] = {}\n\n    # Convert SNOMED codes to list for query\n    snomed_list = list(all_snomed_codes)\n\n    if not snomed_list:\n        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    # Build SNOMED IN clause (reused across batches)\n    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n\n    # Process patients in batches\n    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n\n        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n\n        # Build patient IN clause\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Query to find all matching SNOMED codes for these patients\n        # We'll get all matches and pick the most recent per patient in Python\n        query = f'''\n            SELECT\n                \"PatientPseudonym\",\n                \"SNOMEDCode\",\n                \"EventDateTime\"\n            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n        '''\n\n        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n\n        try:\n            results = connector.execute_dict(query, params)\n\n            # Process results - pick most recent per patient\n            for row in results:\n                person_key = row.get(\"PatientPseudonym\")\n                snomed_code = row.get(\"SNOMEDCode\")\n                event_date = row.get(\"EventDateTime\")\n\n                if person_key and snomed_code:\n                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n                    if person_key not in gp_matches:\n                        gp_matches[person_key] = (snomed_code, event_date)\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch: {e}\")\n            # Continue with other batches\n\n    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n\n    # Step 5: Build result DataFrame\n    # For each unique_pair, determine Indication_Group based on match status\n    results_list = []\n\n    # We need to dedupe by UPID - a patient might be on multiple drugs\n    # Strategy: For each UPID, use the most recent match (if any)\n    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n\n    for _, row in unique_pairs.iterrows():\n        upid = row['UPID']\n        drug_name = row['Drug Name']\n        patient_pseudonym = row['PseudoNHSNoLinked']\n        directory = row['Directory']\n\n        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n        if patient_pseudonym in gp_matches:\n            matched_snomed, event_date = gp_matches[patient_pseudonym]\n\n            # Find the search_term for this SNOMED code and drug\n            # (A SNOMED code might map to multiple drugs with different search_terms)\n            if matched_snomed in snomed_to_drug_searchterm:\n                # Look for match with current drug first\n                search_term = None\n                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n                    if drug.upper() == drug_name.upper():\n                        search_term = st\n                        break\n                # If no drug-specific match, use any match\n                if search_term is None:\n                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n\n                # Only update if we don't have a match for this UPID yet\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n            else:\n                # Shouldn't happen but fallback just in case\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n        else:\n            # No GP match - use fallback\n            if upid not in upid_to_match:\n                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n\n    # Build result DataFrame\n    for upid, (indication_group, source) in upid_to_match.items():\n        results_list.append({\n            'UPID': upid,\n            'Indication_Group': indication_group,\n            'Source': source,\n        })\n\n    result_df = pd.DataFrame(results_list)\n\n    # Log statistics\n    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n    total = len(result_df)\n\n    logger.info(f\"Indication lookup complete:\")\n    logger.info(f\"  Total unique patients: {total}\")\n    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n\n    return result_df\n\n\n# === Drug-to-indication mapping from DimSearchTerm.csv ===\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                if not search_term or not drug_names_raw:\n                    continue\n\n                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n\n                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n                if search_term not in search_term_to_fragments:\n                    search_term_to_fragments[search_term] = []\n                for frag in fragments:\n                    if frag not in search_term_to_fragments[search_term]:\n                        search_term_to_fragments[search_term].append(frag)\n\n                # Build fragment -> search_terms\n                for frag in fragments:\n                    if frag not in fragment_to_search_terms:\n                        fragment_to_search_terms[frag] = []\n                    if search_term not in fragment_to_search_terms[frag]:\n                        fragment_to_search_terms[frag].append(search_term)\n\n        logger.info(\n            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n            f\"{len(fragment_to_search_terms)} drug fragments\"\n        )\n\n    except FileNotFoundError:\n        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n    except Exception as e:\n        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n\n    return fragment_to_search_terms, search_term_to_fragments\n\n\ndef get_search_terms_for_drug(\n    drug_name: str,\n    search_term_to_fragments: dict[str, list[str]],\n) -> list[str]:\n    \"\"\"\n    Get all Search_Terms that list a given drug using substring matching.\n\n    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n    and partial fragments (PEGYLATED, INHALED).\n\n    Args:\n        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n\n    Returns:\n        List of Search_Terms whose drug fragments match the drug name\n    \"\"\"\n    drug_name_upper = drug_name.upper()\n    matched_terms: list[str] = []\n\n    for search_term, fragments in search_term_to_fragments.items():\n        for frag in fragments:\n            if frag in drug_name_upper:\n                matched_terms.append(search_term)\n                break  # One matching fragment is enough for this Search_Term\n\n    return matched_terms\n\n\n# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n\n# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\nCLUSTER_MAPPING_SQL = \"\"\"\nWITH SearchTermClusters AS (\n    SELECT Search_Term, Cluster_ID FROM (VALUES\n        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n        ('allergic asthma', 'AST_COD'),\n        ('allergic rhinitis', 'MILDINTAST_COD'),\n        ('alzheimer''s disease', 'DEMALZ_COD'),\n        ('amyloidosis', 'AMYLOID_COD'),\n        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n        ('apixaban', 'DOACCON_COD'),\n        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n        ('arthritis', 'eFI2_InflammatoryArthritis'),\n        ('asthma', 'eFI2_Asthma'),\n        ('atopic dermatitis', 'ATOPDERM_COD'),\n        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n        ('bipolar disorder', 'MH_COD'),\n        ('bladder', 'eFI2_UrinaryIncontinence'),\n        ('breast cancer', 'BRCANSCR_COD'),\n        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n        ('cardiovascular disease', 'CVDRISKASS_COD'),\n        ('cervical cancer', 'CSDEC_COD'),\n        ('cholangiocarcinoma', 'eFI2_Cancer'),\n        ('chronic kidney disease', 'CKD_COD'),\n        ('chronic liver disease', 'eFI2_LiverProblems'),\n        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n        ('colon cancer', 'eFI2_Cancer'),\n        ('colorectal cancer', 'GICANREF_COD'),\n        ('constipation', 'CHRONCONSTIP_COD'),\n        ('covid-19', 'POSSPOSTCOVID_COD'),\n        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n        ('deep vein thrombosis', 'VTE_COD'),\n        ('depression', 'eFI2_Depression'),\n        ('diabetes', 'eFI2_DiabetesEver'),\n        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('dravet syndrome', 'EPIL_COD'),\n        ('drug misuse', 'ILLSUBINT_COD'),\n        ('dyspepsia', 'eFI2_AbdominalPain'),\n        ('epilepsy', 'eFI2_Seizures'),\n        ('fallopian tube', 'STERIL_COD'),\n        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n        ('gastric cancer', 'eFI2_Cancer'),\n        ('giant cell arteritis', 'GCA_COD'),\n        ('glioma', 'NHAEMCANMORPH_COD'),\n        ('gout', 'eFI2_InflammatoryArthritis'),\n        ('graft versus host disease', 'GVHD_COD'),\n        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n        ('hand eczema', 'ECZEMA_COD'),\n        ('heart failure', 'eFI2_HeartFailure'),\n        ('hepatitis b', 'HEPBCVAC_COD'),\n        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n        ('hiv', 'PREFLANG_COD'),\n        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n        ('hormone receptor', 'eFI2_ThyroidProblems'),\n        ('hypercholesterolaemia', 'CLASSFH_COD'),\n        ('immune thrombocytopenia', 'ITP_COD'),\n        ('influenza', 'FLUINVITE_COD'),\n        ('insomnia', 'eFI2_SleepProblems'),\n        ('irritable bowel syndrome', 'IBS_COD'),\n        ('ischaemic stroke', 'OSTR_COD'),\n        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n        ('kidney transplant', 'RENALTRANSP_COD'),\n        ('leukaemia', 'eFI2_Cancer'),\n        ('lung cancer', 'FTCANREF_COD'),\n        ('lymphoma', 'C19HAEMCAN_COD'),\n        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('major depressive episodes', 'eFI2_Depression'),\n        ('malignant melanoma', 'eFI2_Cancer'),\n        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n        ('manic episode', 'MH_COD'),\n        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n        ('melanoma', 'eFI2_Cancer'),\n        ('merkel cell carcinoma', 'C19CAN_COD'),\n        ('migraine', 'eFI2_Headache'),\n        ('motor neurone disease', 'MND_COD'),\n        ('multiple myeloma', 'C19HAEMCAN_COD'),\n        ('multiple sclerosis', 'MS_COD'),\n        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n        ('myelofibrosis', 'MDS_COD'),\n        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n        ('myotonia', 'CNDATRISK2_COD'),\n        ('narcolepsy', 'LD_COD'),\n        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n        ('obesity', 'BMI30_COD'),\n        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('osteoporosis', 'eFI2_Osteoporosis'),\n        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n        ('ovarian cancer', 'C19CAN_COD'),\n        ('peripheral arterial disease', 'PADEXC_COD'),\n        ('plaque psoriasis', 'PSORIASIS_COD'),\n        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n        ('pregnancy', 'C19PREG_COD'),\n        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n        ('psoriasis', 'PSORIASIS_COD'),\n        ('psoriatic arthritis', 'RARTHAD_COD'),\n        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n        ('pulmonary fibrosis', 'ILD_COD'),\n        ('relapsing multiple sclerosis', 'MS_COD'),\n        ('renal cell carcinoma', 'C19CAN_COD'),\n        ('renal transplantation', 'RENALTRANSP_COD'),\n        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n        ('rivaroxaban', 'DOACCON_COD'),\n        ('schizophrenia', 'MH_COD'),\n        ('seizures', 'LSZFREQ_COD'),\n        ('sepsis', 'C19ACTIVITY_COD'),\n        ('severe persistent allergic asthma', 'SEVAST_COD'),\n        ('sickle cell disease', 'SICKLE_COD'),\n        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n        ('smoking cessation', 'SMOKINGINT_COD'),\n        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n        ('spinal muscular atrophy', 'MND_COD'),\n        ('squamous cell', 'C19CAN_COD'),\n        ('squamous cell carcinoma', 'C19CAN_COD'),\n        ('stem cell transplant', 'ALLOTRANSP_COD'),\n        ('stroke', 'eFI2_Stroke'),\n        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n        ('thrombocytopenic purpura', 'TTP_COD'),\n        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n        ('thyroid cancer', 'C19CAN_COD'),\n        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('transitional cell carcinoma', 'C19CAN_COD'),\n        ('type 1 diabetes', 'DMTYPE1_COD'),\n        ('type 2 diabetes', 'DMTYPE2_COD'),\n        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n        ('urticaria', 'XSAL_COD'),\n        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('vascular disease', 'CVDINVITE_COD'),\n        ('vasculitis', 'CRYOGLOBVASC_COD')\n    ) AS t(Search_Term, Cluster_ID)\n),\n\nClusterCodes AS (\n    SELECT\n        stc.Search_Term,\n        c.\"SNOMEDCode\",\n        c.\"SNOMEDDescription\"\n    FROM SearchTermClusters stc\n    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n        ON stc.Cluster_ID = c.\"Cluster_ID\"\n    WHERE c.\"SNOMEDCode\" IS NOT NULL\n),\n\nExplicitCodes AS (\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n),\n\nAllIndicationCodes AS (\n    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n    FROM ClusterCodes\n    UNION ALL\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n    FROM ExplicitCodes\n)\n\"\"\"\n\n\ndef get_patient_indication_groups(\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n\n    This function queries Snowflake directly using the embedded cluster CTE\n    (from snomed_indication_mapping_query.sql) to find patients with matching\n    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n\n    The query:\n    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n    3. Returns the most recent match per patient (by EventDateTime)\n\n    Args:\n        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        batch_size: Number of patients per Snowflake query batch (default 500)\n\n    Returns:\n        DataFrame with columns:\n        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n        - EventDateTime: Date of the GP diagnosis record\n\n        Patients not found in results have no matching GP diagnosis.\n    \"\"\"\n    import pandas as pd\n\n    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n\n    # Handle edge case: empty patient list\n    if not patient_pseudonyms:\n        logger.warning(\"Empty patient list provided\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    # Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if not is_snowflake_configured():\n        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Results list to collect all matches\n    all_results: list[dict] = []\n\n    # Process patients in batches\n    total_patients = len(patient_pseudonyms)\n    for batch_start in range(0, total_patients, batch_size):\n        batch_end = min(batch_start + batch_size, total_patients)\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n        batch_num = batch_start // batch_size + 1\n        total_batches = (total_patients + batch_size - 1) // batch_size\n\n        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n\n        # Build patient IN clause placeholders\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Build the full query with cluster CTE\n        # This finds the most recent matching diagnosis for each patient\n        # Note: Column names must be aliased to ensure consistent casing in results\n        query = f\"\"\"\n{CLUSTER_MAPPING_SQL}\nSELECT\n    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    aic.Search_Term AS \"Search_Term\",\n    pc.\"EventDateTime\" AS \"EventDateTime\"\nFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\nINNER JOIN AllIndicationCodes aic\n    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\nWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\nQUALIFY ROW_NUMBER() OVER (\n    PARTITION BY pc.\"PatientPseudonym\"\n    ORDER BY pc.\"EventDateTime\" DESC\n) = 1\n\"\"\"\n\n        try:\n            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n\n            for row in results:\n                all_results.append({\n                    'PatientPseudonym': row.get('PatientPseudonym'),\n                    'Search_Term': row.get('Search_Term'),\n                    'EventDateTime': row.get('EventDateTime'),\n                })\n\n            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n            # Continue with other batches - partial results are better than none\n\n    # Build result DataFrame\n    result_df = pd.DataFrame(all_results)\n\n    # Log summary statistics\n    if len(result_df) > 0:\n        matched_count = len(result_df)\n        match_rate = 100 * matched_count / total_patients\n        unique_terms = result_df['Search_Term'].nunique()\n        logger.info(f\"Indication lookup complete:\")\n        logger.info(f\"  Total patients queried: {total_patients}\")\n        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n\n        # Log top Search_Terms\n        top_terms = result_df['Search_Term'].value_counts().head(5)\n        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n    else:\n        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n\n    return result_df\n\n\n# Export public API\n__all__ = [\n    # Dataclasses\n    \"ClusterSnomedCodes\",\n    \"IndicationValidationResult\",\n    \"DrugIndicationMatchRate\",\n    \"DrugSnomedMapping\",\n    \"DirectSnomedMatchResult\",\n    \"DirectorateAssignment\",\n    # Cluster-based lookup functions (existing)\n    \"get_drug_clusters\",\n    \"get_drug_cluster_ids\",\n    \"get_cluster_snomed_codes\",\n    \"patient_has_indication\",\n    \"validate_indication\",\n    \"get_indication_match_rate\",\n    \"batch_validate_indications\",\n    \"get_available_clusters\",\n    # Direct SNOMED lookup functions (new)\n    \"get_drug_snomed_codes\",\n    \"patient_has_indication_direct\",\n    # Diagnosis-based directorate assignment\n    \"get_directorate_from_diagnosis\",\n    # Batch lookup for indication groups\n    \"batch_lookup_indication_groups\",\n    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",\n    # Snowflake-direct indication lookup (new approach)\n    \"get_patient_indication_groups\",\n    \"CLUSTER_MAPPING_SQL\",\n]\n","structuredPatch":[{"oldStart":1198,"oldLines":7,"newStart":1198,"newLines":7,"lines":["         ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),","         ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),","         ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),","-        ('allergic asthma', 'AST_COD'),","+        ('asthma', 'AST_COD'),","         ('allergic rhinitis', 'MILDINTAST_COD'),","         ('alzheimer''s disease', 'DEMALZ_COD'),","         ('amyloidosis', 'AMYLOID_COD'),"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_016m3NW2dvcs9Ntxw3kSwzNf","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01KoCsD9y8dUXacQ6kAqW3cv","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","old_string":"        ('severe persistent allergic asthma', 'SEVAST_COD'),","new_string":"        ('asthma', 'SEVAST_COD'),"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":354,"cache_read_input_tokens":105576,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":354},"output_tokens":24,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"00ea270b-4a44-417d-9610-e3649b5095eb"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01KoCsD9y8dUXacQ6kAqW3cv","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"85fa5c09-9c63-4eca-94b7-be09b5241477","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","oldString":"        ('severe persistent allergic asthma', 'SEVAST_COD'),","newString":"        ('asthma', 'SEVAST_COD'),","originalFile":"\"\"\"\nDiagnosis lookup module for NHS Patient Pathway Analysis.\n\nProvides functions to validate patient indications by checking GP diagnosis records\nagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\ndrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n\nKey workflow:\n1. Get drug's valid indication clusters from local mapping\n2. Get all SNOMED codes for those clusters from Snowflake\n3. Check if patient has any of those SNOMED codes in GP records\n4. Report indication validation status\n\nIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\nGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime import date, datetime\nfrom pathlib import Path\nfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\nimport csv\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nfrom core.logging_config import get_logger\nfrom data_processing.database import DatabaseManager, default_db_manager\nfrom data_processing.snowflake_connector import (\n    SnowflakeConnector,\n    get_connector,\n    is_snowflake_available,\n    is_snowflake_configured,\n    SNOWFLAKE_AVAILABLE,\n)\nfrom data_processing.cache import get_cache, is_cache_enabled\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass ClusterSnomedCodes:\n    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    cluster_id: str\n    cluster_description: str\n    snomed_codes: list[str] = field(default_factory=list)\n    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n\n    @property\n    def code_count(self) -> int:\n        return len(self.snomed_codes)\n\n\n@dataclass\nclass IndicationValidationResult:\n    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    patient_pseudonym: str\n    drug_name: str\n    has_valid_indication: bool\n    matched_cluster_id: Optional[str] = None\n    matched_snomed_code: Optional[str] = None\n    matched_snomed_description: Optional[str] = None\n    checked_clusters: list[str] = field(default_factory=list)\n    total_codes_checked: int = 0\n    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    error_message: Optional[str] = None\n\n\n@dataclass\nclass DrugIndicationMatchRate:\n    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    drug_name: str\n    total_patients: int\n    patients_with_indication: int\n    patients_without_indication: int\n    match_rate: float  # 0.0 to 1.0\n    clusters_checked: list[str] = field(default_factory=list)\n    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n\n\n@dataclass\nclass DrugSnomedMapping:\n    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    snomed_code: str\n    snomed_description: str\n    search_term: str\n    primary_directorate: str\n    indication: str = \"\"\n    ta_id: str = \"\"\n\n\n@dataclass\nclass DirectSnomedMatchResult:\n    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    patient_pseudonym: str\n    matched: bool\n    snomed_code: Optional[str] = None\n    snomed_description: Optional[str] = None\n    search_term: Optional[str] = None\n    primary_directorate: Optional[str] = None\n    event_date: Optional[datetime] = None\n    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n\n\n@dataclass\nclass DirectorateAssignment:\n    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n    upid: str\n    drug_name: str\n    directorate: Optional[str]\n    search_term: Optional[str] = None\n    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n    snomed_code: Optional[str] = None\n    event_date: Optional[datetime] = None\n\n\ndef get_drug_clusters(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[dict]:\n    \"\"\"\n    Get all SNOMED cluster mappings for a drug from local SQLite.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of dicts with keys: drug_name, indication, cluster_id,\n        cluster_description, nice_ta_reference\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n        FROM ref_drug_indication_clusters\n        WHERE UPPER(drug_name) = UPPER(?)\n        ORDER BY indication, cluster_id\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name,))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append({\n                    \"drug_name\": row[\"drug_name\"],\n                    \"indication\": row[\"indication\"],\n                    \"cluster_id\": row[\"cluster_id\"],\n                    \"cluster_description\": row[\"cluster_description\"],\n                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n                })\n\n            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n        return []\n\n\ndef get_drug_cluster_ids(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[str]:\n    \"\"\"\n    Get unique cluster IDs for a drug.\n\n    Args:\n        drug_name: Drug name to look up\n        db_manager: Optional DatabaseManager\n\n    Returns:\n        List of unique cluster IDs\n    \"\"\"\n    clusters = get_drug_clusters(drug_name, db_manager)\n    return list(set(c[\"cluster_id\"] for c in clusters))\n\n\ndef get_drug_snomed_codes(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[DrugSnomedMapping]:\n    \"\"\"\n    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n\n    This uses the enriched mapping CSV data loaded into SQLite, which provides\n    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of DrugSnomedMapping with snomed_code, snomed_description,\n        search_term, primary_directorate, indication, ta_id\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT DISTINCT\n            snomed_code,\n            snomed_description,\n            search_term,\n            primary_directorate,\n            indication,\n            ta_id\n        FROM ref_drug_snomed_mapping\n        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n           OR UPPER(drug_name) = UPPER(?)\n        ORDER BY search_term, snomed_code\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name, drug_name))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append(DrugSnomedMapping(\n                    snomed_code=row[\"snomed_code\"],\n                    snomed_description=row[\"snomed_description\"] or \"\",\n                    search_term=row[\"search_term\"] or \"\",\n                    primary_directorate=row[\"primary_directorate\"] or \"\",\n                    indication=row[\"indication\"] or \"\",\n                    ta_id=row[\"ta_id\"] or \"\",\n                ))\n\n            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n        return []\n\n\ndef patient_has_indication_direct(\n    patient_pseudonym: str,\n    drug_snomed_mappings: list[DrugSnomedMapping],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> DirectSnomedMatchResult:\n    \"\"\"\n    Check if patient has any of the SNOMED codes in their GP records.\n\n    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n    for exact SNOMED code matches (not via cluster). Returns the most recent\n    match by EventDateTime if multiple matches exist.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n    \"\"\"\n    result = DirectSnomedMatchResult(\n        patient_pseudonym=patient_pseudonym,\n        matched=False,\n        source=\"NONE\",\n    )\n\n    if not drug_snomed_mappings:\n        return result\n\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot check GP records\")\n        return result\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n    snomed_lookup = {\n        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n        for m in drug_snomed_mappings\n    }\n\n    # Get unique SNOMED codes\n    snomed_codes = list(snomed_lookup.keys())\n\n    # Build placeholders for SNOMED codes\n    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n\n    # Query to find most recent matching SNOMED code in GP records\n    query = f'''\n        SELECT\n            \"SNOMEDCode\",\n            \"EventDateTime\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n        WHERE \"PatientPseudonym\" = %s\n            AND \"SNOMEDCode\" IN ({placeholders})\n    '''\n\n    params: list = [patient_pseudonym] + snomed_codes\n\n    if before_date:\n        query += ' AND \"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            matched_code = row.get(\"SNOMEDCode\")\n            event_dt = row.get(\"EventDateTime\")\n\n            if matched_code and matched_code in snomed_lookup:\n                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n\n                return DirectSnomedMatchResult(\n                    patient_pseudonym=patient_pseudonym,\n                    matched=True,\n                    snomed_code=matched_code,\n                    snomed_description=snomed_desc,\n                    search_term=search_term,\n                    primary_directorate=primary_dir,\n                    event_date=event_dt,\n                    source=\"DIRECT_SNOMED\",\n                )\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n        return result\n\n\ndef get_directorate_from_diagnosis(\n    upid: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> DirectorateAssignment:\n    \"\"\"\n    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n\n    This function attempts to assign a directorate based on the patient's GP records\n    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n    indicating that the caller should use alternative assignment methods (e.g.,\n    department_identification() from tools/data.py).\n\n    Workflow:\n    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n    2. Query patient's GP records for matching SNOMED codes\n    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n\n    Args:\n        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n        drug_name: Drug name to look up\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectorateAssignment with directorate, search_term, and source\n    \"\"\"\n    result = DirectorateAssignment(\n        upid=upid,\n        drug_name=drug_name,\n        directorate=None,\n        source=\"FALLBACK\",\n    )\n\n    # Step 1: Get SNOMED codes for the drug\n    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n\n    if not drug_snomed_mappings:\n        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n        return result\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.debug(\"Snowflake not available - using fallback\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.debug(\"Snowflake not configured - using fallback\")\n        return result\n\n    # Step 3: Get patient pseudonym from UPID\n    # UPID format is Provider Code (3 chars) + PersonKey\n    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n    # For now, we'll use UPID as the identifier - the actual integration\n    # will need to happen at the DataFrame level where we have PersonKey\n    #\n    # NOTE: This function will be called from the pipeline where we have\n    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n\n    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n    # The caller should pass the PatientPseudonym or we need to look it up.\n    # For now, let's assume the caller will use this in a batch context\n    # where they can map UPID -> PatientPseudonym.\n\n    # Let me reconsider: the function signature takes UPID but we need\n    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n    # we'll have the PersonKey column which IS the PatientPseudonym.\n    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n    #\n    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n\n    # Step 4: Check patient's GP records for matching SNOMED codes\n    match_result = patient_has_indication_direct(\n        patient_pseudonym=patient_pseudonym,\n        drug_snomed_mappings=drug_snomed_mappings,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    if match_result.matched and match_result.primary_directorate:\n        return DirectorateAssignment(\n            upid=upid,\n            drug_name=drug_name,\n            directorate=match_result.primary_directorate,\n            search_term=match_result.search_term,\n            source=\"DIAGNOSIS\",\n            snomed_code=match_result.snomed_code,\n            event_date=match_result.event_date,\n        )\n\n    # No match found - return fallback result\n    return result\n\n\ndef get_cluster_snomed_codes(\n    cluster_id: str,\n    connector: Optional[SnowflakeConnector] = None,\n    use_cache: bool = True,\n) -> ClusterSnomedCodes:\n    \"\"\"\n    Get all SNOMED codes for a cluster from Snowflake.\n\n    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n    that belong to the specified cluster.\n\n    Args:\n        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        use_cache: Whether to use cached results (default True)\n\n    Returns:\n        ClusterSnomedCodes with list of SNOMED codes and descriptions\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    # Check cache first\n    cache_key = f\"cluster_snomed_{cluster_id}\"\n    if use_cache and is_cache_enabled():\n        cache = get_cache()\n        cached = cache.get(cache_key)\n        if cached is not None and len(cached) > 0:\n            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n            cached_dict = cached[0]  # First element is our data dict\n            return ClusterSnomedCodes(\n                cluster_id=cluster_id,\n                cluster_description=str(cached_dict.get(\"description\", \"\")),\n                snomed_codes=list(cached_dict.get(\"codes\", [])),\n                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n            )\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT DISTINCT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            \"SNOMEDCode\",\n            \"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        WHERE \"Cluster_ID\" = %s\n        ORDER BY \"SNOMEDCode\"\n    '''\n\n    try:\n        results = connector.execute_dict(query, (cluster_id,))\n\n        if not results:\n            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n        codes = []\n        descriptions = {}\n        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n\n        for row in results:\n            code = row.get(\"SNOMEDCode\")\n            if code:\n                codes.append(code)\n                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n\n        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n\n        # Cache the results (using query-based cache with fake params)\n        if use_cache and is_cache_enabled():\n            cache = get_cache()\n            cache_data = [{\n                \"description\": description,\n                \"codes\": codes,\n                \"descriptions\": descriptions,\n            }]\n            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n\n        return ClusterSnomedCodes(\n            cluster_id=cluster_id,\n            cluster_description=description,\n            snomed_codes=codes,\n            snomed_descriptions=descriptions,\n        )\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n\ndef patient_has_indication(\n    patient_pseudonym: str,\n    cluster_ids: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n    \"\"\"\n    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        cluster_ids: List of cluster IDs to check against\n        connector: Optional SnowflakeConnector\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        return False, None, None, None\n\n    if not cluster_ids:\n        return False, None, None, None\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build placeholders for cluster IDs\n    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n\n    # Query to check if patient has any matching SNOMED code\n    query = f'''\n        SELECT\n            pc.\"SNOMEDCode\",\n            cc.\"Cluster_ID\",\n            cc.\"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n        WHERE pc.\"PatientPseudonym\" = %s\n            AND cc.\"Cluster_ID\" IN ({placeholders})\n    '''\n\n    params = [patient_pseudonym] + cluster_ids\n\n    if before_date:\n        query += ' AND pc.\"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            return (\n                True,\n                row.get(\"Cluster_ID\"),\n                row.get(\"SNOMEDCode\"),\n                row.get(\"SNOMEDDescription\"),\n            )\n\n        return False, None, None, None\n\n    except Exception as e:\n        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n        return False, None, None, None\n\n\ndef validate_indication(\n    patient_pseudonym: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> IndicationValidationResult:\n    \"\"\"\n    Validate that a patient has an appropriate indication for a drug.\n\n    Full validation workflow:\n    1. Get drug's valid indication clusters from local mapping\n    2. Check if patient has any matching SNOMED codes in GP records\n    3. Return detailed validation result\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_name: Drug name to validate indication for\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        IndicationValidationResult with validation details\n    \"\"\"\n    result = IndicationValidationResult(\n        patient_pseudonym=patient_pseudonym,\n        drug_name=drug_name,\n        has_valid_indication=False,\n    )\n\n    # Step 1: Get drug's cluster mappings\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    if not cluster_ids:\n        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n        result.source = \"NONE\"\n        return result\n\n    result.checked_clusters = cluster_ids\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        result.error_message = \"Snowflake connector not installed\"\n        result.source = \"NONE\"\n        return result\n\n    if not is_snowflake_configured():\n        result.error_message = \"Snowflake not configured\"\n        result.source = \"NONE\"\n        return result\n\n    # Step 3: Check patient GP records\n    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n        patient_pseudonym=patient_pseudonym,\n        cluster_ids=cluster_ids,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    result.has_valid_indication = has_indication\n    result.matched_cluster_id = matched_cluster\n    result.matched_snomed_code = matched_code\n    result.matched_snomed_description = matched_desc\n    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n\n    return result\n\n\ndef get_indication_match_rate(\n    drug_name: str,\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    sample_unmatched_count: int = 10,\n) -> DrugIndicationMatchRate:\n    \"\"\"\n    Calculate indication match rate for a drug across a list of patients.\n\n    Args:\n        drug_name: Drug name to check\n        patient_pseudonyms: List of patient pseudonymised NHS numbers\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n\n    Returns:\n        DrugIndicationMatchRate with match statistics\n    \"\"\"\n    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n        connector = get_connector()\n\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    total = len(patient_pseudonyms)\n    matched = 0\n    unmatched = 0\n    sample_unmatched: list[str] = []\n\n    if not cluster_ids:\n        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n        return DrugIndicationMatchRate(\n            drug_name=drug_name,\n            total_patients=total,\n            patients_with_indication=0,\n            patients_without_indication=total,\n            match_rate=0.0,\n            clusters_checked=[],\n            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n        )\n\n    for i, pseudonym in enumerate(patient_pseudonyms):\n        if i > 0 and i % 100 == 0:\n            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n\n        has_indication, _, _, _ = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        if has_indication:\n            matched += 1\n        else:\n            unmatched += 1\n            if len(sample_unmatched) < sample_unmatched_count:\n                sample_unmatched.append(pseudonym)\n\n    match_rate = matched / total if total > 0 else 0.0\n\n    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n\n    return DrugIndicationMatchRate(\n        drug_name=drug_name,\n        total_patients=total,\n        patients_with_indication=matched,\n        patients_without_indication=unmatched,\n        match_rate=match_rate,\n        clusters_checked=cluster_ids,\n        sample_unmatched=sample_unmatched,\n    )\n\n\ndef batch_validate_indications(\n    patient_drug_pairs: list[tuple[str, str]],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    progress_callback: Optional[Callable[[int, int], None]] = None,\n) -> list[IndicationValidationResult]:\n    \"\"\"\n    Validate indications for multiple patient-drug pairs efficiently.\n\n    Args:\n        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        progress_callback: Optional callback(current, total) for progress updates\n\n    Returns:\n        List of IndicationValidationResult for each pair\n    \"\"\"\n    results = []\n    total = len(patient_drug_pairs)\n\n    # Cache cluster lookups by drug\n    drug_clusters_cache = {}\n\n    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n        if progress_callback:\n            progress_callback(i + 1, total)\n\n        # Get clusters from cache or lookup\n        drug_upper = drug_name.upper()\n        if drug_upper not in drug_clusters_cache:\n            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n\n        cluster_ids = drug_clusters_cache[drug_upper]\n\n        if not cluster_ids:\n            results.append(IndicationValidationResult(\n                patient_pseudonym=pseudonym,\n                drug_name=drug_name,\n                has_valid_indication=False,\n                source=\"NONE\",\n                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n            ))\n            continue\n\n        # Check patient indication\n        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        results.append(IndicationValidationResult(\n            patient_pseudonym=pseudonym,\n            drug_name=drug_name,\n            has_valid_indication=has_indication,\n            matched_cluster_id=matched_cluster,\n            matched_snomed_code=matched_code,\n            matched_snomed_description=matched_desc,\n            checked_clusters=cluster_ids,\n            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n        ))\n\n    matched_count = sum(1 for r in results if r.has_valid_indication)\n    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n\n    return results\n\n\ndef get_available_clusters(\n    connector: Optional[SnowflakeConnector] = None,\n) -> list[dict]:\n    \"\"\"\n    Get list of all available SNOMED clusters from Snowflake.\n\n    Returns:\n        List of dicts with cluster_id, cluster_description, code_count\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - cannot list clusters\")\n        return []\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n        ORDER BY \"Cluster_ID\"\n    '''\n\n    try:\n        results = connector.execute_dict(query)\n\n        clusters = []\n        for row in results:\n            clusters.append({\n                \"cluster_id\": row.get(\"Cluster_ID\"),\n                \"cluster_description\": row.get(\"Cluster_Description\"),\n                \"code_count\": row.get(\"code_count\", 0),\n            })\n\n        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n        return clusters\n\n    except Exception as e:\n        logger.error(f\"Error getting available clusters: {e}\")\n        return []\n\n\ndef batch_lookup_indication_groups(\n    df: \"pd.DataFrame\",\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n\n    This is the efficient batch version of get_directorate_from_diagnosis().\n    Instead of querying Snowflake per patient, it batches the lookups for performance.\n\n    Strategy:\n    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n    2. For each unique drug, get all SNOMED codes from local SQLite\n    3. Build batched Snowflake queries to check GP records\n    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n\n    For unmatched patients, Indication_Group will be their Directory (with suffix).\n\n    Args:\n        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        batch_size: Number of patients per Snowflake query batch\n\n    Returns:\n        DataFrame with columns: UPID, Indication_Group, Source\n        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n    \"\"\"\n    import pandas as pd\n\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n\n    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n    if 'PseudoNHSNoLinked' not in df.columns:\n        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n        # Return fallback for all patients\n        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']]\n\n    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n\n    # Step 2: Get all unique drugs and their SNOMED codes\n    unique_drugs = unique_pairs['Drug Name'].unique()\n    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n\n    # Build drug -> list of DrugSnomedMapping dict\n    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n    all_snomed_codes: set[str] = set()\n    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n\n    for drug_name in unique_drugs:\n        mappings = get_drug_snomed_codes(drug_name, db_manager)\n        drug_snomed_map[drug_name] = mappings\n\n        for m in mappings:\n            all_snomed_codes.add(m.snomed_code)\n            if m.snomed_code not in snomed_to_drug_searchterm:\n                snomed_to_drug_searchterm[m.snomed_code] = []\n            snomed_to_drug_searchterm[m.snomed_code].append(\n                (drug_name, m.search_term, m.primary_directorate)\n            )\n\n    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n\n    # Step 3: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Step 4: Query GP records for all patients in batches\n    # The query finds the most recent matching SNOMED code for each patient\n\n    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n\n    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n\n    # Results dict: PersonKey -> (snomed_code, event_date)\n    gp_matches: dict[str, tuple[str, Any]] = {}\n\n    # Convert SNOMED codes to list for query\n    snomed_list = list(all_snomed_codes)\n\n    if not snomed_list:\n        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    # Build SNOMED IN clause (reused across batches)\n    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n\n    # Process patients in batches\n    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n\n        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n\n        # Build patient IN clause\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Query to find all matching SNOMED codes for these patients\n        # We'll get all matches and pick the most recent per patient in Python\n        query = f'''\n            SELECT\n                \"PatientPseudonym\",\n                \"SNOMEDCode\",\n                \"EventDateTime\"\n            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n        '''\n\n        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n\n        try:\n            results = connector.execute_dict(query, params)\n\n            # Process results - pick most recent per patient\n            for row in results:\n                person_key = row.get(\"PatientPseudonym\")\n                snomed_code = row.get(\"SNOMEDCode\")\n                event_date = row.get(\"EventDateTime\")\n\n                if person_key and snomed_code:\n                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n                    if person_key not in gp_matches:\n                        gp_matches[person_key] = (snomed_code, event_date)\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch: {e}\")\n            # Continue with other batches\n\n    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n\n    # Step 5: Build result DataFrame\n    # For each unique_pair, determine Indication_Group based on match status\n    results_list = []\n\n    # We need to dedupe by UPID - a patient might be on multiple drugs\n    # Strategy: For each UPID, use the most recent match (if any)\n    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n\n    for _, row in unique_pairs.iterrows():\n        upid = row['UPID']\n        drug_name = row['Drug Name']\n        patient_pseudonym = row['PseudoNHSNoLinked']\n        directory = row['Directory']\n\n        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n        if patient_pseudonym in gp_matches:\n            matched_snomed, event_date = gp_matches[patient_pseudonym]\n\n            # Find the search_term for this SNOMED code and drug\n            # (A SNOMED code might map to multiple drugs with different search_terms)\n            if matched_snomed in snomed_to_drug_searchterm:\n                # Look for match with current drug first\n                search_term = None\n                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n                    if drug.upper() == drug_name.upper():\n                        search_term = st\n                        break\n                # If no drug-specific match, use any match\n                if search_term is None:\n                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n\n                # Only update if we don't have a match for this UPID yet\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n            else:\n                # Shouldn't happen but fallback just in case\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n        else:\n            # No GP match - use fallback\n            if upid not in upid_to_match:\n                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n\n    # Build result DataFrame\n    for upid, (indication_group, source) in upid_to_match.items():\n        results_list.append({\n            'UPID': upid,\n            'Indication_Group': indication_group,\n            'Source': source,\n        })\n\n    result_df = pd.DataFrame(results_list)\n\n    # Log statistics\n    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n    total = len(result_df)\n\n    logger.info(f\"Indication lookup complete:\")\n    logger.info(f\"  Total unique patients: {total}\")\n    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n\n    return result_df\n\n\n# === Drug-to-indication mapping from DimSearchTerm.csv ===\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                if not search_term or not drug_names_raw:\n                    continue\n\n                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n\n                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n                if search_term not in search_term_to_fragments:\n                    search_term_to_fragments[search_term] = []\n                for frag in fragments:\n                    if frag not in search_term_to_fragments[search_term]:\n                        search_term_to_fragments[search_term].append(frag)\n\n                # Build fragment -> search_terms\n                for frag in fragments:\n                    if frag not in fragment_to_search_terms:\n                        fragment_to_search_terms[frag] = []\n                    if search_term not in fragment_to_search_terms[frag]:\n                        fragment_to_search_terms[frag].append(search_term)\n\n        logger.info(\n            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n            f\"{len(fragment_to_search_terms)} drug fragments\"\n        )\n\n    except FileNotFoundError:\n        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n    except Exception as e:\n        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n\n    return fragment_to_search_terms, search_term_to_fragments\n\n\ndef get_search_terms_for_drug(\n    drug_name: str,\n    search_term_to_fragments: dict[str, list[str]],\n) -> list[str]:\n    \"\"\"\n    Get all Search_Terms that list a given drug using substring matching.\n\n    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n    and partial fragments (PEGYLATED, INHALED).\n\n    Args:\n        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n\n    Returns:\n        List of Search_Terms whose drug fragments match the drug name\n    \"\"\"\n    drug_name_upper = drug_name.upper()\n    matched_terms: list[str] = []\n\n    for search_term, fragments in search_term_to_fragments.items():\n        for frag in fragments:\n            if frag in drug_name_upper:\n                matched_terms.append(search_term)\n                break  # One matching fragment is enough for this Search_Term\n\n    return matched_terms\n\n\n# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n\n# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\nCLUSTER_MAPPING_SQL = \"\"\"\nWITH SearchTermClusters AS (\n    SELECT Search_Term, Cluster_ID FROM (VALUES\n        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n        ('asthma', 'AST_COD'),\n        ('allergic rhinitis', 'MILDINTAST_COD'),\n        ('alzheimer''s disease', 'DEMALZ_COD'),\n        ('amyloidosis', 'AMYLOID_COD'),\n        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n        ('apixaban', 'DOACCON_COD'),\n        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n        ('arthritis', 'eFI2_InflammatoryArthritis'),\n        ('asthma', 'eFI2_Asthma'),\n        ('atopic dermatitis', 'ATOPDERM_COD'),\n        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n        ('bipolar disorder', 'MH_COD'),\n        ('bladder', 'eFI2_UrinaryIncontinence'),\n        ('breast cancer', 'BRCANSCR_COD'),\n        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n        ('cardiovascular disease', 'CVDRISKASS_COD'),\n        ('cervical cancer', 'CSDEC_COD'),\n        ('cholangiocarcinoma', 'eFI2_Cancer'),\n        ('chronic kidney disease', 'CKD_COD'),\n        ('chronic liver disease', 'eFI2_LiverProblems'),\n        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n        ('colon cancer', 'eFI2_Cancer'),\n        ('colorectal cancer', 'GICANREF_COD'),\n        ('constipation', 'CHRONCONSTIP_COD'),\n        ('covid-19', 'POSSPOSTCOVID_COD'),\n        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n        ('deep vein thrombosis', 'VTE_COD'),\n        ('depression', 'eFI2_Depression'),\n        ('diabetes', 'eFI2_DiabetesEver'),\n        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('dravet syndrome', 'EPIL_COD'),\n        ('drug misuse', 'ILLSUBINT_COD'),\n        ('dyspepsia', 'eFI2_AbdominalPain'),\n        ('epilepsy', 'eFI2_Seizures'),\n        ('fallopian tube', 'STERIL_COD'),\n        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n        ('gastric cancer', 'eFI2_Cancer'),\n        ('giant cell arteritis', 'GCA_COD'),\n        ('glioma', 'NHAEMCANMORPH_COD'),\n        ('gout', 'eFI2_InflammatoryArthritis'),\n        ('graft versus host disease', 'GVHD_COD'),\n        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n        ('hand eczema', 'ECZEMA_COD'),\n        ('heart failure', 'eFI2_HeartFailure'),\n        ('hepatitis b', 'HEPBCVAC_COD'),\n        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n        ('hiv', 'PREFLANG_COD'),\n        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n        ('hormone receptor', 'eFI2_ThyroidProblems'),\n        ('hypercholesterolaemia', 'CLASSFH_COD'),\n        ('immune thrombocytopenia', 'ITP_COD'),\n        ('influenza', 'FLUINVITE_COD'),\n        ('insomnia', 'eFI2_SleepProblems'),\n        ('irritable bowel syndrome', 'IBS_COD'),\n        ('ischaemic stroke', 'OSTR_COD'),\n        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n        ('kidney transplant', 'RENALTRANSP_COD'),\n        ('leukaemia', 'eFI2_Cancer'),\n        ('lung cancer', 'FTCANREF_COD'),\n        ('lymphoma', 'C19HAEMCAN_COD'),\n        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('major depressive episodes', 'eFI2_Depression'),\n        ('malignant melanoma', 'eFI2_Cancer'),\n        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n        ('manic episode', 'MH_COD'),\n        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n        ('melanoma', 'eFI2_Cancer'),\n        ('merkel cell carcinoma', 'C19CAN_COD'),\n        ('migraine', 'eFI2_Headache'),\n        ('motor neurone disease', 'MND_COD'),\n        ('multiple myeloma', 'C19HAEMCAN_COD'),\n        ('multiple sclerosis', 'MS_COD'),\n        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n        ('myelofibrosis', 'MDS_COD'),\n        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n        ('myotonia', 'CNDATRISK2_COD'),\n        ('narcolepsy', 'LD_COD'),\n        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n        ('obesity', 'BMI30_COD'),\n        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('osteoporosis', 'eFI2_Osteoporosis'),\n        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n        ('ovarian cancer', 'C19CAN_COD'),\n        ('peripheral arterial disease', 'PADEXC_COD'),\n        ('plaque psoriasis', 'PSORIASIS_COD'),\n        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n        ('pregnancy', 'C19PREG_COD'),\n        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n        ('psoriasis', 'PSORIASIS_COD'),\n        ('psoriatic arthritis', 'RARTHAD_COD'),\n        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n        ('pulmonary fibrosis', 'ILD_COD'),\n        ('relapsing multiple sclerosis', 'MS_COD'),\n        ('renal cell carcinoma', 'C19CAN_COD'),\n        ('renal transplantation', 'RENALTRANSP_COD'),\n        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n        ('rivaroxaban', 'DOACCON_COD'),\n        ('schizophrenia', 'MH_COD'),\n        ('seizures', 'LSZFREQ_COD'),\n        ('sepsis', 'C19ACTIVITY_COD'),\n        ('severe persistent allergic asthma', 'SEVAST_COD'),\n        ('sickle cell disease', 'SICKLE_COD'),\n        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n        ('smoking cessation', 'SMOKINGINT_COD'),\n        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n        ('spinal muscular atrophy', 'MND_COD'),\n        ('squamous cell', 'C19CAN_COD'),\n        ('squamous cell carcinoma', 'C19CAN_COD'),\n        ('stem cell transplant', 'ALLOTRANSP_COD'),\n        ('stroke', 'eFI2_Stroke'),\n        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n        ('thrombocytopenic purpura', 'TTP_COD'),\n        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n        ('thyroid cancer', 'C19CAN_COD'),\n        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('transitional cell carcinoma', 'C19CAN_COD'),\n        ('type 1 diabetes', 'DMTYPE1_COD'),\n        ('type 2 diabetes', 'DMTYPE2_COD'),\n        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n        ('urticaria', 'XSAL_COD'),\n        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('vascular disease', 'CVDINVITE_COD'),\n        ('vasculitis', 'CRYOGLOBVASC_COD')\n    ) AS t(Search_Term, Cluster_ID)\n),\n\nClusterCodes AS (\n    SELECT\n        stc.Search_Term,\n        c.\"SNOMEDCode\",\n        c.\"SNOMEDDescription\"\n    FROM SearchTermClusters stc\n    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n        ON stc.Cluster_ID = c.\"Cluster_ID\"\n    WHERE c.\"SNOMEDCode\" IS NOT NULL\n),\n\nExplicitCodes AS (\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n),\n\nAllIndicationCodes AS (\n    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n    FROM ClusterCodes\n    UNION ALL\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n    FROM ExplicitCodes\n)\n\"\"\"\n\n\ndef get_patient_indication_groups(\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n\n    This function queries Snowflake directly using the embedded cluster CTE\n    (from snomed_indication_mapping_query.sql) to find patients with matching\n    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n\n    The query:\n    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n    3. Returns the most recent match per patient (by EventDateTime)\n\n    Args:\n        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        batch_size: Number of patients per Snowflake query batch (default 500)\n\n    Returns:\n        DataFrame with columns:\n        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n        - EventDateTime: Date of the GP diagnosis record\n\n        Patients not found in results have no matching GP diagnosis.\n    \"\"\"\n    import pandas as pd\n\n    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n\n    # Handle edge case: empty patient list\n    if not patient_pseudonyms:\n        logger.warning(\"Empty patient list provided\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    # Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if not is_snowflake_configured():\n        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Results list to collect all matches\n    all_results: list[dict] = []\n\n    # Process patients in batches\n    total_patients = len(patient_pseudonyms)\n    for batch_start in range(0, total_patients, batch_size):\n        batch_end = min(batch_start + batch_size, total_patients)\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n        batch_num = batch_start // batch_size + 1\n        total_batches = (total_patients + batch_size - 1) // batch_size\n\n        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n\n        # Build patient IN clause placeholders\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Build the full query with cluster CTE\n        # This finds the most recent matching diagnosis for each patient\n        # Note: Column names must be aliased to ensure consistent casing in results\n        query = f\"\"\"\n{CLUSTER_MAPPING_SQL}\nSELECT\n    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    aic.Search_Term AS \"Search_Term\",\n    pc.\"EventDateTime\" AS \"EventDateTime\"\nFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\nINNER JOIN AllIndicationCodes aic\n    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\nWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\nQUALIFY ROW_NUMBER() OVER (\n    PARTITION BY pc.\"PatientPseudonym\"\n    ORDER BY pc.\"EventDateTime\" DESC\n) = 1\n\"\"\"\n\n        try:\n            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n\n            for row in results:\n                all_results.append({\n                    'PatientPseudonym': row.get('PatientPseudonym'),\n                    'Search_Term': row.get('Search_Term'),\n                    'EventDateTime': row.get('EventDateTime'),\n                })\n\n            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n            # Continue with other batches - partial results are better than none\n\n    # Build result DataFrame\n    result_df = pd.DataFrame(all_results)\n\n    # Log summary statistics\n    if len(result_df) > 0:\n        matched_count = len(result_df)\n        match_rate = 100 * matched_count / total_patients\n        unique_terms = result_df['Search_Term'].nunique()\n        logger.info(f\"Indication lookup complete:\")\n        logger.info(f\"  Total patients queried: {total_patients}\")\n        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n\n        # Log top Search_Terms\n        top_terms = result_df['Search_Term'].value_counts().head(5)\n        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n    else:\n        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n\n    return result_df\n\n\n# Export public API\n__all__ = [\n    # Dataclasses\n    \"ClusterSnomedCodes\",\n    \"IndicationValidationResult\",\n    \"DrugIndicationMatchRate\",\n    \"DrugSnomedMapping\",\n    \"DirectSnomedMatchResult\",\n    \"DirectorateAssignment\",\n    # Cluster-based lookup functions (existing)\n    \"get_drug_clusters\",\n    \"get_drug_cluster_ids\",\n    \"get_cluster_snomed_codes\",\n    \"patient_has_indication\",\n    \"validate_indication\",\n    \"get_indication_match_rate\",\n    \"batch_validate_indications\",\n    \"get_available_clusters\",\n    # Direct SNOMED lookup functions (new)\n    \"get_drug_snomed_codes\",\n    \"patient_has_indication_direct\",\n    # Diagnosis-based directorate assignment\n    \"get_directorate_from_diagnosis\",\n    # Batch lookup for indication groups\n    \"batch_lookup_indication_groups\",\n    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",\n    # Snowflake-direct indication lookup (new approach)\n    \"get_patient_indication_groups\",\n    \"CLUSTER_MAPPING_SQL\",\n]\n","structuredPatch":[{"oldStart":1313,"oldLines":7,"newStart":1313,"newLines":7,"lines":["         ('schizophrenia', 'MH_COD'),","         ('seizures', 'LSZFREQ_COD'),","         ('sepsis', 'C19ACTIVITY_COD'),","-        ('severe persistent allergic asthma', 'SEVAST_COD'),","+        ('asthma', 'SEVAST_COD'),","         ('sickle cell disease', 'SICKLE_COD'),","         ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),","         ('smoking cessation', 'SMOKINGINT_COD'),"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_013eyJrnx3XyNxGh1x3zUM4a","type":"message","role":"assistant","content":[{"type":"text","text":"Good. Now the CLUSTER_MAPPING_SQL has 3 rows for `'asthma'` with Cluster_IDs: AST_COD, eFI2_Asthma, SEVAST_COD. The GP lookup will return `Search_Term = 'asthma'` for any of these clusters. `urticaria` with `XSAL_COD` stays separate.\n\nNow I need to update `load_drug_indication_mapping()` to merge the asthma variants. When loading DimSearchTerm.csv, rows with Search_Term = \"allergic asthma\" or \"severe persistent allergic asthma\" should have their drug fragments merged under a single \"asthma\" key."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":215,"cache_read_input_tokens":105930,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":215},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"7ade38e5-9de8-4e63-b925-5e17a3f748c6"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_013eyJrnx3XyNxGh1x3zUM4a","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01SNFmA47uTpVrnMkEDrHUEY","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"in_progress","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":215,"cache_read_input_tokens":105930,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":215},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"bbc87fb2-4343-4961-bdf6-8536bb4e545c"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01SNFmA47uTpVrnMkEDrHUEY","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"5a04d12a-22d1-4b41-a3be-4efe1d001b3c","tool_use_result":{"oldTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"in_progress","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"pending","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"in_progress","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_017VX33JDuuC9r7foVy2WVBX","type":"message","role":"assistant","content":[{"type":"text","text":"I need to add a Search_Term normalization step in `load_drug_indication_mapping()`. The approach: define a mapping of asthma variant names ÔåÆ \"asthma\", and apply it when reading each row. This keeps the merge logic contained in one place."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":505,"cache_read_input_tokens":106145,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":505},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"6ecc7378-b2da-4a4d-bf37-26fd656f0c45"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_017VX33JDuuC9r7foVy2WVBX","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_012nLcjMSdbhPreuh8XuUc2n","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","old_string":"def load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()","new_string":"# Merge related Search_Terms into canonical names.\n# Asthma variants are clinically the same condition at different severity levels.\n# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\nSEARCH_TERM_MERGE_MAP: dict[str, str] = {\n    \"allergic asthma\": \"asthma\",\n    \"severe persistent allergic asthma\": \"asthma\",\n}\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\n    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\n    \"urticaria\" stays separate.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                # Normalize asthma variants to canonical \"asthma\"\n                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":505,"cache_read_input_tokens":106145,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":505},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"09e95519-ebe4-4a5d-b6ef-0209acc3b971"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_012nLcjMSdbhPreuh8XuUc2n","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"6588e181-c493-4033-bf04-9547ff58f71d","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","oldString":"def load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()","newString":"# Merge related Search_Terms into canonical names.\n# Asthma variants are clinically the same condition at different severity levels.\n# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\nSEARCH_TERM_MERGE_MAP: dict[str, str] = {\n    \"allergic asthma\": \"asthma\",\n    \"severe persistent allergic asthma\": \"asthma\",\n}\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\n    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\n    \"urticaria\" stays separate.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                # Normalize asthma variants to canonical \"asthma\"\n                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)","originalFile":"\"\"\"\nDiagnosis lookup module for NHS Patient Pathway Analysis.\n\nProvides functions to validate patient indications by checking GP diagnosis records\nagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\ndrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n\nKey workflow:\n1. Get drug's valid indication clusters from local mapping\n2. Get all SNOMED codes for those clusters from Snowflake\n3. Check if patient has any of those SNOMED codes in GP records\n4. Report indication validation status\n\nIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\nGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime import date, datetime\nfrom pathlib import Path\nfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\nimport csv\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nfrom core.logging_config import get_logger\nfrom data_processing.database import DatabaseManager, default_db_manager\nfrom data_processing.snowflake_connector import (\n    SnowflakeConnector,\n    get_connector,\n    is_snowflake_available,\n    is_snowflake_configured,\n    SNOWFLAKE_AVAILABLE,\n)\nfrom data_processing.cache import get_cache, is_cache_enabled\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass ClusterSnomedCodes:\n    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    cluster_id: str\n    cluster_description: str\n    snomed_codes: list[str] = field(default_factory=list)\n    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n\n    @property\n    def code_count(self) -> int:\n        return len(self.snomed_codes)\n\n\n@dataclass\nclass IndicationValidationResult:\n    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    patient_pseudonym: str\n    drug_name: str\n    has_valid_indication: bool\n    matched_cluster_id: Optional[str] = None\n    matched_snomed_code: Optional[str] = None\n    matched_snomed_description: Optional[str] = None\n    checked_clusters: list[str] = field(default_factory=list)\n    total_codes_checked: int = 0\n    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    error_message: Optional[str] = None\n\n\n@dataclass\nclass DrugIndicationMatchRate:\n    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    drug_name: str\n    total_patients: int\n    patients_with_indication: int\n    patients_without_indication: int\n    match_rate: float  # 0.0 to 1.0\n    clusters_checked: list[str] = field(default_factory=list)\n    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n\n\n@dataclass\nclass DrugSnomedMapping:\n    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    snomed_code: str\n    snomed_description: str\n    search_term: str\n    primary_directorate: str\n    indication: str = \"\"\n    ta_id: str = \"\"\n\n\n@dataclass\nclass DirectSnomedMatchResult:\n    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    patient_pseudonym: str\n    matched: bool\n    snomed_code: Optional[str] = None\n    snomed_description: Optional[str] = None\n    search_term: Optional[str] = None\n    primary_directorate: Optional[str] = None\n    event_date: Optional[datetime] = None\n    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n\n\n@dataclass\nclass DirectorateAssignment:\n    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n    upid: str\n    drug_name: str\n    directorate: Optional[str]\n    search_term: Optional[str] = None\n    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n    snomed_code: Optional[str] = None\n    event_date: Optional[datetime] = None\n\n\ndef get_drug_clusters(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[dict]:\n    \"\"\"\n    Get all SNOMED cluster mappings for a drug from local SQLite.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of dicts with keys: drug_name, indication, cluster_id,\n        cluster_description, nice_ta_reference\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n        FROM ref_drug_indication_clusters\n        WHERE UPPER(drug_name) = UPPER(?)\n        ORDER BY indication, cluster_id\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name,))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append({\n                    \"drug_name\": row[\"drug_name\"],\n                    \"indication\": row[\"indication\"],\n                    \"cluster_id\": row[\"cluster_id\"],\n                    \"cluster_description\": row[\"cluster_description\"],\n                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n                })\n\n            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n        return []\n\n\ndef get_drug_cluster_ids(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[str]:\n    \"\"\"\n    Get unique cluster IDs for a drug.\n\n    Args:\n        drug_name: Drug name to look up\n        db_manager: Optional DatabaseManager\n\n    Returns:\n        List of unique cluster IDs\n    \"\"\"\n    clusters = get_drug_clusters(drug_name, db_manager)\n    return list(set(c[\"cluster_id\"] for c in clusters))\n\n\ndef get_drug_snomed_codes(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[DrugSnomedMapping]:\n    \"\"\"\n    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n\n    This uses the enriched mapping CSV data loaded into SQLite, which provides\n    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of DrugSnomedMapping with snomed_code, snomed_description,\n        search_term, primary_directorate, indication, ta_id\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT DISTINCT\n            snomed_code,\n            snomed_description,\n            search_term,\n            primary_directorate,\n            indication,\n            ta_id\n        FROM ref_drug_snomed_mapping\n        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n           OR UPPER(drug_name) = UPPER(?)\n        ORDER BY search_term, snomed_code\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name, drug_name))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append(DrugSnomedMapping(\n                    snomed_code=row[\"snomed_code\"],\n                    snomed_description=row[\"snomed_description\"] or \"\",\n                    search_term=row[\"search_term\"] or \"\",\n                    primary_directorate=row[\"primary_directorate\"] or \"\",\n                    indication=row[\"indication\"] or \"\",\n                    ta_id=row[\"ta_id\"] or \"\",\n                ))\n\n            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n        return []\n\n\ndef patient_has_indication_direct(\n    patient_pseudonym: str,\n    drug_snomed_mappings: list[DrugSnomedMapping],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> DirectSnomedMatchResult:\n    \"\"\"\n    Check if patient has any of the SNOMED codes in their GP records.\n\n    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n    for exact SNOMED code matches (not via cluster). Returns the most recent\n    match by EventDateTime if multiple matches exist.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n    \"\"\"\n    result = DirectSnomedMatchResult(\n        patient_pseudonym=patient_pseudonym,\n        matched=False,\n        source=\"NONE\",\n    )\n\n    if not drug_snomed_mappings:\n        return result\n\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot check GP records\")\n        return result\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n    snomed_lookup = {\n        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n        for m in drug_snomed_mappings\n    }\n\n    # Get unique SNOMED codes\n    snomed_codes = list(snomed_lookup.keys())\n\n    # Build placeholders for SNOMED codes\n    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n\n    # Query to find most recent matching SNOMED code in GP records\n    query = f'''\n        SELECT\n            \"SNOMEDCode\",\n            \"EventDateTime\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n        WHERE \"PatientPseudonym\" = %s\n            AND \"SNOMEDCode\" IN ({placeholders})\n    '''\n\n    params: list = [patient_pseudonym] + snomed_codes\n\n    if before_date:\n        query += ' AND \"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            matched_code = row.get(\"SNOMEDCode\")\n            event_dt = row.get(\"EventDateTime\")\n\n            if matched_code and matched_code in snomed_lookup:\n                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n\n                return DirectSnomedMatchResult(\n                    patient_pseudonym=patient_pseudonym,\n                    matched=True,\n                    snomed_code=matched_code,\n                    snomed_description=snomed_desc,\n                    search_term=search_term,\n                    primary_directorate=primary_dir,\n                    event_date=event_dt,\n                    source=\"DIRECT_SNOMED\",\n                )\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n        return result\n\n\ndef get_directorate_from_diagnosis(\n    upid: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> DirectorateAssignment:\n    \"\"\"\n    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n\n    This function attempts to assign a directorate based on the patient's GP records\n    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n    indicating that the caller should use alternative assignment methods (e.g.,\n    department_identification() from tools/data.py).\n\n    Workflow:\n    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n    2. Query patient's GP records for matching SNOMED codes\n    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n\n    Args:\n        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n        drug_name: Drug name to look up\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectorateAssignment with directorate, search_term, and source\n    \"\"\"\n    result = DirectorateAssignment(\n        upid=upid,\n        drug_name=drug_name,\n        directorate=None,\n        source=\"FALLBACK\",\n    )\n\n    # Step 1: Get SNOMED codes for the drug\n    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n\n    if not drug_snomed_mappings:\n        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n        return result\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.debug(\"Snowflake not available - using fallback\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.debug(\"Snowflake not configured - using fallback\")\n        return result\n\n    # Step 3: Get patient pseudonym from UPID\n    # UPID format is Provider Code (3 chars) + PersonKey\n    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n    # For now, we'll use UPID as the identifier - the actual integration\n    # will need to happen at the DataFrame level where we have PersonKey\n    #\n    # NOTE: This function will be called from the pipeline where we have\n    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n\n    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n    # The caller should pass the PatientPseudonym or we need to look it up.\n    # For now, let's assume the caller will use this in a batch context\n    # where they can map UPID -> PatientPseudonym.\n\n    # Let me reconsider: the function signature takes UPID but we need\n    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n    # we'll have the PersonKey column which IS the PatientPseudonym.\n    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n    #\n    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n\n    # Step 4: Check patient's GP records for matching SNOMED codes\n    match_result = patient_has_indication_direct(\n        patient_pseudonym=patient_pseudonym,\n        drug_snomed_mappings=drug_snomed_mappings,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    if match_result.matched and match_result.primary_directorate:\n        return DirectorateAssignment(\n            upid=upid,\n            drug_name=drug_name,\n            directorate=match_result.primary_directorate,\n            search_term=match_result.search_term,\n            source=\"DIAGNOSIS\",\n            snomed_code=match_result.snomed_code,\n            event_date=match_result.event_date,\n        )\n\n    # No match found - return fallback result\n    return result\n\n\ndef get_cluster_snomed_codes(\n    cluster_id: str,\n    connector: Optional[SnowflakeConnector] = None,\n    use_cache: bool = True,\n) -> ClusterSnomedCodes:\n    \"\"\"\n    Get all SNOMED codes for a cluster from Snowflake.\n\n    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n    that belong to the specified cluster.\n\n    Args:\n        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        use_cache: Whether to use cached results (default True)\n\n    Returns:\n        ClusterSnomedCodes with list of SNOMED codes and descriptions\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    # Check cache first\n    cache_key = f\"cluster_snomed_{cluster_id}\"\n    if use_cache and is_cache_enabled():\n        cache = get_cache()\n        cached = cache.get(cache_key)\n        if cached is not None and len(cached) > 0:\n            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n            cached_dict = cached[0]  # First element is our data dict\n            return ClusterSnomedCodes(\n                cluster_id=cluster_id,\n                cluster_description=str(cached_dict.get(\"description\", \"\")),\n                snomed_codes=list(cached_dict.get(\"codes\", [])),\n                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n            )\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT DISTINCT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            \"SNOMEDCode\",\n            \"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        WHERE \"Cluster_ID\" = %s\n        ORDER BY \"SNOMEDCode\"\n    '''\n\n    try:\n        results = connector.execute_dict(query, (cluster_id,))\n\n        if not results:\n            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n        codes = []\n        descriptions = {}\n        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n\n        for row in results:\n            code = row.get(\"SNOMEDCode\")\n            if code:\n                codes.append(code)\n                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n\n        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n\n        # Cache the results (using query-based cache with fake params)\n        if use_cache and is_cache_enabled():\n            cache = get_cache()\n            cache_data = [{\n                \"description\": description,\n                \"codes\": codes,\n                \"descriptions\": descriptions,\n            }]\n            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n\n        return ClusterSnomedCodes(\n            cluster_id=cluster_id,\n            cluster_description=description,\n            snomed_codes=codes,\n            snomed_descriptions=descriptions,\n        )\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n\ndef patient_has_indication(\n    patient_pseudonym: str,\n    cluster_ids: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n    \"\"\"\n    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        cluster_ids: List of cluster IDs to check against\n        connector: Optional SnowflakeConnector\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        return False, None, None, None\n\n    if not cluster_ids:\n        return False, None, None, None\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build placeholders for cluster IDs\n    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n\n    # Query to check if patient has any matching SNOMED code\n    query = f'''\n        SELECT\n            pc.\"SNOMEDCode\",\n            cc.\"Cluster_ID\",\n            cc.\"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n        WHERE pc.\"PatientPseudonym\" = %s\n            AND cc.\"Cluster_ID\" IN ({placeholders})\n    '''\n\n    params = [patient_pseudonym] + cluster_ids\n\n    if before_date:\n        query += ' AND pc.\"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            return (\n                True,\n                row.get(\"Cluster_ID\"),\n                row.get(\"SNOMEDCode\"),\n                row.get(\"SNOMEDDescription\"),\n            )\n\n        return False, None, None, None\n\n    except Exception as e:\n        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n        return False, None, None, None\n\n\ndef validate_indication(\n    patient_pseudonym: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> IndicationValidationResult:\n    \"\"\"\n    Validate that a patient has an appropriate indication for a drug.\n\n    Full validation workflow:\n    1. Get drug's valid indication clusters from local mapping\n    2. Check if patient has any matching SNOMED codes in GP records\n    3. Return detailed validation result\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_name: Drug name to validate indication for\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        IndicationValidationResult with validation details\n    \"\"\"\n    result = IndicationValidationResult(\n        patient_pseudonym=patient_pseudonym,\n        drug_name=drug_name,\n        has_valid_indication=False,\n    )\n\n    # Step 1: Get drug's cluster mappings\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    if not cluster_ids:\n        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n        result.source = \"NONE\"\n        return result\n\n    result.checked_clusters = cluster_ids\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        result.error_message = \"Snowflake connector not installed\"\n        result.source = \"NONE\"\n        return result\n\n    if not is_snowflake_configured():\n        result.error_message = \"Snowflake not configured\"\n        result.source = \"NONE\"\n        return result\n\n    # Step 3: Check patient GP records\n    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n        patient_pseudonym=patient_pseudonym,\n        cluster_ids=cluster_ids,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    result.has_valid_indication = has_indication\n    result.matched_cluster_id = matched_cluster\n    result.matched_snomed_code = matched_code\n    result.matched_snomed_description = matched_desc\n    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n\n    return result\n\n\ndef get_indication_match_rate(\n    drug_name: str,\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    sample_unmatched_count: int = 10,\n) -> DrugIndicationMatchRate:\n    \"\"\"\n    Calculate indication match rate for a drug across a list of patients.\n\n    Args:\n        drug_name: Drug name to check\n        patient_pseudonyms: List of patient pseudonymised NHS numbers\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n\n    Returns:\n        DrugIndicationMatchRate with match statistics\n    \"\"\"\n    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n        connector = get_connector()\n\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    total = len(patient_pseudonyms)\n    matched = 0\n    unmatched = 0\n    sample_unmatched: list[str] = []\n\n    if not cluster_ids:\n        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n        return DrugIndicationMatchRate(\n            drug_name=drug_name,\n            total_patients=total,\n            patients_with_indication=0,\n            patients_without_indication=total,\n            match_rate=0.0,\n            clusters_checked=[],\n            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n        )\n\n    for i, pseudonym in enumerate(patient_pseudonyms):\n        if i > 0 and i % 100 == 0:\n            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n\n        has_indication, _, _, _ = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        if has_indication:\n            matched += 1\n        else:\n            unmatched += 1\n            if len(sample_unmatched) < sample_unmatched_count:\n                sample_unmatched.append(pseudonym)\n\n    match_rate = matched / total if total > 0 else 0.0\n\n    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n\n    return DrugIndicationMatchRate(\n        drug_name=drug_name,\n        total_patients=total,\n        patients_with_indication=matched,\n        patients_without_indication=unmatched,\n        match_rate=match_rate,\n        clusters_checked=cluster_ids,\n        sample_unmatched=sample_unmatched,\n    )\n\n\ndef batch_validate_indications(\n    patient_drug_pairs: list[tuple[str, str]],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    progress_callback: Optional[Callable[[int, int], None]] = None,\n) -> list[IndicationValidationResult]:\n    \"\"\"\n    Validate indications for multiple patient-drug pairs efficiently.\n\n    Args:\n        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        progress_callback: Optional callback(current, total) for progress updates\n\n    Returns:\n        List of IndicationValidationResult for each pair\n    \"\"\"\n    results = []\n    total = len(patient_drug_pairs)\n\n    # Cache cluster lookups by drug\n    drug_clusters_cache = {}\n\n    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n        if progress_callback:\n            progress_callback(i + 1, total)\n\n        # Get clusters from cache or lookup\n        drug_upper = drug_name.upper()\n        if drug_upper not in drug_clusters_cache:\n            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n\n        cluster_ids = drug_clusters_cache[drug_upper]\n\n        if not cluster_ids:\n            results.append(IndicationValidationResult(\n                patient_pseudonym=pseudonym,\n                drug_name=drug_name,\n                has_valid_indication=False,\n                source=\"NONE\",\n                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n            ))\n            continue\n\n        # Check patient indication\n        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        results.append(IndicationValidationResult(\n            patient_pseudonym=pseudonym,\n            drug_name=drug_name,\n            has_valid_indication=has_indication,\n            matched_cluster_id=matched_cluster,\n            matched_snomed_code=matched_code,\n            matched_snomed_description=matched_desc,\n            checked_clusters=cluster_ids,\n            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n        ))\n\n    matched_count = sum(1 for r in results if r.has_valid_indication)\n    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n\n    return results\n\n\ndef get_available_clusters(\n    connector: Optional[SnowflakeConnector] = None,\n) -> list[dict]:\n    \"\"\"\n    Get list of all available SNOMED clusters from Snowflake.\n\n    Returns:\n        List of dicts with cluster_id, cluster_description, code_count\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - cannot list clusters\")\n        return []\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n        ORDER BY \"Cluster_ID\"\n    '''\n\n    try:\n        results = connector.execute_dict(query)\n\n        clusters = []\n        for row in results:\n            clusters.append({\n                \"cluster_id\": row.get(\"Cluster_ID\"),\n                \"cluster_description\": row.get(\"Cluster_Description\"),\n                \"code_count\": row.get(\"code_count\", 0),\n            })\n\n        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n        return clusters\n\n    except Exception as e:\n        logger.error(f\"Error getting available clusters: {e}\")\n        return []\n\n\ndef batch_lookup_indication_groups(\n    df: \"pd.DataFrame\",\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n\n    This is the efficient batch version of get_directorate_from_diagnosis().\n    Instead of querying Snowflake per patient, it batches the lookups for performance.\n\n    Strategy:\n    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n    2. For each unique drug, get all SNOMED codes from local SQLite\n    3. Build batched Snowflake queries to check GP records\n    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n\n    For unmatched patients, Indication_Group will be their Directory (with suffix).\n\n    Args:\n        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        batch_size: Number of patients per Snowflake query batch\n\n    Returns:\n        DataFrame with columns: UPID, Indication_Group, Source\n        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n    \"\"\"\n    import pandas as pd\n\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n\n    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n    if 'PseudoNHSNoLinked' not in df.columns:\n        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n        # Return fallback for all patients\n        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']]\n\n    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n\n    # Step 2: Get all unique drugs and their SNOMED codes\n    unique_drugs = unique_pairs['Drug Name'].unique()\n    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n\n    # Build drug -> list of DrugSnomedMapping dict\n    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n    all_snomed_codes: set[str] = set()\n    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n\n    for drug_name in unique_drugs:\n        mappings = get_drug_snomed_codes(drug_name, db_manager)\n        drug_snomed_map[drug_name] = mappings\n\n        for m in mappings:\n            all_snomed_codes.add(m.snomed_code)\n            if m.snomed_code not in snomed_to_drug_searchterm:\n                snomed_to_drug_searchterm[m.snomed_code] = []\n            snomed_to_drug_searchterm[m.snomed_code].append(\n                (drug_name, m.search_term, m.primary_directorate)\n            )\n\n    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n\n    # Step 3: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Step 4: Query GP records for all patients in batches\n    # The query finds the most recent matching SNOMED code for each patient\n\n    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n\n    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n\n    # Results dict: PersonKey -> (snomed_code, event_date)\n    gp_matches: dict[str, tuple[str, Any]] = {}\n\n    # Convert SNOMED codes to list for query\n    snomed_list = list(all_snomed_codes)\n\n    if not snomed_list:\n        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    # Build SNOMED IN clause (reused across batches)\n    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n\n    # Process patients in batches\n    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n\n        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n\n        # Build patient IN clause\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Query to find all matching SNOMED codes for these patients\n        # We'll get all matches and pick the most recent per patient in Python\n        query = f'''\n            SELECT\n                \"PatientPseudonym\",\n                \"SNOMEDCode\",\n                \"EventDateTime\"\n            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n        '''\n\n        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n\n        try:\n            results = connector.execute_dict(query, params)\n\n            # Process results - pick most recent per patient\n            for row in results:\n                person_key = row.get(\"PatientPseudonym\")\n                snomed_code = row.get(\"SNOMEDCode\")\n                event_date = row.get(\"EventDateTime\")\n\n                if person_key and snomed_code:\n                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n                    if person_key not in gp_matches:\n                        gp_matches[person_key] = (snomed_code, event_date)\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch: {e}\")\n            # Continue with other batches\n\n    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n\n    # Step 5: Build result DataFrame\n    # For each unique_pair, determine Indication_Group based on match status\n    results_list = []\n\n    # We need to dedupe by UPID - a patient might be on multiple drugs\n    # Strategy: For each UPID, use the most recent match (if any)\n    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n\n    for _, row in unique_pairs.iterrows():\n        upid = row['UPID']\n        drug_name = row['Drug Name']\n        patient_pseudonym = row['PseudoNHSNoLinked']\n        directory = row['Directory']\n\n        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n        if patient_pseudonym in gp_matches:\n            matched_snomed, event_date = gp_matches[patient_pseudonym]\n\n            # Find the search_term for this SNOMED code and drug\n            # (A SNOMED code might map to multiple drugs with different search_terms)\n            if matched_snomed in snomed_to_drug_searchterm:\n                # Look for match with current drug first\n                search_term = None\n                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n                    if drug.upper() == drug_name.upper():\n                        search_term = st\n                        break\n                # If no drug-specific match, use any match\n                if search_term is None:\n                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n\n                # Only update if we don't have a match for this UPID yet\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n            else:\n                # Shouldn't happen but fallback just in case\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n        else:\n            # No GP match - use fallback\n            if upid not in upid_to_match:\n                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n\n    # Build result DataFrame\n    for upid, (indication_group, source) in upid_to_match.items():\n        results_list.append({\n            'UPID': upid,\n            'Indication_Group': indication_group,\n            'Source': source,\n        })\n\n    result_df = pd.DataFrame(results_list)\n\n    # Log statistics\n    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n    total = len(result_df)\n\n    logger.info(f\"Indication lookup complete:\")\n    logger.info(f\"  Total unique patients: {total}\")\n    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n\n    return result_df\n\n\n# === Drug-to-indication mapping from DimSearchTerm.csv ===\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                if not search_term or not drug_names_raw:\n                    continue\n\n                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n\n                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n                if search_term not in search_term_to_fragments:\n                    search_term_to_fragments[search_term] = []\n                for frag in fragments:\n                    if frag not in search_term_to_fragments[search_term]:\n                        search_term_to_fragments[search_term].append(frag)\n\n                # Build fragment -> search_terms\n                for frag in fragments:\n                    if frag not in fragment_to_search_terms:\n                        fragment_to_search_terms[frag] = []\n                    if search_term not in fragment_to_search_terms[frag]:\n                        fragment_to_search_terms[frag].append(search_term)\n\n        logger.info(\n            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n            f\"{len(fragment_to_search_terms)} drug fragments\"\n        )\n\n    except FileNotFoundError:\n        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n    except Exception as e:\n        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n\n    return fragment_to_search_terms, search_term_to_fragments\n\n\ndef get_search_terms_for_drug(\n    drug_name: str,\n    search_term_to_fragments: dict[str, list[str]],\n) -> list[str]:\n    \"\"\"\n    Get all Search_Terms that list a given drug using substring matching.\n\n    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n    and partial fragments (PEGYLATED, INHALED).\n\n    Args:\n        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n\n    Returns:\n        List of Search_Terms whose drug fragments match the drug name\n    \"\"\"\n    drug_name_upper = drug_name.upper()\n    matched_terms: list[str] = []\n\n    for search_term, fragments in search_term_to_fragments.items():\n        for frag in fragments:\n            if frag in drug_name_upper:\n                matched_terms.append(search_term)\n                break  # One matching fragment is enough for this Search_Term\n\n    return matched_terms\n\n\n# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n\n# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\nCLUSTER_MAPPING_SQL = \"\"\"\nWITH SearchTermClusters AS (\n    SELECT Search_Term, Cluster_ID FROM (VALUES\n        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n        ('asthma', 'AST_COD'),\n        ('allergic rhinitis', 'MILDINTAST_COD'),\n        ('alzheimer''s disease', 'DEMALZ_COD'),\n        ('amyloidosis', 'AMYLOID_COD'),\n        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n        ('apixaban', 'DOACCON_COD'),\n        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n        ('arthritis', 'eFI2_InflammatoryArthritis'),\n        ('asthma', 'eFI2_Asthma'),\n        ('atopic dermatitis', 'ATOPDERM_COD'),\n        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n        ('bipolar disorder', 'MH_COD'),\n        ('bladder', 'eFI2_UrinaryIncontinence'),\n        ('breast cancer', 'BRCANSCR_COD'),\n        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n        ('cardiovascular disease', 'CVDRISKASS_COD'),\n        ('cervical cancer', 'CSDEC_COD'),\n        ('cholangiocarcinoma', 'eFI2_Cancer'),\n        ('chronic kidney disease', 'CKD_COD'),\n        ('chronic liver disease', 'eFI2_LiverProblems'),\n        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n        ('colon cancer', 'eFI2_Cancer'),\n        ('colorectal cancer', 'GICANREF_COD'),\n        ('constipation', 'CHRONCONSTIP_COD'),\n        ('covid-19', 'POSSPOSTCOVID_COD'),\n        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n        ('deep vein thrombosis', 'VTE_COD'),\n        ('depression', 'eFI2_Depression'),\n        ('diabetes', 'eFI2_DiabetesEver'),\n        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('dravet syndrome', 'EPIL_COD'),\n        ('drug misuse', 'ILLSUBINT_COD'),\n        ('dyspepsia', 'eFI2_AbdominalPain'),\n        ('epilepsy', 'eFI2_Seizures'),\n        ('fallopian tube', 'STERIL_COD'),\n        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n        ('gastric cancer', 'eFI2_Cancer'),\n        ('giant cell arteritis', 'GCA_COD'),\n        ('glioma', 'NHAEMCANMORPH_COD'),\n        ('gout', 'eFI2_InflammatoryArthritis'),\n        ('graft versus host disease', 'GVHD_COD'),\n        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n        ('hand eczema', 'ECZEMA_COD'),\n        ('heart failure', 'eFI2_HeartFailure'),\n        ('hepatitis b', 'HEPBCVAC_COD'),\n        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n        ('hiv', 'PREFLANG_COD'),\n        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n        ('hormone receptor', 'eFI2_ThyroidProblems'),\n        ('hypercholesterolaemia', 'CLASSFH_COD'),\n        ('immune thrombocytopenia', 'ITP_COD'),\n        ('influenza', 'FLUINVITE_COD'),\n        ('insomnia', 'eFI2_SleepProblems'),\n        ('irritable bowel syndrome', 'IBS_COD'),\n        ('ischaemic stroke', 'OSTR_COD'),\n        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n        ('kidney transplant', 'RENALTRANSP_COD'),\n        ('leukaemia', 'eFI2_Cancer'),\n        ('lung cancer', 'FTCANREF_COD'),\n        ('lymphoma', 'C19HAEMCAN_COD'),\n        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('major depressive episodes', 'eFI2_Depression'),\n        ('malignant melanoma', 'eFI2_Cancer'),\n        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n        ('manic episode', 'MH_COD'),\n        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n        ('melanoma', 'eFI2_Cancer'),\n        ('merkel cell carcinoma', 'C19CAN_COD'),\n        ('migraine', 'eFI2_Headache'),\n        ('motor neurone disease', 'MND_COD'),\n        ('multiple myeloma', 'C19HAEMCAN_COD'),\n        ('multiple sclerosis', 'MS_COD'),\n        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n        ('myelofibrosis', 'MDS_COD'),\n        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n        ('myotonia', 'CNDATRISK2_COD'),\n        ('narcolepsy', 'LD_COD'),\n        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n        ('obesity', 'BMI30_COD'),\n        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('osteoporosis', 'eFI2_Osteoporosis'),\n        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n        ('ovarian cancer', 'C19CAN_COD'),\n        ('peripheral arterial disease', 'PADEXC_COD'),\n        ('plaque psoriasis', 'PSORIASIS_COD'),\n        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n        ('pregnancy', 'C19PREG_COD'),\n        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n        ('psoriasis', 'PSORIASIS_COD'),\n        ('psoriatic arthritis', 'RARTHAD_COD'),\n        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n        ('pulmonary fibrosis', 'ILD_COD'),\n        ('relapsing multiple sclerosis', 'MS_COD'),\n        ('renal cell carcinoma', 'C19CAN_COD'),\n        ('renal transplantation', 'RENALTRANSP_COD'),\n        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n        ('rivaroxaban', 'DOACCON_COD'),\n        ('schizophrenia', 'MH_COD'),\n        ('seizures', 'LSZFREQ_COD'),\n        ('sepsis', 'C19ACTIVITY_COD'),\n        ('asthma', 'SEVAST_COD'),\n        ('sickle cell disease', 'SICKLE_COD'),\n        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n        ('smoking cessation', 'SMOKINGINT_COD'),\n        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n        ('spinal muscular atrophy', 'MND_COD'),\n        ('squamous cell', 'C19CAN_COD'),\n        ('squamous cell carcinoma', 'C19CAN_COD'),\n        ('stem cell transplant', 'ALLOTRANSP_COD'),\n        ('stroke', 'eFI2_Stroke'),\n        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n        ('thrombocytopenic purpura', 'TTP_COD'),\n        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n        ('thyroid cancer', 'C19CAN_COD'),\n        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('transitional cell carcinoma', 'C19CAN_COD'),\n        ('type 1 diabetes', 'DMTYPE1_COD'),\n        ('type 2 diabetes', 'DMTYPE2_COD'),\n        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n        ('urticaria', 'XSAL_COD'),\n        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('vascular disease', 'CVDINVITE_COD'),\n        ('vasculitis', 'CRYOGLOBVASC_COD')\n    ) AS t(Search_Term, Cluster_ID)\n),\n\nClusterCodes AS (\n    SELECT\n        stc.Search_Term,\n        c.\"SNOMEDCode\",\n        c.\"SNOMEDDescription\"\n    FROM SearchTermClusters stc\n    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n        ON stc.Cluster_ID = c.\"Cluster_ID\"\n    WHERE c.\"SNOMEDCode\" IS NOT NULL\n),\n\nExplicitCodes AS (\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n),\n\nAllIndicationCodes AS (\n    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n    FROM ClusterCodes\n    UNION ALL\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n    FROM ExplicitCodes\n)\n\"\"\"\n\n\ndef get_patient_indication_groups(\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n\n    This function queries Snowflake directly using the embedded cluster CTE\n    (from snomed_indication_mapping_query.sql) to find patients with matching\n    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n\n    The query:\n    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n    3. Returns the most recent match per patient (by EventDateTime)\n\n    Args:\n        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        batch_size: Number of patients per Snowflake query batch (default 500)\n\n    Returns:\n        DataFrame with columns:\n        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n        - EventDateTime: Date of the GP diagnosis record\n\n        Patients not found in results have no matching GP diagnosis.\n    \"\"\"\n    import pandas as pd\n\n    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n\n    # Handle edge case: empty patient list\n    if not patient_pseudonyms:\n        logger.warning(\"Empty patient list provided\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    # Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if not is_snowflake_configured():\n        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Results list to collect all matches\n    all_results: list[dict] = []\n\n    # Process patients in batches\n    total_patients = len(patient_pseudonyms)\n    for batch_start in range(0, total_patients, batch_size):\n        batch_end = min(batch_start + batch_size, total_patients)\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n        batch_num = batch_start // batch_size + 1\n        total_batches = (total_patients + batch_size - 1) // batch_size\n\n        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n\n        # Build patient IN clause placeholders\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Build the full query with cluster CTE\n        # This finds the most recent matching diagnosis for each patient\n        # Note: Column names must be aliased to ensure consistent casing in results\n        query = f\"\"\"\n{CLUSTER_MAPPING_SQL}\nSELECT\n    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    aic.Search_Term AS \"Search_Term\",\n    pc.\"EventDateTime\" AS \"EventDateTime\"\nFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\nINNER JOIN AllIndicationCodes aic\n    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\nWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\nQUALIFY ROW_NUMBER() OVER (\n    PARTITION BY pc.\"PatientPseudonym\"\n    ORDER BY pc.\"EventDateTime\" DESC\n) = 1\n\"\"\"\n\n        try:\n            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n\n            for row in results:\n                all_results.append({\n                    'PatientPseudonym': row.get('PatientPseudonym'),\n                    'Search_Term': row.get('Search_Term'),\n                    'EventDateTime': row.get('EventDateTime'),\n                })\n\n            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n            # Continue with other batches - partial results are better than none\n\n    # Build result DataFrame\n    result_df = pd.DataFrame(all_results)\n\n    # Log summary statistics\n    if len(result_df) > 0:\n        matched_count = len(result_df)\n        match_rate = 100 * matched_count / total_patients\n        unique_terms = result_df['Search_Term'].nunique()\n        logger.info(f\"Indication lookup complete:\")\n        logger.info(f\"  Total patients queried: {total_patients}\")\n        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n\n        # Log top Search_Terms\n        top_terms = result_df['Search_Term'].value_counts().head(5)\n        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n    else:\n        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n\n    return result_df\n\n\n# Export public API\n__all__ = [\n    # Dataclasses\n    \"ClusterSnomedCodes\",\n    \"IndicationValidationResult\",\n    \"DrugIndicationMatchRate\",\n    \"DrugSnomedMapping\",\n    \"DirectSnomedMatchResult\",\n    \"DirectorateAssignment\",\n    # Cluster-based lookup functions (existing)\n    \"get_drug_clusters\",\n    \"get_drug_cluster_ids\",\n    \"get_cluster_snomed_codes\",\n    \"patient_has_indication\",\n    \"validate_indication\",\n    \"get_indication_match_rate\",\n    \"batch_validate_indications\",\n    \"get_available_clusters\",\n    # Direct SNOMED lookup functions (new)\n    \"get_drug_snomed_codes\",\n    \"patient_has_indication_direct\",\n    # Diagnosis-based directorate assignment\n    \"get_directorate_from_diagnosis\",\n    # Batch lookup for indication groups\n    \"batch_lookup_indication_groups\",\n    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",\n    # Snowflake-direct indication lookup (new approach)\n    \"get_patient_indication_groups\",\n    \"CLUSTER_MAPPING_SQL\",\n]\n","structuredPatch":[{"oldStart":1090,"oldLines":6,"newStart":1090,"newLines":15,"lines":[" # === Drug-to-indication mapping from DimSearchTerm.csv ==="," "," ","+# Merge related Search_Terms into canonical names.","+# Asthma variants are clinically the same condition at different severity levels.","+# Urticaria is a separate condition ÔÇö do NOT merge with asthma.","+SEARCH_TERM_MERGE_MAP: dict[str, str] = {","+    \"allergic asthma\": \"asthma\",","+    \"severe persistent allergic asthma\": \"asthma\",","+}","+","+"," def load_drug_indication_mapping(","     csv_path: Optional[str] = None,"," ) -> tuple[dict[str, list[str]], dict[str, list[str]]]:"]},{"oldStart":1107,"oldLines":6,"newStart":1116,"newLines":10,"lines":["     (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).","     Drug fragments from all rows for the same Search_Term are combined."," ","+    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")","+    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.","+    \"urticaria\" stays separate.","+","     Args:","         csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv."," "]},{"oldStart":1126,"oldLines":6,"newStart":1139,"newLines":9,"lines":["                 search_term = row.get(\"Search_Term\", \"\").strip()","                 drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()"," ","+                # Normalize asthma variants to canonical \"asthma\"","+                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)","+","                 if not search_term or not drug_names_raw:","                     continue"," "]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RFvFMdxRFYJeQEbTr7K5BY","type":"message","role":"assistant","content":[{"type":"text","text":"Now I need to update the `__all__` exports to include the new constant, and update the log message to note the merge.\n\nLet me also update the `__all__` list to include `SEARCH_TERM_MERGE_MAP` for potential use by other modules."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1401,"cache_read_input_tokens":106650,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1401},"output_tokens":5,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"677efa41-297a-4e82-aff5-d1490a278d1a"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RFvFMdxRFYJeQEbTr7K5BY","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01LjkjDMp18dyVzkXZucnyhe","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","old_string":"    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",","new_string":"    # Drug-indication mapping from DimSearchTerm.csv\n    \"SEARCH_TERM_MERGE_MAP\",\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\","}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1401,"cache_read_input_tokens":106650,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1401},"output_tokens":5,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"b077bbc0-2f18-4f75-bf68-20c41ee4e096"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01LjkjDMp18dyVzkXZucnyhe","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"142e00c0-a12b-404e-8264-ee38939280a7","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","oldString":"    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",","newString":"    # Drug-indication mapping from DimSearchTerm.csv\n    \"SEARCH_TERM_MERGE_MAP\",\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",","originalFile":"\"\"\"\nDiagnosis lookup module for NHS Patient Pathway Analysis.\n\nProvides functions to validate patient indications by checking GP diagnosis records\nagainst SNOMED cluster codes. Uses the drug-to-cluster mapping from\ndrug_indication_clusters.csv and queries Snowflake for SNOMED codes and GP records.\n\nKey workflow:\n1. Get drug's valid indication clusters from local mapping\n2. Get all SNOMED codes for those clusters from Snowflake\n3. Check if patient has any of those SNOMED codes in GP records\n4. Report indication validation status\n\nIMPORTANT: HCD activity data indication codes are UNRELIABLE. This module uses\nGP/Primary Care data (PrimaryCareClinicalCoding) as the authoritative source.\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom datetime import date, datetime\nfrom pathlib import Path\nfrom typing import Optional, Callable, Any, cast, TYPE_CHECKING\nimport csv\n\nif TYPE_CHECKING:\n    import pandas as pd\n\nfrom core.logging_config import get_logger\nfrom data_processing.database import DatabaseManager, default_db_manager\nfrom data_processing.snowflake_connector import (\n    SnowflakeConnector,\n    get_connector,\n    is_snowflake_available,\n    is_snowflake_configured,\n    SNOWFLAKE_AVAILABLE,\n)\nfrom data_processing.cache import get_cache, is_cache_enabled\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass ClusterSnomedCodes:\n    \"\"\"SNOMED codes for a clinical coding cluster.\"\"\"\n    cluster_id: str\n    cluster_description: str\n    snomed_codes: list[str] = field(default_factory=list)\n    snomed_descriptions: dict[str, str] = field(default_factory=dict)\n\n    @property\n    def code_count(self) -> int:\n        return len(self.snomed_codes)\n\n\n@dataclass\nclass IndicationValidationResult:\n    \"\"\"Result of validating a patient's indication for a drug.\"\"\"\n    patient_pseudonym: str\n    drug_name: str\n    has_valid_indication: bool\n    matched_cluster_id: Optional[str] = None\n    matched_snomed_code: Optional[str] = None\n    matched_snomed_description: Optional[str] = None\n    checked_clusters: list[str] = field(default_factory=list)\n    total_codes_checked: int = 0\n    source: str = \"GP_SNOMED\"  # GP_SNOMED | NONE\n    error_message: Optional[str] = None\n\n\n@dataclass\nclass DrugIndicationMatchRate:\n    \"\"\"Match rate statistics for a drug's indication validation.\"\"\"\n    drug_name: str\n    total_patients: int\n    patients_with_indication: int\n    patients_without_indication: int\n    match_rate: float  # 0.0 to 1.0\n    clusters_checked: list[str] = field(default_factory=list)\n    sample_unmatched: list[str] = field(default_factory=list)  # Sample patient IDs\n\n\n@dataclass\nclass DrugSnomedMapping:\n    \"\"\"SNOMED code mapping for a drug from ref_drug_snomed_mapping.\"\"\"\n    snomed_code: str\n    snomed_description: str\n    search_term: str\n    primary_directorate: str\n    indication: str = \"\"\n    ta_id: str = \"\"\n\n\n@dataclass\nclass DirectSnomedMatchResult:\n    \"\"\"Result of direct SNOMED code lookup in GP records.\"\"\"\n    patient_pseudonym: str\n    matched: bool\n    snomed_code: Optional[str] = None\n    snomed_description: Optional[str] = None\n    search_term: Optional[str] = None\n    primary_directorate: Optional[str] = None\n    event_date: Optional[datetime] = None\n    source: str = \"DIRECT_SNOMED\"  # DIRECT_SNOMED | NONE\n\n\n@dataclass\nclass DirectorateAssignment:\n    \"\"\"Result of directorate assignment for a patient-drug combination.\"\"\"\n    upid: str\n    drug_name: str\n    directorate: Optional[str]\n    search_term: Optional[str] = None\n    source: str = \"FALLBACK\"  # DIAGNOSIS | FALLBACK\n    snomed_code: Optional[str] = None\n    event_date: Optional[datetime] = None\n\n\ndef get_drug_clusters(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[dict]:\n    \"\"\"\n    Get all SNOMED cluster mappings for a drug from local SQLite.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of dicts with keys: drug_name, indication, cluster_id,\n        cluster_description, nice_ta_reference\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT drug_name, indication, cluster_id, cluster_description, nice_ta_reference\n        FROM ref_drug_indication_clusters\n        WHERE UPPER(drug_name) = UPPER(?)\n        ORDER BY indication, cluster_id\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name,))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append({\n                    \"drug_name\": row[\"drug_name\"],\n                    \"indication\": row[\"indication\"],\n                    \"cluster_id\": row[\"cluster_id\"],\n                    \"cluster_description\": row[\"cluster_description\"],\n                    \"nice_ta_reference\": row[\"nice_ta_reference\"],\n                })\n\n            logger.debug(f\"Found {len(results)} cluster mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting clusters for drug '{drug_name}': {e}\")\n        return []\n\n\ndef get_drug_cluster_ids(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[str]:\n    \"\"\"\n    Get unique cluster IDs for a drug.\n\n    Args:\n        drug_name: Drug name to look up\n        db_manager: Optional DatabaseManager\n\n    Returns:\n        List of unique cluster IDs\n    \"\"\"\n    clusters = get_drug_clusters(drug_name, db_manager)\n    return list(set(c[\"cluster_id\"] for c in clusters))\n\n\ndef get_drug_snomed_codes(\n    drug_name: str,\n    db_manager: Optional[DatabaseManager] = None\n) -> list[DrugSnomedMapping]:\n    \"\"\"\n    Get all SNOMED codes for a drug from local ref_drug_snomed_mapping table.\n\n    This uses the enriched mapping CSV data loaded into SQLite, which provides\n    direct SNOMED-to-drug mappings with Search_Term and PrimaryDirectorate.\n\n    Args:\n        drug_name: Drug name to look up (case-insensitive, matches cleaned_drug_name)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n\n    Returns:\n        List of DrugSnomedMapping with snomed_code, snomed_description,\n        search_term, primary_directorate, indication, ta_id\n    \"\"\"\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    query = \"\"\"\n        SELECT DISTINCT\n            snomed_code,\n            snomed_description,\n            search_term,\n            primary_directorate,\n            indication,\n            ta_id\n        FROM ref_drug_snomed_mapping\n        WHERE UPPER(cleaned_drug_name) = UPPER(?)\n           OR UPPER(drug_name) = UPPER(?)\n        ORDER BY search_term, snomed_code\n    \"\"\"\n\n    try:\n        with db_manager.get_connection() as conn:\n            cursor = conn.execute(query, (drug_name, drug_name))\n            rows = cursor.fetchall()\n\n            results = []\n            for row in rows:\n                results.append(DrugSnomedMapping(\n                    snomed_code=row[\"snomed_code\"],\n                    snomed_description=row[\"snomed_description\"] or \"\",\n                    search_term=row[\"search_term\"] or \"\",\n                    primary_directorate=row[\"primary_directorate\"] or \"\",\n                    indication=row[\"indication\"] or \"\",\n                    ta_id=row[\"ta_id\"] or \"\",\n                ))\n\n            logger.debug(f\"Found {len(results)} SNOMED mappings for drug '{drug_name}'\")\n            return results\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for drug '{drug_name}': {e}\")\n        return []\n\n\ndef patient_has_indication_direct(\n    patient_pseudonym: str,\n    drug_snomed_mappings: list[DrugSnomedMapping],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> DirectSnomedMatchResult:\n    \"\"\"\n    Check if patient has any of the SNOMED codes in their GP records.\n\n    This is the direct SNOMED lookup - it queries PrimaryCareClinicalCoding\n    for exact SNOMED code matches (not via cluster). Returns the most recent\n    match by EventDateTime if multiple matches exist.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_snomed_mappings: List of DrugSnomedMapping from get_drug_snomed_codes()\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectSnomedMatchResult with match details (most recent by EventDateTime)\n    \"\"\"\n    result = DirectSnomedMatchResult(\n        patient_pseudonym=patient_pseudonym,\n        matched=False,\n        source=\"NONE\",\n    )\n\n    if not drug_snomed_mappings:\n        return result\n\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot check GP records\")\n        return result\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build lookup dict for mapping snomed_code -> (search_term, primary_directorate, snomed_description)\n    snomed_lookup = {\n        m.snomed_code: (m.search_term, m.primary_directorate, m.snomed_description)\n        for m in drug_snomed_mappings\n    }\n\n    # Get unique SNOMED codes\n    snomed_codes = list(snomed_lookup.keys())\n\n    # Build placeholders for SNOMED codes\n    placeholders = \", \".join([\"%s\"] * len(snomed_codes))\n\n    # Query to find most recent matching SNOMED code in GP records\n    query = f'''\n        SELECT\n            \"SNOMEDCode\",\n            \"EventDateTime\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n        WHERE \"PatientPseudonym\" = %s\n            AND \"SNOMEDCode\" IN ({placeholders})\n    '''\n\n    params: list = [patient_pseudonym] + snomed_codes\n\n    if before_date:\n        query += ' AND \"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' ORDER BY \"EventDateTime\" DESC LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            matched_code = row.get(\"SNOMEDCode\")\n            event_dt = row.get(\"EventDateTime\")\n\n            if matched_code and matched_code in snomed_lookup:\n                search_term, primary_dir, snomed_desc = snomed_lookup[matched_code]\n\n                return DirectSnomedMatchResult(\n                    patient_pseudonym=patient_pseudonym,\n                    matched=True,\n                    snomed_code=matched_code,\n                    snomed_description=snomed_desc,\n                    search_term=search_term,\n                    primary_directorate=primary_dir,\n                    event_date=event_dt,\n                    source=\"DIRECT_SNOMED\",\n                )\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error checking direct SNOMED for patient '{patient_pseudonym}': {e}\")\n        return result\n\n\ndef get_directorate_from_diagnosis(\n    upid: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> DirectorateAssignment:\n    \"\"\"\n    Get directorate assignment for a patient-drug combination using diagnosis-based lookup.\n\n    This function attempts to assign a directorate based on the patient's GP records\n    (direct SNOMED code matching). If no match is found, it returns a FALLBACK result\n    indicating that the caller should use alternative assignment methods (e.g.,\n    department_identification() from tools/data.py).\n\n    Workflow:\n    1. Get all SNOMED codes for the drug from ref_drug_snomed_mapping\n    2. Query patient's GP records for matching SNOMED codes\n    3. If match found ÔåÆ return diagnosis-based directorate and search_term\n    4. If no match ÔåÆ return FALLBACK result (caller handles fallback logic)\n\n    Args:\n        upid: Patient's unique patient ID (Provider Code[:3] + PersonKey)\n        drug_name: Drug name to look up\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        DirectorateAssignment with directorate, search_term, and source\n    \"\"\"\n    result = DirectorateAssignment(\n        upid=upid,\n        drug_name=drug_name,\n        directorate=None,\n        source=\"FALLBACK\",\n    )\n\n    # Step 1: Get SNOMED codes for the drug\n    drug_snomed_mappings = get_drug_snomed_codes(drug_name, db_manager)\n\n    if not drug_snomed_mappings:\n        logger.debug(f\"No SNOMED mappings found for drug '{drug_name}' - using fallback\")\n        return result\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.debug(\"Snowflake not available - using fallback\")\n        return result\n\n    if not is_snowflake_configured():\n        logger.debug(\"Snowflake not configured - using fallback\")\n        return result\n\n    # Step 3: Get patient pseudonym from UPID\n    # UPID format is Provider Code (3 chars) + PersonKey\n    # We need to query Snowflake to get the PatientPseudonym for this PersonKey\n    # However, patient_has_indication_direct expects PatientPseudonym, not UPID\n    # For now, we'll use UPID as the identifier - the actual integration\n    # will need to happen at the DataFrame level where we have PersonKey\n    #\n    # NOTE: This function will be called from the pipeline where we have\n    # access to PatientPseudonym. The UPID is passed for logging/tracking.\n\n    # Actually, looking at the pipeline, we need PatientPseudonym, not UPID.\n    # The caller should pass the PatientPseudonym or we need to look it up.\n    # For now, let's assume the caller will use this in a batch context\n    # where they can map UPID -> PatientPseudonym.\n\n    # Let me reconsider: the function signature takes UPID but we need\n    # PatientPseudonym for Snowflake. In the pipeline context (fetch_and_transform_data),\n    # we'll have the PersonKey column which IS the PatientPseudonym.\n    # So UPID = ProviderCode[:3] + PersonKey, and PersonKey = PatientPseudonym.\n    #\n    # We can extract PatientPseudonym from UPID by removing the first 3 chars.\n    patient_pseudonym = upid[3:] if len(upid) > 3 else upid\n\n    # Step 4: Check patient's GP records for matching SNOMED codes\n    match_result = patient_has_indication_direct(\n        patient_pseudonym=patient_pseudonym,\n        drug_snomed_mappings=drug_snomed_mappings,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    if match_result.matched and match_result.primary_directorate:\n        return DirectorateAssignment(\n            upid=upid,\n            drug_name=drug_name,\n            directorate=match_result.primary_directorate,\n            search_term=match_result.search_term,\n            source=\"DIAGNOSIS\",\n            snomed_code=match_result.snomed_code,\n            event_date=match_result.event_date,\n        )\n\n    # No match found - return fallback result\n    return result\n\n\ndef get_cluster_snomed_codes(\n    cluster_id: str,\n    connector: Optional[SnowflakeConnector] = None,\n    use_cache: bool = True,\n) -> ClusterSnomedCodes:\n    \"\"\"\n    Get all SNOMED codes for a cluster from Snowflake.\n\n    Queries the ClinicalCodingClusterSnomedCodes table to get all SNOMED codes\n    that belong to the specified cluster.\n\n    Args:\n        cluster_id: Cluster ID to look up (e.g., 'RARTH_COD', 'PSORIASIS_COD')\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        use_cache: Whether to use cached results (default True)\n\n    Returns:\n        ClusterSnomedCodes with list of SNOMED codes and descriptions\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE:\n        logger.warning(\"Snowflake connector not available\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    if not is_snowflake_configured():\n        logger.warning(\"Snowflake not configured - cannot get cluster codes\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n    # Check cache first\n    cache_key = f\"cluster_snomed_{cluster_id}\"\n    if use_cache and is_cache_enabled():\n        cache = get_cache()\n        cached = cache.get(cache_key)\n        if cached is not None and len(cached) > 0:\n            logger.debug(f\"Using cached SNOMED codes for cluster '{cluster_id}'\")\n            cached_dict = cached[0]  # First element is our data dict\n            return ClusterSnomedCodes(\n                cluster_id=cluster_id,\n                cluster_description=str(cached_dict.get(\"description\", \"\")),\n                snomed_codes=list(cached_dict.get(\"codes\", [])),\n                snomed_descriptions=dict(cached_dict.get(\"descriptions\", {})),\n            )\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT DISTINCT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            \"SNOMEDCode\",\n            \"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        WHERE \"Cluster_ID\" = %s\n        ORDER BY \"SNOMEDCode\"\n    '''\n\n    try:\n        results = connector.execute_dict(query, (cluster_id,))\n\n        if not results:\n            logger.warning(f\"No SNOMED codes found for cluster '{cluster_id}'\")\n            return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n        codes = []\n        descriptions = {}\n        description = results[0].get(\"Cluster_Description\", \"\") if results else \"\"\n\n        for row in results:\n            code = row.get(\"SNOMEDCode\")\n            if code:\n                codes.append(code)\n                descriptions[code] = row.get(\"SNOMEDDescription\", \"\")\n\n        logger.info(f\"Found {len(codes)} SNOMED codes for cluster '{cluster_id}'\")\n\n        # Cache the results (using query-based cache with fake params)\n        if use_cache and is_cache_enabled():\n            cache = get_cache()\n            cache_data = [{\n                \"description\": description,\n                \"codes\": codes,\n                \"descriptions\": descriptions,\n            }]\n            cache.set(cache_key, None, cache_data)  # type: ignore[arg-type]\n\n        return ClusterSnomedCodes(\n            cluster_id=cluster_id,\n            cluster_description=description,\n            snomed_codes=codes,\n            snomed_descriptions=descriptions,\n        )\n\n    except Exception as e:\n        logger.error(f\"Error getting SNOMED codes for cluster '{cluster_id}': {e}\")\n        return ClusterSnomedCodes(cluster_id=cluster_id, cluster_description=\"\")\n\n\ndef patient_has_indication(\n    patient_pseudonym: str,\n    cluster_ids: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    before_date: Optional[date] = None,\n) -> tuple[bool, Optional[str], Optional[str], Optional[str]]:\n    \"\"\"\n    Check if a patient has any SNOMED codes from the specified clusters in GP records.\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        cluster_ids: List of cluster IDs to check against\n        connector: Optional SnowflakeConnector\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        Tuple of (has_indication, matched_cluster_id, matched_snomed_code, matched_description)\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        return False, None, None, None\n\n    if not cluster_ids:\n        return False, None, None, None\n\n    if connector is None:\n        connector = get_connector()\n\n    # Build placeholders for cluster IDs\n    placeholders = \", \".join([\"%s\"] * len(cluster_ids))\n\n    # Query to check if patient has any matching SNOMED code\n    query = f'''\n        SELECT\n            pc.\"SNOMEDCode\",\n            cc.\"Cluster_ID\",\n            cc.\"SNOMEDDescription\"\n        FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\n        INNER JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" cc\n            ON pc.\"SNOMEDCode\" = cc.\"SNOMEDCode\"\n        WHERE pc.\"PatientPseudonym\" = %s\n            AND cc.\"Cluster_ID\" IN ({placeholders})\n    '''\n\n    params = [patient_pseudonym] + cluster_ids\n\n    if before_date:\n        query += ' AND pc.\"EventDateTime\" < %s'\n        params.append(before_date.isoformat())\n\n    query += ' LIMIT 1'\n\n    try:\n        results = connector.execute_dict(query, tuple(params))\n\n        if results:\n            row = results[0]\n            return (\n                True,\n                row.get(\"Cluster_ID\"),\n                row.get(\"SNOMEDCode\"),\n                row.get(\"SNOMEDDescription\"),\n            )\n\n        return False, None, None, None\n\n    except Exception as e:\n        logger.error(f\"Error checking indication for patient '{patient_pseudonym}': {e}\")\n        return False, None, None, None\n\n\ndef validate_indication(\n    patient_pseudonym: str,\n    drug_name: str,\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    before_date: Optional[date] = None,\n) -> IndicationValidationResult:\n    \"\"\"\n    Validate that a patient has an appropriate indication for a drug.\n\n    Full validation workflow:\n    1. Get drug's valid indication clusters from local mapping\n    2. Check if patient has any matching SNOMED codes in GP records\n    3. Return detailed validation result\n\n    Args:\n        patient_pseudonym: Patient's pseudonymised NHS number\n        drug_name: Drug name to validate indication for\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        before_date: Optional date - only check diagnoses before this date\n\n    Returns:\n        IndicationValidationResult with validation details\n    \"\"\"\n    result = IndicationValidationResult(\n        patient_pseudonym=patient_pseudonym,\n        drug_name=drug_name,\n        has_valid_indication=False,\n    )\n\n    # Step 1: Get drug's cluster mappings\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    if not cluster_ids:\n        result.error_message = f\"No cluster mappings found for drug '{drug_name}'\"\n        result.source = \"NONE\"\n        return result\n\n    result.checked_clusters = cluster_ids\n\n    # Step 2: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        result.error_message = \"Snowflake connector not installed\"\n        result.source = \"NONE\"\n        return result\n\n    if not is_snowflake_configured():\n        result.error_message = \"Snowflake not configured\"\n        result.source = \"NONE\"\n        return result\n\n    # Step 3: Check patient GP records\n    has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n        patient_pseudonym=patient_pseudonym,\n        cluster_ids=cluster_ids,\n        connector=connector,\n        before_date=before_date,\n    )\n\n    result.has_valid_indication = has_indication\n    result.matched_cluster_id = matched_cluster\n    result.matched_snomed_code = matched_code\n    result.matched_snomed_description = matched_desc\n    result.source = \"GP_SNOMED\" if has_indication else \"NONE\"\n\n    return result\n\n\ndef get_indication_match_rate(\n    drug_name: str,\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    sample_unmatched_count: int = 10,\n) -> DrugIndicationMatchRate:\n    \"\"\"\n    Calculate indication match rate for a drug across a list of patients.\n\n    Args:\n        drug_name: Drug name to check\n        patient_pseudonyms: List of patient pseudonymised NHS numbers\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        sample_unmatched_count: Number of unmatched patient IDs to include in sample\n\n    Returns:\n        DrugIndicationMatchRate with match statistics\n    \"\"\"\n    if connector is None and SNOWFLAKE_AVAILABLE and is_snowflake_configured():\n        connector = get_connector()\n\n    cluster_ids = get_drug_cluster_ids(drug_name, db_manager)\n\n    total = len(patient_pseudonyms)\n    matched = 0\n    unmatched = 0\n    sample_unmatched: list[str] = []\n\n    if not cluster_ids:\n        logger.warning(f\"No cluster mappings for drug '{drug_name}' - all patients will be unmatched\")\n        return DrugIndicationMatchRate(\n            drug_name=drug_name,\n            total_patients=total,\n            patients_with_indication=0,\n            patients_without_indication=total,\n            match_rate=0.0,\n            clusters_checked=[],\n            sample_unmatched=patient_pseudonyms[:sample_unmatched_count],\n        )\n\n    for i, pseudonym in enumerate(patient_pseudonyms):\n        if i > 0 and i % 100 == 0:\n            logger.info(f\"Validating indications: {i}/{total} ({100*i/total:.1f}%)\")\n\n        has_indication, _, _, _ = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        if has_indication:\n            matched += 1\n        else:\n            unmatched += 1\n            if len(sample_unmatched) < sample_unmatched_count:\n                sample_unmatched.append(pseudonym)\n\n    match_rate = matched / total if total > 0 else 0.0\n\n    logger.info(f\"Indication match rate for '{drug_name}': {100*match_rate:.1f}% ({matched}/{total})\")\n\n    return DrugIndicationMatchRate(\n        drug_name=drug_name,\n        total_patients=total,\n        patients_with_indication=matched,\n        patients_without_indication=unmatched,\n        match_rate=match_rate,\n        clusters_checked=cluster_ids,\n        sample_unmatched=sample_unmatched,\n    )\n\n\ndef batch_validate_indications(\n    patient_drug_pairs: list[tuple[str, str]],\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    progress_callback: Optional[Callable[[int, int], None]] = None,\n) -> list[IndicationValidationResult]:\n    \"\"\"\n    Validate indications for multiple patient-drug pairs efficiently.\n\n    Args:\n        patient_drug_pairs: List of (patient_pseudonym, drug_name) tuples\n        connector: Optional SnowflakeConnector\n        db_manager: Optional DatabaseManager\n        progress_callback: Optional callback(current, total) for progress updates\n\n    Returns:\n        List of IndicationValidationResult for each pair\n    \"\"\"\n    results = []\n    total = len(patient_drug_pairs)\n\n    # Cache cluster lookups by drug\n    drug_clusters_cache = {}\n\n    for i, (pseudonym, drug_name) in enumerate(patient_drug_pairs):\n        if progress_callback:\n            progress_callback(i + 1, total)\n\n        # Get clusters from cache or lookup\n        drug_upper = drug_name.upper()\n        if drug_upper not in drug_clusters_cache:\n            drug_clusters_cache[drug_upper] = get_drug_cluster_ids(drug_name, db_manager)\n\n        cluster_ids = drug_clusters_cache[drug_upper]\n\n        if not cluster_ids:\n            results.append(IndicationValidationResult(\n                patient_pseudonym=pseudonym,\n                drug_name=drug_name,\n                has_valid_indication=False,\n                source=\"NONE\",\n                error_message=f\"No cluster mappings for drug '{drug_name}'\",\n            ))\n            continue\n\n        # Check patient indication\n        has_indication, matched_cluster, matched_code, matched_desc = patient_has_indication(\n            patient_pseudonym=pseudonym,\n            cluster_ids=cluster_ids,\n            connector=connector,\n        )\n\n        results.append(IndicationValidationResult(\n            patient_pseudonym=pseudonym,\n            drug_name=drug_name,\n            has_valid_indication=has_indication,\n            matched_cluster_id=matched_cluster,\n            matched_snomed_code=matched_code,\n            matched_snomed_description=matched_desc,\n            checked_clusters=cluster_ids,\n            source=\"GP_SNOMED\" if has_indication else \"NONE\",\n        ))\n\n    matched_count = sum(1 for r in results if r.has_valid_indication)\n    logger.info(f\"Batch validation complete: {matched_count}/{total} ({100*matched_count/total:.1f}%) with valid indications\")\n\n    return results\n\n\ndef get_available_clusters(\n    connector: Optional[SnowflakeConnector] = None,\n) -> list[dict]:\n    \"\"\"\n    Get list of all available SNOMED clusters from Snowflake.\n\n    Returns:\n        List of dicts with cluster_id, cluster_description, code_count\n    \"\"\"\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - cannot list clusters\")\n        return []\n\n    if connector is None:\n        connector = get_connector()\n\n    query = '''\n        SELECT\n            \"Cluster_ID\",\n            \"Cluster_Description\",\n            COUNT(DISTINCT \"SNOMEDCode\") as code_count\n        FROM DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\"\n        GROUP BY \"Cluster_ID\", \"Cluster_Description\"\n        ORDER BY \"Cluster_ID\"\n    '''\n\n    try:\n        results = connector.execute_dict(query)\n\n        clusters = []\n        for row in results:\n            clusters.append({\n                \"cluster_id\": row.get(\"Cluster_ID\"),\n                \"cluster_description\": row.get(\"Cluster_Description\"),\n                \"code_count\": row.get(\"code_count\", 0),\n            })\n\n        logger.info(f\"Found {len(clusters)} available SNOMED clusters\")\n        return clusters\n\n    except Exception as e:\n        logger.error(f\"Error getting available clusters: {e}\")\n        return []\n\n\ndef batch_lookup_indication_groups(\n    df: \"pd.DataFrame\",\n    connector: Optional[SnowflakeConnector] = None,\n    db_manager: Optional[DatabaseManager] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups for a DataFrame of patients.\n\n    This is the efficient batch version of get_directorate_from_diagnosis().\n    Instead of querying Snowflake per patient, it batches the lookups for performance.\n\n    Strategy:\n    1. Get all unique (PersonKey, Drug Name) pairs from DataFrame\n    2. For each unique drug, get all SNOMED codes from local SQLite\n    3. Build batched Snowflake queries to check GP records\n    4. Return indication_df mapping UPID ÔåÆ Indication_Group\n\n    For unmatched patients, Indication_Group will be their Directory (with suffix).\n\n    Args:\n        df: DataFrame with columns: UPID, Drug Name, Directory, PersonKey\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        db_manager: Optional DatabaseManager (defaults to default_db_manager)\n        batch_size: Number of patients per Snowflake query batch\n\n    Returns:\n        DataFrame with columns: UPID, Indication_Group, Source\n        - Indication_Group: Search_Term (if matched) or \"Directory (no GP dx)\" (if not)\n        - Source: \"DIAGNOSIS\" or \"FALLBACK\"\n    \"\"\"\n    import pandas as pd\n\n    if db_manager is None:\n        db_manager = default_db_manager\n\n    logger.info(f\"Starting batch indication lookup for {len(df)} records...\")\n\n    # Step 1: Get unique (UPID, Drug Name, PseudoNHSNoLinked, Directory) combinations\n    # We need PseudoNHSNoLinked to query Snowflake - this matches PatientPseudonym in GP records\n    # Note: PersonKey is LocalPatientID which is provider-specific and does NOT match GP records\n    if 'PseudoNHSNoLinked' not in df.columns:\n        logger.error(\"DataFrame missing 'PseudoNHSNoLinked' column - cannot lookup GP records\")\n        # Return fallback for all patients\n        result_df = df[['UPID', 'Directory']].drop_duplicates().copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']]\n\n    # Get unique patient-drug combinations (we need one lookup per patient-drug pair)\n    unique_pairs = df[['UPID', 'Drug Name', 'PseudoNHSNoLinked', 'Directory']].drop_duplicates()\n    logger.info(f\"Found {len(unique_pairs)} unique patient-drug combinations\")\n\n    # Step 2: Get all unique drugs and their SNOMED codes\n    unique_drugs = unique_pairs['Drug Name'].unique()\n    logger.info(f\"Building SNOMED lookup for {len(unique_drugs)} unique drugs...\")\n\n    # Build drug -> list of DrugSnomedMapping dict\n    drug_snomed_map: dict[str, list[DrugSnomedMapping]] = {}\n    all_snomed_codes: set[str] = set()\n    snomed_to_drug_searchterm: dict[str, list[tuple[str, str, str]]] = {}  # snomed -> [(drug, search_term, primary_dir), ...]\n\n    for drug_name in unique_drugs:\n        mappings = get_drug_snomed_codes(drug_name, db_manager)\n        drug_snomed_map[drug_name] = mappings\n\n        for m in mappings:\n            all_snomed_codes.add(m.snomed_code)\n            if m.snomed_code not in snomed_to_drug_searchterm:\n                snomed_to_drug_searchterm[m.snomed_code] = []\n            snomed_to_drug_searchterm[m.snomed_code].append(\n                (drug_name, m.search_term, m.primary_directorate)\n            )\n\n    logger.info(f\"Total SNOMED codes to check: {len(all_snomed_codes)}\")\n\n    # Step 3: Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE or not is_snowflake_configured():\n        logger.warning(\"Snowflake not available - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Step 4: Query GP records for all patients in batches\n    # The query finds the most recent matching SNOMED code for each patient\n\n    # Get unique PseudoNHSNoLinked values (each = one patient in GP records)\n    unique_patients = unique_pairs[['PseudoNHSNoLinked', 'UPID', 'Directory']].drop_duplicates(subset=['PseudoNHSNoLinked'])\n    patient_pseudonyms = unique_patients['PseudoNHSNoLinked'].tolist()\n\n    logger.info(f\"Querying GP records for {len(patient_pseudonyms)} unique patients in batches of {batch_size}...\")\n\n    # Results dict: PersonKey -> (snomed_code, event_date)\n    gp_matches: dict[str, tuple[str, Any]] = {}\n\n    # Convert SNOMED codes to list for query\n    snomed_list = list(all_snomed_codes)\n\n    if not snomed_list:\n        logger.warning(\"No SNOMED codes to check - returning fallback for all patients\")\n        result_df = unique_pairs[['UPID', 'Directory']].copy()\n        result_df['Indication_Group'] = result_df['Directory'] + \" (no GP dx)\"\n        result_df['Source'] = \"FALLBACK\"\n        return result_df[['UPID', 'Indication_Group', 'Source']].drop_duplicates(subset=['UPID'])\n\n    # Build SNOMED IN clause (reused across batches)\n    snomed_placeholders = \", \".join([\"%s\"] * len(snomed_list))\n\n    # Process patients in batches\n    for batch_start in range(0, len(patient_pseudonyms), batch_size):\n        batch_end = min(batch_start + batch_size, len(patient_pseudonyms))\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n\n        logger.info(f\"Batch {batch_start//batch_size + 1}: patients {batch_start} to {batch_end}\")\n\n        # Build patient IN clause\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Query to find all matching SNOMED codes for these patients\n        # We'll get all matches and pick the most recent per patient in Python\n        query = f'''\n            SELECT\n                \"PatientPseudonym\",\n                \"SNOMEDCode\",\n                \"EventDateTime\"\n            FROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\"\n            WHERE \"PatientPseudonym\" IN ({patient_placeholders})\n              AND \"SNOMEDCode\" IN ({snomed_placeholders})\n            ORDER BY \"PatientPseudonym\", \"EventDateTime\" DESC\n        '''\n\n        params = tuple(batch_pseudonyms) + tuple(snomed_list)\n\n        try:\n            results = connector.execute_dict(query, params)\n\n            # Process results - pick most recent per patient\n            for row in results:\n                person_key = row.get(\"PatientPseudonym\")\n                snomed_code = row.get(\"SNOMEDCode\")\n                event_date = row.get(\"EventDateTime\")\n\n                if person_key and snomed_code:\n                    # Keep only if we haven't seen this patient yet (first = most recent due to ORDER BY)\n                    if person_key not in gp_matches:\n                        gp_matches[person_key] = (snomed_code, event_date)\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch: {e}\")\n            # Continue with other batches\n\n    logger.info(f\"Found GP matches for {len(gp_matches)} patients\")\n\n    # Step 5: Build result DataFrame\n    # For each unique_pair, determine Indication_Group based on match status\n    results_list = []\n\n    # We need to dedupe by UPID - a patient might be on multiple drugs\n    # Strategy: For each UPID, use the most recent match (if any)\n    upid_to_match: dict[str, tuple[str, str]] = {}  # UPID -> (Indication_Group, Source)\n\n    for _, row in unique_pairs.iterrows():\n        upid = row['UPID']\n        drug_name = row['Drug Name']\n        patient_pseudonym = row['PseudoNHSNoLinked']\n        directory = row['Directory']\n\n        # Check if patient has GP match (using PseudoNHSNoLinked which matches PatientPseudonym in GP)\n        if patient_pseudonym in gp_matches:\n            matched_snomed, event_date = gp_matches[patient_pseudonym]\n\n            # Find the search_term for this SNOMED code and drug\n            # (A SNOMED code might map to multiple drugs with different search_terms)\n            if matched_snomed in snomed_to_drug_searchterm:\n                # Look for match with current drug first\n                search_term = None\n                for drug, st, pd in snomed_to_drug_searchterm[matched_snomed]:\n                    if drug.upper() == drug_name.upper():\n                        search_term = st\n                        break\n                # If no drug-specific match, use any match\n                if search_term is None:\n                    search_term = snomed_to_drug_searchterm[matched_snomed][0][1]\n\n                # Only update if we don't have a match for this UPID yet\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (search_term, \"DIAGNOSIS\")\n            else:\n                # Shouldn't happen but fallback just in case\n                if upid not in upid_to_match:\n                    upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n        else:\n            # No GP match - use fallback\n            if upid not in upid_to_match:\n                upid_to_match[upid] = (directory + \" (no GP dx)\", \"FALLBACK\")\n\n    # Build result DataFrame\n    for upid, (indication_group, source) in upid_to_match.items():\n        results_list.append({\n            'UPID': upid,\n            'Indication_Group': indication_group,\n            'Source': source,\n        })\n\n    result_df = pd.DataFrame(results_list)\n\n    # Log statistics\n    diagnosis_count = len([s for s in result_df['Source'] if s == \"DIAGNOSIS\"])\n    fallback_count = len([s for s in result_df['Source'] if s == \"FALLBACK\"])\n    total = len(result_df)\n\n    logger.info(f\"Indication lookup complete:\")\n    logger.info(f\"  Total unique patients: {total}\")\n    logger.info(f\"  DIAGNOSIS matches: {diagnosis_count} ({100*diagnosis_count/total:.1f}%)\")\n    logger.info(f\"  FALLBACK (no GP match): {fallback_count} ({100*fallback_count/total:.1f}%)\")\n\n    return result_df\n\n\n# === Drug-to-indication mapping from DimSearchTerm.csv ===\n\n\n# Merge related Search_Terms into canonical names.\n# Asthma variants are clinically the same condition at different severity levels.\n# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\nSEARCH_TERM_MERGE_MAP: dict[str, str] = {\n    \"allergic asthma\": \"asthma\",\n    \"severe persistent allergic asthma\": \"asthma\",\n}\n\n\ndef load_drug_indication_mapping(\n    csv_path: Optional[str] = None,\n) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n    \"\"\"\n    Load the drug-to-Search_Term mapping from DimSearchTerm.csv.\n\n    Builds two lookup dicts:\n    - fragment_to_search_terms: drug fragment (UPPERCASE) -> list of Search_Terms containing it\n    - search_term_to_fragments: search_term -> list of drug fragments (UPPERCASE)\n\n    DimSearchTerm.csv has columns: Search_Term, CleanedDrugName, PrimaryDirectorate\n    CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\").\n\n    Note: A Search_Term can appear multiple times with different PrimaryDirectorates\n    (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n    Drug fragments from all rows for the same Search_Term are combined.\n\n    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\n    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\n    \"urticaria\" stays separate.\n\n    Args:\n        csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n\n    Returns:\n        Tuple of (fragment_to_search_terms, search_term_to_fragments)\n    \"\"\"\n    if csv_path is None:\n        csv_path = str(Path(__file__).parent.parent / \"data\" / \"DimSearchTerm.csv\")\n\n    fragment_to_search_terms: dict[str, list[str]] = {}\n    search_term_to_fragments: dict[str, list[str]] = {}\n\n    try:\n        with open(csv_path, \"r\", encoding=\"utf-8\") as f:\n            reader = csv.DictReader(f)\n            for row in reader:\n                search_term = row.get(\"Search_Term\", \"\").strip()\n                drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n\n                # Normalize asthma variants to canonical \"asthma\"\n                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)\n\n                if not search_term or not drug_names_raw:\n                    continue\n\n                fragments = [frag.strip().upper() for frag in drug_names_raw.split(\"|\") if frag.strip()]\n\n                # Build search_term -> fragments (accumulate for duplicate Search_Terms)\n                if search_term not in search_term_to_fragments:\n                    search_term_to_fragments[search_term] = []\n                for frag in fragments:\n                    if frag not in search_term_to_fragments[search_term]:\n                        search_term_to_fragments[search_term].append(frag)\n\n                # Build fragment -> search_terms\n                for frag in fragments:\n                    if frag not in fragment_to_search_terms:\n                        fragment_to_search_terms[frag] = []\n                    if search_term not in fragment_to_search_terms[frag]:\n                        fragment_to_search_terms[frag].append(search_term)\n\n        logger.info(\n            f\"Loaded drug-indication mapping: {len(search_term_to_fragments)} Search_Terms, \"\n            f\"{len(fragment_to_search_terms)} drug fragments\"\n        )\n\n    except FileNotFoundError:\n        logger.error(f\"DimSearchTerm.csv not found at {csv_path}\")\n    except Exception as e:\n        logger.error(f\"Error loading DimSearchTerm.csv: {e}\")\n\n    return fragment_to_search_terms, search_term_to_fragments\n\n\ndef get_search_terms_for_drug(\n    drug_name: str,\n    search_term_to_fragments: dict[str, list[str]],\n) -> list[str]:\n    \"\"\"\n    Get all Search_Terms that list a given drug using substring matching.\n\n    Checks if any drug fragment from DimSearchTerm is a SUBSTRING of the given\n    drug name (case-insensitive). This handles both exact matches (ADALIMUMAB)\n    and partial fragments (PEGYLATED, INHALED).\n\n    Args:\n        drug_name: HCD drug name (e.g., \"ADALIMUMAB 40MG\", \"PEGYLATED LIPOSOMAL DOXORUBICIN\")\n        search_term_to_fragments: Mapping of search_term -> list of drug fragments\n\n    Returns:\n        List of Search_Terms whose drug fragments match the drug name\n    \"\"\"\n    drug_name_upper = drug_name.upper()\n    matched_terms: list[str] = []\n\n    for search_term, fragments in search_term_to_fragments.items():\n        for frag in fragments:\n            if frag in drug_name_upper:\n                matched_terms.append(search_term)\n                break  # One matching fragment is enough for this Search_Term\n\n    return matched_terms\n\n\n# === NEW APPROACH: Query Snowflake directly using cluster CTE ===\n\n# The cluster query mapping (embedded from snomed_indication_mapping_query.sql)\n# This maps Search_Term -> Cluster_ID for ~148 clinical conditions\nCLUSTER_MAPPING_SQL = \"\"\"\nWITH SearchTermClusters AS (\n    SELECT Search_Term, Cluster_ID FROM (VALUES\n        ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n        ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n        ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n        ('asthma', 'AST_COD'),\n        ('allergic rhinitis', 'MILDINTAST_COD'),\n        ('alzheimer''s disease', 'DEMALZ_COD'),\n        ('amyloidosis', 'AMYLOID_COD'),\n        ('anaemia', 'eFI2_AnaemiaTimeSensitive'),\n        ('anaplastic large cell lymphoma', 'C19HAEMCAN_COD'),\n        ('apixaban', 'DOACCON_COD'),\n        ('aplastic anaemia', 'eFI2_AnaemiaEver'),\n        ('arthritis', 'eFI2_InflammatoryArthritis'),\n        ('asthma', 'eFI2_Asthma'),\n        ('atopic dermatitis', 'ATOPDERM_COD'),\n        ('atrial fibrillation', 'eFI2_AtrialFibrillation'),\n        ('attention deficit hyperactivity disorder', 'ADHD_COD'),\n        ('bipolar disorder', 'MH_COD'),\n        ('bladder', 'eFI2_UrinaryIncontinence'),\n        ('breast cancer', 'BRCANSCR_COD'),\n        ('cardiomyopathy', 'eFI2_HarmfulDrinking'),\n        ('cardiovascular disease', 'CVDRISKASS_COD'),\n        ('cervical cancer', 'CSDEC_COD'),\n        ('cholangiocarcinoma', 'eFI2_Cancer'),\n        ('chronic kidney disease', 'CKD_COD'),\n        ('chronic liver disease', 'eFI2_LiverProblems'),\n        ('chronic lymphocytic leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic myeloid leukaemia', 'EPPHAEMCAN_COD'),\n        ('chronic obstructive pulmonary disease', 'eFI2_COPD'),\n        ('colon cancer', 'eFI2_Cancer'),\n        ('colorectal cancer', 'GICANREF_COD'),\n        ('constipation', 'CHRONCONSTIP_COD'),\n        ('covid-19', 'POSSPOSTCOVID_COD'),\n        ('crohn''s disease', 'eFI2_InflammatoryBowelDisease'),\n        ('cutaneous t-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('cystic fibrosis', 'CUST_ICB_CYSTIC_FIBROSIS'),\n        ('deep vein thrombosis', 'VTE_COD'),\n        ('depression', 'eFI2_Depression'),\n        ('diabetes', 'eFI2_DiabetesEver'),\n        ('diabetic retinopathy', 'DRSELIGIBILITY_COD'),\n        ('diffuse large b-cell lymphoma', 'C19HAEMCAN_COD'),\n        ('dravet syndrome', 'EPIL_COD'),\n        ('drug misuse', 'ILLSUBINT_COD'),\n        ('dyspepsia', 'eFI2_AbdominalPain'),\n        ('epilepsy', 'eFI2_Seizures'),\n        ('fallopian tube', 'STERIL_COD'),\n        ('follicular lymphoma', 'C19HAEMCAN_COD'),\n        ('gastric cancer', 'eFI2_Cancer'),\n        ('giant cell arteritis', 'GCA_COD'),\n        ('glioma', 'NHAEMCANMORPH_COD'),\n        ('gout', 'eFI2_InflammatoryArthritis'),\n        ('graft versus host disease', 'GVHD_COD'),\n        ('granulomatosis with polyangiitis', 'WEGENERVASC_COD'),\n        ('growth hormone deficiency', 'HYPOPITUITARY_COD'),\n        ('hand eczema', 'ECZEMA_COD'),\n        ('heart failure', 'eFI2_HeartFailure'),\n        ('hepatitis b', 'HEPBCVAC_COD'),\n        ('hepatocellular carcinoma', 'eFI2_Cancer'),\n        ('hiv', 'PREFLANG_COD'),\n        ('hodgkin lymphoma', 'HAEMCANMORPH_COD'),\n        ('hormone receptor', 'eFI2_ThyroidProblems'),\n        ('hypercholesterolaemia', 'CLASSFH_COD'),\n        ('immune thrombocytopenia', 'ITP_COD'),\n        ('influenza', 'FLUINVITE_COD'),\n        ('insomnia', 'eFI2_SleepProblems'),\n        ('irritable bowel syndrome', 'IBS_COD'),\n        ('ischaemic stroke', 'OSTR_COD'),\n        ('juvenile idiopathic arthritis', 'RARTHAD_COD'),\n        ('kidney transplant', 'RENALTRANSP_COD'),\n        ('leukaemia', 'eFI2_Cancer'),\n        ('lung cancer', 'FTCANREF_COD'),\n        ('lymphoma', 'C19HAEMCAN_COD'),\n        ('macular degeneration', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('macular oedema', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('major depressive episodes', 'eFI2_Depression'),\n        ('malignant melanoma', 'eFI2_Cancer'),\n        ('malignant pleural mesothelioma', 'LUNGCAN_COD'),\n        ('manic episode', 'MH_COD'),\n        ('mantle cell lymphoma', 'HAEMCANMORPH_COD'),\n        ('melanoma', 'eFI2_Cancer'),\n        ('merkel cell carcinoma', 'C19CAN_COD'),\n        ('migraine', 'eFI2_Headache'),\n        ('motor neurone disease', 'MND_COD'),\n        ('multiple myeloma', 'C19HAEMCAN_COD'),\n        ('multiple sclerosis', 'MS_COD'),\n        ('myelodysplastic', 'eFI2_AnaemiaEver'),\n        ('myelofibrosis', 'MDS_COD'),\n        ('myocardial infarction', 'eFI2_IschaemicHeartDisease'),\n        ('myotonia', 'CNDATRISK2_COD'),\n        ('narcolepsy', 'LD_COD'),\n        ('neuroendocrine tumour', 'LUNGCAN_COD'),\n        ('non-small cell lung cancer', 'LUNGCAN_COD'),\n        ('non-small-cell lung cancer', 'FTCANREF_COD'),\n        ('obesity', 'BMI30_COD'),\n        ('osteoarthritis', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('osteoporosis', 'eFI2_Osteoporosis'),\n        ('osteosarcoma', 'NHAEMCANMORPH_COD'),\n        ('ovarian cancer', 'C19CAN_COD'),\n        ('peripheral arterial disease', 'PADEXC_COD'),\n        ('plaque psoriasis', 'PSORIASIS_COD'),\n        ('polycystic kidney disease', 'EPPCONGMALF_COD'),\n        ('polycythaemia vera', 'C19HAEMCAN_COD'),\n        ('pregnancy', 'C19PREG_COD'),\n        ('primary biliary cholangitis', 'eFI2_LiverProblems'),\n        ('primary hypercholesterolaemia', 'FNFHYP_COD'),\n        ('prostate cancer', 'EPPSOLIDCAN_COD'),\n        ('psoriasis', 'PSORIASIS_COD'),\n        ('psoriatic arthritis', 'RARTHAD_COD'),\n        ('pulmonary embolism', 'eFI2_RespiratoryDiseaseTimeSensitive'),\n        ('pulmonary fibrosis', 'ILD_COD'),\n        ('relapsing multiple sclerosis', 'MS_COD'),\n        ('renal cell carcinoma', 'C19CAN_COD'),\n        ('renal transplantation', 'RENALTRANSP_COD'),\n        ('retinal vein occlusion', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('rheumatoid arthritis', 'eFI2_InflammatoryArthritis'),\n        ('rivaroxaban', 'DOACCON_COD'),\n        ('schizophrenia', 'MH_COD'),\n        ('seizures', 'LSZFREQ_COD'),\n        ('sepsis', 'C19ACTIVITY_COD'),\n        ('asthma', 'SEVAST_COD'),\n        ('sickle cell disease', 'SICKLE_COD'),\n        ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n        ('smoking cessation', 'SMOKINGINT_COD'),\n        ('soft tissue sarcoma', 'NHAEMCANMORPH_COD'),\n        ('spinal muscular atrophy', 'MND_COD'),\n        ('squamous cell', 'C19CAN_COD'),\n        ('squamous cell carcinoma', 'C19CAN_COD'),\n        ('stem cell transplant', 'ALLOTRANSP_COD'),\n        ('stroke', 'eFI2_Stroke'),\n        ('systemic lupus erythematosus', 'SLUPUS_COD'),\n        ('systemic mastocytosis', 'HAEMCANMORPH_COD'),\n        ('thrombocytopenic purpura', 'TTP_COD'),\n        ('thrombotic thrombocytopenic purpura', 'TTP_COD'),\n        ('thyroid cancer', 'C19CAN_COD'),\n        ('tophaceous gout', 'CUST_ICB_OSTEOARTHRITIS'),\n        ('transitional cell carcinoma', 'C19CAN_COD'),\n        ('type 1 diabetes', 'DMTYPE1_COD'),\n        ('type 2 diabetes', 'DMTYPE2_COD'),\n        ('ulcerative colitis', 'eFI2_InflammatoryBowelDisease'),\n        ('urothelial carcinoma', 'NHAEMCANMORPH_COD'),\n        ('urticaria', 'XSAL_COD'),\n        ('uveitis', 'CUST_ICB_VISUAL_IMPAIRMENT'),\n        ('vascular disease', 'CVDINVITE_COD'),\n        ('vasculitis', 'CRYOGLOBVASC_COD')\n    ) AS t(Search_Term, Cluster_ID)\n),\n\nClusterCodes AS (\n    SELECT\n        stc.Search_Term,\n        c.\"SNOMEDCode\",\n        c.\"SNOMEDDescription\"\n    FROM SearchTermClusters stc\n    JOIN DATA_HUB.PHM.\"ClinicalCodingClusterSnomedCodes\" c\n        ON stc.Cluster_ID = c.\"Cluster_ID\"\n    WHERE c.\"SNOMEDCode\" IS NOT NULL\n),\n\nExplicitCodes AS (\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription FROM (VALUES\n        ('acute coronary syndrome', '837091000000100', 'Manual mapping'),\n        ('ankylosing spondylitis', '162930007', 'Manual mapping'),\n        ('ankylosing spondylitis', '239805001', 'Manual mapping'),\n        ('ankylosing spondylitis', '239810002', 'Manual mapping'),\n        ('ankylosing spondylitis', '239811003', 'Manual mapping'),\n        ('ankylosing spondylitis', '394990003', 'Manual mapping'),\n        ('ankylosing spondylitis', '429712009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441562009', 'Manual mapping'),\n        ('ankylosing spondylitis', '441680005', 'Manual mapping'),\n        ('ankylosing spondylitis', '441930001', 'Manual mapping'),\n        ('axial spondyloarthritis', '723116002', 'Manual mapping'),\n        ('choroidal neovascularisation', '380621000000102', 'Manual mapping'),\n        ('choroidal neovascularisation', '733124000', 'Manual mapping')\n    ) AS t(Search_Term, SNOMEDCode, SNOMEDDescription)\n),\n\nAllIndicationCodes AS (\n    SELECT Search_Term, \"SNOMEDCode\" AS SNOMEDCode, \"SNOMEDDescription\" AS SNOMEDDescription\n    FROM ClusterCodes\n    UNION ALL\n    SELECT Search_Term, SNOMEDCode, SNOMEDDescription\n    FROM ExplicitCodes\n)\n\"\"\"\n\n\ndef get_patient_indication_groups(\n    patient_pseudonyms: list[str],\n    connector: Optional[SnowflakeConnector] = None,\n    batch_size: int = 500,\n) -> \"pd.DataFrame\":\n    \"\"\"\n    Batch lookup GP diagnosis-based indication groups using Snowflake cluster query.\n\n    This function queries Snowflake directly using the embedded cluster CTE\n    (from snomed_indication_mapping_query.sql) to find patients with matching\n    GP diagnoses. This is the NEW approach replacing the old SQLite-based lookup.\n\n    The query:\n    1. Uses the cluster mapping CTE to get all Search_Term -> SNOMED code mappings\n    2. Joins with PrimaryCareClinicalCoding to find patients with matching codes\n    3. Returns the most recent match per patient (by EventDateTime)\n\n    Args:\n        patient_pseudonyms: List of PseudoNHSNoLinked values (matches PatientPseudonym in GP records)\n        connector: Optional SnowflakeConnector (defaults to singleton)\n        batch_size: Number of patients per Snowflake query batch (default 500)\n\n    Returns:\n        DataFrame with columns:\n        - PatientPseudonym: The patient identifier (PseudoNHSNoLinked value)\n        - Search_Term: The matched indication (e.g., \"rheumatoid arthritis\")\n        - EventDateTime: Date of the GP diagnosis record\n\n        Patients not found in results have no matching GP diagnosis.\n    \"\"\"\n    import pandas as pd\n\n    logger.info(f\"Starting Snowflake-direct indication lookup for {len(patient_pseudonyms)} patients...\")\n\n    # Handle edge case: empty patient list\n    if not patient_pseudonyms:\n        logger.warning(\"Empty patient list provided\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    # Check Snowflake availability\n    if not SNOWFLAKE_AVAILABLE:\n        logger.error(\"Snowflake connector not available - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if not is_snowflake_configured():\n        logger.error(\"Snowflake not configured - cannot lookup GP records\")\n        return pd.DataFrame(columns=['PatientPseudonym', 'Search_Term', 'EventDateTime'])\n\n    if connector is None:\n        connector = get_connector()\n\n    # Results list to collect all matches\n    all_results: list[dict] = []\n\n    # Process patients in batches\n    total_patients = len(patient_pseudonyms)\n    for batch_start in range(0, total_patients, batch_size):\n        batch_end = min(batch_start + batch_size, total_patients)\n        batch_pseudonyms = patient_pseudonyms[batch_start:batch_end]\n        batch_num = batch_start // batch_size + 1\n        total_batches = (total_patients + batch_size - 1) // batch_size\n\n        logger.info(f\"Batch {batch_num}/{total_batches}: patients {batch_start + 1} to {batch_end}\")\n\n        # Build patient IN clause placeholders\n        patient_placeholders = \", \".join([\"%s\"] * len(batch_pseudonyms))\n\n        # Build the full query with cluster CTE\n        # This finds the most recent matching diagnosis for each patient\n        # Note: Column names must be aliased to ensure consistent casing in results\n        query = f\"\"\"\n{CLUSTER_MAPPING_SQL}\nSELECT\n    pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n    aic.Search_Term AS \"Search_Term\",\n    pc.\"EventDateTime\" AS \"EventDateTime\"\nFROM DATA_HUB.PHM.\"PrimaryCareClinicalCoding\" pc\nINNER JOIN AllIndicationCodes aic\n    ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\nWHERE pc.\"PatientPseudonym\" IN ({patient_placeholders})\nQUALIFY ROW_NUMBER() OVER (\n    PARTITION BY pc.\"PatientPseudonym\"\n    ORDER BY pc.\"EventDateTime\" DESC\n) = 1\n\"\"\"\n\n        try:\n            results = connector.execute_dict(query, tuple(batch_pseudonyms))\n\n            for row in results:\n                all_results.append({\n                    'PatientPseudonym': row.get('PatientPseudonym'),\n                    'Search_Term': row.get('Search_Term'),\n                    'EventDateTime': row.get('EventDateTime'),\n                })\n\n            logger.debug(f\"Batch {batch_num}: found {len(results)} matches\")\n\n        except Exception as e:\n            logger.error(f\"Error querying GP records for batch {batch_num}: {e}\")\n            # Continue with other batches - partial results are better than none\n\n    # Build result DataFrame\n    result_df = pd.DataFrame(all_results)\n\n    # Log summary statistics\n    if len(result_df) > 0:\n        matched_count = len(result_df)\n        match_rate = 100 * matched_count / total_patients\n        unique_terms = result_df['Search_Term'].nunique()\n        logger.info(f\"Indication lookup complete:\")\n        logger.info(f\"  Total patients queried: {total_patients}\")\n        logger.info(f\"  Patients with GP match: {matched_count} ({match_rate:.1f}%)\")\n        logger.info(f\"  Unique Search_Terms found: {unique_terms}\")\n\n        # Log top Search_Terms\n        top_terms = result_df['Search_Term'].value_counts().head(5)\n        logger.info(f\"  Top 5 indications: {dict(top_terms)}\")\n    else:\n        logger.info(f\"Indication lookup complete: 0 matches from {total_patients} patients\")\n\n    return result_df\n\n\n# Export public API\n__all__ = [\n    # Dataclasses\n    \"ClusterSnomedCodes\",\n    \"IndicationValidationResult\",\n    \"DrugIndicationMatchRate\",\n    \"DrugSnomedMapping\",\n    \"DirectSnomedMatchResult\",\n    \"DirectorateAssignment\",\n    # Cluster-based lookup functions (existing)\n    \"get_drug_clusters\",\n    \"get_drug_cluster_ids\",\n    \"get_cluster_snomed_codes\",\n    \"patient_has_indication\",\n    \"validate_indication\",\n    \"get_indication_match_rate\",\n    \"batch_validate_indications\",\n    \"get_available_clusters\",\n    # Direct SNOMED lookup functions (new)\n    \"get_drug_snomed_codes\",\n    \"patient_has_indication_direct\",\n    # Diagnosis-based directorate assignment\n    \"get_directorate_from_diagnosis\",\n    # Batch lookup for indication groups\n    \"batch_lookup_indication_groups\",\n    # Drug-indication mapping from DimSearchTerm.csv\n    \"load_drug_indication_mapping\",\n    \"get_search_terms_for_drug\",\n    # Snowflake-direct indication lookup (new approach)\n    \"get_patient_indication_groups\",\n    \"CLUSTER_MAPPING_SQL\",\n]\n","structuredPatch":[{"oldStart":1546,"oldLines":6,"newStart":1546,"newLines":7,"lines":["     # Batch lookup for indication groups","     \"batch_lookup_indication_groups\",","     # Drug-indication mapping from DimSearchTerm.csv","+    \"SEARCH_TERM_MERGE_MAP\",","     \"load_drug_indication_mapping\",","     \"get_search_terms_for_drug\",","     # Snowflake-direct indication lookup (new approach)"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_012rigWpXaiELSaWXgqa8XKP","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me validate the changes."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":333,"cache_read_input_tokens":108051,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":333},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"4c4765d0-309f-4cbf-bc79-5544156a819a"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_012rigWpXaiELSaWXgqa8XKP","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_0154XcV9VMs1PY8grZ3SbzC8","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"in_progress","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":333,"cache_read_input_tokens":108051,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":333},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"00680fc7-ff94-46b0-9260-e9b494442f95"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_0154XcV9VMs1PY8grZ3SbzC8","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"0200a8f1-1dd7-4d17-831f-7dca79cd8626","tool_use_result":{"oldTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"in_progress","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"pending","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"in_progress","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01TmXKzLR7NVFQMVJqzPNjZg","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01A73g7h8XzpeejYjXDvnjTw","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && python -m py_compile data_processing/diagnosis_lookup.py","description":"Syntax check diagnosis_lookup.py"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":345,"cache_read_input_tokens":108384,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":345},"output_tokens":26,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"35ed5939-f4e7-4061-8443-e557d10ac84f"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01A73g7h8XzpeejYjXDvnjTw","type":"tool_result","content":"","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"6bf356ab-e359-4b92-b82f-0917bba35939","tool_use_result":{"stdout":"","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01NBqg4w9DPSWhLqjabDrbsg","type":"message","role":"assistant","content":[{"type":"text","text":"Syntax check passed. Now let me test the import and verify the asthma merge works correctly."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":138,"cache_read_input_tokens":108729,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":138},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"9867fec2-5969-458e-ab9a-60da1cbbf4b8"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01NBqg4w9DPSWhLqjabDrbsg","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01PdFPxf4DYK6AkAckPDq899","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && python -c \"\nfrom data_processing.diagnosis_lookup import load_drug_indication_mapping, get_search_terms_for_drug, SEARCH_TERM_MERGE_MAP\n\nprint('SEARCH_TERM_MERGE_MAP:', SEARCH_TERM_MERGE_MAP)\nprint()\n\nfrag_to_st, st_to_frag = load_drug_indication_mapping()\n\n# Check asthma merge\nprint('=== Asthma merge verification ===')\nprint('asthma fragments:', st_to_frag.get('asthma', []))\nprint()\nprint('allergic asthma still exists?', 'allergic asthma' in st_to_frag)\nprint('severe persistent allergic asthma still exists?', 'severe persistent allergic asthma' in st_to_frag)\nprint()\n\n# Check urticaria is separate\nprint('urticaria fragments:', st_to_frag.get('urticaria', []))\nprint()\n\n# Check OMALIZUMAB maps to both asthma and urticaria\nprint('OMALIZUMAB maps to:', [st for st, frags in st_to_frag.items() if 'OMALIZUMAB' in frags])\nprint()\n\n# Check drug matching still works\nprint('=== Drug matching verification ===')\nprint('OMALIZUMAB matches:', get_search_terms_for_drug('OMALIZUMAB', st_to_frag))\nprint('BENRALIZUMAB matches:', get_search_terms_for_drug('BENRALIZUMAB', st_to_frag))\nprint('DUPILUMAB matches:', get_search_terms_for_drug('DUPILUMAB', st_to_frag))\nprint('ADALIMUMAB matches:', get_search_terms_for_drug('ADALIMUMAB', st_to_frag))\nprint()\n\n# Check total counts\nprint('Total Search_Terms:', len(st_to_frag))\nprint('Total fragments:', len(frag_to_st))\n\"\n","description":"Verify asthma merge in load_drug_indication_mapping"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":138,"cache_read_input_tokens":108729,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":138},"output_tokens":2,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"4ae3db35-ee5d-495f-a0db-62841c4d2a5b"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01PdFPxf4DYK6AkAckPDq899","type":"tool_result","content":"SEARCH_TERM_MERGE_MAP: {'allergic asthma': 'asthma', 'severe persistent allergic asthma': 'asthma'}\r\n\r\n=== Asthma merge verification ===\r\nasthma fragments: ['OMALIZUMAB', 'BENRALIZUMAB', 'DUPILUMAB', 'INHALED', 'MEPOLIZUMAB', 'RESLIZUMAB']\r\n\r\nallergic asthma still exists? False\r\nsevere persistent allergic asthma still exists? False\r\n\r\nurticaria fragments: ['OMALIZUMAB']\r\n\r\nOMALIZUMAB maps to: ['asthma', 'urticaria']\r\n\r\n=== Drug matching verification ===\r\nOMALIZUMAB matches: ['asthma', 'urticaria']\r\nBENRALIZUMAB matches: ['asthma']\r\nDUPILUMAB matches: ['asthma', 'atopic dermatitis']\r\nADALIMUMAB matches: ['ankylosing spondylitis', 'axial spondyloarthritis', 'plaque psoriasis', 'psoriatic arthritis', 'rheumatoid arthritis', 'ulcerative colitis', 'uveitis']\r\n\r\nTotal Search_Terms: 162\r\nTotal fragments: 346","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"281c3a6f-30c4-4e19-b4b9-a71fa6b9980b","tool_use_result":{"stdout":"SEARCH_TERM_MERGE_MAP: {'allergic asthma': 'asthma', 'severe persistent allergic asthma': 'asthma'}\r\n\r\n=== Asthma merge verification ===\r\nasthma fragments: ['OMALIZUMAB', 'BENRALIZUMAB', 'DUPILUMAB', 'INHALED', 'MEPOLIZUMAB', 'RESLIZUMAB']\r\n\r\nallergic asthma still exists? False\r\nsevere persistent allergic asthma still exists? False\r\n\r\nurticaria fragments: ['OMALIZUMAB']\r\n\r\nOMALIZUMAB maps to: ['asthma', 'urticaria']\r\n\r\n=== Drug matching verification ===\r\nOMALIZUMAB matches: ['asthma', 'urticaria']\r\nBENRALIZUMAB matches: ['asthma']\r\nDUPILUMAB matches: ['asthma', 'atopic dermatitis']\r\nADALIMUMAB matches: ['ankylosing spondylitis', 'axial spondyloarthritis', 'plaque psoriasis', 'psoriatic arthritis', 'rheumatoid arthritis', 'ulcerative colitis', 'uveitis']\r\n\r\nTotal Search_Terms: 162\r\nTotal fragments: 346\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_0115a5UaqYuR9hnvR31ARAZs","type":"message","role":"assistant","content":[{"type":"text","text":"All validations pass:\n\n1. **Asthma merge works**: `asthma` now has the combined drug list: OMALIZUMAB, BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, RESLIZUMAB (from all 3 original rows)\n2. **Variants removed**: `allergic asthma` and `severe persistent allergic asthma` no longer exist as separate entries\n3. **Urticaria separate**: `urticaria` ÔåÆ `['OMALIZUMAB']` ÔÇö correctly kept separate\n4. **OMALIZUMAB maps correctly**: to both `asthma` and `urticaria` (not just one)\n5. **Drug matching still works**: ADALIMUMAB maps to 7 indications as expected\n6. **Count changes**: 162 Search_Terms (was 164 ÔÇö 2 fewer because 3 asthma entries merged into 1)\n7. **Fragment count unchanged**: 346 (no fragments lost, just redistributed)\n\nNow let me also verify the CLUSTER_MAPPING_SQL changes by checking the specific lines."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":935,"cache_read_input_tokens":108867,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":935},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"921c3451-5015-4ad6-ab8e-5869e4a8db9b"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_0115a5UaqYuR9hnvR31ARAZs","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_0173KDqp1wFwHz1P7jaUQHGY","name":"Grep","input":{"pattern":"asthma|urticaria","path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\data_processing\\diagnosis_lookup.py","output_mode":"content","-n":true}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":935,"cache_read_input_tokens":108867,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":935},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"99d11296-042c-47b9-ba12-0105ca214eb4"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_0173KDqp1wFwHz1P7jaUQHGY","type":"tool_result","content":"1095:# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\r\n1097:    \"allergic asthma\": \"asthma\",\r\n1098:    \"severe persistent allergic asthma\": \"asthma\",\r\n1119:    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\r\n1120:    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\r\n1121:    \"urticaria\" stays separate.\r\n1142:                # Normalize asthma variants to canonical \"asthma\"\r\n1217:        ('asthma', 'AST_COD'),\r\n1226:        ('asthma', 'eFI2_Asthma'),\r\n1332:        ('asthma', 'SEVAST_COD'),\r\n1353:        ('urticaria', 'XSAL_COD'),"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"40eeff22-8de9-4521-b890-77ea3b726304","tool_use_result":{"mode":"content","numFiles":0,"filenames":[],"content":"1095:# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\r\n1097:    \"allergic asthma\": \"asthma\",\r\n1098:    \"severe persistent allergic asthma\": \"asthma\",\r\n1119:    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\r\n1120:    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\r\n1121:    \"urticaria\" stays separate.\r\n1142:                # Normalize asthma variants to canonical \"asthma\"\r\n1217:        ('asthma', 'AST_COD'),\r\n1226:        ('asthma', 'eFI2_Asthma'),\r\n1332:        ('asthma', 'SEVAST_COD'),\r\n1353:        ('urticaria', 'XSAL_COD'),","numLines":11}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RmnowXDGcfZwuxqwo977zw","type":"message","role":"assistant","content":[{"type":"text","text":"CLUSTER_MAPPING_SQL now has 3 entries for `'asthma'`:\n- `('asthma', 'AST_COD')` ÔÇö was `allergic asthma`\n- `('asthma', 'eFI2_Asthma')` ÔÇö unchanged\n- `('asthma', 'SEVAST_COD')` ÔÇö was `severe persistent allergic asthma`\n\nAnd `('urticaria', 'XSAL_COD')` is untouched. \n\nNow let me mark the task in the IMPLEMENTATION_PLAN.md."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":638,"cache_read_input_tokens":109802,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":638},"output_tokens":8,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"9db9d775-a246-4fce-9d75-bf811e61469f"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01RmnowXDGcfZwuxqwo977zw","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01R3uzb7WEqrbMP1HMZB6Qkt","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md","offset":81,"limit":13}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":638,"cache_read_input_tokens":109802,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":638},"output_tokens":8,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"747ff9a9-2e0c-422e-b66f-8a073dfc80af"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01R3uzb7WEqrbMP1HMZB6Qkt","type":"tool_result","content":"    81ÔåÆ### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n    82ÔåÆ- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n    83ÔåÆ  - `allergic asthma` (Cluster: OMALIZUMAB only)\n    84ÔåÆ  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n    85ÔåÆ  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n    86ÔåÆ- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n    87ÔåÆ- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n    88ÔåÆ- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n    89ÔåÆ  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n    90ÔåÆ  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n    91ÔåÆ- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n    92ÔåÆ- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB\n    93ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"311e5f55-721e-49d9-8a1e-eacd54644833","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md","content":"### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB\n","numLines":13,"startLine":81,"totalLines":239}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01CEdzm5cqq5YsBfzigTY7Yc","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01EHKcSYDScvG8VUb4VyL3Ez","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md","old_string":"### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB","new_string":"### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [x] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [x] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [x] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [x] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [x] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [x] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":781,"cache_read_input_tokens":110440,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":781},"output_tokens":25,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e9995505-6644-4d2f-be7e-43e25c84cc69"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01EHKcSYDScvG8VUb4VyL3Ez","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"629654dd-9989-4f33-8c1d-ae57604132ba","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\IMPLEMENTATION_PLAN.md","oldString":"### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB","newString":"### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [x] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [x] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [x] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [x] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [x] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [x] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB","originalFile":"# Implementation Plan - Drug-Aware Indication Matching\n\n## Project Overview\n\nUpdate the indication-based pathway charts so that patient indications are matched **per drug**, not just per patient. Currently, each patient gets ONE indication (most recent GP diagnosis match). This ignores which drugs the patient is actually taking.\n\n### The Problem\n\nA patient on ADALIMUMAB + OMALIZUMAB currently gets assigned a single indication (e.g., \"rheumatoid arthritis\" ÔÇö the most recent GP match). But:\n- ADALIMUMAB is used for rheumatoid arthritis, axial spondyloarthritis, crohn's disease, etc.\n- OMALIZUMAB is used for asthma, allergic asthma, urticaria\n\nThese are different clinical pathways and should be treated as separate treatment journeys.\n\n### The Solution\n\nMatch each drug to an indication by cross-referencing:\n1. **GP diagnosis** ÔÇö which Search_Terms the patient has matching SNOMED codes for\n2. **Drug mapping** ÔÇö which Search_Terms list each drug (from `DimSearchTerm.csv`)\n\nOnly assign a drug to an indication if BOTH conditions are met. If a patient's drugs map to different indications, they become separate pathways (via modified UPID).\n\n### Key Design Decisions\n\n| Aspect | Decision |\n|--------|----------|\n| Drug-indication source | `data/DimSearchTerm.csv` ÔÇö Search_Term ÔåÆ CleanedDrugName mapping |\n| UPID modification | `{original_UPID}\\|{search_term}` for drugs with matched indication |\n| GP diagnosis matching | Return ALL matches per patient (not just most recent) |\n| Drug matching | Substring match: HCD drug name contains DimSearchTerm fragment |\n| Multiple indication matches per drug | Use highest GP code frequency as tiebreaker (COUNT of matching SNOMED codes per Search_Term) |\n| GP code time range | Only codes from MIN(Intervention Date) onwards ÔÇö restricts to HCD data window |\n| No indication match | Fallback to directory (same as current behavior) |\n| Same patient, different indications | Separate pathways via different modified UPIDs |\n\n### Examples\n\n**Patient on ADALIMUMAB + GOLIMUMAB, GP dx: axial spondyloarthritis + asthma**\n- axial spondyloarthritis drug list includes both ADALIMUMAB and GOLIMUMAB\n- ÔåÆ Both drugs grouped under \"axial spondyloarthritis\", single pathway\n- Modified UPID: `RMV12345|axial spondyloarthritis`\n\n**Patient on ADALIMUMAB + OMALIZUMAB, GP dx: axial spondyloarthritis + asthma**\n- axial spondyloarthritis lists ADALIMUMAB but not OMALIZUMAB\n- asthma lists OMALIZUMAB but not ADALIMUMAB\n- ÔåÆ Two separate pathways:\n  - `RMV12345|axial spondyloarthritis` with ADALIMUMAB\n  - `RMV12345|asthma` with OMALIZUMAB\n\n**Patient on ADALIMUMAB, GP dx: rheumatoid arthritis (47 codes) + crohn's disease (2 codes)**\n- Both Search_Terms list ADALIMUMAB AND patient has GP dx for both\n- ÔåÆ Tiebreaker: highest code frequency ÔÇö rheumatoid arthritis has 47 matching SNOMED codes vs 2 for crohn's\n- ÔåÆ Single pathway under rheumatoid arthritis (more clinical activity = more likely the treatment indication)\n\n---\n\n## Phase 1: Update Snowflake Query & Drug Mapping\n\n### 1.1 Update `get_patient_indication_groups()` to return ALL matches with frequency\n- [ ] Modify the Snowflake query in `get_patient_indication_groups()` (diagnosis_lookup.py):\n  - Remove `QUALIFY ROW_NUMBER() OVER (PARTITION BY ... ORDER BY EventDateTime DESC) = 1`\n  - Return ALL matching Search_Terms per patient with code frequency:\n    ```sql\n    SELECT pc.\"PatientPseudonym\" AS \"PatientPseudonym\",\n           aic.Search_Term AS \"Search_Term\",\n           COUNT(*) AS \"code_frequency\"\n    FROM PrimaryCareClinicalCoding pc\n    JOIN AllIndicationCodes aic ON pc.\"SNOMEDCode\" = aic.SNOMEDCode\n    WHERE pc.\"PatientPseudonym\" IN (...)\n      AND pc.\"EventDateTime\" >= :earliest_hcd_date\n    GROUP BY pc.\"PatientPseudonym\", aic.Search_Term\n    ```\n  - `code_frequency` = number of matching SNOMED codes per Search_Term per patient\n  - Higher frequency = more clinical activity = stronger signal for tiebreaker\n  - `earliest_hcd_date` = `MIN(Intervention Date)` from the HCD DataFrame ÔÇö restricts GP codes to the HCD data window, reducing noise from old/irrelevant diagnoses\n- [ ] Accept `earliest_hcd_date` parameter in `get_patient_indication_groups()` and pass to query\n- [ ] Keep batch processing (500 patients per query)\n- [ ] Update return type: DataFrame now has multiple rows per patient (PatientPseudonym, Search_Term, code_frequency)\n- [ ] Verify: Query returns more rows than before (patients with multiple matching diagnoses)\n\n### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:\n  - `allergic asthma` (Cluster: OMALIZUMAB only)\n  - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)\n  - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)\n- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES\n- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma\n- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:\n  - Combine drug lists from all 3 entries under a single `\"asthma\"` key\n  - Deduplicate drug fragments (OMALIZUMAB appears in all 3)\n- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)\n- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB\n\n### 1.3 Build drug-to-Search_Term lookup from DimSearchTerm.csv\n- [x] Add function `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n  - Loads `data/DimSearchTerm.csv`\n  - Builds dict: `drug_fragment (uppercase) ÔåÆ list[Search_Term]`\n  - Also builds reverse: `search_term ÔåÆ list[drug_fragments]`\n  - CleanedDrugName is pipe-separated (e.g., \"ADALIMUMAB|GOLIMUMAB|IXEKIZUMAB\")\n- [x] Add function `get_search_terms_for_drug(drug_name, search_term_to_fragments) -> list[str]`:\n  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n  - More practical than per-term boolean check ÔÇö returns all matches at once for Phase 2 use\n- [x] Verify: ADALIMUMAB matches \"axial spondyloarthritis\", OMALIZUMAB matches \"asthma\"\n\n---\n\n## Phase 2: Drug-Aware Indication Matching Logic\n\n### 2.1 Create `assign_drug_indications()` function\n- [ ] Add to `diagnosis_lookup.py` or `pathway_pipeline.py`:\n  ```\n  def assign_drug_indications(\n      df: pd.DataFrame,              # HCD data with UPID, Drug Name columns\n      gp_matches_df: pd.DataFrame,   # PatientPseudonym ÔåÆ list of matched Search_Terms\n      drug_mapping: dict,             # From load_drug_indication_mapping()\n  ) -> tuple[pd.DataFrame, pd.DataFrame]:\n      Returns: (modified_df, indication_df)\n      - modified_df: HCD data with UPID replaced by {UPID}|{indication}\n      - indication_df: mapping modified_UPID ÔåÆ Search_Term\n  ```\n- [ ] Logic per UPID + Drug Name pair:\n  1. Get patient's GP-matched Search_Terms with code_frequency (from gp_matches_df via PseudoNHSNoLinked)\n  2. Get which Search_Terms include this drug (from drug_mapping)\n  3. Intersection = valid indications for this drug-patient pair\n  4. If 1 match: use it\n  5. If multiple matches: use highest code_frequency as tiebreaker (most GP coding activity = most likely treatment indication)\n  6. If 0 matches: use fallback directory\n- [ ] Modify UPID in df rows: `{original_UPID}|{matched_search_term}`\n- [ ] Build indication_df: `{modified_UPID}` ÔåÆ `Search_Term` (or fallback label)\n- [ ] Verify: Function compiles, handles edge cases (no GP match, no drug match)\n\n### 2.2 Handle tiebreaker for multiple indication matches\n- [ ] When a drug matches multiple Search_Terms AND patient has GP dx for multiple:\n  - Use `code_frequency` from the GP query (COUNT of matching SNOMED codes per Search_Term)\n  - Higher code_frequency = more clinical activity for that condition = more likely treatment indication\n  - E.g., patient with 47 RA codes and 2 crohn's codes ÔåÆ ADALIMUMAB assigned to RA\n  - code_frequency is already returned by the updated query in Task 1.1\n- [ ] Verify: Tiebreaker logic correctly picks highest-frequency diagnosis\n- [ ] Verify: Tie on frequency (rare but possible) falls back to alphabetical Search_Term for determinism\n\n---\n\n## Phase 3: Pipeline Integration\n\n### 3.1 Update `refresh_pathways.py` indication processing\n- [ ] In the `elif current_chart_type == \"indication\":` block:\n  1. Call `get_patient_indication_groups()` as before (but now returns ALL matches)\n  2. Load drug mapping: `drug_mapping = load_drug_indication_mapping()`\n  3. Call `assign_drug_indications(df, gp_matches_df, drug_mapping)`\n  4. Use modified_df (with indication-aware UPIDs) for pathway processing\n  5. Use indication_df for the indication mapping\n- [ ] Pass modified_df (not original df) to `process_indication_pathway_for_date_filter()`\n- [ ] Verify: Pipeline compiles, `python -m py_compile cli/refresh_pathways.py`\n\n### 3.2 Test with dry run\n- [ ] Run `python -m cli.refresh_pathways --chart-type indication --dry-run -v`\n- [ ] Verify:\n  - Modified UPIDs appear in pipeline log (e.g., `RMV12345|rheumatoid arthritis`)\n  - Patient counts are reasonable (will be higher than before since same patient can appear under multiple indications)\n  - Drug-indication matching is logged (match rate, fallback rate)\n  - Pathway hierarchy shows drug-specific grouping under correct indications\n\n---\n\n## Phase 4: Full Refresh & Validation\n\n### 4.1 Full refresh with both chart types\n- [ ] Run `python -m cli.refresh_pathways --chart-type all`\n- [ ] Verify:\n  - Both chart types generate data\n  - Directory charts unchanged (no modified UPIDs)\n  - Indication charts reflect drug-aware matching\n\n### 4.2 Validate indication chart correctness\n- [ ] Check that drugs under an indication all appear in that Search_Term's drug list\n- [ ] Verify that a patient on drugs for different indications creates separate pathway branches\n- [ ] Verify that drugs sharing an indication are grouped in the same pathway\n- [ ] Log: patient count comparison (old vs new approach)\n\n### 4.3 Validate Reflex UI\n- [ ] Run `python -m reflex compile` to verify app compiles\n- [ ] Verify chart type toggle still works\n- [ ] Verify indication chart shows correct hierarchy\n\n---\n\n## Completion Criteria\n\nAll tasks marked `[x]` AND:\n- [ ] App compiles without errors (`reflex compile` succeeds)\n- [ ] Both chart types generate pathway data\n- [ ] Indication charts show drug-specific indication matching\n- [ ] Drugs under the same indication for the same patient are in one pathway\n- [ ] Drugs under different indications for the same patient create separate pathways\n- [ ] Fallback works for drugs with no indication match\n- [ ] Full refresh completes successfully\n- [ ] Existing directory charts are unaffected\n\n---\n\n## Reference\n\n### DimSearchTerm.csv Structure\n```\nSearch_Term,CleanedDrugName,PrimaryDirectorate\nrheumatoid arthritis,ABATACEPT|ADALIMUMAB|ANAKINRA|BARICITINIB|...,RHEUMATOLOGY\nasthma,BENRALIZUMAB|DUPILUMAB|INHALED|MEPOLIZUMAB|OMALIZUMAB|RESLIZUMAB,THORACIC MEDICINE\n```\n\n### Modified UPID Format\n```\nOriginal:  RMV12345\nModified:  RMV12345|rheumatoid arthritis\nFallback:  RMV12345|RHEUMATOLOGY (no GP dx)\n```\n\n### Current vs New Indication Flow\n```\nCURRENT:\n  Patient ÔåÆ GP dx (most recent) ÔåÆ single Search_Term ÔåÆ one pathway\n\nNEW:\n  Patient + Drug A ÔåÆ GP dx matching Drug A ÔåÆ Search_Term X\n  Patient + Drug B ÔåÆ GP dx matching Drug B ÔåÆ Search_Term Y\n  ÔåÆ If X == Y: one pathway under X\n  ÔåÆ If X != Y: two pathways (modified UPIDs)\n```\n\n### Key Files\n\n| File | Changes |\n|------|---------|\n| `data_processing/diagnosis_lookup.py` | Update query, add drug mapping functions |\n| `data_processing/pathway_pipeline.py` | Possibly minor changes for modified UPIDs |\n| `cli/refresh_pathways.py` | Integrate drug-aware matching into pipeline |\n| `data/DimSearchTerm.csv` | Reference data (read-only) |\n| `analysis/pathway_analyzer.py` | No changes expected (UPID changes are transparent) |\n| `pathways_app/pathways_app.py` | No changes expected |\n","structuredPatch":[{"oldStart":79,"oldLines":17,"newStart":79,"newLines":17,"lines":[" - [ ] Verify: Query returns more rows than before (patients with multiple matching diagnoses)"," "," ### 1.2 Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL","-- [ ] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:","+- [x] In `CLUSTER_MAPPING_SQL` (diagnosis_lookup.py), merge these 3 Search_Terms into one `\"asthma\"` entry:","   - `allergic asthma` (Cluster: OMALIZUMAB only)","   - `asthma` (Cluster: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB)","   - `severe persistent allergic asthma` (Cluster: OMALIZUMAB only)","-- [ ] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES","-- [ ] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma","-- [ ] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:","+- [x] Map all 3 Cluster_IDs to `Search_Term = 'asthma'` in the CTE VALUES","+- [x] `urticaria` (OMALIZUMAB, DERMATOLOGY) stays SEPARATE ÔÇö do NOT merge with asthma","+- [x] Also update `load_drug_indication_mapping()` to apply the same merge when loading DimSearchTerm.csv:","   - Combine drug lists from all 3 entries under a single `\"asthma\"` key","   - Deduplicate drug fragments (OMALIZUMAB appears in all 3)","-- [ ] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)","-- [ ] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB","+- [x] Verify: GP code lookup returns `\"asthma\"` (not `\"allergic asthma\"` or `\"severe persistent allergic asthma\"`)","+- [x] Verify: Drug mapping for `\"asthma\"` includes full combined drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB"," "," ### 1.3 Build drug-to-Search_Term lookup from DimSearchTerm.csv"," - [x] Add function `load_drug_indication_mapping()` to `diagnosis_lookup.py`:"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01C3C1W8Z14HqmNM7iVGDW2T","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01WeDNzR5GcJ7cM6mGhcsbw9","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"completed","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"in_progress","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":975,"cache_read_input_tokens":111221,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":975},"output_tokens":25,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"476dcb0c-a487-4182-9722-29381f395a9f"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01WeDNzR5GcJ7cM6mGhcsbw9","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"d7b4523a-1597-4f16-b6d9-1adc446c4e97","tool_use_result":{"oldTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"in_progress","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"pending","activeForm":"Committing and updating progress"}],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"completed","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"in_progress","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_013zmaoDmTwQsstYDymEsf38","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me commit the changes and update progress.txt."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":337,"cache_read_input_tokens":112196,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":337},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"439b9ae3-a16d-45fc-8a52-011240e4ed01"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_013zmaoDmTwQsstYDymEsf38","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_016sM9kW5ksAaJcANhAGz1EM","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git status","description":"Show working tree status"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":337,"cache_read_input_tokens":112196,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":337},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"aa37981a-18d5-4ac6-b69b-4b4ecd5cbe80"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_016sM9kW5ksAaJcANhAGz1EM","type":"tool_result","content":"On branch feature/snomed-indication-mapping\nChanges not staged for commit:\n  (use \"git add/rm <file>...\" to update what will be committed)\n  (use \"git restore <file>...\" to discard changes in working directory)\n\tmodified:   .claude/settings.local.json\n\tmodified:   DESIGN_SYSTEM.md\n\tmodified:   IMPLEMENTATION_PLAN.md\n\tmodified:   RALPH_PROMPT.md\n\tmodified:   data_processing/diagnosis_lookup.py\n\tmodified:   guardrails.md\n\tmodified:   pathways_app/__init__.py\n\tdeleted:    pathways_app/app_v2.py\n\tmodified:   progress.txt\n\tmodified:   ralph.ps1\n\tmodified:   uv.lock\n\nUntracked files:\n  (use \"git add <file>...\" to include in what will be committed)\n\tdata/DimSearchTerm.csv\n\tdata/LookupSearchTermCleanedDrugName.csv\n\tdata/condition_directorate_mapping.csv\n\tdata/drug_indication_clusters.csv\n\tdata/drug_snomed_mapping_enriched.csv\n\tdata/na_directory_rows.csv\n\tpathways_app/pathways_app_old.py.bak\n\tsnomed_indication_mapping_query.sql\n\nno changes added to commit (use \"git add\" and/or \"git commit -a\")","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"52334450-8f97-4041-bbd2-8c829f37bd40","tool_use_result":{"stdout":"On branch feature/snomed-indication-mapping\nChanges not staged for commit:\n  (use \"git add/rm <file>...\" to update what will be committed)\n  (use \"git restore <file>...\" to discard changes in working directory)\n\tmodified:   .claude/settings.local.json\n\tmodified:   DESIGN_SYSTEM.md\n\tmodified:   IMPLEMENTATION_PLAN.md\n\tmodified:   RALPH_PROMPT.md\n\tmodified:   data_processing/diagnosis_lookup.py\n\tmodified:   guardrails.md\n\tmodified:   pathways_app/__init__.py\n\tdeleted:    pathways_app/app_v2.py\n\tmodified:   progress.txt\n\tmodified:   ralph.ps1\n\tmodified:   uv.lock\n\nUntracked files:\n  (use \"git add <file>...\" to include in what will be committed)\n\tdata/DimSearchTerm.csv\n\tdata/LookupSearchTermCleanedDrugName.csv\n\tdata/condition_directorate_mapping.csv\n\tdata/drug_indication_clusters.csv\n\tdata/drug_snomed_mapping_enriched.csv\n\tdata/na_directory_rows.csv\n\tpathways_app/pathways_app_old.py.bak\n\tsnomed_indication_mapping_query.sql\n\nno changes added to commit (use \"git add\" and/or \"git commit -a\")\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01QWJqcsPMXKczNKvBoUNsUJ","type":"message","role":"assistant","content":[{"type":"text","text":"I see many modified files from prior work. Let me check what my specific changes are in the files I modified."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":459,"cache_read_input_tokens":112533,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":459},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"acdd5697-096e-4da5-9479-45d1a86e0a98"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01QWJqcsPMXKczNKvBoUNsUJ","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01BjWaev9cnJyCym6qPcs9bJ","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git diff data_processing/diagnosis_lookup.py","description":"Show changes to diagnosis_lookup.py"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":459,"cache_read_input_tokens":112533,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":459},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e863c5e2-d87b-4191-879b-1eb2ec93a7c1"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01BjWaev9cnJyCym6qPcs9bJ","type":"tool_result","content":"diff --git a/data_processing/diagnosis_lookup.py b/data_processing/diagnosis_lookup.py\nindex 9b41339..eb796b7 100644\n--- a/data_processing/diagnosis_lookup.py\n+++ b/data_processing/diagnosis_lookup.py\n@@ -1090,6 +1090,15 @@ def batch_lookup_indication_groups(\n # === Drug-to-indication mapping from DimSearchTerm.csv ===\n \n \n+# Merge related Search_Terms into canonical names.\n+# Asthma variants are clinically the same condition at different severity levels.\n+# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\n+SEARCH_TERM_MERGE_MAP: dict[str, str] = {\n+    \"allergic asthma\": \"asthma\",\n+    \"severe persistent allergic asthma\": \"asthma\",\n+}\n+\n+\n def load_drug_indication_mapping(\n     csv_path: Optional[str] = None,\n ) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n@@ -1107,6 +1116,10 @@ def load_drug_indication_mapping(\n     (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n     Drug fragments from all rows for the same Search_Term are combined.\n \n+    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\n+    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\n+    \"urticaria\" stays separate.\n+\n     Args:\n         csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n \n@@ -1126,6 +1139,9 @@ def load_drug_indication_mapping(\n                 search_term = row.get(\"Search_Term\", \"\").strip()\n                 drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n \n+                # Normalize asthma variants to canonical \"asthma\"\n+                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)\n+\n                 if not search_term or not drug_names_raw:\n                     continue\n \n@@ -1198,7 +1214,7 @@ WITH SearchTermClusters AS (\n         ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n         ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n         ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n-        ('allergic asthma', 'AST_COD'),\n+        ('asthma', 'AST_COD'),\n         ('allergic rhinitis', 'MILDINTAST_COD'),\n         ('alzheimer''s disease', 'DEMALZ_COD'),\n         ('amyloidosis', 'AMYLOID_COD'),\n@@ -1313,7 +1329,7 @@ WITH SearchTermClusters AS (\n         ('schizophrenia', 'MH_COD'),\n         ('seizures', 'LSZFREQ_COD'),\n         ('sepsis', 'C19ACTIVITY_COD'),\n-        ('severe persistent allergic asthma', 'SEVAST_COD'),\n+        ('asthma', 'SEVAST_COD'),\n         ('sickle cell disease', 'SICKLE_COD'),\n         ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n         ('smoking cessation', 'SMOKINGINT_COD'),\n@@ -1530,6 +1546,7 @@ __all__ = [\n     # Batch lookup for indication groups\n     \"batch_lookup_indication_groups\",\n     # Drug-indication mapping from DimSearchTerm.csv\n+    \"SEARCH_TERM_MERGE_MAP\",\n     \"load_drug_indication_mapping\",\n     \"get_search_terms_for_drug\",\n     # Snowflake-direct indication lookup (new approach)","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"ce72a939-d670-49c8-bff7-bec6dcc166fa","tool_use_result":{"stdout":"diff --git a/data_processing/diagnosis_lookup.py b/data_processing/diagnosis_lookup.py\nindex 9b41339..eb796b7 100644\n--- a/data_processing/diagnosis_lookup.py\n+++ b/data_processing/diagnosis_lookup.py\n@@ -1090,6 +1090,15 @@ def batch_lookup_indication_groups(\n # === Drug-to-indication mapping from DimSearchTerm.csv ===\n \n \n+# Merge related Search_Terms into canonical names.\n+# Asthma variants are clinically the same condition at different severity levels.\n+# Urticaria is a separate condition ÔÇö do NOT merge with asthma.\n+SEARCH_TERM_MERGE_MAP: dict[str, str] = {\n+    \"allergic asthma\": \"asthma\",\n+    \"severe persistent allergic asthma\": \"asthma\",\n+}\n+\n+\n def load_drug_indication_mapping(\n     csv_path: Optional[str] = None,\n ) -> tuple[dict[str, list[str]], dict[str, list[str]]]:\n@@ -1107,6 +1116,10 @@ def load_drug_indication_mapping(\n     (e.g., \"diabetes\" appears under both DIABETIC MEDICINE and OPHTHALMOLOGY).\n     Drug fragments from all rows for the same Search_Term are combined.\n \n+    Asthma-related Search_Terms (\"allergic asthma\", \"severe persistent allergic asthma\")\n+    are merged into \"asthma\" to match the CLUSTER_MAPPING_SQL normalization.\n+    \"urticaria\" stays separate.\n+\n     Args:\n         csv_path: Path to DimSearchTerm.csv. Defaults to data/DimSearchTerm.csv.\n \n@@ -1126,6 +1139,9 @@ def load_drug_indication_mapping(\n                 search_term = row.get(\"Search_Term\", \"\").strip()\n                 drug_names_raw = row.get(\"CleanedDrugName\", \"\").strip()\n \n+                # Normalize asthma variants to canonical \"asthma\"\n+                search_term = SEARCH_TERM_MERGE_MAP.get(search_term, search_term)\n+\n                 if not search_term or not drug_names_raw:\n                     continue\n \n@@ -1198,7 +1214,7 @@ WITH SearchTermClusters AS (\n         ('acute lymphoblastic leukaemia', 'HAEMCANMORPH_COD'),\n         ('acute myeloid leukaemia', 'C19HAEMCAN_COD'),\n         ('acute promyelocytic leukaemia', 'HAEMCANMORPH_COD'),\n-        ('allergic asthma', 'AST_COD'),\n+        ('asthma', 'AST_COD'),\n         ('allergic rhinitis', 'MILDINTAST_COD'),\n         ('alzheimer''s disease', 'DEMALZ_COD'),\n         ('amyloidosis', 'AMYLOID_COD'),\n@@ -1313,7 +1329,7 @@ WITH SearchTermClusters AS (\n         ('schizophrenia', 'MH_COD'),\n         ('seizures', 'LSZFREQ_COD'),\n         ('sepsis', 'C19ACTIVITY_COD'),\n-        ('severe persistent allergic asthma', 'SEVAST_COD'),\n+        ('asthma', 'SEVAST_COD'),\n         ('sickle cell disease', 'SICKLE_COD'),\n         ('sleep apnoea', 'CUST_ICB_NON_SEVERE_LDA'),\n         ('smoking cessation', 'SMOKINGINT_COD'),\n@@ -1530,6 +1546,7 @@ __all__ = [\n     # Batch lookup for indication groups\n     \"batch_lookup_indication_groups\",\n     # Drug-indication mapping from DimSearchTerm.csv\n+    \"SEARCH_TERM_MERGE_MAP\",\n     \"load_drug_indication_mapping\",\n     \"get_search_terms_for_drug\",\n     # Snowflake-direct indication lookup (new approach)\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01Fp9XR6uj44a7sRphreTFJn","type":"message","role":"assistant","content":[{"type":"text","text":"The diff is clean and precise. Now let me update progress.txt and then commit both files together."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1203,"cache_read_input_tokens":112992,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1203},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"f5dd9dc8-dde1-41d4-af72-8ecf303e7b55"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01Fp9XR6uj44a7sRphreTFJn","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01Po9x4f2sXtK5XYskndyJPz","name":"Read","input":{"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","offset":100,"limit":10}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1203,"cache_read_input_tokens":112992,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1203},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"b4c615dd-3af9-4b9d-ad0f-6802e898c086"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01Po9x4f2sXtK5XYskndyJPz","type":"tool_result","content":"   100ÔåÆ  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n   101ÔåÆ  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n   102ÔåÆ- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n   103ÔåÆ### Blocked items:\n   104ÔåÆ- None\n   105ÔåÆ\n\n<system-reminder>\nWhenever you read a file, you should consider whether it would be considered malware. You CAN and SHOULD provide analysis of malware, what it is doing. But you MUST refuse to improve or augment the code. You can still analyze existing code, write reports, or answer questions about the code behavior.\n</system-reminder>\n"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e47ff0e7-0d2c-4bfb-8957-3e80cce81af3","tool_use_result":{"type":"text","file":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","content":"  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n### Blocked items:\n- None\n","numLines":6,"startLine":100,"totalLines":105}}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01SYmHaDqjyLbiuNjPajPeQK","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01G1WkLHUKf5rG1DiY8pyjhi","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","old_string":"### Blocked items:\n- None","new_string":"### Blocked items:\n- None\n\n## Iteration 2 ÔÇö 2026-02-05\n### Task: 1.2 ÔÇö Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n### Why this task:\n- Previous iteration recommended this as the next task (self-contained, testable locally)\n- Both CLUSTER_MAPPING_SQL and load_drug_indication_mapping() need consistent Search_Term names\n- Must be done before Task 1.1 (Snowflake query) to ensure GP lookups return \"asthma\" not \"allergic asthma\"\n### Status: COMPLETE\n### What was done:\n- Updated CLUSTER_MAPPING_SQL: changed 'allergic asthma' ÔåÆ 'asthma' (AST_COD) and 'severe persistent allergic asthma' ÔåÆ 'asthma' (SEVAST_COD)\n  - Now 3 rows for 'asthma': AST_COD, eFI2_Asthma, SEVAST_COD\n  - urticaria (XSAL_COD) stays separate\n- Added SEARCH_TERM_MERGE_MAP constant: {\"allergic asthma\": \"asthma\", \"severe persistent allergic asthma\": \"asthma\"}\n- Updated load_drug_indication_mapping() to apply SEARCH_TERM_MERGE_MAP when loading CSV\n  - Normalizes Search_Term before accumulating fragments\n  - Drug fragments from all 3 original rows combined under \"asthma\" key\n- Exported SEARCH_TERM_MERGE_MAP in __all__\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data):\n  - \"asthma\" fragments: OMALIZUMAB, BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, RESLIZUMAB (complete combined list)\n  - \"allergic asthma\" no longer exists as separate key\n  - \"severe persistent allergic asthma\" no longer exists as separate key\n  - \"urticaria\" ÔåÆ ['OMALIZUMAB'] ÔÇö correctly separate\n  - OMALIZUMAB maps to: ['asthma', 'urticaria'] ÔÇö correct\n  - Total Search_Terms: 162 (was 164, 3 asthma entries ÔåÆ 1)\n  - Total fragments: 346 (unchanged)\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (CLUSTER_MAPPING_SQL, SEARCH_TERM_MERGE_MAP, load_drug_indication_mapping)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: [pending]\n### Patterns discovered:\n- SEARCH_TERM_MERGE_MAP is reusable: any future module that receives Search_Terms from Snowflake can apply the same normalization\n- The merge approach (normalize at load time) is cleaner than post-hoc deduplication\n### Next iteration should:\n- Work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at ~line 1467 uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1`\n  - Replace with GROUP BY + COUNT(*) for code_frequency\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency\n  - Empty DataFrame columns should match new return type\n  - This requires Snowflake connectivity to fully test, but code changes can be verified with py_compile and import checks\n- OR work on Task 2.1: Create assign_drug_indications() ÔÇö can be built and tested with mock data\n  - This is independent of Task 1.1 if you mock the gp_matches_df input\n### Blocked items:\n- None"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":340,"cache_read_input_tokens":114195,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":340},"output_tokens":24,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"735dc6ca-4a58-45eb-9d19-db82c9d26479"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01G1WkLHUKf5rG1DiY8pyjhi","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"16d1daec-647a-4cee-b2b8-a29be52d243b","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","oldString":"### Blocked items:\n- None","newString":"### Blocked items:\n- None\n\n## Iteration 2 ÔÇö 2026-02-05\n### Task: 1.2 ÔÇö Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n### Why this task:\n- Previous iteration recommended this as the next task (self-contained, testable locally)\n- Both CLUSTER_MAPPING_SQL and load_drug_indication_mapping() need consistent Search_Term names\n- Must be done before Task 1.1 (Snowflake query) to ensure GP lookups return \"asthma\" not \"allergic asthma\"\n### Status: COMPLETE\n### What was done:\n- Updated CLUSTER_MAPPING_SQL: changed 'allergic asthma' ÔåÆ 'asthma' (AST_COD) and 'severe persistent allergic asthma' ÔåÆ 'asthma' (SEVAST_COD)\n  - Now 3 rows for 'asthma': AST_COD, eFI2_Asthma, SEVAST_COD\n  - urticaria (XSAL_COD) stays separate\n- Added SEARCH_TERM_MERGE_MAP constant: {\"allergic asthma\": \"asthma\", \"severe persistent allergic asthma\": \"asthma\"}\n- Updated load_drug_indication_mapping() to apply SEARCH_TERM_MERGE_MAP when loading CSV\n  - Normalizes Search_Term before accumulating fragments\n  - Drug fragments from all 3 original rows combined under \"asthma\" key\n- Exported SEARCH_TERM_MERGE_MAP in __all__\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data):\n  - \"asthma\" fragments: OMALIZUMAB, BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, RESLIZUMAB (complete combined list)\n  - \"allergic asthma\" no longer exists as separate key\n  - \"severe persistent allergic asthma\" no longer exists as separate key\n  - \"urticaria\" ÔåÆ ['OMALIZUMAB'] ÔÇö correctly separate\n  - OMALIZUMAB maps to: ['asthma', 'urticaria'] ÔÇö correct\n  - Total Search_Terms: 162 (was 164, 3 asthma entries ÔåÆ 1)\n  - Total fragments: 346 (unchanged)\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (CLUSTER_MAPPING_SQL, SEARCH_TERM_MERGE_MAP, load_drug_indication_mapping)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: [pending]\n### Patterns discovered:\n- SEARCH_TERM_MERGE_MAP is reusable: any future module that receives Search_Terms from Snowflake can apply the same normalization\n- The merge approach (normalize at load time) is cleaner than post-hoc deduplication\n### Next iteration should:\n- Work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at ~line 1467 uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1`\n  - Replace with GROUP BY + COUNT(*) for code_frequency\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency\n  - Empty DataFrame columns should match new return type\n  - This requires Snowflake connectivity to fully test, but code changes can be verified with py_compile and import checks\n- OR work on Task 2.1: Create assign_drug_indications() ÔÇö can be built and tested with mock data\n  - This is independent of Task 1.1 if you mock the gp_matches_df input\n### Blocked items:\n- None","originalFile":"# Progress Log - Drug-Aware Indication Matching\n\n## Project Context\n\nThis project extends the indication-based pathway charts (Phase 1-5 complete) with drug-aware matching.\n\n**Previous state**: Patients get ONE indication based on their most recent GP diagnosis match (SNOMED cluster codes). This ignores which drugs the patient is taking.\n\n**New goal**: Match each drug to an indication by cross-referencing the patient's GP diagnoses AND the drug's Search_Term mapping from DimSearchTerm.csv.\n\n## Key Data/Patterns\n\n### DimSearchTerm.csv\n- Located at `data/DimSearchTerm.csv`\n- Columns: Search_Term, CleanedDrugName (pipe-separated), PrimaryDirectorate\n- ~165 rows mapping clinical conditions to drug name fragments\n- Drug fragments are substrings that match standardized drug names from HCD data\n- Some entries have generic fragments: INHALED, CONTINUOUS, STANDARD-DOSE, PEGYLATED\n\n### Current get_patient_indication_groups() in diagnosis_lookup.py\n- Uses CLUSTER_MAPPING_SQL as CTE in Snowflake query\n- Returns ONLY the most recent match per patient (QUALIFY ROW_NUMBER() = 1)\n- Needs to return ALL matching Search_Terms per patient (remove QUALIFY)\n- Batches 500 patients per query\n\n### Modified UPID approach\n- Current: UPID = Provider Code[:3] + PersonKey (e.g., \"RMV12345\")\n- New: UPID = original + \"|\" + search_term (e.g., \"RMV12345|rheumatoid arthritis\")\n- The pipe delimiter \"|\" is safe because existing UPIDs are alphanumeric\n- generate_icicle_chart_indication() treats UPID as an opaque identifier ÔÇö modified UPIDs work transparently\n- The \" - \" delimiter in pathway ids is used for hierarchy levels, not within UPIDs\n\n### PseudoNHSNoLinked mapping\n- HCD data has PseudoNHSNoLinked column that matches PatientPseudonym in GP records\n- PersonKey is provider-specific local ID ÔÇö do NOT use for GP matching\n- One PseudoNHSNoLinked can map to multiple UPIDs (multi-provider patients)\n- GP match lookup: PseudoNHSNoLinked ÔåÆ list of matched Search_Terms\n\n### Drug matching logic\n- For each HCD row (UPID + Drug Name):\n  1. Get patient's GP-matched Search_Terms with code_frequency (via PseudoNHSNoLinked)\n  2. Get which Search_Terms list this drug (from DimSearchTerm.csv)\n  3. Intersection = valid indications\n  4. If 1: use it. If multiple: pick highest code_frequency (most GP coding = most likely indication). If 0: fallback to directory.\n- Modified UPID groups drugs under same indication together naturally\n- code_frequency = COUNT(*) of matching SNOMED codes per Search_Term per patient in GP records\n- GP code time range: only count codes from MIN(Intervention Date) onwards (the HCD data window)\n  - Reduces noise from old/irrelevant diagnoses, makes frequency more meaningful\n  - Pass earliest_hcd_date as parameter to get_patient_indication_groups()\n- Tiebreaker rationale: 47 RA codes vs 2 crohn's codes ÔåÆ RA is clearly the active condition\n\n### Known edge cases\n- Some DimSearchTerm drug fragments are generic (INHALED, ORAL, CONTINUOUS)\n  - These could match broadly but are constrained by GP diagnosis requirement\n- A patient visiting multiple providers has multiple UPIDs\n  - Each UPID gets its own drug-indication matching independently\n- Same Search_Term appears twice in DimSearchTerm.csv with different directorates\n  - e.g., \"diabetes\" ÔåÆ DIABETIC MEDICINE and OPHTHALMOLOGY\n  - For indication charts, we use Search_Term not directorate, so this is fine\n\n## Iteration Log\n\n## Iteration 1 ÔÇö 2026-02-05\n### Task: 1.3 ÔÇö Build drug-to-Search_Term lookup from DimSearchTerm.csv\n### Why this task:\n- First iteration, chose Phase 1 foundations. Task 1.2 (CSV loading) is self-contained and testable locally without Snowflake.\n- Task 1.1 (Snowflake query update) can't be verified without a live connection ÔÇö better to do 1.2 first.\n- Both 1.1 and 1.2 are independent, so order doesn't matter for dependencies.\n### Status: COMPLETE\n### What was done:\n- Added `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n  - Loads `data/DimSearchTerm.csv`, builds two dicts:\n    - `fragment_to_search_terms`: drug fragment (UPPER) ÔåÆ list of Search_Terms\n    - `search_term_to_fragments`: search_term ÔåÆ list of drug fragments (UPPER)\n  - Handles duplicate Search_Terms (e.g., \"diabetes\" rows combined)\n  - Result: 164 Search_Terms, 346 drug fragments\n- Added `get_search_terms_for_drug()` to `diagnosis_lookup.py`:\n  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n  - Named differently from plan's `drug_matches_search_term()` ÔÇö returns all matches at once rather than single boolean, more practical for Phase 2\n- Updated `__all__` exports\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data): ADALIMUMAB ÔåÆ 7 indications (including axial spondyloarthritis, rheumatoid arthritis), OMALIZUMAB ÔåÆ 4 indications (asthma, allergic asthma, etc.), PEGYLATED LIPOSOMAL DOXORUBICIN ÔåÆ 4 matches via substring, \"ADALIMUMAB 40MG\" matches correctly with dosage info, diabetes fragments combined from 2 CSV rows\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (added load_drug_indication_mapping, get_search_terms_for_drug)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: 0779df7 \"feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.3)\"\n### Patterns discovered:\n- DimSearchTerm.csv has 164 unique Search_Terms (not 165 as noted) because diabetes appears twice with different directorates but same Search_Term\n- Some drug fragments are very generic: INHALED, CONTINUOUS, ORAL, STANDARD-DOSE, INTRAVENOUS, PEGYLATED, ROUTINE, INDUCTION ÔÇö these will match broadly but are constrained by the GP diagnosis requirement in Phase 2\n- Function signatures for Phase 2: `get_search_terms_for_drug(drug_name, search_term_to_fragments)` returns list[str] ÔÇö use this to get candidate indications per drug\n### Next iteration should:\n- Work on Task 1.2: Merge asthma Search_Terms in CLUSTER_MAPPING_SQL and load_drug_indication_mapping()\n  - Merge \"allergic asthma\", \"asthma\", \"severe persistent allergic asthma\" ÔåÆ \"asthma\"\n  - Keep \"urticaria\" separate\n  - This is self-contained and testable locally\n- OR work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at line ~1352 of diagnosis_lookup.py uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1` ÔÇö this must be replaced with GROUP BY + COUNT(*)\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n### Blocked items:\n- None\n","structuredPatch":[{"oldStart":102,"oldLines":3,"newStart":102,"newLines":50,"lines":[" - OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data"," ### Blocked items:"," - None","+","+## Iteration 2 ÔÇö 2026-02-05","+### Task: 1.2 ÔÇö Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL","+### Why this task:","+- Previous iteration recommended this as the next task (self-contained, testable locally)","+- Both CLUSTER_MAPPING_SQL and load_drug_indication_mapping() need consistent Search_Term names","+- Must be done before Task 1.1 (Snowflake query) to ensure GP lookups return \"asthma\" not \"allergic asthma\"","+### Status: COMPLETE","+### What was done:","+- Updated CLUSTER_MAPPING_SQL: changed 'allergic asthma' ÔåÆ 'asthma' (AST_COD) and 'severe persistent allergic asthma' ÔåÆ 'asthma' (SEVAST_COD)","+  - Now 3 rows for 'asthma': AST_COD, eFI2_Asthma, SEVAST_COD","+  - urticaria (XSAL_COD) stays separate","+- Added SEARCH_TERM_MERGE_MAP constant: {\"allergic asthma\": \"asthma\", \"severe persistent allergic asthma\": \"asthma\"}","+- Updated load_drug_indication_mapping() to apply SEARCH_TERM_MERGE_MAP when loading CSV","+  - Normalizes Search_Term before accumulating fragments","+  - Drug fragments from all 3 original rows combined under \"asthma\" key","+- Exported SEARCH_TERM_MERGE_MAP in __all__","+### Validation results:","+- Tier 1 (Code): py_compile passed, import check passed","+- Tier 2 (Data):","+  - \"asthma\" fragments: OMALIZUMAB, BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, RESLIZUMAB (complete combined list)","+  - \"allergic asthma\" no longer exists as separate key","+  - \"severe persistent allergic asthma\" no longer exists as separate key","+  - \"urticaria\" ÔåÆ ['OMALIZUMAB'] ÔÇö correctly separate","+  - OMALIZUMAB maps to: ['asthma', 'urticaria'] ÔÇö correct","+  - Total Search_Terms: 162 (was 164, 3 asthma entries ÔåÆ 1)","+  - Total fragments: 346 (unchanged)","+- Tier 3 (Functional): N/A (no UI changes)","+### Files changed:","+- data_processing/diagnosis_lookup.py (CLUSTER_MAPPING_SQL, SEARCH_TERM_MERGE_MAP, load_drug_indication_mapping)","+- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])","+### Committed: [pending]","+### Patterns discovered:","+- SEARCH_TERM_MERGE_MAP is reusable: any future module that receives Search_Terms from Snowflake can apply the same normalization","+- The merge approach (normalize at load time) is cleaner than post-hoc deduplication","+### Next iteration should:","+- Work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency","+  - The current query at ~line 1467 uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1`","+  - Replace with GROUP BY + COUNT(*) for code_frequency","+  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window","+  - Return columns: PatientPseudonym, Search_Term, code_frequency","+  - Empty DataFrame columns should match new return type","+  - This requires Snowflake connectivity to fully test, but code changes can be verified with py_compile and import checks","+- OR work on Task 2.1: Create assign_drug_indications() ÔÇö can be built and tested with mock data","+  - This is independent of Task 1.1 if you mock the gp_matches_df input","+### Blocked items:","+- None"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_014Xy72JKxth5sE2N7h19aXZ","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me commit the changes."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1156,"cache_read_input_tokens":114535,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1156},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"79435bba-01c6-41a0-9858-20fcbf0bf4ef"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_014Xy72JKxth5sE2N7h19aXZ","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01VXJXCApBEyFya6zFnHrQNP","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git add data_processing/diagnosis_lookup.py IMPLEMENTATION_PLAN.md progress.txt","description":"Stage changed files for commit"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":1156,"cache_read_input_tokens":114535,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":1156},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"3a12b155-3d55-4f22-b95a-de2f66d4e446"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01VXJXCApBEyFya6zFnHrQNP","type":"tool_result","content":"","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"6ab18722-5151-46ad-bc61-c66f72f3941c","tool_use_result":{"stdout":"","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_019CmbaZrL6U7cu6JbPV6RnU","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01KbDqV35VP7EuT9rSpW2TrV","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git commit -m \"$(cat <<'EOF'\nfeat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\n\nMerge 'allergic asthma' and 'severe persistent allergic asthma' into\ncanonical 'asthma' in both CLUSTER_MAPPING_SQL (Snowflake CTE) and\nload_drug_indication_mapping() (DimSearchTerm.csv loader).\n\n- CLUSTER_MAPPING_SQL: 3 Cluster_IDs (AST_COD, eFI2_Asthma, SEVAST_COD) now\n  all map to Search_Term = 'asthma'\n- Added SEARCH_TERM_MERGE_MAP constant for reusable normalization\n- load_drug_indication_mapping() applies merge at CSV load time\n- urticaria (XSAL_COD) stays separate ÔÇö not merged with asthma\n- Combined asthma drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB,\n  OMALIZUMAB, RESLIZUMAB\nEOF\n)\"","description":"Commit asthma merge changes"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":149,"cache_read_input_tokens":115691,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":149},"output_tokens":19,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"05586a91-0b94-4833-b539-4e914ca9b4c3"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01KbDqV35VP7EuT9rSpW2TrV","type":"tool_result","content":"[feature/snomed-indication-mapping b0a8a9d] feat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\n 3 files changed, 88 insertions(+), 7 deletions(-)","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"0c1a0d0e-7c03-429f-adf2-b6bb702c744c","tool_use_result":{"stdout":"[feature/snomed-indication-mapping b0a8a9d] feat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\n 3 files changed, 88 insertions(+), 7 deletions(-)\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01GSxc1quLCmrzD7TCR16joy","type":"message","role":"assistant","content":[{"type":"text","text":"Now let me update the commit hash in progress.txt."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":695,"cache_read_input_tokens":115840,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":695},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"26230b46-70b3-4ee0-8a2b-4a1ccbb8f612"}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01GSxc1quLCmrzD7TCR16joy","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01BYAyMW6odPmh9hQ1SLXVKM","name":"Edit","input":{"replace_all":false,"file_path":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","old_string":"### Committed: [pending]","new_string":"### Committed: b0a8a9d \"feat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\""}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":695,"cache_read_input_tokens":115840,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":695},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"a65ae3ea-676b-4ada-b16a-39939814de92"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01BYAyMW6odPmh9hQ1SLXVKM","type":"tool_result","content":"The file C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt has been updated successfully."}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"e13b1986-c15d-4a41-b58c-511832320ef0","tool_use_result":{"filePath":"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\\progress.txt","oldString":"### Committed: [pending]","newString":"### Committed: b0a8a9d \"feat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\"","originalFile":"# Progress Log - Drug-Aware Indication Matching\n\n## Project Context\n\nThis project extends the indication-based pathway charts (Phase 1-5 complete) with drug-aware matching.\n\n**Previous state**: Patients get ONE indication based on their most recent GP diagnosis match (SNOMED cluster codes). This ignores which drugs the patient is taking.\n\n**New goal**: Match each drug to an indication by cross-referencing the patient's GP diagnoses AND the drug's Search_Term mapping from DimSearchTerm.csv.\n\n## Key Data/Patterns\n\n### DimSearchTerm.csv\n- Located at `data/DimSearchTerm.csv`\n- Columns: Search_Term, CleanedDrugName (pipe-separated), PrimaryDirectorate\n- ~165 rows mapping clinical conditions to drug name fragments\n- Drug fragments are substrings that match standardized drug names from HCD data\n- Some entries have generic fragments: INHALED, CONTINUOUS, STANDARD-DOSE, PEGYLATED\n\n### Current get_patient_indication_groups() in diagnosis_lookup.py\n- Uses CLUSTER_MAPPING_SQL as CTE in Snowflake query\n- Returns ONLY the most recent match per patient (QUALIFY ROW_NUMBER() = 1)\n- Needs to return ALL matching Search_Terms per patient (remove QUALIFY)\n- Batches 500 patients per query\n\n### Modified UPID approach\n- Current: UPID = Provider Code[:3] + PersonKey (e.g., \"RMV12345\")\n- New: UPID = original + \"|\" + search_term (e.g., \"RMV12345|rheumatoid arthritis\")\n- The pipe delimiter \"|\" is safe because existing UPIDs are alphanumeric\n- generate_icicle_chart_indication() treats UPID as an opaque identifier ÔÇö modified UPIDs work transparently\n- The \" - \" delimiter in pathway ids is used for hierarchy levels, not within UPIDs\n\n### PseudoNHSNoLinked mapping\n- HCD data has PseudoNHSNoLinked column that matches PatientPseudonym in GP records\n- PersonKey is provider-specific local ID ÔÇö do NOT use for GP matching\n- One PseudoNHSNoLinked can map to multiple UPIDs (multi-provider patients)\n- GP match lookup: PseudoNHSNoLinked ÔåÆ list of matched Search_Terms\n\n### Drug matching logic\n- For each HCD row (UPID + Drug Name):\n  1. Get patient's GP-matched Search_Terms with code_frequency (via PseudoNHSNoLinked)\n  2. Get which Search_Terms list this drug (from DimSearchTerm.csv)\n  3. Intersection = valid indications\n  4. If 1: use it. If multiple: pick highest code_frequency (most GP coding = most likely indication). If 0: fallback to directory.\n- Modified UPID groups drugs under same indication together naturally\n- code_frequency = COUNT(*) of matching SNOMED codes per Search_Term per patient in GP records\n- GP code time range: only count codes from MIN(Intervention Date) onwards (the HCD data window)\n  - Reduces noise from old/irrelevant diagnoses, makes frequency more meaningful\n  - Pass earliest_hcd_date as parameter to get_patient_indication_groups()\n- Tiebreaker rationale: 47 RA codes vs 2 crohn's codes ÔåÆ RA is clearly the active condition\n\n### Known edge cases\n- Some DimSearchTerm drug fragments are generic (INHALED, ORAL, CONTINUOUS)\n  - These could match broadly but are constrained by GP diagnosis requirement\n- A patient visiting multiple providers has multiple UPIDs\n  - Each UPID gets its own drug-indication matching independently\n- Same Search_Term appears twice in DimSearchTerm.csv with different directorates\n  - e.g., \"diabetes\" ÔåÆ DIABETIC MEDICINE and OPHTHALMOLOGY\n  - For indication charts, we use Search_Term not directorate, so this is fine\n\n## Iteration Log\n\n## Iteration 1 ÔÇö 2026-02-05\n### Task: 1.3 ÔÇö Build drug-to-Search_Term lookup from DimSearchTerm.csv\n### Why this task:\n- First iteration, chose Phase 1 foundations. Task 1.2 (CSV loading) is self-contained and testable locally without Snowflake.\n- Task 1.1 (Snowflake query update) can't be verified without a live connection ÔÇö better to do 1.2 first.\n- Both 1.1 and 1.2 are independent, so order doesn't matter for dependencies.\n### Status: COMPLETE\n### What was done:\n- Added `load_drug_indication_mapping()` to `diagnosis_lookup.py`:\n  - Loads `data/DimSearchTerm.csv`, builds two dicts:\n    - `fragment_to_search_terms`: drug fragment (UPPER) ÔåÆ list of Search_Terms\n    - `search_term_to_fragments`: search_term ÔåÆ list of drug fragments (UPPER)\n  - Handles duplicate Search_Terms (e.g., \"diabetes\" rows combined)\n  - Result: 164 Search_Terms, 346 drug fragments\n- Added `get_search_terms_for_drug()` to `diagnosis_lookup.py`:\n  - Returns all Search_Terms whose drug fragments are substrings of the drug name (case-insensitive)\n  - Named differently from plan's `drug_matches_search_term()` ÔÇö returns all matches at once rather than single boolean, more practical for Phase 2\n- Updated `__all__` exports\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data): ADALIMUMAB ÔåÆ 7 indications (including axial spondyloarthritis, rheumatoid arthritis), OMALIZUMAB ÔåÆ 4 indications (asthma, allergic asthma, etc.), PEGYLATED LIPOSOMAL DOXORUBICIN ÔåÆ 4 matches via substring, \"ADALIMUMAB 40MG\" matches correctly with dosage info, diabetes fragments combined from 2 CSV rows\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (added load_drug_indication_mapping, get_search_terms_for_drug)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: 0779df7 \"feat: add drug-to-indication mapping from DimSearchTerm.csv (Task 1.3)\"\n### Patterns discovered:\n- DimSearchTerm.csv has 164 unique Search_Terms (not 165 as noted) because diabetes appears twice with different directorates but same Search_Term\n- Some drug fragments are very generic: INHALED, CONTINUOUS, ORAL, STANDARD-DOSE, INTRAVENOUS, PEGYLATED, ROUTINE, INDUCTION ÔÇö these will match broadly but are constrained by the GP diagnosis requirement in Phase 2\n- Function signatures for Phase 2: `get_search_terms_for_drug(drug_name, search_term_to_fragments)` returns list[str] ÔÇö use this to get candidate indications per drug\n### Next iteration should:\n- Work on Task 1.2: Merge asthma Search_Terms in CLUSTER_MAPPING_SQL and load_drug_indication_mapping()\n  - Merge \"allergic asthma\", \"asthma\", \"severe persistent allergic asthma\" ÔåÆ \"asthma\"\n  - Keep \"urticaria\" separate\n  - This is self-contained and testable locally\n- OR work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at line ~1352 of diagnosis_lookup.py uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1` ÔÇö this must be replaced with GROUP BY + COUNT(*)\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency (not EventDateTime)\n- OR if Snowflake isn't available to test 1.1, skip to Task 2.1 (assign_drug_indications function) which can be built and tested with mock data\n### Blocked items:\n- None\n\n## Iteration 2 ÔÇö 2026-02-05\n### Task: 1.2 ÔÇö Merge related asthma Search_Terms in CLUSTER_MAPPING_SQL\n### Why this task:\n- Previous iteration recommended this as the next task (self-contained, testable locally)\n- Both CLUSTER_MAPPING_SQL and load_drug_indication_mapping() need consistent Search_Term names\n- Must be done before Task 1.1 (Snowflake query) to ensure GP lookups return \"asthma\" not \"allergic asthma\"\n### Status: COMPLETE\n### What was done:\n- Updated CLUSTER_MAPPING_SQL: changed 'allergic asthma' ÔåÆ 'asthma' (AST_COD) and 'severe persistent allergic asthma' ÔåÆ 'asthma' (SEVAST_COD)\n  - Now 3 rows for 'asthma': AST_COD, eFI2_Asthma, SEVAST_COD\n  - urticaria (XSAL_COD) stays separate\n- Added SEARCH_TERM_MERGE_MAP constant: {\"allergic asthma\": \"asthma\", \"severe persistent allergic asthma\": \"asthma\"}\n- Updated load_drug_indication_mapping() to apply SEARCH_TERM_MERGE_MAP when loading CSV\n  - Normalizes Search_Term before accumulating fragments\n  - Drug fragments from all 3 original rows combined under \"asthma\" key\n- Exported SEARCH_TERM_MERGE_MAP in __all__\n### Validation results:\n- Tier 1 (Code): py_compile passed, import check passed\n- Tier 2 (Data):\n  - \"asthma\" fragments: OMALIZUMAB, BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, RESLIZUMAB (complete combined list)\n  - \"allergic asthma\" no longer exists as separate key\n  - \"severe persistent allergic asthma\" no longer exists as separate key\n  - \"urticaria\" ÔåÆ ['OMALIZUMAB'] ÔÇö correctly separate\n  - OMALIZUMAB maps to: ['asthma', 'urticaria'] ÔÇö correct\n  - Total Search_Terms: 162 (was 164, 3 asthma entries ÔåÆ 1)\n  - Total fragments: 346 (unchanged)\n- Tier 3 (Functional): N/A (no UI changes)\n### Files changed:\n- data_processing/diagnosis_lookup.py (CLUSTER_MAPPING_SQL, SEARCH_TERM_MERGE_MAP, load_drug_indication_mapping)\n- IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])\n### Committed: [pending]\n### Patterns discovered:\n- SEARCH_TERM_MERGE_MAP is reusable: any future module that receives Search_Terms from Snowflake can apply the same normalization\n- The merge approach (normalize at load time) is cleaner than post-hoc deduplication\n### Next iteration should:\n- Work on Task 1.1: Update `get_patient_indication_groups()` to return ALL matches with code_frequency\n  - The current query at ~line 1467 uses `QUALIFY ROW_NUMBER() OVER (PARTITION BY pc.\"PatientPseudonym\" ORDER BY pc.\"EventDateTime\" DESC) = 1`\n  - Replace with GROUP BY + COUNT(*) for code_frequency\n  - Add `earliest_hcd_date` parameter to restrict GP codes to HCD data window\n  - Return columns: PatientPseudonym, Search_Term, code_frequency\n  - Empty DataFrame columns should match new return type\n  - This requires Snowflake connectivity to fully test, but code changes can be verified with py_compile and import checks\n- OR work on Task 2.1: Create assign_drug_indications() ÔÇö can be built and tested with mock data\n  - This is independent of Task 1.1 if you mock the gp_matches_df input\n### Blocked items:\n- None\n","structuredPatch":[{"oldStart":133,"oldLines":7,"newStart":133,"newLines":7,"lines":[" ### Files changed:"," - data_processing/diagnosis_lookup.py (CLUSTER_MAPPING_SQL, SEARCH_TERM_MERGE_MAP, load_drug_indication_mapping)"," - IMPLEMENTATION_PLAN.md (marked 1.2 subtasks [x])","-### Committed: [pending]","+### Committed: b0a8a9d \"feat: merge asthma Search_Term variants in CLUSTER_MAPPING_SQL and drug mapping (Task 1.2)\""," ### Patterns discovered:"," - SEARCH_TERM_MERGE_MAP is reusable: any future module that receives Search_Terms from Snowflake can apply the same normalization"," - The merge approach (normalize at load time) is cleaner than post-hoc deduplication"]}],"userModified":false,"replaceAll":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_011ttUjnPxW1KpqJhCK9DpHK","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01McmhjYZi5w2LQwWwqSqNPC","name":"Bash","input":{"command":"cd \"C:\\Users\\charlwoodand\\Ralph local\\Tasks\\UI Redesign\" && git add progress.txt && git commit -m \"$(cat <<'EOF'\ndocs: update progress.txt with Iteration 2 results (Task 1.2)\nEOF\n)\"","description":"Commit progress.txt update"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":229,"cache_read_input_tokens":116535,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":229},"output_tokens":19,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"ca7417a7-6547-4799-874d-c73ad3dafe07"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01McmhjYZi5w2LQwWwqSqNPC","type":"tool_result","content":"[feature/snomed-indication-mapping 4fed0e5] docs: update progress.txt with Iteration 2 results (Task 1.2)\n 1 file changed, 1 insertion(+), 1 deletion(-)","is_error":false}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"b9d612bb-4c4f-4f1e-982c-9c18d52beedd","tool_use_result":{"stdout":"[feature/snomed-indication-mapping 4fed0e5] docs: update progress.txt with Iteration 2 results (Task 1.2)\n 1 file changed, 1 insertion(+), 1 deletion(-)\r","stderr":"","interrupted":false,"isImage":false}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01FEe7J4K8NR7ksua8jCbnFK","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01AqZuRMcoqrBnTohfWAHk8F","name":"TodoWrite","input":{"todos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"completed","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"completed","activeForm":"Committing and updating progress"}]}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":203,"cache_read_input_tokens":116764,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":203},"output_tokens":25,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"9a9a54ac-3184-4735-a186-9cde00ee2600"}
{"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01AqZuRMcoqrBnTohfWAHk8F","type":"tool_result","content":"Todos have been modified successfully. Ensure that you continue to use the todo list to track your progress. Please proceed with the current tasks if applicable"}]},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"829ea583-9bbf-487d-b83c-01894641802c","tool_use_result":{"oldTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"completed","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"in_progress","activeForm":"Committing and updating progress"}],"newTodos":[{"content":"Read DimSearchTerm.csv for asthma rows and understand current CLUSTER_MAPPING_SQL asthma entries","status":"completed","activeForm":"Analyzing asthma-related entries in data sources"},{"content":"Update CLUSTER_MAPPING_SQL to map all 3 asthma Cluster_IDs to 'asthma' Search_Term","status":"completed","activeForm":"Updating CLUSTER_MAPPING_SQL asthma entries"},{"content":"Update load_drug_indication_mapping() to merge asthma variants","status":"completed","activeForm":"Updating load_drug_indication_mapping() for asthma merge"},{"content":"Validate: py_compile, import check, asthma merge correctness","status":"completed","activeForm":"Validating asthma merge changes"},{"content":"Commit changes and update progress.txt","status":"completed","activeForm":"Committing and updating progress"}]}}
{"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01Wdn2mWsPMKuHU1H6caJK6t","type":"message","role":"assistant","content":[{"type":"text","text":"**Iteration 2 complete.** Here's the summary:\n\n**Task 1.2: Merge related asthma Search_Terms** ÔÇö DONE\n\nChanges made:\n1. **CLUSTER_MAPPING_SQL**: `'allergic asthma'` ÔåÆ `'asthma'` (AST_COD) and `'severe persistent allergic asthma'` ÔåÆ `'asthma'` (SEVAST_COD). Now 3 Cluster_IDs all map to `'asthma'`.\n2. **SEARCH_TERM_MERGE_MAP**: New constant mapping asthma variants ÔåÆ canonical `\"asthma\"`. Reusable by other modules.\n3. **load_drug_indication_mapping()**: Applies merge map at CSV load time. Combined asthma drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB.\n4. **urticaria** stays separate (XSAL_COD, OMALIZUMAB only).\n\nAll validation passed. Commits: `b0a8a9d`, `4fed0e5`.\n\n**Next iteration**: Task 1.1 (update Snowflake query to return ALL matches with code_frequency) or Task 2.1 (assign_drug_indications function)."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":1,"cache_creation_input_tokens":335,"cache_read_input_tokens":116967,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":335},"output_tokens":1,"service_tier":"standard","inference_geo":"not_available"},"context_management":null},"parent_tool_use_id":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","uuid":"cfdaedba-e3ae-4f41-854c-b9bf1e5d91a0"}
{"type":"result","subtype":"success","is_error":false,"duration_ms":247219,"duration_api_ms":236378,"num_turns":30,"result":"**Iteration 2 complete.** Here's the summary:\n\n**Task 1.2: Merge related asthma Search_Terms** ÔÇö DONE\n\nChanges made:\n1. **CLUSTER_MAPPING_SQL**: `'allergic asthma'` ÔåÆ `'asthma'` (AST_COD) and `'severe persistent allergic asthma'` ÔåÆ `'asthma'` (SEVAST_COD). Now 3 Cluster_IDs all map to `'asthma'`.\n2. **SEARCH_TERM_MERGE_MAP**: New constant mapping asthma variants ÔåÆ canonical `\"asthma\"`. Reusable by other modules.\n3. **load_drug_indication_mapping()**: Applies merge map at CSV load time. Combined asthma drug list: BENRALIZUMAB, DUPILUMAB, INHALED, MEPOLIZUMAB, OMALIZUMAB, RESLIZUMAB.\n4. **urticaria** stays separate (XSAL_COD, OMALIZUMAB only).\n\nAll validation passed. Commits: `b0a8a9d`, `4fed0e5`.\n\n**Next iteration**: Task 1.1 (update Snowflake query to return ALL matches with code_frequency) or Task 2.1 (assign_drug_indications function).","stop_reason":null,"session_id":"73d004c4-7376-4936-9771-dc37f5b75850","total_cost_usd":2.2438232499999997,"usage":{"input_tokens":28,"cache_creation_input_tokens":89917,"cache_read_input_tokens":2829266,"output_tokens":10168,"server_tool_use":{"web_search_requests":0,"web_fetch_requests":0},"service_tier":"standard","cache_creation":{"ephemeral_1h_input_tokens":89917,"ephemeral_5m_input_tokens":0}},"modelUsage":{"claude-opus-4-6":{"inputTokens":28,"outputTokens":10168,"cacheReadInputTokens":2829266,"cacheCreationInputTokens":89917,"webSearchRequests":0,"costUSD":2.23095425,"contextWindow":200000,"maxOutputTokens":32000},"claude-haiku-4-5-20251001":{"inputTokens":11294,"outputTokens":315,"cacheReadInputTokens":0,"cacheCreationInputTokens":0,"webSearchRequests":0,"costUSD":0.012869,"contextWindow":200000,"maxOutputTokens":64000}},"permission_denials":[],"uuid":"8218820a-9403-42a4-a00d-a56747996cbe"}