Spaces:

ybchen928
/

oncall-guide-ai

Running

App Files Files Community

YanBoChen commited on Jul 26

Commit

d37f4b2

1 Parent(s): 2ee61dc

WIP: during pre-process dataset, when doing dataset_treatment exploration, some abnormality happen, thus we now create certain test script to identify the problem

Browse files

Files changed (6) hide show

dataset/analysis/keyword_matching_test_results.json +151 -0
dataset/keywords/treatment_keywords.txt +91 -104
dataset/scripts/02_filter_treatment.py +77 -23
dataset/scripts/check_subset_integrity.py +178 -0
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt +85 -0
dataset/scripts/test_keyword_matching.py +175 -0

dataset/analysis/keyword_matching_test_results.json ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+  "special_terms_matching": [
+    {
+      "clean_text": "Patient needs an x-ray of the chest",
+      "category": "x-ray variants",
+      "matched": "x-ray"
+    },
+    {
+      "clean_text": "Ordered chest xray",
+      "category": "x-ray variants",
+      "matched": "xray"
+    },
+    {
+      "clean_text": "X ray shows pneumonia",
+      "category": "x-ray variants",
+      "matched": "X ray"
+    },
+    {
+      "clean_text": "XRAY negative",
+      "category": "x-ray variants",
+      "matched": "XRAY"
+    },
+    {
+      "clean_text": "CT scan reveals nodule",
+      "category": "ct-scan variants",
+      "matched": "CT scan"
+    },
+    {
+      "clean_text": "CT-scan indicates mass",
+      "category": "ct-scan variants",
+      "matched": "CT-scan"
+    },
+    {
+      "clean_text": "Requires ctscan urgently",
+      "category": "ct-scan variants",
+      "matched": "ctscan"
+    },
+    {
+      "clean_text": "CTSCAN of abdomen",
+      "category": "ct-scan variants",
+      "matched": "CTSCAN"
+    },
+    {
+      "clean_text": "Point-of-care testing needed",
+      "category": "point-of-care variants",
+      "matched": "Point-of-care"
+    },
+    {
+      "clean_text": "Point of care ultrasound",
+      "category": "point-of-care variants",
+      "matched": "Point of care"
+    },
+    {
+      "clean_text": "POC testing results",
+      "category": "point-of-care variants",
+      "matched": ""
+    },
+    {
+      "clean_text": "Ordered both x-ray and CT scan",
+      "category": "mixed cases",
+      "matched": "x-ray|CT scan"
+    },
+    {
+      "clean_text": "XRAY and CTSCAN negative",
+      "category": "mixed cases",
+      "matched": "XRAY|CTSCAN"
+    },
+    {
+      "clean_text": "Multiple point-of-care tests with x-ray",
+      "category": "mixed cases",
+      "matched": "point-of-care|x-ray"
+    },
+    {
+      "clean_text": "No imaging mentioned",
+      "category": "negative cases",
+      "matched": ""
+    },
+    {
+      "clean_text": "Regular examination only",
+      "category": "negative cases",
+      "matched": ""
+    },
+    {
+      "clean_text": "Laboratory tests pending",
+      "category": "negative cases",
+      "matched": ""
+    }
+  ],
+  "basic_matching": [
+    {
+      "clean_text": "Emergency treatment required",
+      "category": "simple matches",
+      "matched": "Emergency"
+    },
+    {
+      "clean_text": "Acute condition observed",
+      "category": "simple matches",
+      "matched": "Acute"
+    },
+    {
+      "clean_text": "Urgent care needed",
+      "category": "simple matches",
+      "matched": "Urgent"
+    },
+    {
+      "clean_text": "EMERGENCY situation",
+      "category": "case variations",
+      "matched": "EMERGENCY"
+    },
+    {
+      "clean_text": "Acute RESPIRATORY failure",
+      "category": "case variations",
+      "matched": "Acute"
+    },
+    {
+      "clean_text": "URgent surgical intervention",
+      "category": "case variations",
+      "matched": "URgent"
+    },
+    {
+      "clean_text": "Emergency treatment for acute condition",
+      "category": "multiple matches",
+      "matched": "Emergency|acute"
+    },
+    {
+      "clean_text": "Urgent care in emergency department",
+      "category": "multiple matches",
+      "matched": "Urgent|emergency"
+    },
+    {
+      "clean_text": "Acute respiratory emergency",
+      "category": "multiple matches",
+      "matched": "Acute|emergency"
+    },
+    {
+      "clean_text": "Non-emergency situation",
+      "category": "partial words",
+      "matched": "emergency"
+    },
+    {
+      "clean_text": "Subacute condition",
+      "category": "partial words",
+      "matched": ""
+    },
+    {
+      "clean_text": "Emergency-related",
+      "category": "partial words",
+      "matched": "Emergency"
+    }
+  ]
+}

dataset/keywords/treatment_keywords.txt CHANGED Viewed

@@ -1,118 +1,105 @@
-iv fluids
-Infusion Intravenous
-fluid resuscitation
-Intravenous Infusion
-normal saline
-crystalloids
-vasopressors
-Vasoconstrictor Agents
-Epinephrine
-Ondansetron
-Ibuprofen
-Morphine
-Lidocaine
 Airway Management
-intubation
-Intratracheal Intubation
-ventilation support
-Ventilators
-oxygen therapy
-Oxygen Inhalation Therapy
-cpap
-Continuous Positive Airway Pressure
-bipap
 Bi-level Positive Airway Pressure
-Nebulization
-cpr
 Cardiopulmonary Resuscitation
-ACLS
-Advanced Cardiac Life Support
-Defibrillation
 Cardioversion
-Blood Transfusion
-transfusion
-hemodynamic monitoring
-Hemodynamics
-central line placement
-Catheterization Central Venous
-arterial line placement
 Catheterization Arterial
-Hemostasis
-wound care
-Wound Management
-Suturing
-Suture
-Tourniquet
 compression dressing
-Wound Dressing
-splinting
-Splints
-radiologic imaging
-Radiography
-point of care ultrasound
-POCUS
-Ultrasonography Point of Care
-ultrasound
-x-ray
-Radiography
 ct scan
-Tomography X-Ray Computed
-laboratory testing
-Laboratory Techniques
-Sedation
-analgesia
-Analgesia
-procedural sedation
-Anesthesia Procedural
-ketamine
-Ketamine
-midazolam
-Midazolam
-supportive care
-Supportive Care
-monitoring
-Patient Monitoring
-vital signs monitoring
-Vital Signs
 icu transfer
-Intensive Care Units
-treatment
-Therapeutics
 manage
-Patient Management
 management
-Patient Management
-intervention
-Therapeutic Intervention
-Therapy
 medication
-Drug Therapy
-procedure
-Surgical Procedures Operative
-resuscitation
-Cardiopulmonary Resuscitation
-administer
-Drug Administration Routes
-dose
-Dosage Forms
 monitor
-Patient Monitoring
-Oxygen
-fluid
-Infusion Intravenous
-surgery
-Surgical Procedures
-antibiotic
-Anti-Bacterial Agents
-Dopamine
-Amiodarone
-levophed
-Norepinephrine
-Epinephrine
-Bosmin
-Adrenaline
-Insulin
 nitroglycerin
 NTG
-beta blocker
-alpha blocker

+ACLS
+administer
+Adrenaline
+Advanced Cardiac Life Support
 Airway Management
+alpha blocker
+Amiodarone
+analgesia
+Anesthesia Procedural
+Anti-Bacterial Agents
+antibiotic
+arterial line placement
+beta blocker
 Bi-level Positive Airway Pressure
+bipap
+Blood Transfusion
+Bosmin
 Cardiopulmonary Resuscitation
 Cardioversion
 Catheterization Arterial
+Catheterization Central Venous
+central line placement
 compression dressing
+Computed Tomography
+cpap
+cpr
+crystalloids
 ct scan
+Defibrillation
+Dopamine
+Dosage Forms
+dose
+Drug Administration Routes
+Drug Therapy
+Epinephrine
+fluid
+fluid resuscitation
+hemodynamic monitoring
+Hemodynamics
+Hemostasis
+Ibuprofen
 icu transfer
+Insulin
+intervention
+intubation
+Intratracheal Intubation
+Intravenous Infusion
+iv fluids
+laboratory techniques
+laboratory testing
+levophed
+Lidocaine
 manage
 management
 medication
+midazolam
 monitor
+monitoring
+Morphine
+Nebulization
 nitroglycerin
 NTG
+Norepinephrine
+normal saline
+Ondansetron
+Oxygen
+Oxygen Inhalation Therapy
+oxygen therapy
+Patient Management
+Patient Monitoring
+POCUS
+point of care ultrasound
+procedural sedation
+procedure
+radiologic imaging
+Radiography
+resuscitation
+Sedation
+splinting
+Splints
+supportive care
+surgical procedures
+Surgical Procedures Operative
+surgery
+Suture
+Suturing
+Therapeutic Intervention
+Therapeutics
+Therapy
+tourniquet
+transfusion
+treat
+treatment
+Ultrasonography Point of Care
+ultrasound
+Vasoconstrictor Agents
+vasopressors
+ventilation support
+Ventilators
+Vital Signs
+vital signs monitoring
+wound care
+Wound Dressing
+Wound Management
+X-Ray

dataset/scripts/02_filter_treatment.py CHANGED Viewed

@@ -4,31 +4,60 @@ import os
 import re
 import pandas as pd
-# Function: Load keywords and print progress
-def load_keywords(path):
-    print(f"📥 Loading keywords from: {path}")
-    with open(path, "r", encoding="utf-8") as f:
-        kws = [line.strip() for line in f if line.strip()]
-    print(f"   Loaded {len(kws)} keywords")
-    return kws
-# Step 1: Load emergency subset
 print("1️⃣ Reading emergency subset...")
 emergency_path = "../dataset/emergency/emergency_subset.jsonl"
 df = pd.read_json(emergency_path, lines=True)
 print(f"   Loaded {len(df)} emergency records")
-# Step 2: Load and apply treatment keywords
-print("2️⃣ Loading treatment keywords and filtering...")
-treatment_keywords = load_keywords("../keywords/treatment_keywords.txt")
-pattern = r"\b(?:" + "|".join(treatment_keywords) + r")\b"
-# Match treatment keywords and add metadata
 df["treatment_matched"] = (
-    df["clean_text"]
-      .fillna("")
-      .str.findall(pattern, flags=re.IGNORECASE)
-      .apply(lambda lst: "|".join(lst) if lst else "")
 )
 df["has_treatment"] = df["treatment_matched"].str.len() > 0
@@ -36,14 +65,39 @@ df["has_treatment"] = df["treatment_matched"].str.len() > 0
 df["type"] = "treatment"  # Document type identifier
 df["condition"] = ""      # Reserved for future condition mapping
 cnt_treat = df["has_treatment"].sum()
-print(f"   Matched {cnt_treat} records with treatment information")
-# Step 3: Save treatment subset
-print("3️⃣ Saving treatment subset...")
 out_dir = "../dataset/emergency_treatment"
 os.makedirs(out_dir, exist_ok=True)
-subset = df[df["has_treatment"]]
 subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
 subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
-print(f"✅ Complete! Generated treatment subset with {len(subset)} records, saved in `{out_dir}`")

 import re
 import pandas as pd
+def preprocess_keywords(keywords_file):
+    """Load and preprocess treatment keywords"""
+    print(f"📥 Loading keywords from: {keywords_file}")
+    # Special medical terms with common variants
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Read and preprocess keywords
+    with open(keywords_file, "r", encoding="utf-8") as f:
+        keywords = [line.strip().lower() for line in f if line.strip()]
+    # Process keywords and handle special terms
+    processed_keywords = []
+    for kw in keywords:
+        if kw in special_terms:
+            processed_keywords.extend(special_terms[kw])
+        else:
+            processed_keywords.append(kw)
+    print(f"   Loaded {len(keywords)} base keywords")
+    print(f"   Processed into {len(processed_keywords)} keyword variants")
+    return processed_keywords
+def create_regex_pattern(keywords):
+    """Create compiled regex pattern with word boundaries"""
+    pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+    return re.compile(pattern, re.IGNORECASE)
+# Step 1: Read source data
 print("1️⃣ Reading emergency subset...")
 emergency_path = "../dataset/emergency/emergency_subset.jsonl"
 df = pd.read_json(emergency_path, lines=True)
 print(f"   Loaded {len(df)} emergency records")
+print(f"   Contains emergency keywords in 'matched' column")
+# Step 2: Load treatment keywords and match
+print("2️⃣ Loading treatment keywords and matching...")
+treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
+pattern = create_regex_pattern(treatment_keywords)
+# Step 3: Process text and match keywords
+print("3️⃣ Processing text and matching keywords...")
+# Create lowercase version of text for matching
+df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+# Match treatment keywords and add metadata columns
+# Note: Preserving original 'matched' column from emergency subset
 df["treatment_matched"] = (
+    df["clean_text_lower"]
+    .apply(lambda text: "|".join(pattern.findall(text)) or "")
 )
 df["has_treatment"] = df["treatment_matched"].str.len() > 0
 df["type"] = "treatment"  # Document type identifier
 df["condition"] = ""      # Reserved for future condition mapping
+# Verify columns
+print("   Verifying columns...")
+print(f"   - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
+print(f"   - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
+# Calculate statistics
 cnt_treat = df["has_treatment"].sum()
+avg_matches = (
+    df[df["has_treatment"]]["treatment_matched"]
+      .str.count(r"\|")
+      .add(1)
+      .mean()
+)
+print(f"   Found {cnt_treat} treatment-related records")
+print(f"   Average treatment keywords per record: {avg_matches:.2f}")
+# Step 4: Save treatment subset
+print("4️⃣ Saving treatment subset...")
 out_dir = "../dataset/emergency_treatment"
 os.makedirs(out_dir, exist_ok=True)
+# Select records with treatment keywords
+subset = df[df["has_treatment"]].copy()  # Use copy to avoid SettingWithCopyWarning
+# Verify final subset columns
+print("   Final subset columns:")
+print(f"   - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
+print(f"   - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
 subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
 subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
+print(f"✅ Generated treatment subset with {len(subset)} records")
+print(f"   Saved in: {out_dir}")
+print(f"   Contains both emergency and treatment keywords")

dataset/scripts/check_subset_integrity.py ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/usr/bin/env python3
+# /scripts/check_subset_integrity.py
+import pandas as pd
+import json
+from pathlib import Path
+from tqdm import tqdm
+def check_subset_sample(file_path, sample_size=100):
+    """
+    Check the first N rows of the subset file
+    """
+    print(f"\n{'='*60}")
+    print(f"📊 Sampling Analysis (first {sample_size} rows)")
+    print(f"{'='*60}")
+    # Read sample
+    print(f"\n1️⃣ Reading sample from: {file_path}")
+    sample_df = pd.read_csv(file_path, nrows=sample_size)
+    # Basic information
+    print("\n2️⃣ Basic Information:")
+    print(f"   Columns present: {', '.join(sample_df.columns.tolist())}")
+    # Check matched columns
+    print("\n3️⃣ Matched Columns Status:")
+    matched_stats = {
+        'matched': {
+            'non_null': int(sample_df['matched'].notna().sum()),
+            'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
+            'unique_values': sample_df['matched'].nunique()
+        },
+        'treatment_matched': {
+            'non_null': int(sample_df['treatment_matched'].notna().sum()),
+            'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
+            'unique_values': sample_df['treatment_matched'].nunique()
+        }
+    }
+    for col, stats in matched_stats.items():
+        print(f"\n   {col}:")
+        print(f"   - Non-null count: {stats['non_null']}/{sample_size}")
+        print(f"   - Non-empty count: {stats['non_empty']}/{sample_size}")
+        print(f"   - Unique values: {stats['unique_values']}")
+    # Sample rows with both matches
+    print("\n4️⃣ Sample Rows with Both Matches:")
+    both_matched = sample_df[
+        (sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
+        (sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
+    ].head(3)
+    for idx, row in both_matched.iterrows():
+        print(f"\n   Row {idx}:")
+        print(f"   - Emergency keywords: {row['matched']}")
+        print(f"   - Treatment keywords: {row['treatment_matched']}")
+    return matched_stats
+def analyze_large_file(file_path, chunk_size=1000):
+    """
+    Analyze the entire file in chunks
+    """
+    print(f"\n{'='*60}")
+    print(f"📈 Full File Analysis (chunk size: {chunk_size})")
+    print(f"{'='*60}")
+    stats = {
+        'total_rows': 0,
+        'matched_stats': {
+            'non_null': 0,
+            'non_empty': 0
+        },
+        'treatment_matched_stats': {
+            'non_null': 0,
+            'non_empty': 0
+        },
+        'both_matched': 0
+    }
+    print("\n1️⃣ Processing file in chunks...")
+    chunks = pd.read_csv(file_path, chunksize=chunk_size)
+    for chunk in tqdm(chunks, desc="Analyzing chunks"):
+        # Update total rows
+        stats['total_rows'] += len(chunk)
+        # Update matched stats
+        stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
+        stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
+        # Update treatment_matched stats
+        stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
+        stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
+        # Update both matched count
+        stats['both_matched'] += (
+            (chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
+            (chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
+        ).sum()
+    return stats
+def generate_report(sample_stats, full_stats, output_dir):
+    """
+    Generate and save analysis report
+    """
+    print(f"\n{'='*60}")
+    print(f"📝 Generating Report")
+    print(f"{'='*60}")
+    report = {
+        'sample_analysis': sample_stats,
+        'full_file_analysis': {
+            'total_records': int(full_stats['total_rows']),
+            'matched_column': {
+                'non_null_count': int(full_stats['matched_stats']['non_null']),
+                'non_empty_count': int(full_stats['matched_stats']['non_empty']),
+                'null_percentage': float(
+                    (full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
+                    / full_stats['total_rows'] * 100
+                )
+            },
+            'treatment_matched_column': {
+                'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
+                'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
+                'null_percentage': float(
+                    (full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
+                    / full_stats['total_rows'] * 100
+                )
+            },
+            'both_matched_count': int(full_stats['both_matched']),
+            'both_matched_percentage': float(
+                full_stats['both_matched'] / full_stats['total_rows'] * 100
+            )
+        }
+    }
+    # Create output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save report
+    report_file = output_dir / 'integrity_check_report.json'
+    with open(report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\nReport saved to: {report_file}")
+    # Print summary
+    print("\n📊 Summary:")
+    print(f"Total records: {report['full_file_analysis']['total_records']}")
+    print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
+          f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
+    return report
+def main():
+    """
+    Main execution function
+    """
+    # Configuration
+    input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
+    output_dir = "../analysis/integrity_check"
+    print(f"\n🔍 Starting Subset Integrity Check")
+    print(f"Input file: {input_file}")
+    print(f"Output directory: {output_dir}")
+    # Run analysis
+    sample_stats = check_subset_sample(input_file)
+    full_stats = analyze_large_file(input_file)
+    report = generate_report(sample_stats, full_stats, output_dir)
+    print("\n✅ Integrity check complete!")
+if __name__ == "__main__":
+    main()

dataset/scripts/keyword_Match_Clean_for_subset_filter.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+# Keyword Matching and Text Cleaning Logic for Subset Filtering
+## 1. Keyword Preprocessing
+```python
+def preprocess_keywords(keywords_file):
+    # Handle special medical term variants
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Read and preprocess keywords
+    with open(keywords_file, "r", encoding="utf-8") as f:
+        keywords = [
+            line.strip()  # Remove whitespace
+            .lower()      # Convert to lowercase
+            for line in f
+            if line.strip()
+        ]
+    # Process special term variants
+    processed_keywords = []
+    for kw in keywords:
+        if kw in special_terms:
+            processed_keywords.extend(special_terms[kw])
+        else:
+            processed_keywords.append(kw)
+    return processed_keywords
+```
+## 2. Regex Pattern Processing
+```python
+def create_regex_pattern(keywords):
+    # Simple word boundary matching
+    pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+    return re.compile(pattern, re.IGNORECASE)
+```
+### Regex Pattern Explanation:
+- `\b`: Word boundary matching
+- `(?:...)`: Non-capturing group
+- `re.escape()`: Escape special characters
+- `re.IGNORECASE`: Case-insensitive matching
+## 3. Text Preprocessing and Matching
+```python
+# Create lowercase version of text
+df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+# Match keywords
+df["treatment_matched"] = (
+    df["clean_text_lower"]
+    .apply(lambda text: "|".join(pattern.findall(text)) or "")
+)
+```
+## 4. Processing Logic Details
+### 4.1 Special Term Handling Rationale
+- Common variants in medical literature
+- Maintain semantic consistency
+- Improve matching accuracy
+### 4.2 Regex Matching Strategy
+- Word boundary matching for complete terms
+- Precompiled patterns for performance
+- Case-insensitive matching for flexibility
+### 4.3 Text Preprocessing Steps
+1. Fill null values (fillna)
+2. Convert to lowercase (str.lower)
+3. Create dedicated lowercase column to avoid repeated conversions
+## 5. Output Format
+- matched column: Pipe-separated matched keywords
+- type column: Document type identifier ("emergency" or "treatment")
+- condition column: Reserved for future condition mapping
+## 6. Important Considerations
+1. Regular maintenance required for special term variants
+2. Precompiled regex patterns for performance optimization
+3. Dedicated text preprocessing storage to avoid redundant computations
+4. Maintain consistent column structure between emergency and treatment subsets

dataset/scripts/test_keyword_matching.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import pandas as pd
+import re
+from pathlib import Path
+import json
+def test_special_terms_matching():
+    """
+    Test special medical term matching logic
+    """
+    # Test cases for different scenarios
+    test_cases = {
+        "x-ray variants": [
+            "Patient needs an x-ray of the chest",
+            "Ordered chest xray",
+            "X ray shows pneumonia",
+            "XRAY negative"
+        ],
+        "ct-scan variants": [
+            "CT scan reveals nodule",
+            "CT-scan indicates mass",
+            "Requires ctscan urgently",
+            "CTSCAN of abdomen"
+        ],
+        "point-of-care variants": [
+            "Point-of-care testing needed",
+            "Point of care ultrasound",
+            "POC testing results"
+        ],
+        "mixed cases": [
+            "Ordered both x-ray and CT scan",
+            "XRAY and CTSCAN negative",
+            "Multiple point-of-care tests with x-ray"
+        ],
+        "negative cases": [
+            "No imaging mentioned",
+            "Regular examination only",
+            "Laboratory tests pending"
+        ]
+    }
+    # Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
+    special_terms = {
+        'x-ray': ['x-ray', 'x ray', 'xray'],
+        'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
+        'point-of-care': ['point-of-care', 'point of care']
+    }
+    # Create test DataFrame
+    test_df = pd.DataFrame({
+        'clean_text': [text for cases in test_cases.values() for text in cases],
+        'category': [cat for cat, texts in test_cases.items() for _ in texts]
+    })
+    # Process keywords
+    processed_keywords = []
+    for term, variants in special_terms.items():
+        processed_keywords.extend(variants)
+    # Create regex pattern
+    pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
+    # Apply matching logic
+    test_df['matched'] = (
+        test_df['clean_text']
+        .fillna("")
+        .str.findall(pattern, flags=re.IGNORECASE)
+        .apply(lambda lst: "|".join(lst) if lst else "")
+    )
+    return test_df
+def test_basic_matching():
+    """
+    Test basic keyword matching functionality
+    """
+    # Basic test cases
+    test_cases = {
+        "simple matches": [
+            "Emergency treatment required",
+            "Acute condition observed",
+            "Urgent care needed"
+        ],
+        "case variations": [
+            "EMERGENCY situation",
+            "Acute RESPIRATORY failure",
+            "URgent surgical intervention"
+        ],
+        "multiple matches": [
+            "Emergency treatment for acute condition",
+            "Urgent care in emergency department",
+            "Acute respiratory emergency"
+        ],
+        "partial words": [
+            "Non-emergency situation",
+            "Subacute condition",
+            "Emergency-related"
+        ]
+    }
+    # Create test DataFrame
+    test_df = pd.DataFrame({
+        'clean_text': [text for cases in test_cases.values() for text in cases],
+        'category': [cat for cat, texts in test_cases.items() for _ in texts]
+    })
+    # Test keywords
+    test_keywords = ['emergency', 'acute', 'urgent']
+    pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
+    # Apply matching logic
+    test_df['matched'] = (
+        test_df['clean_text']
+        .fillna("")
+        .str.findall(pattern, flags=re.IGNORECASE)
+        .apply(lambda lst: "|".join(lst) if lst else "")
+    )
+    return test_df
+def save_test_results(results_dict):
+    """
+    Save test results to JSON file
+    """
+    output_dir = Path("../analysis")
+    output_dir.mkdir(exist_ok=True)
+    output_file = output_dir / "keyword_matching_test_results.json"
+    # Convert DataFrame results to dictionary
+    for key, df in results_dict.items():
+        results_dict[key] = df.to_dict(orient='records')
+    with open(output_file, 'w') as f:
+        json.dump(results_dict, f, indent=2)
+    print(f"Results saved to: {output_file}")
+def run_tests():
+    """
+    Run all tests and output results
+    """
+    print("🧪 Running keyword matching tests...")
+    # Run tests
+    special_terms_results = test_special_terms_matching()
+    basic_matching_results = test_basic_matching()
+    # Print results
+    print("\n📊 Special Terms Matching Results:")
+    for category in special_terms_results['category'].unique():
+        print(f"\n{category}:")
+        subset = special_terms_results[special_terms_results['category'] == category]
+        for _, row in subset.iterrows():
+            print(f"Text: {row['clean_text']}")
+            print(f"Matched: {row['matched'] or 'No matches'}")
+            print("-" * 50)
+    print("\n📊 Basic Matching Results:")
+    for category in basic_matching_results['category'].unique():
+        print(f"\n{category}:")
+        subset = basic_matching_results[basic_matching_results['category'] == category]
+        for _, row in subset.iterrows():
+            print(f"Text: {row['clean_text']}")
+            print(f"Matched: {row['matched'] or 'No matches'}")
+            print("-" * 50)
+    # Save results
+    results_dict = {
+        'special_terms_matching': special_terms_results,
+        'basic_matching': basic_matching_results
+    }
+    save_test_results(results_dict)
+if __name__ == "__main__":
+    run_tests()