Spaces:
Running
Running
YanBoChen
commited on
Commit
·
d37f4b2
1
Parent(s):
2ee61dc
WIP: during pre-process dataset, when doing dataset_treatment exploration, some abnormality happen, thus we now create certain test script to identify the problem
Browse files- dataset/analysis/keyword_matching_test_results.json +151 -0
- dataset/keywords/treatment_keywords.txt +91 -104
- dataset/scripts/02_filter_treatment.py +77 -23
- dataset/scripts/check_subset_integrity.py +178 -0
- dataset/scripts/keyword_Match_Clean_for_subset_filter.txt +85 -0
- dataset/scripts/test_keyword_matching.py +175 -0
dataset/analysis/keyword_matching_test_results.json
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"special_terms_matching": [
|
| 3 |
+
{
|
| 4 |
+
"clean_text": "Patient needs an x-ray of the chest",
|
| 5 |
+
"category": "x-ray variants",
|
| 6 |
+
"matched": "x-ray"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"clean_text": "Ordered chest xray",
|
| 10 |
+
"category": "x-ray variants",
|
| 11 |
+
"matched": "xray"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"clean_text": "X ray shows pneumonia",
|
| 15 |
+
"category": "x-ray variants",
|
| 16 |
+
"matched": "X ray"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"clean_text": "XRAY negative",
|
| 20 |
+
"category": "x-ray variants",
|
| 21 |
+
"matched": "XRAY"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"clean_text": "CT scan reveals nodule",
|
| 25 |
+
"category": "ct-scan variants",
|
| 26 |
+
"matched": "CT scan"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"clean_text": "CT-scan indicates mass",
|
| 30 |
+
"category": "ct-scan variants",
|
| 31 |
+
"matched": "CT-scan"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"clean_text": "Requires ctscan urgently",
|
| 35 |
+
"category": "ct-scan variants",
|
| 36 |
+
"matched": "ctscan"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"clean_text": "CTSCAN of abdomen",
|
| 40 |
+
"category": "ct-scan variants",
|
| 41 |
+
"matched": "CTSCAN"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"clean_text": "Point-of-care testing needed",
|
| 45 |
+
"category": "point-of-care variants",
|
| 46 |
+
"matched": "Point-of-care"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"clean_text": "Point of care ultrasound",
|
| 50 |
+
"category": "point-of-care variants",
|
| 51 |
+
"matched": "Point of care"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"clean_text": "POC testing results",
|
| 55 |
+
"category": "point-of-care variants",
|
| 56 |
+
"matched": ""
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"clean_text": "Ordered both x-ray and CT scan",
|
| 60 |
+
"category": "mixed cases",
|
| 61 |
+
"matched": "x-ray|CT scan"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"clean_text": "XRAY and CTSCAN negative",
|
| 65 |
+
"category": "mixed cases",
|
| 66 |
+
"matched": "XRAY|CTSCAN"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"clean_text": "Multiple point-of-care tests with x-ray",
|
| 70 |
+
"category": "mixed cases",
|
| 71 |
+
"matched": "point-of-care|x-ray"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"clean_text": "No imaging mentioned",
|
| 75 |
+
"category": "negative cases",
|
| 76 |
+
"matched": ""
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"clean_text": "Regular examination only",
|
| 80 |
+
"category": "negative cases",
|
| 81 |
+
"matched": ""
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"clean_text": "Laboratory tests pending",
|
| 85 |
+
"category": "negative cases",
|
| 86 |
+
"matched": ""
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
"basic_matching": [
|
| 90 |
+
{
|
| 91 |
+
"clean_text": "Emergency treatment required",
|
| 92 |
+
"category": "simple matches",
|
| 93 |
+
"matched": "Emergency"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"clean_text": "Acute condition observed",
|
| 97 |
+
"category": "simple matches",
|
| 98 |
+
"matched": "Acute"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"clean_text": "Urgent care needed",
|
| 102 |
+
"category": "simple matches",
|
| 103 |
+
"matched": "Urgent"
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"clean_text": "EMERGENCY situation",
|
| 107 |
+
"category": "case variations",
|
| 108 |
+
"matched": "EMERGENCY"
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"clean_text": "Acute RESPIRATORY failure",
|
| 112 |
+
"category": "case variations",
|
| 113 |
+
"matched": "Acute"
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"clean_text": "URgent surgical intervention",
|
| 117 |
+
"category": "case variations",
|
| 118 |
+
"matched": "URgent"
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"clean_text": "Emergency treatment for acute condition",
|
| 122 |
+
"category": "multiple matches",
|
| 123 |
+
"matched": "Emergency|acute"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"clean_text": "Urgent care in emergency department",
|
| 127 |
+
"category": "multiple matches",
|
| 128 |
+
"matched": "Urgent|emergency"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"clean_text": "Acute respiratory emergency",
|
| 132 |
+
"category": "multiple matches",
|
| 133 |
+
"matched": "Acute|emergency"
|
| 134 |
+
},
|
| 135 |
+
{
|
| 136 |
+
"clean_text": "Non-emergency situation",
|
| 137 |
+
"category": "partial words",
|
| 138 |
+
"matched": "emergency"
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"clean_text": "Subacute condition",
|
| 142 |
+
"category": "partial words",
|
| 143 |
+
"matched": ""
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"clean_text": "Emergency-related",
|
| 147 |
+
"category": "partial words",
|
| 148 |
+
"matched": "Emergency"
|
| 149 |
+
}
|
| 150 |
+
]
|
| 151 |
+
}
|
dataset/keywords/treatment_keywords.txt
CHANGED
|
@@ -1,118 +1,105 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
normal saline
|
| 6 |
-
crystalloids
|
| 7 |
-
vasopressors
|
| 8 |
-
Vasoconstrictor Agents
|
| 9 |
-
Epinephrine
|
| 10 |
-
Ondansetron
|
| 11 |
-
Ibuprofen
|
| 12 |
-
Morphine
|
| 13 |
-
Lidocaine
|
| 14 |
Airway Management
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
bipap
|
| 24 |
Bi-level Positive Airway Pressure
|
| 25 |
-
|
| 26 |
-
|
|
|
|
| 27 |
Cardiopulmonary Resuscitation
|
| 28 |
-
ACLS
|
| 29 |
-
Advanced Cardiac Life Support
|
| 30 |
-
Defibrillation
|
| 31 |
Cardioversion
|
| 32 |
-
Blood Transfusion
|
| 33 |
-
transfusion
|
| 34 |
-
hemodynamic monitoring
|
| 35 |
-
Hemodynamics
|
| 36 |
-
central line placement
|
| 37 |
-
Catheterization Central Venous
|
| 38 |
-
arterial line placement
|
| 39 |
Catheterization Arterial
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
Wound Management
|
| 43 |
-
Suturing
|
| 44 |
-
Suture
|
| 45 |
-
Tourniquet
|
| 46 |
compression dressing
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
Radiography
|
| 52 |
-
point of care ultrasound
|
| 53 |
-
POCUS
|
| 54 |
-
Ultrasonography Point of Care
|
| 55 |
-
ultrasound
|
| 56 |
-
x-ray
|
| 57 |
-
Radiography
|
| 58 |
ct scan
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
Supportive Care
|
| 73 |
-
monitoring
|
| 74 |
-
Patient Monitoring
|
| 75 |
-
vital signs monitoring
|
| 76 |
-
Vital Signs
|
| 77 |
icu transfer
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
manage
|
| 82 |
-
Patient Management
|
| 83 |
management
|
| 84 |
-
Patient Management
|
| 85 |
-
intervention
|
| 86 |
-
Therapeutic Intervention
|
| 87 |
-
Therapy
|
| 88 |
medication
|
| 89 |
-
|
| 90 |
-
procedure
|
| 91 |
-
Surgical Procedures Operative
|
| 92 |
-
resuscitation
|
| 93 |
-
Cardiopulmonary Resuscitation
|
| 94 |
-
administer
|
| 95 |
-
Drug Administration Routes
|
| 96 |
-
dose
|
| 97 |
-
Dosage Forms
|
| 98 |
monitor
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
Infusion Intravenous
|
| 103 |
-
surgery
|
| 104 |
-
Surgical Procedures
|
| 105 |
-
antibiotic
|
| 106 |
-
Anti-Bacterial Agents
|
| 107 |
-
Dopamine
|
| 108 |
-
Amiodarone
|
| 109 |
-
levophed
|
| 110 |
-
Norepinephrine
|
| 111 |
-
Epinephrine
|
| 112 |
-
Bosmin
|
| 113 |
-
Adrenaline
|
| 114 |
-
Insulin
|
| 115 |
nitroglycerin
|
| 116 |
NTG
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ACLS
|
| 2 |
+
administer
|
| 3 |
+
Adrenaline
|
| 4 |
+
Advanced Cardiac Life Support
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
Airway Management
|
| 6 |
+
alpha blocker
|
| 7 |
+
Amiodarone
|
| 8 |
+
analgesia
|
| 9 |
+
Anesthesia Procedural
|
| 10 |
+
Anti-Bacterial Agents
|
| 11 |
+
antibiotic
|
| 12 |
+
arterial line placement
|
| 13 |
+
beta blocker
|
|
|
|
| 14 |
Bi-level Positive Airway Pressure
|
| 15 |
+
bipap
|
| 16 |
+
Blood Transfusion
|
| 17 |
+
Bosmin
|
| 18 |
Cardiopulmonary Resuscitation
|
|
|
|
|
|
|
|
|
|
| 19 |
Cardioversion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
Catheterization Arterial
|
| 21 |
+
Catheterization Central Venous
|
| 22 |
+
central line placement
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
compression dressing
|
| 24 |
+
Computed Tomography
|
| 25 |
+
cpap
|
| 26 |
+
cpr
|
| 27 |
+
crystalloids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
ct scan
|
| 29 |
+
Defibrillation
|
| 30 |
+
Dopamine
|
| 31 |
+
Dosage Forms
|
| 32 |
+
dose
|
| 33 |
+
Drug Administration Routes
|
| 34 |
+
Drug Therapy
|
| 35 |
+
Epinephrine
|
| 36 |
+
fluid
|
| 37 |
+
fluid resuscitation
|
| 38 |
+
hemodynamic monitoring
|
| 39 |
+
Hemodynamics
|
| 40 |
+
Hemostasis
|
| 41 |
+
Ibuprofen
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
icu transfer
|
| 43 |
+
Insulin
|
| 44 |
+
intervention
|
| 45 |
+
intubation
|
| 46 |
+
Intratracheal Intubation
|
| 47 |
+
Intravenous Infusion
|
| 48 |
+
iv fluids
|
| 49 |
+
laboratory techniques
|
| 50 |
+
laboratory testing
|
| 51 |
+
levophed
|
| 52 |
+
Lidocaine
|
| 53 |
manage
|
|
|
|
| 54 |
management
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
medication
|
| 56 |
+
midazolam
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
monitor
|
| 58 |
+
monitoring
|
| 59 |
+
Morphine
|
| 60 |
+
Nebulization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
nitroglycerin
|
| 62 |
NTG
|
| 63 |
+
Norepinephrine
|
| 64 |
+
normal saline
|
| 65 |
+
Ondansetron
|
| 66 |
+
Oxygen
|
| 67 |
+
Oxygen Inhalation Therapy
|
| 68 |
+
oxygen therapy
|
| 69 |
+
Patient Management
|
| 70 |
+
Patient Monitoring
|
| 71 |
+
POCUS
|
| 72 |
+
point of care ultrasound
|
| 73 |
+
procedural sedation
|
| 74 |
+
procedure
|
| 75 |
+
radiologic imaging
|
| 76 |
+
Radiography
|
| 77 |
+
resuscitation
|
| 78 |
+
Sedation
|
| 79 |
+
splinting
|
| 80 |
+
Splints
|
| 81 |
+
supportive care
|
| 82 |
+
surgical procedures
|
| 83 |
+
Surgical Procedures Operative
|
| 84 |
+
surgery
|
| 85 |
+
Suture
|
| 86 |
+
Suturing
|
| 87 |
+
Therapeutic Intervention
|
| 88 |
+
Therapeutics
|
| 89 |
+
Therapy
|
| 90 |
+
tourniquet
|
| 91 |
+
transfusion
|
| 92 |
+
treat
|
| 93 |
+
treatment
|
| 94 |
+
Ultrasonography Point of Care
|
| 95 |
+
ultrasound
|
| 96 |
+
Vasoconstrictor Agents
|
| 97 |
+
vasopressors
|
| 98 |
+
ventilation support
|
| 99 |
+
Ventilators
|
| 100 |
+
Vital Signs
|
| 101 |
+
vital signs monitoring
|
| 102 |
+
wound care
|
| 103 |
+
Wound Dressing
|
| 104 |
+
Wound Management
|
| 105 |
+
X-Ray
|
dataset/scripts/02_filter_treatment.py
CHANGED
|
@@ -4,31 +4,60 @@ import os
|
|
| 4 |
import re
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
print(f"📥 Loading keywords from: {
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
print("1️⃣ Reading emergency subset...")
|
| 17 |
emergency_path = "../dataset/emergency/emergency_subset.jsonl"
|
| 18 |
df = pd.read_json(emergency_path, lines=True)
|
| 19 |
print(f" Loaded {len(df)} emergency records")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
# Step
|
| 22 |
-
print("
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
-
# Match treatment keywords and add metadata
|
|
|
|
| 27 |
df["treatment_matched"] = (
|
| 28 |
-
df["
|
| 29 |
-
|
| 30 |
-
.str.findall(pattern, flags=re.IGNORECASE)
|
| 31 |
-
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 32 |
)
|
| 33 |
df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
| 34 |
|
|
@@ -36,14 +65,39 @@ df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
|
| 36 |
df["type"] = "treatment" # Document type identifier
|
| 37 |
df["condition"] = "" # Reserved for future condition mapping
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
cnt_treat = df["has_treatment"].sum()
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
print("
|
|
|
|
|
|
|
|
|
|
| 44 |
out_dir = "../dataset/emergency_treatment"
|
| 45 |
os.makedirs(out_dir, exist_ok=True)
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
|
| 48 |
subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import re
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
+
def preprocess_keywords(keywords_file):
|
| 8 |
+
"""Load and preprocess treatment keywords"""
|
| 9 |
+
print(f"📥 Loading keywords from: {keywords_file}")
|
| 10 |
+
|
| 11 |
+
# Special medical terms with common variants
|
| 12 |
+
special_terms = {
|
| 13 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 14 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 15 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Read and preprocess keywords
|
| 19 |
+
with open(keywords_file, "r", encoding="utf-8") as f:
|
| 20 |
+
keywords = [line.strip().lower() for line in f if line.strip()]
|
| 21 |
+
|
| 22 |
+
# Process keywords and handle special terms
|
| 23 |
+
processed_keywords = []
|
| 24 |
+
for kw in keywords:
|
| 25 |
+
if kw in special_terms:
|
| 26 |
+
processed_keywords.extend(special_terms[kw])
|
| 27 |
+
else:
|
| 28 |
+
processed_keywords.append(kw)
|
| 29 |
+
|
| 30 |
+
print(f" Loaded {len(keywords)} base keywords")
|
| 31 |
+
print(f" Processed into {len(processed_keywords)} keyword variants")
|
| 32 |
+
return processed_keywords
|
| 33 |
+
|
| 34 |
+
def create_regex_pattern(keywords):
|
| 35 |
+
"""Create compiled regex pattern with word boundaries"""
|
| 36 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 37 |
+
return re.compile(pattern, re.IGNORECASE)
|
| 38 |
+
|
| 39 |
+
# Step 1: Read source data
|
| 40 |
print("1️⃣ Reading emergency subset...")
|
| 41 |
emergency_path = "../dataset/emergency/emergency_subset.jsonl"
|
| 42 |
df = pd.read_json(emergency_path, lines=True)
|
| 43 |
print(f" Loaded {len(df)} emergency records")
|
| 44 |
+
print(f" Contains emergency keywords in 'matched' column")
|
| 45 |
+
|
| 46 |
+
# Step 2: Load treatment keywords and match
|
| 47 |
+
print("2️⃣ Loading treatment keywords and matching...")
|
| 48 |
+
treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
|
| 49 |
+
pattern = create_regex_pattern(treatment_keywords)
|
| 50 |
|
| 51 |
+
# Step 3: Process text and match keywords
|
| 52 |
+
print("3️⃣ Processing text and matching keywords...")
|
| 53 |
+
# Create lowercase version of text for matching
|
| 54 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 55 |
|
| 56 |
+
# Match treatment keywords and add metadata columns
|
| 57 |
+
# Note: Preserving original 'matched' column from emergency subset
|
| 58 |
df["treatment_matched"] = (
|
| 59 |
+
df["clean_text_lower"]
|
| 60 |
+
.apply(lambda text: "|".join(pattern.findall(text)) or "")
|
|
|
|
|
|
|
| 61 |
)
|
| 62 |
df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
| 63 |
|
|
|
|
| 65 |
df["type"] = "treatment" # Document type identifier
|
| 66 |
df["condition"] = "" # Reserved for future condition mapping
|
| 67 |
|
| 68 |
+
# Verify columns
|
| 69 |
+
print(" Verifying columns...")
|
| 70 |
+
print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
|
| 71 |
+
print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
|
| 72 |
+
|
| 73 |
+
# Calculate statistics
|
| 74 |
cnt_treat = df["has_treatment"].sum()
|
| 75 |
+
avg_matches = (
|
| 76 |
+
df[df["has_treatment"]]["treatment_matched"]
|
| 77 |
+
.str.count(r"\|")
|
| 78 |
+
.add(1)
|
| 79 |
+
.mean()
|
| 80 |
+
)
|
| 81 |
|
| 82 |
+
print(f" Found {cnt_treat} treatment-related records")
|
| 83 |
+
print(f" Average treatment keywords per record: {avg_matches:.2f}")
|
| 84 |
+
|
| 85 |
+
# Step 4: Save treatment subset
|
| 86 |
+
print("4️⃣ Saving treatment subset...")
|
| 87 |
out_dir = "../dataset/emergency_treatment"
|
| 88 |
os.makedirs(out_dir, exist_ok=True)
|
| 89 |
+
|
| 90 |
+
# Select records with treatment keywords
|
| 91 |
+
subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
|
| 92 |
+
|
| 93 |
+
# Verify final subset columns
|
| 94 |
+
print(" Final subset columns:")
|
| 95 |
+
print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
|
| 96 |
+
print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
|
| 97 |
+
|
| 98 |
subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
|
| 99 |
subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
|
| 100 |
+
|
| 101 |
+
print(f"✅ Generated treatment subset with {len(subset)} records")
|
| 102 |
+
print(f" Saved in: {out_dir}")
|
| 103 |
+
print(f" Contains both emergency and treatment keywords")
|
dataset/scripts/check_subset_integrity.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# /scripts/check_subset_integrity.py
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
def check_subset_sample(file_path, sample_size=100):
|
| 10 |
+
"""
|
| 11 |
+
Check the first N rows of the subset file
|
| 12 |
+
"""
|
| 13 |
+
print(f"\n{'='*60}")
|
| 14 |
+
print(f"📊 Sampling Analysis (first {sample_size} rows)")
|
| 15 |
+
print(f"{'='*60}")
|
| 16 |
+
|
| 17 |
+
# Read sample
|
| 18 |
+
print(f"\n1️⃣ Reading sample from: {file_path}")
|
| 19 |
+
sample_df = pd.read_csv(file_path, nrows=sample_size)
|
| 20 |
+
|
| 21 |
+
# Basic information
|
| 22 |
+
print("\n2️⃣ Basic Information:")
|
| 23 |
+
print(f" Columns present: {', '.join(sample_df.columns.tolist())}")
|
| 24 |
+
|
| 25 |
+
# Check matched columns
|
| 26 |
+
print("\n3️⃣ Matched Columns Status:")
|
| 27 |
+
matched_stats = {
|
| 28 |
+
'matched': {
|
| 29 |
+
'non_null': int(sample_df['matched'].notna().sum()),
|
| 30 |
+
'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
|
| 31 |
+
'unique_values': sample_df['matched'].nunique()
|
| 32 |
+
},
|
| 33 |
+
'treatment_matched': {
|
| 34 |
+
'non_null': int(sample_df['treatment_matched'].notna().sum()),
|
| 35 |
+
'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
|
| 36 |
+
'unique_values': sample_df['treatment_matched'].nunique()
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
for col, stats in matched_stats.items():
|
| 41 |
+
print(f"\n {col}:")
|
| 42 |
+
print(f" - Non-null count: {stats['non_null']}/{sample_size}")
|
| 43 |
+
print(f" - Non-empty count: {stats['non_empty']}/{sample_size}")
|
| 44 |
+
print(f" - Unique values: {stats['unique_values']}")
|
| 45 |
+
|
| 46 |
+
# Sample rows with both matches
|
| 47 |
+
print("\n4️⃣ Sample Rows with Both Matches:")
|
| 48 |
+
both_matched = sample_df[
|
| 49 |
+
(sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
|
| 50 |
+
(sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
|
| 51 |
+
].head(3)
|
| 52 |
+
|
| 53 |
+
for idx, row in both_matched.iterrows():
|
| 54 |
+
print(f"\n Row {idx}:")
|
| 55 |
+
print(f" - Emergency keywords: {row['matched']}")
|
| 56 |
+
print(f" - Treatment keywords: {row['treatment_matched']}")
|
| 57 |
+
|
| 58 |
+
return matched_stats
|
| 59 |
+
|
| 60 |
+
def analyze_large_file(file_path, chunk_size=1000):
|
| 61 |
+
"""
|
| 62 |
+
Analyze the entire file in chunks
|
| 63 |
+
"""
|
| 64 |
+
print(f"\n{'='*60}")
|
| 65 |
+
print(f"📈 Full File Analysis (chunk size: {chunk_size})")
|
| 66 |
+
print(f"{'='*60}")
|
| 67 |
+
|
| 68 |
+
stats = {
|
| 69 |
+
'total_rows': 0,
|
| 70 |
+
'matched_stats': {
|
| 71 |
+
'non_null': 0,
|
| 72 |
+
'non_empty': 0
|
| 73 |
+
},
|
| 74 |
+
'treatment_matched_stats': {
|
| 75 |
+
'non_null': 0,
|
| 76 |
+
'non_empty': 0
|
| 77 |
+
},
|
| 78 |
+
'both_matched': 0
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
print("\n1️⃣ Processing file in chunks...")
|
| 82 |
+
chunks = pd.read_csv(file_path, chunksize=chunk_size)
|
| 83 |
+
|
| 84 |
+
for chunk in tqdm(chunks, desc="Analyzing chunks"):
|
| 85 |
+
# Update total rows
|
| 86 |
+
stats['total_rows'] += len(chunk)
|
| 87 |
+
|
| 88 |
+
# Update matched stats
|
| 89 |
+
stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
|
| 90 |
+
stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
|
| 91 |
+
|
| 92 |
+
# Update treatment_matched stats
|
| 93 |
+
stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
|
| 94 |
+
stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
|
| 95 |
+
|
| 96 |
+
# Update both matched count
|
| 97 |
+
stats['both_matched'] += (
|
| 98 |
+
(chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
|
| 99 |
+
(chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
|
| 100 |
+
).sum()
|
| 101 |
+
|
| 102 |
+
return stats
|
| 103 |
+
|
| 104 |
+
def generate_report(sample_stats, full_stats, output_dir):
|
| 105 |
+
"""
|
| 106 |
+
Generate and save analysis report
|
| 107 |
+
"""
|
| 108 |
+
print(f"\n{'='*60}")
|
| 109 |
+
print(f"📝 Generating Report")
|
| 110 |
+
print(f"{'='*60}")
|
| 111 |
+
|
| 112 |
+
report = {
|
| 113 |
+
'sample_analysis': sample_stats,
|
| 114 |
+
'full_file_analysis': {
|
| 115 |
+
'total_records': int(full_stats['total_rows']),
|
| 116 |
+
'matched_column': {
|
| 117 |
+
'non_null_count': int(full_stats['matched_stats']['non_null']),
|
| 118 |
+
'non_empty_count': int(full_stats['matched_stats']['non_empty']),
|
| 119 |
+
'null_percentage': float(
|
| 120 |
+
(full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
|
| 121 |
+
/ full_stats['total_rows'] * 100
|
| 122 |
+
)
|
| 123 |
+
},
|
| 124 |
+
'treatment_matched_column': {
|
| 125 |
+
'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
|
| 126 |
+
'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
|
| 127 |
+
'null_percentage': float(
|
| 128 |
+
(full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
|
| 129 |
+
/ full_stats['total_rows'] * 100
|
| 130 |
+
)
|
| 131 |
+
},
|
| 132 |
+
'both_matched_count': int(full_stats['both_matched']),
|
| 133 |
+
'both_matched_percentage': float(
|
| 134 |
+
full_stats['both_matched'] / full_stats['total_rows'] * 100
|
| 135 |
+
)
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Create output directory
|
| 140 |
+
output_dir = Path(output_dir)
|
| 141 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 142 |
+
|
| 143 |
+
# Save report
|
| 144 |
+
report_file = output_dir / 'integrity_check_report.json'
|
| 145 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
| 146 |
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
| 147 |
+
|
| 148 |
+
print(f"\nReport saved to: {report_file}")
|
| 149 |
+
|
| 150 |
+
# Print summary
|
| 151 |
+
print("\n📊 Summary:")
|
| 152 |
+
print(f"Total records: {report['full_file_analysis']['total_records']}")
|
| 153 |
+
print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
|
| 154 |
+
f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
|
| 155 |
+
|
| 156 |
+
return report
|
| 157 |
+
|
| 158 |
+
def main():
|
| 159 |
+
"""
|
| 160 |
+
Main execution function
|
| 161 |
+
"""
|
| 162 |
+
# Configuration
|
| 163 |
+
input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
|
| 164 |
+
output_dir = "../analysis/integrity_check"
|
| 165 |
+
|
| 166 |
+
print(f"\n🔍 Starting Subset Integrity Check")
|
| 167 |
+
print(f"Input file: {input_file}")
|
| 168 |
+
print(f"Output directory: {output_dir}")
|
| 169 |
+
|
| 170 |
+
# Run analysis
|
| 171 |
+
sample_stats = check_subset_sample(input_file)
|
| 172 |
+
full_stats = analyze_large_file(input_file)
|
| 173 |
+
report = generate_report(sample_stats, full_stats, output_dir)
|
| 174 |
+
|
| 175 |
+
print("\n✅ Integrity check complete!")
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
main()
|
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Keyword Matching and Text Cleaning Logic for Subset Filtering
|
| 2 |
+
|
| 3 |
+
## 1. Keyword Preprocessing
|
| 4 |
+
```python
|
| 5 |
+
def preprocess_keywords(keywords_file):
|
| 6 |
+
# Handle special medical term variants
|
| 7 |
+
special_terms = {
|
| 8 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 9 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 10 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
# Read and preprocess keywords
|
| 14 |
+
with open(keywords_file, "r", encoding="utf-8") as f:
|
| 15 |
+
keywords = [
|
| 16 |
+
line.strip() # Remove whitespace
|
| 17 |
+
.lower() # Convert to lowercase
|
| 18 |
+
for line in f
|
| 19 |
+
if line.strip()
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
# Process special term variants
|
| 23 |
+
processed_keywords = []
|
| 24 |
+
for kw in keywords:
|
| 25 |
+
if kw in special_terms:
|
| 26 |
+
processed_keywords.extend(special_terms[kw])
|
| 27 |
+
else:
|
| 28 |
+
processed_keywords.append(kw)
|
| 29 |
+
|
| 30 |
+
return processed_keywords
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## 2. Regex Pattern Processing
|
| 34 |
+
```python
|
| 35 |
+
def create_regex_pattern(keywords):
|
| 36 |
+
# Simple word boundary matching
|
| 37 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
| 38 |
+
return re.compile(pattern, re.IGNORECASE)
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### Regex Pattern Explanation:
|
| 42 |
+
- `\b`: Word boundary matching
|
| 43 |
+
- `(?:...)`: Non-capturing group
|
| 44 |
+
- `re.escape()`: Escape special characters
|
| 45 |
+
- `re.IGNORECASE`: Case-insensitive matching
|
| 46 |
+
|
| 47 |
+
## 3. Text Preprocessing and Matching
|
| 48 |
+
```python
|
| 49 |
+
# Create lowercase version of text
|
| 50 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 51 |
+
|
| 52 |
+
# Match keywords
|
| 53 |
+
df["treatment_matched"] = (
|
| 54 |
+
df["clean_text_lower"]
|
| 55 |
+
.apply(lambda text: "|".join(pattern.findall(text)) or "")
|
| 56 |
+
)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## 4. Processing Logic Details
|
| 60 |
+
|
| 61 |
+
### 4.1 Special Term Handling Rationale
|
| 62 |
+
- Common variants in medical literature
|
| 63 |
+
- Maintain semantic consistency
|
| 64 |
+
- Improve matching accuracy
|
| 65 |
+
|
| 66 |
+
### 4.2 Regex Matching Strategy
|
| 67 |
+
- Word boundary matching for complete terms
|
| 68 |
+
- Precompiled patterns for performance
|
| 69 |
+
- Case-insensitive matching for flexibility
|
| 70 |
+
|
| 71 |
+
### 4.3 Text Preprocessing Steps
|
| 72 |
+
1. Fill null values (fillna)
|
| 73 |
+
2. Convert to lowercase (str.lower)
|
| 74 |
+
3. Create dedicated lowercase column to avoid repeated conversions
|
| 75 |
+
|
| 76 |
+
## 5. Output Format
|
| 77 |
+
- matched column: Pipe-separated matched keywords
|
| 78 |
+
- type column: Document type identifier ("emergency" or "treatment")
|
| 79 |
+
- condition column: Reserved for future condition mapping
|
| 80 |
+
|
| 81 |
+
## 6. Important Considerations
|
| 82 |
+
1. Regular maintenance required for special term variants
|
| 83 |
+
2. Precompiled regex patterns for performance optimization
|
| 84 |
+
3. Dedicated text preprocessing storage to avoid redundant computations
|
| 85 |
+
4. Maintain consistent column structure between emergency and treatment subsets
|
dataset/scripts/test_keyword_matching.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
def test_special_terms_matching():
|
| 7 |
+
"""
|
| 8 |
+
Test special medical term matching logic
|
| 9 |
+
"""
|
| 10 |
+
# Test cases for different scenarios
|
| 11 |
+
test_cases = {
|
| 12 |
+
"x-ray variants": [
|
| 13 |
+
"Patient needs an x-ray of the chest",
|
| 14 |
+
"Ordered chest xray",
|
| 15 |
+
"X ray shows pneumonia",
|
| 16 |
+
"XRAY negative"
|
| 17 |
+
],
|
| 18 |
+
"ct-scan variants": [
|
| 19 |
+
"CT scan reveals nodule",
|
| 20 |
+
"CT-scan indicates mass",
|
| 21 |
+
"Requires ctscan urgently",
|
| 22 |
+
"CTSCAN of abdomen"
|
| 23 |
+
],
|
| 24 |
+
"point-of-care variants": [
|
| 25 |
+
"Point-of-care testing needed",
|
| 26 |
+
"Point of care ultrasound",
|
| 27 |
+
"POC testing results"
|
| 28 |
+
],
|
| 29 |
+
"mixed cases": [
|
| 30 |
+
"Ordered both x-ray and CT scan",
|
| 31 |
+
"XRAY and CTSCAN negative",
|
| 32 |
+
"Multiple point-of-care tests with x-ray"
|
| 33 |
+
],
|
| 34 |
+
"negative cases": [
|
| 35 |
+
"No imaging mentioned",
|
| 36 |
+
"Regular examination only",
|
| 37 |
+
"Laboratory tests pending"
|
| 38 |
+
]
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
|
| 42 |
+
special_terms = {
|
| 43 |
+
'x-ray': ['x-ray', 'x ray', 'xray'],
|
| 44 |
+
'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
|
| 45 |
+
'point-of-care': ['point-of-care', 'point of care']
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Create test DataFrame
|
| 49 |
+
test_df = pd.DataFrame({
|
| 50 |
+
'clean_text': [text for cases in test_cases.values() for text in cases],
|
| 51 |
+
'category': [cat for cat, texts in test_cases.items() for _ in texts]
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
# Process keywords
|
| 55 |
+
processed_keywords = []
|
| 56 |
+
for term, variants in special_terms.items():
|
| 57 |
+
processed_keywords.extend(variants)
|
| 58 |
+
|
| 59 |
+
# Create regex pattern
|
| 60 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
|
| 61 |
+
|
| 62 |
+
# Apply matching logic
|
| 63 |
+
test_df['matched'] = (
|
| 64 |
+
test_df['clean_text']
|
| 65 |
+
.fillna("")
|
| 66 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 67 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
return test_df
|
| 71 |
+
|
| 72 |
+
def test_basic_matching():
|
| 73 |
+
"""
|
| 74 |
+
Test basic keyword matching functionality
|
| 75 |
+
"""
|
| 76 |
+
# Basic test cases
|
| 77 |
+
test_cases = {
|
| 78 |
+
"simple matches": [
|
| 79 |
+
"Emergency treatment required",
|
| 80 |
+
"Acute condition observed",
|
| 81 |
+
"Urgent care needed"
|
| 82 |
+
],
|
| 83 |
+
"case variations": [
|
| 84 |
+
"EMERGENCY situation",
|
| 85 |
+
"Acute RESPIRATORY failure",
|
| 86 |
+
"URgent surgical intervention"
|
| 87 |
+
],
|
| 88 |
+
"multiple matches": [
|
| 89 |
+
"Emergency treatment for acute condition",
|
| 90 |
+
"Urgent care in emergency department",
|
| 91 |
+
"Acute respiratory emergency"
|
| 92 |
+
],
|
| 93 |
+
"partial words": [
|
| 94 |
+
"Non-emergency situation",
|
| 95 |
+
"Subacute condition",
|
| 96 |
+
"Emergency-related"
|
| 97 |
+
]
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Create test DataFrame
|
| 101 |
+
test_df = pd.DataFrame({
|
| 102 |
+
'clean_text': [text for cases in test_cases.values() for text in cases],
|
| 103 |
+
'category': [cat for cat, texts in test_cases.items() for _ in texts]
|
| 104 |
+
})
|
| 105 |
+
|
| 106 |
+
# Test keywords
|
| 107 |
+
test_keywords = ['emergency', 'acute', 'urgent']
|
| 108 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
|
| 109 |
+
|
| 110 |
+
# Apply matching logic
|
| 111 |
+
test_df['matched'] = (
|
| 112 |
+
test_df['clean_text']
|
| 113 |
+
.fillna("")
|
| 114 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 115 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
return test_df
|
| 119 |
+
|
| 120 |
+
def save_test_results(results_dict):
|
| 121 |
+
"""
|
| 122 |
+
Save test results to JSON file
|
| 123 |
+
"""
|
| 124 |
+
output_dir = Path("../analysis")
|
| 125 |
+
output_dir.mkdir(exist_ok=True)
|
| 126 |
+
|
| 127 |
+
output_file = output_dir / "keyword_matching_test_results.json"
|
| 128 |
+
|
| 129 |
+
# Convert DataFrame results to dictionary
|
| 130 |
+
for key, df in results_dict.items():
|
| 131 |
+
results_dict[key] = df.to_dict(orient='records')
|
| 132 |
+
|
| 133 |
+
with open(output_file, 'w') as f:
|
| 134 |
+
json.dump(results_dict, f, indent=2)
|
| 135 |
+
|
| 136 |
+
print(f"Results saved to: {output_file}")
|
| 137 |
+
|
| 138 |
+
def run_tests():
|
| 139 |
+
"""
|
| 140 |
+
Run all tests and output results
|
| 141 |
+
"""
|
| 142 |
+
print("🧪 Running keyword matching tests...")
|
| 143 |
+
|
| 144 |
+
# Run tests
|
| 145 |
+
special_terms_results = test_special_terms_matching()
|
| 146 |
+
basic_matching_results = test_basic_matching()
|
| 147 |
+
|
| 148 |
+
# Print results
|
| 149 |
+
print("\n📊 Special Terms Matching Results:")
|
| 150 |
+
for category in special_terms_results['category'].unique():
|
| 151 |
+
print(f"\n{category}:")
|
| 152 |
+
subset = special_terms_results[special_terms_results['category'] == category]
|
| 153 |
+
for _, row in subset.iterrows():
|
| 154 |
+
print(f"Text: {row['clean_text']}")
|
| 155 |
+
print(f"Matched: {row['matched'] or 'No matches'}")
|
| 156 |
+
print("-" * 50)
|
| 157 |
+
|
| 158 |
+
print("\n📊 Basic Matching Results:")
|
| 159 |
+
for category in basic_matching_results['category'].unique():
|
| 160 |
+
print(f"\n{category}:")
|
| 161 |
+
subset = basic_matching_results[basic_matching_results['category'] == category]
|
| 162 |
+
for _, row in subset.iterrows():
|
| 163 |
+
print(f"Text: {row['clean_text']}")
|
| 164 |
+
print(f"Matched: {row['matched'] or 'No matches'}")
|
| 165 |
+
print("-" * 50)
|
| 166 |
+
|
| 167 |
+
# Save results
|
| 168 |
+
results_dict = {
|
| 169 |
+
'special_terms_matching': special_terms_results,
|
| 170 |
+
'basic_matching': basic_matching_results
|
| 171 |
+
}
|
| 172 |
+
save_test_results(results_dict)
|
| 173 |
+
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
run_tests()
|