Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		refactor: migrate special terms to JSON configuration
Browse filesBREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
1. Create New Configuration Files:
- Add special_terms_emergency.json
  - Organize emergency terms by categories (cardiac, respiratory, etc.)
  - Include all existing mappings with standardized structure
- Add special_terms_treatment.json
  - Organize treatment terms by categories (imaging, medications, etc.)
  - Maintain all existing term variants
2. Update Processing Scripts:
- Modify 01_filter_emergency_opt.py:
  - Load terms from JSON configuration
  - Add term standardization
  - Implement deduplication
  - Preserve category information
- Modify 02_filter_treatment_opt.py:
  - Similar updates for treatment terms
  - Maintain consistent processing logic
3. New Features:
- Term standardization: Convert variants to standard form
- Deduplication: Remove repeated terms while preserving order
- Category-aware: Support for term categorization
- Improved maintainability: Configuration separated from code
4. Technical Details:
- Use pathlib for file path handling
- JSON structure supports hierarchical organization
- Maintain backward compatibility
- Add type hints for better code clarity
Testing:
- Verify JSON format
- Confirm all mappings migrated correctly
- Check term standardization
- Validate deduplication logic
- dataset/keywords/special_terms_emergency.json +26 -0
- dataset/keywords/special_terms_treatment.json +25 -0
- dataset/scripts/01_filter_emergency_opt.py +37 -27
- dataset/scripts/02_filter_treatment_opt.py +131 -0
- dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
- dataset/scripts/compare_subsets_opt.py +124 -0
- dataset/scripts/data_explorer_opt.py +118 -0
- dataset/scripts/data_explorer_treatment_opt.py +263 -0
| @@ -0,0 +1,26 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "cardiac": {
         | 
| 3 | 
            +
                    "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
         | 
| 4 | 
            +
                    "acs": ["acs", "ACS", "acute coronary syndrome"]
         | 
| 5 | 
            +
                },
         | 
| 6 | 
            +
                "respiratory": {
         | 
| 7 | 
            +
                    "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
         | 
| 8 | 
            +
                    "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
         | 
| 9 | 
            +
                },
         | 
| 10 | 
            +
                "neurological": {
         | 
| 11 | 
            +
                    "loc": ["loc", "LOC", "loss of consciousness"],
         | 
| 12 | 
            +
                    "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
         | 
| 13 | 
            +
                },
         | 
| 14 | 
            +
                "shock": {
         | 
| 15 | 
            +
                    "shock": ["shock", "circulatory failure"],
         | 
| 16 | 
            +
                    "septic_shock": ["septic shock", "sepsis induced shock"]
         | 
| 17 | 
            +
                },
         | 
| 18 | 
            +
                "bleeding": {
         | 
| 19 | 
            +
                    "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
         | 
| 20 | 
            +
                    "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
         | 
| 21 | 
            +
                },
         | 
| 22 | 
            +
                "vital_signs": {
         | 
| 23 | 
            +
                    "hypotension": ["hypotension", "low bp", "low blood pressure"],
         | 
| 24 | 
            +
                    "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
         | 
| 25 | 
            +
                }
         | 
| 26 | 
            +
            } 
         | 
| @@ -0,0 +1,25 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "imaging": {
         | 
| 3 | 
            +
                    "x-ray": ["x-ray", "x ray", "xray", "XR"],
         | 
| 4 | 
            +
                    "ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
         | 
| 5 | 
            +
                    "us": ["us", "u/s", "ultrasound", "sonography"]
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "medications": {
         | 
| 8 | 
            +
                    "iv": ["iv", "i.v.", "intravenous"],
         | 
| 9 | 
            +
                    "im": ["im", "i.m.", "intramuscular"],
         | 
| 10 | 
            +
                    "po": ["po", "p.o.", "per os", "by mouth"]
         | 
| 11 | 
            +
                },
         | 
| 12 | 
            +
                "procedures": {
         | 
| 13 | 
            +
                    "cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
         | 
| 14 | 
            +
                    "intubation": ["intubation", "ETT", "endotracheal tube"],
         | 
| 15 | 
            +
                    "cardioversion": ["cardioversion", "electrical cardioversion"]
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                "monitoring": {
         | 
| 18 | 
            +
                    "ecg": ["ecg", "ekg", "electrocardiogram"],
         | 
| 19 | 
            +
                    "monitoring": ["monitoring", "continuous observation"]
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                "ventilation": {
         | 
| 22 | 
            +
                    "bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
         | 
| 23 | 
            +
                    "cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
            } 
         | 
| @@ -1,36 +1,20 @@ | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import re
         | 
|  | |
| 3 | 
             
            import pandas as pd
         | 
|  | |
| 4 |  | 
| 5 | 
            -
            # Medical term processor class for handling special terms
         | 
| 6 | 
             
            class MedicalTermProcessor:
         | 
| 7 | 
             
                def __init__(self):
         | 
| 8 | 
            -
                    #  | 
| 9 | 
            -
                     | 
| 10 | 
            -
             | 
| 11 | 
            -
                         | 
| 12 | 
            -
                        'acs': ['acs', 'ACS', 'acute coronary syndrome'],
         | 
| 13 |  | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
                        
         | 
| 18 | 
            -
                        # Neurological
         | 
| 19 | 
            -
                        'loc': ['loc', 'LOC', 'loss of consciousness'],
         | 
| 20 | 
            -
                        'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
         | 
| 21 | 
            -
                        
         | 
| 22 | 
            -
                        # Shock States
         | 
| 23 | 
            -
                        'shock': ['shock', 'circulatory failure'],
         | 
| 24 | 
            -
                        'septic_shock': ['septic shock', 'sepsis induced shock'],
         | 
| 25 | 
            -
                        
         | 
| 26 | 
            -
                        # Bleeding
         | 
| 27 | 
            -
                        'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
         | 
| 28 | 
            -
                        'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
         | 
| 29 | 
            -
                        
         | 
| 30 | 
            -
                        # Vital Signs
         | 
| 31 | 
            -
                        'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
         | 
| 32 | 
            -
                        'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
         | 
| 33 | 
            -
                    }
         | 
| 34 |  | 
| 35 | 
             
                def get_all_variants(self):
         | 
| 36 | 
             
                    """Get all term variants including special terms"""
         | 
| @@ -39,6 +23,32 @@ class MedicalTermProcessor: | |
| 39 | 
             
                        variants.extend(term_list)
         | 
| 40 | 
             
                    return variants
         | 
| 41 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 42 | 
             
            # Function: Load keywords and print progress
         | 
| 43 | 
             
            def load_keywords(path, processor):
         | 
| 44 | 
             
                print(f"📥 Loading keywords from: {path}")
         | 
| @@ -70,7 +80,7 @@ df["matched"] = ( | |
| 70 | 
             
                df["clean_text"]
         | 
| 71 | 
             
                  .fillna("")  # Convert NaN to empty string
         | 
| 72 | 
             
                  .str.findall(pattern, flags=re.IGNORECASE)
         | 
| 73 | 
            -
                  .apply(lambda  | 
| 74 | 
             
            )
         | 
| 75 | 
             
            df["has_emergency"] = df["matched"].str.len() > 0
         | 
| 76 |  | 
|  | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import re
         | 
| 3 | 
            +
            import json
         | 
| 4 | 
             
            import pandas as pd
         | 
| 5 | 
            +
            from pathlib import Path
         | 
| 6 |  | 
|  | |
| 7 | 
             
            class MedicalTermProcessor:
         | 
| 8 | 
             
                def __init__(self):
         | 
| 9 | 
            +
                    # Load emergency special terms from JSON
         | 
| 10 | 
            +
                    keywords_dir = Path("../keywords")
         | 
| 11 | 
            +
                    with open(keywords_dir / "special_terms_emergency.json", "r") as f:
         | 
| 12 | 
            +
                        self.emergency_terms_by_category = json.load(f)
         | 
|  | |
| 13 |  | 
| 14 | 
            +
                    # Flatten the nested structure for easy lookup
         | 
| 15 | 
            +
                    self.emergency_special_terms = {}
         | 
| 16 | 
            +
                    for category in self.emergency_terms_by_category.values():
         | 
| 17 | 
            +
                        self.emergency_special_terms.update(category)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 18 |  | 
| 19 | 
             
                def get_all_variants(self):
         | 
| 20 | 
             
                    """Get all term variants including special terms"""
         | 
|  | |
| 23 | 
             
                        variants.extend(term_list)
         | 
| 24 | 
             
                    return variants
         | 
| 25 |  | 
| 26 | 
            +
                def standardize_term(self, term: str) -> str:
         | 
| 27 | 
            +
                    """Convert a term to its standard form if it's a variant"""
         | 
| 28 | 
            +
                    term_lower = term.lower()
         | 
| 29 | 
            +
                    for standard_term, variants in self.emergency_special_terms.items():
         | 
| 30 | 
            +
                        if term_lower in [v.lower() for v in variants]:
         | 
| 31 | 
            +
                            return standard_term
         | 
| 32 | 
            +
                    return term
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                def process_matches(self, matches: list) -> str:
         | 
| 35 | 
            +
                    """Process matches to standardize terms and remove duplicates"""
         | 
| 36 | 
            +
                    if not matches:
         | 
| 37 | 
            +
                        return ""
         | 
| 38 | 
            +
                    
         | 
| 39 | 
            +
                    # Standardize terms
         | 
| 40 | 
            +
                    standardized = [self.standardize_term(match) for match in matches]
         | 
| 41 | 
            +
                    
         | 
| 42 | 
            +
                    # Remove duplicates while preserving order
         | 
| 43 | 
            +
                    seen = set()
         | 
| 44 | 
            +
                    unique_matches = []
         | 
| 45 | 
            +
                    for term in standardized:
         | 
| 46 | 
            +
                        if term.lower() not in seen:
         | 
| 47 | 
            +
                            unique_matches.append(term)
         | 
| 48 | 
            +
                            seen.add(term.lower())
         | 
| 49 | 
            +
                            
         | 
| 50 | 
            +
                    return "|".join(unique_matches)
         | 
| 51 | 
            +
             | 
| 52 | 
             
            # Function: Load keywords and print progress
         | 
| 53 | 
             
            def load_keywords(path, processor):
         | 
| 54 | 
             
                print(f"📥 Loading keywords from: {path}")
         | 
|  | |
| 80 | 
             
                df["clean_text"]
         | 
| 81 | 
             
                  .fillna("")  # Convert NaN to empty string
         | 
| 82 | 
             
                  .str.findall(pattern, flags=re.IGNORECASE)
         | 
| 83 | 
            +
                  .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
         | 
| 84 | 
             
            )
         | 
| 85 | 
             
            df["has_emergency"] = df["matched"].str.len() > 0
         | 
| 86 |  | 
| @@ -0,0 +1,131 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
            import json
         | 
| 4 | 
            +
            import pandas as pd
         | 
| 5 | 
            +
            from pathlib import Path
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            class MedicalTermProcessor:
         | 
| 8 | 
            +
                def __init__(self):
         | 
| 9 | 
            +
                    # Load treatment special terms from JSON
         | 
| 10 | 
            +
                    keywords_dir = Path("../keywords")
         | 
| 11 | 
            +
                    with open(keywords_dir / "special_terms_treatment.json", "r") as f:
         | 
| 12 | 
            +
                        self.treatment_terms_by_category = json.load(f)
         | 
| 13 | 
            +
                        
         | 
| 14 | 
            +
                    # Flatten the nested structure for easy lookup
         | 
| 15 | 
            +
                    self.treatment_special_terms = {}
         | 
| 16 | 
            +
                    for category in self.treatment_terms_by_category.values():
         | 
| 17 | 
            +
                        self.treatment_special_terms.update(category)
         | 
| 18 | 
            +
                    
         | 
| 19 | 
            +
                def get_all_variants(self):
         | 
| 20 | 
            +
                    """Get all term variants including special terms"""
         | 
| 21 | 
            +
                    variants = []
         | 
| 22 | 
            +
                    for term_list in self.treatment_special_terms.values():
         | 
| 23 | 
            +
                        variants.extend(term_list)
         | 
| 24 | 
            +
                    return variants
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def standardize_term(self, term: str) -> str:
         | 
| 27 | 
            +
                    """Convert a term to its standard form if it's a variant"""
         | 
| 28 | 
            +
                    term_lower = term.lower()
         | 
| 29 | 
            +
                    for standard_term, variants in self.treatment_special_terms.items():
         | 
| 30 | 
            +
                        if term_lower in [v.lower() for v in variants]:
         | 
| 31 | 
            +
                            return standard_term
         | 
| 32 | 
            +
                    return term
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                def process_matches(self, matches: list) -> str:
         | 
| 35 | 
            +
                    """Process matches to standardize terms and remove duplicates"""
         | 
| 36 | 
            +
                    if not matches:
         | 
| 37 | 
            +
                        return ""
         | 
| 38 | 
            +
                    
         | 
| 39 | 
            +
                    # Standardize terms
         | 
| 40 | 
            +
                    standardized = [self.standardize_term(match) for match in matches]
         | 
| 41 | 
            +
                    
         | 
| 42 | 
            +
                    # Remove duplicates while preserving order
         | 
| 43 | 
            +
                    seen = set()
         | 
| 44 | 
            +
                    unique_matches = []
         | 
| 45 | 
            +
                    for term in standardized:
         | 
| 46 | 
            +
                        if term.lower() not in seen:
         | 
| 47 | 
            +
                            unique_matches.append(term)
         | 
| 48 | 
            +
                            seen.add(term.lower())
         | 
| 49 | 
            +
                            
         | 
| 50 | 
            +
                    return "|".join(unique_matches)
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            def load_keywords(path, processor):
         | 
| 53 | 
            +
                """Load and preprocess treatment keywords"""
         | 
| 54 | 
            +
                print(f"📥 Loading keywords from: {path}")
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                # Load basic keywords
         | 
| 57 | 
            +
                with open(path, "r", encoding="utf-8") as f:
         | 
| 58 | 
            +
                    basic_kws = [line.strip() for line in f if line.strip()]
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                # Add special term variants
         | 
| 61 | 
            +
                special_kws = processor.get_all_variants()
         | 
| 62 | 
            +
                all_kws = list(set(basic_kws + special_kws))  # Remove duplicates
         | 
| 63 | 
            +
                
         | 
| 64 | 
            +
                print(f"   Loaded {len(all_kws)} keywords (including variants)")
         | 
| 65 | 
            +
                return all_kws
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            # Step 1: Read optimized emergency subset
         | 
| 68 | 
            +
            print("1️⃣ Reading optimized emergency subset...")
         | 
| 69 | 
            +
            emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
         | 
| 70 | 
            +
            df = pd.read_json(emergency_path, lines=True)
         | 
| 71 | 
            +
            print(f"   Loaded {len(df)} emergency records")
         | 
| 72 | 
            +
            print(f"   Contains emergency keywords in 'matched' column")
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            # Step 2: Load treatment keywords and match
         | 
| 75 | 
            +
            print("2️⃣ Loading treatment keywords and matching...")
         | 
| 76 | 
            +
            processor = MedicalTermProcessor()
         | 
| 77 | 
            +
            keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
         | 
| 78 | 
            +
            pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            # Step 3: Process text and match keywords
         | 
| 81 | 
            +
            print("3️⃣ Processing text and matching keywords...")
         | 
| 82 | 
            +
            # Match treatment keywords and add metadata columns
         | 
| 83 | 
            +
            df["treatment_matched"] = (
         | 
| 84 | 
            +
                df["clean_text"]
         | 
| 85 | 
            +
                  .fillna("")  # Convert NaN to empty string
         | 
| 86 | 
            +
                  .str.findall(pattern, flags=re.IGNORECASE)
         | 
| 87 | 
            +
                  .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
         | 
| 88 | 
            +
            )
         | 
| 89 | 
            +
            df["has_treatment"] = df["treatment_matched"].str.len() > 0
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            # Add metadata columns for future use
         | 
| 92 | 
            +
            df["type"] = "treatment"  # Document type identifier
         | 
| 93 | 
            +
            df["condition"] = ""      # Reserved for future condition mapping
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            # Verify columns
         | 
| 96 | 
            +
            print("   Verifying columns...")
         | 
| 97 | 
            +
            print(f"   - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
         | 
| 98 | 
            +
            print(f"   - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            # Calculate statistics
         | 
| 101 | 
            +
            cnt_treat = df["has_treatment"].sum()
         | 
| 102 | 
            +
            avg_matches = (
         | 
| 103 | 
            +
                df[df["has_treatment"]]["treatment_matched"]
         | 
| 104 | 
            +
                  .str.count(r"\|")
         | 
| 105 | 
            +
                  .add(1)
         | 
| 106 | 
            +
                  .mean()
         | 
| 107 | 
            +
            )
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            print(f"   Found {cnt_treat} treatment-related records")
         | 
| 110 | 
            +
            print(f"   Average treatment keywords per record: {avg_matches:.2f}")
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            # Step 4: Save treatment subset
         | 
| 113 | 
            +
            print("4️⃣ Saving treatment subset...")
         | 
| 114 | 
            +
            out_dir = "../dataset/emergency_treatment"
         | 
| 115 | 
            +
            os.makedirs(out_dir, exist_ok=True)
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            # Select records with treatment keywords
         | 
| 118 | 
            +
            subset = df[df["has_treatment"]].copy()  # Use copy to avoid SettingWithCopyWarning
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            # Verify final subset columns
         | 
| 121 | 
            +
            print("   Final subset columns:")
         | 
| 122 | 
            +
            print(f"   - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
         | 
| 123 | 
            +
            print(f"   - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            # Save with _opt suffix
         | 
| 126 | 
            +
            subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
         | 
| 127 | 
            +
            subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            print(f"✅ Generated optimized treatment subset with {len(subset)} records")
         | 
| 130 | 
            +
            print(f"   Saved in: {out_dir}")
         | 
| 131 | 
            +
            print(f"   Contains both emergency and treatment keywords") 
         | 
| @@ -0,0 +1,39 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            refactor: migrate special terms to JSON configuration
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            1. Create New Configuration Files:
         | 
| 6 | 
            +
            - Add special_terms_emergency.json
         | 
| 7 | 
            +
              - Organize emergency terms by categories (cardiac, respiratory, etc.)
         | 
| 8 | 
            +
              - Include all existing mappings with standardized structure
         | 
| 9 | 
            +
            - Add special_terms_treatment.json
         | 
| 10 | 
            +
              - Organize treatment terms by categories (imaging, medications, etc.)
         | 
| 11 | 
            +
              - Maintain all existing term variants
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            2. Update Processing Scripts:
         | 
| 14 | 
            +
            - Modify 01_filter_emergency_opt.py:
         | 
| 15 | 
            +
              - Load terms from JSON configuration
         | 
| 16 | 
            +
              - Add term standardization
         | 
| 17 | 
            +
              - Implement deduplication
         | 
| 18 | 
            +
              - Preserve category information
         | 
| 19 | 
            +
            - Modify 02_filter_treatment_opt.py:
         | 
| 20 | 
            +
              - Similar updates for treatment terms
         | 
| 21 | 
            +
              - Maintain consistent processing logic
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            3. New Features:
         | 
| 24 | 
            +
            - Term standardization: Convert variants to standard form
         | 
| 25 | 
            +
            - Deduplication: Remove repeated terms while preserving order
         | 
| 26 | 
            +
            - Category-aware: Support for term categorization
         | 
| 27 | 
            +
            - Improved maintainability: Configuration separated from code
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            4. Technical Details:
         | 
| 30 | 
            +
            - Use pathlib for file path handling
         | 
| 31 | 
            +
            - JSON structure supports hierarchical organization
         | 
| 32 | 
            +
            - Maintain backward compatibility
         | 
| 33 | 
            +
            - Add type hints for better code clarity
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            Testing:
         | 
| 36 | 
            +
            - Verify JSON format
         | 
| 37 | 
            +
            - Confirm all mappings migrated correctly
         | 
| 38 | 
            +
            - Check term standardization
         | 
| 39 | 
            +
            - Validate deduplication logic 
         | 
| @@ -0,0 +1,124 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # /scripts/compare_subsets_opt.py
         | 
| 2 | 
            +
            import pandas as pd
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
            from datetime import datetime
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            def load_and_compare_subsets(format_type='csv'):
         | 
| 7 | 
            +
                """
         | 
| 8 | 
            +
                Load and compare the first 10 records from both optimized subsets
         | 
| 9 | 
            +
                
         | 
| 10 | 
            +
                Args:
         | 
| 11 | 
            +
                    format_type (str): 'csv' or 'jsonl'
         | 
| 12 | 
            +
                """
         | 
| 13 | 
            +
                # Prepare output file
         | 
| 14 | 
            +
                output_dir = Path("../analysis")
         | 
| 15 | 
            +
                output_dir.mkdir(exist_ok=True)
         | 
| 16 | 
            +
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         | 
| 17 | 
            +
                output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                # Initialize markdown content
         | 
| 20 | 
            +
                md_content = []
         | 
| 21 | 
            +
                md_content.append("# Optimized Subsets Comparison Report\n")
         | 
| 22 | 
            +
                md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
         | 
| 23 | 
            +
                md_content.append(f"File format: {format_type.upper()}\n")
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                # Set file paths based on format
         | 
| 26 | 
            +
                if format_type == 'csv':
         | 
| 27 | 
            +
                    emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
         | 
| 28 | 
            +
                    treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
         | 
| 29 | 
            +
                    # Load CSV files
         | 
| 30 | 
            +
                    emergency_df = pd.read_csv(emergency_path)
         | 
| 31 | 
            +
                    treatment_df = pd.read_csv(treatment_path)
         | 
| 32 | 
            +
                else:  # jsonl
         | 
| 33 | 
            +
                    emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
         | 
| 34 | 
            +
                    treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
         | 
| 35 | 
            +
                    # Load JSONL files
         | 
| 36 | 
            +
                    emergency_df = pd.read_json(emergency_path, lines=True)
         | 
| 37 | 
            +
                    treatment_df = pd.read_json(treatment_path, lines=True)
         | 
| 38 | 
            +
                
         | 
| 39 | 
            +
                # Print and save basic statistics
         | 
| 40 | 
            +
                print("\n📊 Basic Statistics:")
         | 
| 41 | 
            +
                print("-" * 40)
         | 
| 42 | 
            +
                md_content.append("\n## Basic Statistics\n")
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                stats = [
         | 
| 45 | 
            +
                    f"- Emergency subset total records: {len(emergency_df)}",
         | 
| 46 | 
            +
                    f"- Emergency+Treatment subset total records: {len(treatment_df)}",
         | 
| 47 | 
            +
                    f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
         | 
| 48 | 
            +
                    f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
         | 
| 49 | 
            +
                ]
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                # Calculate average keywords using pattern
         | 
| 52 | 
            +
                pattern = r'\|'
         | 
| 53 | 
            +
                emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
         | 
| 54 | 
            +
                treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                stats.extend([
         | 
| 57 | 
            +
                    f"- Avg Emergency Keywords: {emergency_avg:.2f}",
         | 
| 58 | 
            +
                    f"- Avg Treatment Keywords: {treatment_avg:.2f}"
         | 
| 59 | 
            +
                ])
         | 
| 60 | 
            +
                
         | 
| 61 | 
            +
                # Print to console and add to markdown
         | 
| 62 | 
            +
                for stat in stats:
         | 
| 63 | 
            +
                    print(stat.replace("- ", ""))
         | 
| 64 | 
            +
                md_content.extend(stats)
         | 
| 65 | 
            +
                
         | 
| 66 | 
            +
                # Compare first 10 records from Emergency subset
         | 
| 67 | 
            +
                print("\n🔍 First 10 records from Emergency Subset:")
         | 
| 68 | 
            +
                print("-" * 80)
         | 
| 69 | 
            +
                md_content.append("\n## Emergency Subset (First 10 Records)\n")
         | 
| 70 | 
            +
                
         | 
| 71 | 
            +
                for idx, row in emergency_df.head(10).iterrows():
         | 
| 72 | 
            +
                    print(f"\nRecord #{idx+1}")
         | 
| 73 | 
            +
                    print(f"Text preview: {row['clean_text'][:100]}...")
         | 
| 74 | 
            +
                    print(f"Matched keywords: {row['matched']}")
         | 
| 75 | 
            +
                    print(f"Text length: {len(row['clean_text'])}")
         | 
| 76 | 
            +
                    print("-" * 40)
         | 
| 77 | 
            +
                    
         | 
| 78 | 
            +
                    md_content.extend([
         | 
| 79 | 
            +
                        f"\n### Record {idx+1}",
         | 
| 80 | 
            +
                        "```",
         | 
| 81 | 
            +
                        f"Text preview: {row['clean_text'][:100]}...",
         | 
| 82 | 
            +
                        f"Matched keywords: {row['matched']}",
         | 
| 83 | 
            +
                        f"Text length: {len(row['clean_text'])}",
         | 
| 84 | 
            +
                        "```\n"
         | 
| 85 | 
            +
                    ])
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                # Compare first 10 records from Emergency+Treatment subset
         | 
| 88 | 
            +
                print("\n🔍 First 10 records from Emergency+Treatment Subset:")
         | 
| 89 | 
            +
                print("-" * 80)
         | 
| 90 | 
            +
                md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
         | 
| 91 | 
            +
                
         | 
| 92 | 
            +
                for idx, row in treatment_df.head(10).iterrows():
         | 
| 93 | 
            +
                    print(f"\nRecord #{idx+1}")
         | 
| 94 | 
            +
                    print(f"Text preview: {row['clean_text'][:100]}...")
         | 
| 95 | 
            +
                    print(f"Emergency keywords: {row['matched']}")
         | 
| 96 | 
            +
                    print(f"Treatment keywords: {row['treatment_matched']}")
         | 
| 97 | 
            +
                    print(f"Text length: {len(row['clean_text'])}")
         | 
| 98 | 
            +
                    print("-" * 40)
         | 
| 99 | 
            +
                    
         | 
| 100 | 
            +
                    md_content.extend([
         | 
| 101 | 
            +
                        f"\n### Record {idx+1}",
         | 
| 102 | 
            +
                        "```",
         | 
| 103 | 
            +
                        f"Text preview: {row['clean_text'][:100]}...",
         | 
| 104 | 
            +
                        f"Emergency keywords: {row['matched']}",
         | 
| 105 | 
            +
                        f"Treatment keywords: {row['treatment_matched']}",
         | 
| 106 | 
            +
                        f"Text length: {len(row['clean_text'])}",
         | 
| 107 | 
            +
                        "```\n"
         | 
| 108 | 
            +
                    ])
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                # Save markdown content
         | 
| 111 | 
            +
                with open(output_file, 'w', encoding='utf-8') as f:
         | 
| 112 | 
            +
                    f.write('\n'.join(md_content))
         | 
| 113 | 
            +
                
         | 
| 114 | 
            +
                print(f"\n✅ Comparison complete!")
         | 
| 115 | 
            +
                print(f"Report saved to: {output_file}")
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            if __name__ == "__main__":
         | 
| 118 | 
            +
                # Compare using CSV format
         | 
| 119 | 
            +
                print("\nComparing CSV files...")
         | 
| 120 | 
            +
                load_and_compare_subsets('csv')
         | 
| 121 | 
            +
                
         | 
| 122 | 
            +
                # Compare using JSONL format
         | 
| 123 | 
            +
                print("\nComparing JSONL files...")
         | 
| 124 | 
            +
                load_and_compare_subsets('jsonl')
         | 
| @@ -0,0 +1,118 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # /scripts/data_explorer_opt.py
         | 
| 2 | 
            +
            import pandas as pd
         | 
| 3 | 
            +
            import matplotlib.pyplot as plt
         | 
| 4 | 
            +
            import seaborn as sns
         | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
            from pathlib import Path
         | 
| 7 | 
            +
            import json
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
         | 
| 10 | 
            +
                """Analyze subset data quality and distribution"""
         | 
| 11 | 
            +
                print(f"\n{'='*50}")
         | 
| 12 | 
            +
                print(f"Starting optimized dataset analysis: {file_path}")
         | 
| 13 | 
            +
                print(f"Using keywords file: {keywords_path}")
         | 
| 14 | 
            +
                print(f"Output directory: {output_dir}")
         | 
| 15 | 
            +
                print(f"{'='*50}\n")
         | 
| 16 | 
            +
                
         | 
| 17 | 
            +
                # Load data
         | 
| 18 | 
            +
                print("1️⃣ Loading data...")
         | 
| 19 | 
            +
                df = pd.read_csv(file_path)
         | 
| 20 | 
            +
                output_dir = Path(output_dir)
         | 
| 21 | 
            +
                
         | 
| 22 | 
            +
                # 1. Basic statistics
         | 
| 23 | 
            +
                print("\n2️⃣ Calculating basic statistics...")
         | 
| 24 | 
            +
                total = len(df)
         | 
| 25 | 
            +
                df['text_length'] = df['clean_text'].str.len()
         | 
| 26 | 
            +
                avg_len = df['text_length'].mean()
         | 
| 27 | 
            +
                print(f"Total records: {total}")
         | 
| 28 | 
            +
                print(f"Average text length: {avg_len:.2f}")
         | 
| 29 | 
            +
                
         | 
| 30 | 
            +
                # Initialize statistics dictionary with native Python types
         | 
| 31 | 
            +
                stats = {
         | 
| 32 | 
            +
                    'basic_statistics': {
         | 
| 33 | 
            +
                        'total_records': int(total),
         | 
| 34 | 
            +
                        'avg_length': float(avg_len)
         | 
| 35 | 
            +
                    },
         | 
| 36 | 
            +
                    'keyword_statistics': {}
         | 
| 37 | 
            +
                }
         | 
| 38 | 
            +
                
         | 
| 39 | 
            +
                # 2. Keyword analysis
         | 
| 40 | 
            +
                print("\n3️⃣ Performing keyword analysis...")
         | 
| 41 | 
            +
                with open(keywords_path, 'r') as f:
         | 
| 42 | 
            +
                    keywords = [line.strip() for line in f if line.strip()]
         | 
| 43 | 
            +
                print(f"Loaded {len(keywords)} keywords")
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
                # Count keywords and store in stats
         | 
| 46 | 
            +
                for keyword in keywords:
         | 
| 47 | 
            +
                    cnt = df['clean_text'].str.contains(keyword, case=False).sum()
         | 
| 48 | 
            +
                    stats['keyword_statistics'][keyword] = int(cnt)
         | 
| 49 | 
            +
                    print(f"  - {keyword}: {cnt} records")
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                # 3. Visualization
         | 
| 52 | 
            +
                print("\n4️⃣ Generating visualizations...")
         | 
| 53 | 
            +
                output_path = Path(output_dir) / "plots"
         | 
| 54 | 
            +
                output_path.mkdir(parents=True, exist_ok=True)
         | 
| 55 | 
            +
                print(f"Charts will be saved in: {output_path}")
         | 
| 56 | 
            +
                
         | 
| 57 | 
            +
                # 3.1 Keyword distribution chart
         | 
| 58 | 
            +
                print("  - Generating keyword distribution chart...")
         | 
| 59 | 
            +
                plt.figure(figsize=(15, 8))
         | 
| 60 | 
            +
                plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
         | 
| 61 | 
            +
                plt.xticks(rotation=45, ha='right')
         | 
| 62 | 
            +
                plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
         | 
| 63 | 
            +
                plt.xlabel('Keywords')
         | 
| 64 | 
            +
                plt.ylabel('Match Count')
         | 
| 65 | 
            +
                plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
         | 
| 66 | 
            +
                plt.close()
         | 
| 67 | 
            +
                
         | 
| 68 | 
            +
                # 3.2 Text length distribution
         | 
| 69 | 
            +
                print("  - Generating text length distribution...")
         | 
| 70 | 
            +
                plt.figure(figsize=(10, 6))
         | 
| 71 | 
            +
                df['text_length'].hist(bins=50)
         | 
| 72 | 
            +
                plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
         | 
| 73 | 
            +
                plt.xlabel('Text Length')
         | 
| 74 | 
            +
                plt.ylabel('Frequency')
         | 
| 75 | 
            +
                plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
         | 
| 76 | 
            +
                plt.close()
         | 
| 77 | 
            +
                
         | 
| 78 | 
            +
                # 3.3 Keyword co-occurrence analysis
         | 
| 79 | 
            +
                print("  - Generating keyword co-occurrence heatmap...")
         | 
| 80 | 
            +
                cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
         | 
| 81 | 
            +
                for text in df['clean_text']:
         | 
| 82 | 
            +
                    present_keywords = [k for k in keywords if k.lower() in text.lower()]
         | 
| 83 | 
            +
                    for i, k1 in enumerate(present_keywords):
         | 
| 84 | 
            +
                        for j, k2 in enumerate(present_keywords):
         | 
| 85 | 
            +
                            if i != j:
         | 
| 86 | 
            +
                                cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
         | 
| 87 | 
            +
                
         | 
| 88 | 
            +
                plt.figure(figsize=(12, 8))
         | 
| 89 | 
            +
                sns.heatmap(cooccurrence_matrix, 
         | 
| 90 | 
            +
                            xticklabels=keywords, 
         | 
| 91 | 
            +
                            yticklabels=keywords,
         | 
| 92 | 
            +
                            cmap='YlOrRd')
         | 
| 93 | 
            +
                plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
         | 
| 94 | 
            +
                plt.xticks(rotation=45, ha='right')
         | 
| 95 | 
            +
                plt.tight_layout()
         | 
| 96 | 
            +
                plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
         | 
| 97 | 
            +
                plt.close()
         | 
| 98 | 
            +
                
         | 
| 99 | 
            +
                # 4. Save statistics
         | 
| 100 | 
            +
                print("\n5️⃣ Saving statistics...")
         | 
| 101 | 
            +
                stats_path = Path(output_dir) / "stats"
         | 
| 102 | 
            +
                stats_path.mkdir(parents=True, exist_ok=True)
         | 
| 103 | 
            +
                stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
         | 
| 104 | 
            +
                
         | 
| 105 | 
            +
                with open(stats_file, 'w', encoding='utf-8') as f:
         | 
| 106 | 
            +
                    json.dump(stats, f, indent=2, ensure_ascii=False)
         | 
| 107 | 
            +
                print(f"Statistics saved to: {stats_file}")
         | 
| 108 | 
            +
                
         | 
| 109 | 
            +
                print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            if __name__ == "__main__":
         | 
| 112 | 
            +
                # Set file paths for optimized version
         | 
| 113 | 
            +
                emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
         | 
| 114 | 
            +
                emergency_keywords = "../keywords/emergency_keywords.txt"
         | 
| 115 | 
            +
                output_dir = "../analysis"
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                # Run analysis
         | 
| 118 | 
            +
                analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency") 
         | 
| @@ -0,0 +1,263 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # /scripts/data_explorer_treatment_opt.py
         | 
| 2 | 
            +
            import pandas as pd
         | 
| 3 | 
            +
            import matplotlib.pyplot as plt
         | 
| 4 | 
            +
            import seaborn as sns
         | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
            from pathlib import Path
         | 
| 7 | 
            +
            import json
         | 
| 8 | 
            +
            import numpy as np
         | 
| 9 | 
            +
            from tqdm import tqdm
         | 
| 10 | 
            +
            import re
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            def calculate_density(matches, text_length):
         | 
| 13 | 
            +
                """
         | 
| 14 | 
            +
                Calculate keyword density per 1000 words
         | 
| 15 | 
            +
                
         | 
| 16 | 
            +
                Args:
         | 
| 17 | 
            +
                    matches: Number of keyword matches
         | 
| 18 | 
            +
                    text_length: Total text length
         | 
| 19 | 
            +
                    
         | 
| 20 | 
            +
                Returns:
         | 
| 21 | 
            +
                    float: Density per 1000 words
         | 
| 22 | 
            +
                """
         | 
| 23 | 
            +
                return (matches / text_length) * 1000
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            def analyze_treatment_subset(
         | 
| 26 | 
            +
                treatment_file_path, 
         | 
| 27 | 
            +
                emergency_keywords_path, 
         | 
| 28 | 
            +
                treatment_keywords_path, 
         | 
| 29 | 
            +
                output_dir="analysis_treatment_opt"  # Updated default output directory
         | 
| 30 | 
            +
            ):
         | 
| 31 | 
            +
                """
         | 
| 32 | 
            +
                Specialized analysis for optimized treatment subset focusing on:
         | 
| 33 | 
            +
                1. Dual keyword analysis (emergency + treatment)
         | 
| 34 | 
            +
                2. Path B effectiveness validation
         | 
| 35 | 
            +
                3. Condition mapping data preparation
         | 
| 36 | 
            +
                4. RAG readiness assessment
         | 
| 37 | 
            +
                """
         | 
| 38 | 
            +
                print(f"\n{'='*60}")
         | 
| 39 | 
            +
                print(f"Treatment Subset Analysis (Optimized Version)")
         | 
| 40 | 
            +
                print(f"Treatment file: {treatment_file_path}")
         | 
| 41 | 
            +
                print(f"Emergency keywords: {emergency_keywords_path}")
         | 
| 42 | 
            +
                print(f"Treatment keywords: {treatment_keywords_path}")
         | 
| 43 | 
            +
                print(f"Output directory: {output_dir}")
         | 
| 44 | 
            +
                print(f"{'='*60}\n")
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                # Load data
         | 
| 47 | 
            +
                print("1️⃣ Loading optimized treatment subset data...")
         | 
| 48 | 
            +
                df = pd.read_csv(treatment_file_path)
         | 
| 49 | 
            +
                output_dir = Path(output_dir)
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                # Load keyword lists
         | 
| 52 | 
            +
                print("2️⃣ Loading keyword lists...")
         | 
| 53 | 
            +
                with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
         | 
| 54 | 
            +
                    emergency_keywords = [line.strip() for line in f if line.strip()]
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
         | 
| 57 | 
            +
                    treatment_keywords = [line.strip() for line in f if line.strip()]
         | 
| 58 | 
            +
                
         | 
| 59 | 
            +
                print(f"   Emergency keywords: {len(emergency_keywords)}")
         | 
| 60 | 
            +
                print(f"   Treatment keywords: {len(treatment_keywords)}")
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                # Basic statistics
         | 
| 63 | 
            +
                print("\n3️⃣ Computing basic statistics...")
         | 
| 64 | 
            +
                total_records = len(df)
         | 
| 65 | 
            +
                df['text_length'] = df['clean_text'].str.len()
         | 
| 66 | 
            +
                avg_length = df['text_length'].mean()
         | 
| 67 | 
            +
                
         | 
| 68 | 
            +
                print(f"   Total treatment records: {total_records}")
         | 
| 69 | 
            +
                print(f"   Average text length: {avg_length:.2f} characters")
         | 
| 70 | 
            +
                
         | 
| 71 | 
            +
                # Initialize comprehensive statistics
         | 
| 72 | 
            +
                stats = {
         | 
| 73 | 
            +
                    'basic_statistics': {
         | 
| 74 | 
            +
                        'total_records': int(total_records),
         | 
| 75 | 
            +
                        'avg_text_length': float(avg_length),
         | 
| 76 | 
            +
                        'emergency_keywords_count': len(emergency_keywords),
         | 
| 77 | 
            +
                        'treatment_keywords_count': len(treatment_keywords)
         | 
| 78 | 
            +
                    },
         | 
| 79 | 
            +
                    'emergency_keyword_stats': {},
         | 
| 80 | 
            +
                    'treatment_keyword_stats': {},
         | 
| 81 | 
            +
                    'cooccurrence_analysis': {},
         | 
| 82 | 
            +
                    'path_b_validation': {},
         | 
| 83 | 
            +
                    'condition_mapping_candidates': {}
         | 
| 84 | 
            +
                }
         | 
| 85 | 
            +
                
         | 
| 86 | 
            +
                # Emergency keyword analysis in treatment subset
         | 
| 87 | 
            +
                print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
         | 
| 88 | 
            +
                for keyword in emergency_keywords:
         | 
| 89 | 
            +
                    count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
         | 
| 90 | 
            +
                    stats['emergency_keyword_stats'][keyword] = int(count)
         | 
| 91 | 
            +
                    print(f"   Emergency: {keyword} -> {count} records")
         | 
| 92 | 
            +
                
         | 
| 93 | 
            +
                # Treatment keyword analysis
         | 
| 94 | 
            +
                print("\n5️⃣ Analyzing treatment keywords...")
         | 
| 95 | 
            +
                for keyword in treatment_keywords:
         | 
| 96 | 
            +
                    count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
         | 
| 97 | 
            +
                    stats['treatment_keyword_stats'][keyword] = int(count)
         | 
| 98 | 
            +
                    print(f"   Treatment: {keyword} -> {count} records")
         | 
| 99 | 
            +
                
         | 
| 100 | 
            +
                # Step 6: Co-occurrence analysis
         | 
| 101 | 
            +
                print("\n6️⃣ Computing keyword co-occurrence patterns...")
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                # Initialize matrices for full dataset
         | 
| 104 | 
            +
                emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
         | 
| 105 | 
            +
                treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                # Pre-process text
         | 
| 108 | 
            +
                print("   Pre-processing text...")
         | 
| 109 | 
            +
                df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                # Process all emergency keywords
         | 
| 112 | 
            +
                print("\n   Processing all emergency keywords...")
         | 
| 113 | 
            +
                for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
         | 
| 114 | 
            +
                    pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
         | 
| 115 | 
            +
                    emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         | 
| 116 | 
            +
                    matches = emergency_matrix[:, i].sum()
         | 
| 117 | 
            +
                    print(f"   - {keyword}: {matches} matches")
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                # Process all treatment keywords
         | 
| 120 | 
            +
                print("\n   Processing all treatment keywords...")
         | 
| 121 | 
            +
                for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
         | 
| 122 | 
            +
                    pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
         | 
| 123 | 
            +
                    treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         | 
| 124 | 
            +
                    matches = treatment_matrix[:, i].sum()
         | 
| 125 | 
            +
                    print(f"   - {keyword}: {matches} matches")
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                # Compute co-occurrence matrix
         | 
| 128 | 
            +
                print("\n   Computing co-occurrence matrix...")
         | 
| 129 | 
            +
                cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
         | 
| 130 | 
            +
                print("   Computation completed successfully")
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                # Extract results
         | 
| 133 | 
            +
                print("   Extracting co-occurrence pairs...")
         | 
| 134 | 
            +
                cooccurrence_pairs = []
         | 
| 135 | 
            +
                for i, em_kw in enumerate(emergency_keywords):
         | 
| 136 | 
            +
                    for j, tr_kw in enumerate(treatment_keywords):
         | 
| 137 | 
            +
                        count = int(cooc_matrix[i, j])
         | 
| 138 | 
            +
                        if count > 0:
         | 
| 139 | 
            +
                            cooccurrence_pairs.append({
         | 
| 140 | 
            +
                                'emergency_keyword': em_kw,
         | 
| 141 | 
            +
                                'treatment_keyword': tr_kw,
         | 
| 142 | 
            +
                                'cooccurrence_count': count,
         | 
| 143 | 
            +
                                'percentage': float(count / len(df) * 100)
         | 
| 144 | 
            +
                            })
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                # Sort and store results
         | 
| 147 | 
            +
                cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
         | 
| 148 | 
            +
                stats['cooccurrence_analysis'] = cooccurrence_pairs[:20]  # Top 20 pairs
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                print(f"   Found {len(cooccurrence_pairs)} co-occurrence pairs")
         | 
| 151 | 
            +
                print("   Top 5 co-occurrence pairs:")
         | 
| 152 | 
            +
                for i, pair in enumerate(cooccurrence_pairs[:5]):
         | 
| 153 | 
            +
                    print(f"     {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
         | 
| 154 | 
            +
                
         | 
| 155 | 
            +
                # Step 7: Path B validation metrics
         | 
| 156 | 
            +
                print("\n7️⃣ Validating Path B strategy effectiveness...")
         | 
| 157 | 
            +
                
         | 
| 158 | 
            +
                # Compute keyword density with progress bar
         | 
| 159 | 
            +
                print("   Computing keyword density...")
         | 
| 160 | 
            +
                with tqdm(total=2, desc="Density calculation") as pbar:
         | 
| 161 | 
            +
                    emergency_density = calculate_density(
         | 
| 162 | 
            +
                        emergency_matrix.sum(axis=1),
         | 
| 163 | 
            +
                        df['text_length']
         | 
| 164 | 
            +
                    )
         | 
| 165 | 
            +
                    pbar.update(1)
         | 
| 166 | 
            +
                    
         | 
| 167 | 
            +
                    treatment_density = calculate_density(
         | 
| 168 | 
            +
                        treatment_matrix.sum(axis=1),
         | 
| 169 | 
            +
                        df['text_length']
         | 
| 170 | 
            +
                    )
         | 
| 171 | 
            +
                    pbar.update(1)
         | 
| 172 | 
            +
                
         | 
| 173 | 
            +
                # Store density in dataframe for visualization
         | 
| 174 | 
            +
                df['emergency_keyword_density'] = emergency_density
         | 
| 175 | 
            +
                df['treatment_keyword_density'] = treatment_density
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                # Calculate statistics with the new density metrics
         | 
| 178 | 
            +
                stats['path_b_validation'] = {
         | 
| 179 | 
            +
                    'avg_emergency_density': float(np.mean(emergency_density)),
         | 
| 180 | 
            +
                    'avg_treatment_density': float(np.mean(treatment_density)),
         | 
| 181 | 
            +
                    'high_density_records': int(sum(
         | 
| 182 | 
            +
                        (emergency_density >= np.percentile(emergency_density, 75)) & 
         | 
| 183 | 
            +
                        (treatment_density >= np.percentile(treatment_density, 75))
         | 
| 184 | 
            +
                    )),
         | 
| 185 | 
            +
                    'precision_estimate': float(sum(
         | 
| 186 | 
            +
                        (emergency_density > 0) & (treatment_density > 0)
         | 
| 187 | 
            +
                    ) / len(df))
         | 
| 188 | 
            +
                }
         | 
| 189 | 
            +
                
         | 
| 190 | 
            +
                # Print detailed results
         | 
| 191 | 
            +
                print("\n   Results:")
         | 
| 192 | 
            +
                print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
         | 
| 193 | 
            +
                print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
         | 
| 194 | 
            +
                print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
         | 
| 195 | 
            +
                print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
         | 
| 196 | 
            +
                
         | 
| 197 | 
            +
                # Sample distribution analysis
         | 
| 198 | 
            +
                print("\n   Density Distribution:")
         | 
| 199 | 
            +
                density_counts = pd.DataFrame({
         | 
| 200 | 
            +
                    'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
         | 
| 201 | 
            +
                    'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
         | 
| 202 | 
            +
                }).value_counts().head()
         | 
| 203 | 
            +
                print("   Top 5 density combinations (emergency, treatment):")
         | 
| 204 | 
            +
                for (em, tr), count in density_counts.items():
         | 
| 205 | 
            +
                    print(f"   - {count} documents have {em} emergency and {tr} treatment density")
         | 
| 206 | 
            +
                
         | 
| 207 | 
            +
                # Visualization
         | 
| 208 | 
            +
                print("\n8️⃣ Generating visualizations...")
         | 
| 209 | 
            +
                output_plots = output_dir / "plots"
         | 
| 210 | 
            +
                output_plots.mkdir(parents=True, exist_ok=True)
         | 
| 211 | 
            +
                
         | 
| 212 | 
            +
                # 1. Keyword density scatter plot with improved visualization
         | 
| 213 | 
            +
                plt.figure(figsize=(12, 8))
         | 
| 214 | 
            +
                plt.scatter(
         | 
| 215 | 
            +
                    emergency_density,
         | 
| 216 | 
            +
                    treatment_density,
         | 
| 217 | 
            +
                    alpha=0.6,
         | 
| 218 | 
            +
                    c=np.log1p(df['text_length']),
         | 
| 219 | 
            +
                    cmap='viridis'
         | 
| 220 | 
            +
                )
         | 
| 221 | 
            +
                plt.colorbar(label='Log Text Length')
         | 
| 222 | 
            +
                plt.xlabel('Emergency Keyword Density (per 1000 words)')
         | 
| 223 | 
            +
                plt.ylabel('Treatment Keyword Density (per 1000 words)')
         | 
| 224 | 
            +
                plt.title('Emergency vs Treatment Keyword Density (Optimized)')
         | 
| 225 | 
            +
                plt.grid(True, alpha=0.3)
         | 
| 226 | 
            +
                
         | 
| 227 | 
            +
                # Add mean lines
         | 
| 228 | 
            +
                plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
         | 
| 229 | 
            +
                plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
         | 
| 230 | 
            +
                plt.legend()
         | 
| 231 | 
            +
                
         | 
| 232 | 
            +
                plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
         | 
| 233 | 
            +
                plt.close()
         | 
| 234 | 
            +
                
         | 
| 235 | 
            +
                # Save comprehensive statistics
         | 
| 236 | 
            +
                print("\n9️⃣ Saving analysis results...")
         | 
| 237 | 
            +
                stats_dir = output_dir / "stats"
         | 
| 238 | 
            +
                stats_dir.mkdir(parents=True, exist_ok=True)
         | 
| 239 | 
            +
                
         | 
| 240 | 
            +
                with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
         | 
| 241 | 
            +
                    json.dump(stats, f, indent=2, ensure_ascii=False)
         | 
| 242 | 
            +
                
         | 
| 243 | 
            +
                print(f"✅ Treatment subset analysis complete! (Optimized Version)")
         | 
| 244 | 
            +
                print(f"   Results saved to: {output_dir}")
         | 
| 245 | 
            +
                print(f"   Plots: {output_plots}")
         | 
| 246 | 
            +
                print(f"   Statistics: {stats_dir}")
         | 
| 247 | 
            +
                
         | 
| 248 | 
            +
                return stats
         | 
| 249 | 
            +
             | 
| 250 | 
            +
            if __name__ == "__main__":
         | 
| 251 | 
            +
                # Configuration for optimized version
         | 
| 252 | 
            +
                treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
         | 
| 253 | 
            +
                emergency_keywords = "../keywords/emergency_keywords.txt"
         | 
| 254 | 
            +
                treatment_keywords = "../keywords/treatment_keywords.txt"
         | 
| 255 | 
            +
                output_directory = "../analysis_treatment_opt"
         | 
| 256 | 
            +
                
         | 
| 257 | 
            +
                # Run analysis
         | 
| 258 | 
            +
                results = analyze_treatment_subset(
         | 
| 259 | 
            +
                    treatment_file, 
         | 
| 260 | 
            +
                    emergency_keywords, 
         | 
| 261 | 
            +
                    treatment_keywords, 
         | 
| 262 | 
            +
                    output_directory
         | 
| 263 | 
            +
                ) 
         | 
