# /scripts/data_explorer_treatment.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Removed duplicate import of numpy
from pathlib import Path
import json
from tqdm import tqdm
import re

def calculate_density(matches, text_length):
    """
    Calculate keyword density per 1000 words
    
    Args:
        matches: Number of keyword matches
        text_length: Total text length
        
    Returns:
        float: Density per 1000 words
    """
    return (matches / text_length) * 1000

def analyze_treatment_subset(
    treatment_file_path, 
    emergency_keywords_path, 
    treatment_keywords_path, 
    output_dir="analysis_treatment"
):
    """
    Specialized analysis for treatment subset focusing on:
    1. Dual keyword analysis (emergency + treatment)
    2. Path B effectiveness validation
    3. Condition mapping data preparation
    4. RAG readiness assessment
    """
    print(f"\n{'='*60}")
    print(f"Treatment Subset Analysis")
    print(f"Treatment file: {treatment_file_path}")
    print(f"Emergency keywords: {emergency_keywords_path}")
    print(f"Treatment keywords: {treatment_keywords_path}")
    print(f"Output directory: {output_dir}")
    print(f"{'='*60}\n")
    
    # Load data
    print("1️⃣ Loading treatment subset data...")
    df = pd.read_csv(treatment_file_path)
    output_dir = Path(output_dir)
    
    # Load keyword lists
    print("2️⃣ Loading keyword lists...")
    with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
        emergency_keywords = [line.strip() for line in f if line.strip()]
    
    with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
        treatment_keywords = [line.strip() for line in f if line.strip()]
    
    print(f"   Emergency keywords: {len(emergency_keywords)}")
    print(f"   Treatment keywords: {len(treatment_keywords)}")
    
    # Basic statistics
    print("\n3️⃣ Computing basic statistics...")
    total_records = len(df)
    df['text_length'] = df['clean_text'].str.len()
    avg_length = df['text_length'].mean()
    
    print(f"   Total treatment records: {total_records}")
    print(f"   Average text length: {avg_length:.2f} characters")
    
    # Initialize comprehensive statistics
    stats = {
        'basic_statistics': {
            'total_records': int(total_records),
            'avg_text_length': float(avg_length),
            'emergency_keywords_count': len(emergency_keywords),
            'treatment_keywords_count': len(treatment_keywords)
        },
        'emergency_keyword_stats': {},
        'treatment_keyword_stats': {},
        'cooccurrence_analysis': {},
        'path_b_validation': {},
        'condition_mapping_candidates': {}
    }
    
    # Emergency keyword analysis in treatment subset
    print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
    for keyword in emergency_keywords:
        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
        stats['emergency_keyword_stats'][keyword] = int(count)
        print(f"   Emergency: {keyword} -> {count} records")
    
    # Treatment keyword analysis
    print("\n5️⃣ Analyzing treatment keywords...")
    for keyword in treatment_keywords:
        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
        stats['treatment_keyword_stats'][keyword] = int(count)
        print(f"   Treatment: {keyword} -> {count} records")
    
    # Step 6: Co-occurrence analysis
    print("\n6️⃣ Computing keyword co-occurrence patterns...")

    # Initialize matrices for full dataset
    emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
    treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)

    # Pre-process text
    print("   Pre-processing text...")
    df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()

    # Process all emergency keywords
    print("\n   Processing all emergency keywords...")
    for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
        # Using word boundary instead of negative lookbehind/lookahead
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
        matches = emergency_matrix[:, i].sum()
        print(f"   - {keyword}: {matches} matches")

    # Process all treatment keywords
    print("\n   Processing all treatment keywords...")
    for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
        # Using word boundary instead of negative lookbehind/lookahead
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
        matches = treatment_matrix[:, i].sum()
        print(f"   - {keyword}: {matches} matches")

    # Compute co-occurrence matrix
    print("\n   Computing co-occurrence matrix...")
    cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
    print("   Computation completed successfully")

    # Extract results
    print("   Extracting co-occurrence pairs...")
    cooccurrence_pairs = []
    for i, em_kw in enumerate(emergency_keywords):
        for j, tr_kw in enumerate(treatment_keywords):
            count = int(cooc_matrix[i, j])
            if count > 0:
                cooccurrence_pairs.append({
                    'emergency_keyword': em_kw,
                    'treatment_keyword': tr_kw,
                    'cooccurrence_count': count,
                    'percentage': float(count / len(df) * 100)
                })

    # Sort and store results
    cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
    stats['cooccurrence_analysis'] = cooccurrence_pairs[:20]  # Top 20 pairs

    print(f"   Found {len(cooccurrence_pairs)} co-occurrence pairs")
    print("   Top 5 co-occurrence pairs:")
    for i, pair in enumerate(cooccurrence_pairs[:5]):
        print(f"     {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
    
    # Step 7: Path B validation metrics
    print("\n7️⃣ Validating Path B strategy effectiveness...")
    
    # Compute keyword density with progress bar
    print("   Computing keyword density...")
    with tqdm(total=2, desc="Density calculation") as pbar:
        # Calculate density per 1000 words for both emergency and treatment keywords
        emergency_density = calculate_density(
            emergency_matrix.sum(axis=1),
            df['text_length']
        )
        pbar.update(1)
        
        treatment_density = calculate_density(
            treatment_matrix.sum(axis=1),
            df['text_length']
        )
        pbar.update(1)
    
    # Store density in dataframe for visualization
    df['emergency_keyword_density'] = emergency_density
    df['treatment_keyword_density'] = treatment_density
    
    # Calculate statistics with the new density metrics
    stats['path_b_validation'] = {
        'avg_emergency_density': float(np.mean(emergency_density)),
        'avg_treatment_density': float(np.mean(treatment_density)),
        'high_density_records': int(sum(
            (emergency_density >= np.percentile(emergency_density, 75)) & 
            (treatment_density >= np.percentile(treatment_density, 75))
        )),
        'precision_estimate': float(sum(
            (emergency_density > 0) & (treatment_density > 0)
        ) / len(df))
    }
    
    # Print detailed results
    print("\n   Results:")
    print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
    print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
    print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
    print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
    
    # Sample distribution analysis
    print("\n   Density Distribution:")
    density_counts = pd.DataFrame({
        'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
        'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
    }).value_counts().head()
    print("   Top 5 density combinations (emergency, treatment):")
    for (em, tr), count in density_counts.items():
        print(f"   - {count} documents have {em} emergency and {tr} treatment density")
    
    # Visualization
    print("\n8️⃣ Generating visualizations...")
    output_plots = output_dir / "plots"
    output_plots.mkdir(parents=True, exist_ok=True)
    
    # 1. Keyword density scatter plot with improved visualization
    plt.figure(figsize=(12, 8))
    plt.scatter(
        emergency_density,
        treatment_density,
        alpha=0.6,
        c=np.log1p(df['text_length']),  # Color by log text length
        cmap='viridis'
    )
    plt.colorbar(label='Log Text Length')
    plt.xlabel('Emergency Keyword Density (per 1000 words)')
    plt.ylabel('Treatment Keyword Density (per 1000 words)')
    plt.title('Emergency vs Treatment Keyword Density')
    plt.grid(True, alpha=0.3)
    
    # Add mean lines
    plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
    plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
    plt.legend()
    
    plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
    plt.close()
    
    # Save comprehensive statistics
    print("\n9️⃣ Saving analysis results...")
    stats_dir = output_dir / "stats"
    stats_dir.mkdir(parents=True, exist_ok=True)
    
    with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Treatment subset analysis complete!")
    print(f"   Results saved to: {output_dir}")
    print(f"   Plots: {output_plots}")
    print(f"   Statistics: {stats_dir}")
    
    return stats

if __name__ == "__main__":
    # Configuration
    treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
    emergency_keywords = "../keywords/emergency_keywords.txt"
    treatment_keywords = "../keywords/treatment_keywords.txt"
    output_directory = "../analysis_treatment"
    
    # Run analysis
    results = analyze_treatment_subset(
        treatment_file, 
        emergency_keywords, 
        treatment_keywords, 
        output_directory
    )