File size: 4,721 Bytes
988dac9
 
 
b37084c
 
988dac9
b37084c
988dac9
 
ee06c0f
b37084c
ee06c0f
 
 
b37084c
988dac9
ee06c0f
 
988dac9
 
 
ee06c0f
 
 
b37084c
ee06c0f
 
 
988dac9
ee06c0f
 
 
 
 
 
 
 
 
 
 
988dac9
 
ee06c0f
988dac9
ee06c0f
988dac9
ee06c0f
 
 
988dac9
ee06c0f
 
988dac9
 
ee06c0f
988dac9
ee06c0f
 
988dac9
ee06c0f
988dac9
ee06c0f
 
 
 
 
988dac9
 
 
ee06c0f
 
988dac9
 
ee06c0f
 
 
 
 
988dac9
 
ee06c0f
 
988dac9
 
 
 
 
 
 
 
 
 
 
 
 
ee06c0f
988dac9
 
 
 
 
 
ee06c0f
 
988dac9
 
 
b37084c
ee06c0f
b37084c
 
ee06c0f
b37084c
ee06c0f
b37084c
 
ee06c0f
b37084c
 
 
 
ee06c0f
b37084c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# /scripts/data_explorer.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import json

def analyze_subset(file_path, keywords_path, output_dir="analysis"):
    """Analyze subset data quality and distribution"""
    print(f"\n{'='*50}")
    print(f"Starting dataset analysis: {file_path}")
    print(f"Using keywords file: {keywords_path}")
    print(f"Output directory: {output_dir}")
    print(f"{'='*50}\n")
    
    # Load data
    print("1️⃣ Loading data...")
    df = pd.read_csv(file_path)
    output_dir = Path(output_dir)
    
    # 1. Basic statistics
    print("\n2️⃣ Calculating basic statistics...")
    total = len(df)
    df['text_length'] = df['clean_text'].str.len()
    avg_len = df['text_length'].mean()
    print(f"Total records: {total}")
    print(f"Average text length: {avg_len:.2f}")
    
    # Initialize statistics dictionary with native Python types
    stats = {
        'basic_statistics': {
            'total_records': int(total),
            'avg_length': float(avg_len)
        },
        'keyword_statistics': {}
    }
    
    # 2. Keyword analysis
    print("\n3️⃣ Performing keyword analysis...")
    with open(keywords_path, 'r') as f:
        keywords = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(keywords)} keywords")
    
    # Count keywords and store in stats
    for keyword in keywords:
        cnt = df['clean_text'].str.contains(keyword, case=False).sum()
        stats['keyword_statistics'][keyword] = int(cnt)
        print(f"  - {keyword}: {cnt} records")
    
    # 3. Visualization
    print("\n4️⃣ Generating visualizations...")
    output_path = Path(output_dir) / "plots"
    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Charts will be saved in: {output_path}")
    
    # 3.1 Keyword distribution chart
    print("  - Generating keyword distribution chart...")
    plt.figure(figsize=(15, 8))
    plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
    plt.xticks(rotation=45, ha='right')
    # TODO: change the title to the name of the subset
    plt.title('Keyword Distribution for Emergency Subset')
    plt.xlabel('Keywords')
    plt.ylabel('Match Count')
    # TODO: change the name of the file to the name of the subset
    plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
    plt.close()
    
    # 3.2 Text length distribution
    print("  - Generating text length distribution...")
    plt.figure(figsize=(10, 6))
    df['text_length'].hist(bins=50)
    plt.title('Text Length Distribution')
    plt.xlabel('Text Length')
    plt.ylabel('Frequency')
    # TODO: change the name of the file to the name of the subset
    plt.savefig(output_path / "text_length_dist_emergency_subset.png", bbox_inches='tight')
    plt.close()
    
    # 3.3 Keyword co-occurrence analysis
    print("  - Generating keyword co-occurrence heatmap...")
    cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
    for text in df['clean_text']:
        present_keywords = [k for k in keywords if k.lower() in text.lower()]
        for i, k1 in enumerate(present_keywords):
            for j, k2 in enumerate(present_keywords):
                if i != j:
                    cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(cooccurrence_matrix, 
                xticklabels=keywords, 
                yticklabels=keywords,
                cmap='YlOrRd')
    plt.title('Keyword Co-occurrence Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    # TODO: change the name of the file to the name of the subset
    plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
    plt.close()
    
    # 4. Save statistics
    print("\n5️⃣ Saving statistics...")
    stats_path = Path(output_dir) / "stats"
    stats_path.mkdir(parents=True, exist_ok=True)
    # TODO: change the name of the file to the name of the subset
    stats_file = stats_path / "analysis_stats_emergency_subset.json"
    
    with open(stats_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    print(f"Statistics saved to: {stats_file}")
    
    print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")

if __name__ == "__main__":
    # Set file paths
    emergency_subset = "../dataset/emergency/emergency_subset.csv"
    emergency_keywords = "../keywords/emergency_keywords.txt"
    output_dir = "../analysis"
    
    # Run analysis
    analyze_subset(emergency_subset, emergency_keywords, output_dir)