Spaces:
Running
Running
File size: 4,721 Bytes
988dac9 b37084c 988dac9 b37084c 988dac9 ee06c0f b37084c ee06c0f b37084c 988dac9 ee06c0f 988dac9 ee06c0f b37084c ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 ee06c0f 988dac9 b37084c ee06c0f b37084c ee06c0f b37084c ee06c0f b37084c ee06c0f b37084c ee06c0f b37084c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# /scripts/data_explorer.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import json
def analyze_subset(file_path, keywords_path, output_dir="analysis"):
"""Analyze subset data quality and distribution"""
print(f"\n{'='*50}")
print(f"Starting dataset analysis: {file_path}")
print(f"Using keywords file: {keywords_path}")
print(f"Output directory: {output_dir}")
print(f"{'='*50}\n")
# Load data
print("1️⃣ Loading data...")
df = pd.read_csv(file_path)
output_dir = Path(output_dir)
# 1. Basic statistics
print("\n2️⃣ Calculating basic statistics...")
total = len(df)
df['text_length'] = df['clean_text'].str.len()
avg_len = df['text_length'].mean()
print(f"Total records: {total}")
print(f"Average text length: {avg_len:.2f}")
# Initialize statistics dictionary with native Python types
stats = {
'basic_statistics': {
'total_records': int(total),
'avg_length': float(avg_len)
},
'keyword_statistics': {}
}
# 2. Keyword analysis
print("\n3️⃣ Performing keyword analysis...")
with open(keywords_path, 'r') as f:
keywords = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(keywords)} keywords")
# Count keywords and store in stats
for keyword in keywords:
cnt = df['clean_text'].str.contains(keyword, case=False).sum()
stats['keyword_statistics'][keyword] = int(cnt)
print(f" - {keyword}: {cnt} records")
# 3. Visualization
print("\n4️⃣ Generating visualizations...")
output_path = Path(output_dir) / "plots"
output_path.mkdir(parents=True, exist_ok=True)
print(f"Charts will be saved in: {output_path}")
# 3.1 Keyword distribution chart
print(" - Generating keyword distribution chart...")
plt.figure(figsize=(15, 8))
plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
plt.xticks(rotation=45, ha='right')
# TODO: change the title to the name of the subset
plt.title('Keyword Distribution for Emergency Subset')
plt.xlabel('Keywords')
plt.ylabel('Match Count')
# TODO: change the name of the file to the name of the subset
plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
plt.close()
# 3.2 Text length distribution
print(" - Generating text length distribution...")
plt.figure(figsize=(10, 6))
df['text_length'].hist(bins=50)
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
# TODO: change the name of the file to the name of the subset
plt.savefig(output_path / "text_length_dist_emergency_subset.png", bbox_inches='tight')
plt.close()
# 3.3 Keyword co-occurrence analysis
print(" - Generating keyword co-occurrence heatmap...")
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
for text in df['clean_text']:
present_keywords = [k for k in keywords if k.lower() in text.lower()]
for i, k1 in enumerate(present_keywords):
for j, k2 in enumerate(present_keywords):
if i != j:
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
plt.figure(figsize=(12, 8))
sns.heatmap(cooccurrence_matrix,
xticklabels=keywords,
yticklabels=keywords,
cmap='YlOrRd')
plt.title('Keyword Co-occurrence Heatmap')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# TODO: change the name of the file to the name of the subset
plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
plt.close()
# 4. Save statistics
print("\n5️⃣ Saving statistics...")
stats_path = Path(output_dir) / "stats"
stats_path.mkdir(parents=True, exist_ok=True)
# TODO: change the name of the file to the name of the subset
stats_file = stats_path / "analysis_stats_emergency_subset.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Statistics saved to: {stats_file}")
print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
if __name__ == "__main__":
# Set file paths
emergency_subset = "../dataset/emergency/emergency_subset.csv"
emergency_keywords = "../keywords/emergency_keywords.txt"
output_dir = "../analysis"
# Run analysis
analyze_subset(emergency_subset, emergency_keywords, output_dir) |