Spaces:

ybchen928
/

oncall-guide-ai

Running

oncall-guide-ai / dataset /scripts /data_explorer_treatment_opt.py

Yan-Bo Chen

Update dataset/scripts/data_explorer_treatment_opt.py

f58b914 4 months ago

10.7 kB

	# /scripts/data_explorer_treatment_opt.py
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	from pathlib import Path
	import json
	from tqdm import tqdm
	import re

	def calculate_density(matches, text_length):
	"""
	Calculate keyword density per 1000 words

	Args:
	matches: Number of keyword matches
	text_length: Total text length

	Returns:
	float: Density per 1000 words
	"""
	return (matches / text_length) * 1000

	def analyze_treatment_subset(
	treatment_file_path,
	emergency_keywords_path,
	treatment_keywords_path,
	output_dir="analysis_treatment_opt" # Updated default output directory
	):
	"""
	Specialized analysis for optimized treatment subset focusing on:
	1. Dual keyword analysis (emergency + treatment)
	2. Path B effectiveness validation
	3. Condition mapping data preparation
	4. RAG readiness assessment
	"""
	print(f"\n{'='*60}")
	print(f"Treatment Subset Analysis (Optimized Version)")
	print(f"Treatment file: {treatment_file_path}")
	print(f"Emergency keywords: {emergency_keywords_path}")
	print(f"Treatment keywords: {treatment_keywords_path}")
	print(f"Output directory: {output_dir}")
	print(f"{'='*60}\n")

	# Load data
	print("1️⃣ Loading optimized treatment subset data...")
	df = pd.read_csv(treatment_file_path)
	output_dir = Path(output_dir)

	# Load keyword lists
	print("2️⃣ Loading keyword lists...")
	with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
	emergency_keywords = [line.strip() for line in f if line.strip()]

	with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
	treatment_keywords = [line.strip() for line in f if line.strip()]

	print(f" Emergency keywords: {len(emergency_keywords)}")
	print(f" Treatment keywords: {len(treatment_keywords)}")

	# Basic statistics
	print("\n3️⃣ Computing basic statistics...")
	total_records = len(df)
	df['text_length'] = df['clean_text'].str.len()
	avg_length = df['text_length'].mean()

	print(f" Total treatment records: {total_records}")
	print(f" Average text length: {avg_length:.2f} characters")

	# Initialize comprehensive statistics
	stats = {
	'basic_statistics': {
	'total_records': int(total_records),
	'avg_text_length': float(avg_length),
	'emergency_keywords_count': len(emergency_keywords),
	'treatment_keywords_count': len(treatment_keywords)
	},
	'emergency_keyword_stats': {},
	'treatment_keyword_stats': {},
	'cooccurrence_analysis': {},
	'path_b_validation': {},
	'condition_mapping_candidates': {}
	}

	# Emergency keyword analysis in treatment subset
	print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
	for keyword in emergency_keywords:
	count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
	stats['emergency_keyword_stats'][keyword] = int(count)
	print(f" Emergency: {keyword} -> {count} records")

	# Treatment keyword analysis
	print("\n5️⃣ Analyzing treatment keywords...")
	for keyword in treatment_keywords:
	count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
	stats['treatment_keyword_stats'][keyword] = int(count)
	print(f" Treatment: {keyword} -> {count} records")

	# Step 6: Co-occurrence analysis
	print("\n6️⃣ Computing keyword co-occurrence patterns...")

	# Initialize matrices for full dataset
	emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
	treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)

	# Pre-process text
	print(" Pre-processing text...")
	df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()

	# Process all emergency keywords
	print("\n Processing all emergency keywords...")
	for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
	pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
	emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
	matches = emergency_matrix[:, i].sum()
	print(f" - {keyword}: {matches} matches")

	# Process all treatment keywords
	print("\n Processing all treatment keywords...")
	for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
	pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
	treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
	matches = treatment_matrix[:, i].sum()
	print(f" - {keyword}: {matches} matches")

	# Compute co-occurrence matrix
	print("\n Computing co-occurrence matrix...")
	cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
	print(" Computation completed successfully")

	# Extract results
	print(" Extracting co-occurrence pairs...")
	cooccurrence_pairs = []
	for i, em_kw in enumerate(emergency_keywords):
	for j, tr_kw in enumerate(treatment_keywords):
	count = int(cooc_matrix[i, j])
	if count > 0:
	cooccurrence_pairs.append({
	'emergency_keyword': em_kw,
	'treatment_keyword': tr_kw,
	'cooccurrence_count': count,
	'percentage': float(count / len(df) * 100)
	})

	# Sort and store results
	cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
	stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs

	print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
	print(" Top 5 co-occurrence pairs:")
	for i, pair in enumerate(cooccurrence_pairs[:5]):
	print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")

	# Step 7: Path B validation metrics
	print("\n7️⃣ Validating Path B strategy effectiveness...")

	# Compute keyword density with progress bar
	print(" Computing keyword density...")
	with tqdm(total=2, desc="Density calculation") as pbar:
	emergency_density = calculate_density(
	emergency_matrix.sum(axis=1),
	df['text_length']
	)
	pbar.update(1)

	treatment_density = calculate_density(
	treatment_matrix.sum(axis=1),
	df['text_length']
	)
	pbar.update(1)

	# Store density in dataframe for visualization
	df['emergency_keyword_density'] = emergency_density
	df['treatment_keyword_density'] = treatment_density

	# Calculate statistics with the new density metrics
	stats['path_b_validation'] = {
	'avg_emergency_density': float(np.mean(emergency_density)),
	'avg_treatment_density': float(np.mean(treatment_density)),
	'high_density_records': int(sum(
	(emergency_density >= np.percentile(emergency_density, 75)) &
	(treatment_density >= np.percentile(treatment_density, 75))
	)),
	'precision_estimate': float(sum(
	(emergency_density > 0) & (treatment_density > 0)
	) / len(df))
	}

	# Print detailed results
	print("\n Results:")
	print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
	print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
	print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
	print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")

	# Sample distribution analysis
	print("\n Density Distribution:")
	density_counts = pd.DataFrame({
	'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
	'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
	}).value_counts().head()
	print(" Top 5 density combinations (emergency, treatment):")
	for (em, tr), count in density_counts.items():
	print(f" - {count} documents have {em} emergency and {tr} treatment density")

	# Visualization
	print("\n8️⃣ Generating visualizations...")
	output_plots = output_dir / "plots"
	output_plots.mkdir(parents=True, exist_ok=True)

	# 1. Keyword density scatter plot with improved visualization
	plt.figure(figsize=(12, 8))
	plt.scatter(
	emergency_density,
	treatment_density,
	alpha=0.6,
	c=np.log1p(df['text_length']),
	cmap='viridis'
	)
	plt.colorbar(label='Log Text Length')
	plt.xlabel('Emergency Keyword Density (per 1000 words)')
	plt.ylabel('Treatment Keyword Density (per 1000 words)')
	plt.title('Emergency vs Treatment Keyword Density (Optimized)')
	plt.grid(True, alpha=0.3)

	# Add mean lines
	plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
	plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
	plt.legend()

	plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
	plt.close()

	# Save comprehensive statistics
	print("\n9️⃣ Saving analysis results...")
	stats_dir = output_dir / "stats"
	stats_dir.mkdir(parents=True, exist_ok=True)

	with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
	json.dump(stats, f, indent=2, ensure_ascii=False)

	print(f"✅ Treatment subset analysis complete! (Optimized Version)")
	print(f" Results saved to: {output_dir}")
	print(f" Plots: {output_plots}")
	print(f" Statistics: {stats_dir}")

	return stats

	if __name__ == "__main__":
	# Configuration for optimized version
	treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
	emergency_keywords = "../keywords/emergency_keywords.txt"
	treatment_keywords = "../keywords/treatment_keywords.txt"
	output_directory = "../analysis_treatment_opt"

	# Run analysis
	results = analyze_treatment_subset(
	treatment_file,
	emergency_keywords,
	treatment_keywords,
	output_directory
	)