Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
a5bcfa7
1
Parent(s):
7d8970e
WIP: add dual keyword and text length distribution plots for treatment subset analysis
Browse files
dataset/scripts/data_explorer_treatment.py
CHANGED
|
@@ -84,11 +84,10 @@ def analyze_treatment_subset(
|
|
| 84 |
stats['treatment_keyword_stats'][keyword] = int(count)
|
| 85 |
print(f" Treatment: {keyword} -> {count} records")
|
| 86 |
|
| 87 |
-
# Co-occurrence analysis
|
| 88 |
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
| 89 |
-
print(" Creating boolean matrices...")
|
| 90 |
|
| 91 |
-
# Initialize
|
| 92 |
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
| 93 |
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
| 94 |
|
|
@@ -96,35 +95,26 @@ def analyze_treatment_subset(
|
|
| 96 |
print(" Pre-processing text...")
|
| 97 |
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
print(" Processing emergency keywords...")
|
| 101 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
| 102 |
-
pattern = r'
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
regex=True,
|
| 107 |
-
na=False
|
| 108 |
-
).values
|
| 109 |
-
except Exception as e:
|
| 110 |
-
print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
|
| 111 |
|
| 112 |
-
#
|
| 113 |
-
print(" Processing treatment keywords...")
|
| 114 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
| 115 |
-
pattern = r'
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
regex=True,
|
| 120 |
-
na=False
|
| 121 |
-
).values
|
| 122 |
-
except Exception as e:
|
| 123 |
-
print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
|
| 124 |
|
| 125 |
-
# Compute co-occurrence
|
| 126 |
-
print(" Computing co-occurrence matrix...")
|
| 127 |
-
cooc_matrix = emergency_matrix.T @ treatment_matrix
|
|
|
|
| 128 |
|
| 129 |
# Extract results
|
| 130 |
print(" Extracting co-occurrence pairs...")
|
|
@@ -137,7 +127,7 @@ def analyze_treatment_subset(
|
|
| 137 |
'emergency_keyword': em_kw,
|
| 138 |
'treatment_keyword': tr_kw,
|
| 139 |
'cooccurrence_count': count,
|
| 140 |
-
'percentage': float(count /
|
| 141 |
})
|
| 142 |
|
| 143 |
# Sort and store results
|
|
@@ -149,35 +139,45 @@ def analyze_treatment_subset(
|
|
| 149 |
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
| 150 |
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
| 151 |
|
| 152 |
-
# Path B validation metrics
|
| 153 |
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
tr_matches = sum(1 for kw in treatment_keywords if kw.lower() in text)
|
| 163 |
-
|
| 164 |
-
emergency_density.append(em_matches)
|
| 165 |
-
treatment_density.append(tr_matches)
|
| 166 |
|
|
|
|
| 167 |
df['emergency_keyword_density'] = emergency_density
|
| 168 |
df['treatment_keyword_density'] = treatment_density
|
| 169 |
|
|
|
|
| 170 |
stats['path_b_validation'] = {
|
| 171 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
| 172 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
| 173 |
-
'high_density_records': int(sum(
|
| 174 |
-
'precision_estimate': float(sum(
|
| 175 |
}
|
| 176 |
|
| 177 |
-
|
| 178 |
-
print(
|
| 179 |
-
print(f"
|
| 180 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
# Condition mapping candidates
|
| 183 |
print("\n8️⃣ Preparing condition mapping candidates...")
|
|
|
|
| 84 |
stats['treatment_keyword_stats'][keyword] = int(count)
|
| 85 |
print(f" Treatment: {keyword} -> {count} records")
|
| 86 |
|
| 87 |
+
# Step 6: Co-occurrence analysis
|
| 88 |
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
|
|
|
| 89 |
|
| 90 |
+
# Initialize matrices for full dataset
|
| 91 |
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
| 92 |
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
| 93 |
|
|
|
|
| 95 |
print(" Pre-processing text...")
|
| 96 |
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
| 97 |
|
| 98 |
+
# Process all emergency keywords
|
| 99 |
+
print("\n Processing all emergency keywords...")
|
| 100 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
| 101 |
+
pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
|
| 102 |
+
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 103 |
+
matches = emergency_matrix[:, i].sum()
|
| 104 |
+
print(f" - {keyword}: {matches} matches")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
# Process all treatment keywords
|
| 107 |
+
print("\n Processing all treatment keywords...")
|
| 108 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
| 109 |
+
pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
|
| 110 |
+
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
| 111 |
+
matches = treatment_matrix[:, i].sum()
|
| 112 |
+
print(f" - {keyword}: {matches} matches")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# Compute co-occurrence matrix
|
| 115 |
+
print("\n Computing co-occurrence matrix...")
|
| 116 |
+
cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
|
| 117 |
+
print(" Computation completed successfully")
|
| 118 |
|
| 119 |
# Extract results
|
| 120 |
print(" Extracting co-occurrence pairs...")
|
|
|
|
| 127 |
'emergency_keyword': em_kw,
|
| 128 |
'treatment_keyword': tr_kw,
|
| 129 |
'cooccurrence_count': count,
|
| 130 |
+
'percentage': float(count / len(df) * 100)
|
| 131 |
})
|
| 132 |
|
| 133 |
# Sort and store results
|
|
|
|
| 139 |
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
| 140 |
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
| 141 |
|
| 142 |
+
# Step 7: Path B validation metrics
|
| 143 |
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
| 144 |
|
| 145 |
+
# Compute keyword density with progress bar
|
| 146 |
+
print(" Computing keyword density...")
|
| 147 |
+
with tqdm(total=2, desc="Density calculation") as pbar:
|
| 148 |
+
emergency_density = emergency_matrix.sum(axis=1)
|
| 149 |
+
pbar.update(1)
|
| 150 |
+
treatment_density = treatment_matrix.sum(axis=1)
|
| 151 |
+
pbar.update(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
# Store density in dataframe
|
| 154 |
df['emergency_keyword_density'] = emergency_density
|
| 155 |
df['treatment_keyword_density'] = treatment_density
|
| 156 |
|
| 157 |
+
# Calculate statistics
|
| 158 |
stats['path_b_validation'] = {
|
| 159 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
| 160 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
| 161 |
+
'high_density_records': int(sum((emergency_density >= 2) & (treatment_density >= 2))),
|
| 162 |
+
'precision_estimate': float(sum((emergency_density >= 1) & (treatment_density >= 1)) / len(df))
|
| 163 |
}
|
| 164 |
|
| 165 |
+
# Print detailed results
|
| 166 |
+
print("\n Results:")
|
| 167 |
+
print(f" - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
| 168 |
+
print(f" - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
| 169 |
+
print(f" - High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
|
| 170 |
+
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
| 171 |
+
|
| 172 |
+
# Sample distribution analysis
|
| 173 |
+
print("\n Density Distribution:")
|
| 174 |
+
density_counts = pd.DataFrame({
|
| 175 |
+
'emergency': emergency_density,
|
| 176 |
+
'treatment': treatment_density
|
| 177 |
+
}).value_counts().head()
|
| 178 |
+
print(" Top 5 density combinations (emergency, treatment):")
|
| 179 |
+
for (em, tr), count in density_counts.items():
|
| 180 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment keywords")
|
| 181 |
|
| 182 |
# Condition mapping candidates
|
| 183 |
print("\n8️⃣ Preparing condition mapping candidates...")
|