Spaces:
Sleeping
feat(dataset): Implement emergency subset extraction with enhanced matching
Browse filesImplement initial data preprocessing pipeline for RAG system evaluation.
Key Changes:
- Enhance keyword matching with findall and non-capturing groups
- Add matched column for tracking all keyword occurrences
- Implement basic statistics calculation
- Prepare for data exploration phase
Technical Details:
1. Keyword Matching Enhancement:
- Use non-capturing groups (?:...) to handle multiple matches
- Implement proper regex pattern with word boundaries
- Handle NaN values explicitly
2. Data Flow:
```
Raw Data (guidelines_source_filtered.jsonl)
│
▼
Keyword Matching (emergency_keywords.txt)
│ ┌─ Pattern: \b(?:keyword1|keyword2)\b
│ └─ Flags: re.IGNORECASE
▼
Multiple Match Extraction
│ ┌─ Use str.findall
│ └─ Join multiple matches with |
▼
Subset Creation
│ ┌─ matched column: "keyword1|keyword2"
│ └─ has_emergency flag
▼
Output Files
├─ emergency_subset.jsonl
└─ emergency_subset.csv
```
3. Next Steps:
- Run data_explorer.py for detailed analysis
- Evaluate subset quality against draft_offlineSubsetbuilding.md
- Consider implementing treatment subset with similar approach
Performance Metrics:
- Capture all keyword matches (not just first occurrence)
- Calculate average keywords per document
- Prepare for co-occurrence analysis
This approach aligns with the RAG system requirements:
1. Maintain semantic relationships (multiple keyword tracking)
2. Enable detailed analysis (matched column)
3. Support future enhancements (treatment subset)
- dataset/check_source.py +18 -0
- dataset/filter_guidelines.py +31 -0
- dataset/keywords/emergency_keywords.txt +44 -0
- dataset/keywords/treatment_keywords.txt +113 -0
- dataset/scripts/01_filter_emergency.py +54 -0
- dataset/scripts/02_filter_treatment.py +37 -0
- dataset/scripts/20250722_datesetA_emergency_subset_preprocessing_commit_message.txt +52 -0
- dataset/scripts/data_explorer.py +92 -0
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
# 讀取剛剛下載並過濾後的 JSONL 檔案
|
| 4 |
+
df = pd.read_json("dataset/guidelines_source_filtered.jsonl", lines=True)
|
| 5 |
+
|
| 6 |
+
# 顯示各來源出現次數
|
| 7 |
+
print("📊 各來源出現次數:")
|
| 8 |
+
print(df["source"].value_counts())
|
| 9 |
+
|
| 10 |
+
# 驗證來源是否只有指定的 9 個
|
| 11 |
+
expected_sources = {"cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"}
|
| 12 |
+
actual_sources = set(df["source"].unique())
|
| 13 |
+
|
| 14 |
+
# 顯示驗證結果
|
| 15 |
+
if actual_sources == expected_sources:
|
| 16 |
+
print("✅ 來源完全符合預期,沒有其他來源。")
|
| 17 |
+
else:
|
| 18 |
+
print(f"❌ 發現未預期來源:{actual_sources - expected_sources}")
|
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# filter_guidelines.py
|
| 2 |
+
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# ✅ 你信任的來源來源縮寫(Hugging Face dataset 中的 source 欄位)
|
| 8 |
+
approved_sources = ["cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"]
|
| 9 |
+
|
| 10 |
+
# Step 1: 從 Hugging Face 載入資料集
|
| 11 |
+
print("⏳ 載入資料中...")
|
| 12 |
+
ds = load_dataset("epfl-llm/guidelines", split="train")
|
| 13 |
+
|
| 14 |
+
# Step 2: 依據 source 欄位進行過濾
|
| 15 |
+
print("🔍 篩選可信來源中...")
|
| 16 |
+
ds_filtered = ds.filter(lambda ex: ex["source"] in approved_sources)
|
| 17 |
+
print(f"✅ 篩選完成,總共 {len(ds_filtered)} 筆資料。")
|
| 18 |
+
|
| 19 |
+
# Step 3: 轉成 pandas DataFrame
|
| 20 |
+
print("📄 轉換為 DataFrame...")
|
| 21 |
+
df = ds_filtered.to_pandas()
|
| 22 |
+
|
| 23 |
+
# Step 4: 建立 dataset 資料夾(如果不存在)
|
| 24 |
+
os.makedirs("dataset", exist_ok=True)
|
| 25 |
+
|
| 26 |
+
# Step 5: 儲存為 JSONL 與 CSV 到 dataset/ 資料夾中
|
| 27 |
+
print("💾 儲存到 dataset/ 資料夾...")
|
| 28 |
+
df.to_json("dataset/guidelines_source_filtered.jsonl", orient="records", lines=True)
|
| 29 |
+
df.to_csv("dataset/guidelines_source_filtered.csv", index=False)
|
| 30 |
+
|
| 31 |
+
print("🎉 完成!已儲存來自可信來源的資料。")
|
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Acute abdomen
|
| 2 |
+
Acute bleeding
|
| 3 |
+
Acute Coronary Syndrome
|
| 4 |
+
Acute Kidney Injury
|
| 5 |
+
Acute pancréatitis
|
| 6 |
+
Acute respiratory distress syndrome
|
| 7 |
+
Acute stroke
|
| 8 |
+
Anaphylaxis
|
| 9 |
+
Anaphylactic Shock
|
| 10 |
+
Arrhythmia
|
| 11 |
+
Atrial fibrillation
|
| 12 |
+
Bradycardia
|
| 13 |
+
Cardiac arrest
|
| 14 |
+
Cardiogenic Shock
|
| 15 |
+
Chest pain
|
| 16 |
+
Dyspnea
|
| 17 |
+
Fever
|
| 18 |
+
Gastrointestinal Hemorrhage (GI bleeding)
|
| 19 |
+
Hemorrhage
|
| 20 |
+
Hemorrhagic stroke
|
| 21 |
+
Hyperthermia
|
| 22 |
+
Hypovolemic Shock
|
| 23 |
+
Hypotension
|
| 24 |
+
Hypothermia
|
| 25 |
+
Internal bleeding
|
| 26 |
+
Intracranial Hemorrhages
|
| 27 |
+
Ischemic stroke
|
| 28 |
+
Loss of consciousness
|
| 29 |
+
Myocardial Infarction
|
| 30 |
+
MI
|
| 31 |
+
Pulmonary Edema
|
| 32 |
+
Pulmonary Embolism
|
| 33 |
+
Respiratory distress
|
| 34 |
+
Respiratory failure
|
| 35 |
+
Sepsis
|
| 36 |
+
Sepsis, Severe
|
| 37 |
+
Septic Shock
|
| 38 |
+
Shock
|
| 39 |
+
Status Epilepticus
|
| 40 |
+
Syncope
|
| 41 |
+
Tachycardia
|
| 42 |
+
Tachypnea
|
| 43 |
+
Traumatic Brain Injury
|
| 44 |
+
Ventricular Tachycardia
|
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
iv fluids
|
| 2 |
+
Infusion Intravenous
|
| 3 |
+
fluid resuscitation
|
| 4 |
+
Infusion Intravenous
|
| 5 |
+
normal saline
|
| 6 |
+
Infusion Intravenous
|
| 7 |
+
crystalloids
|
| 8 |
+
Infusion Intravenous
|
| 9 |
+
vasopressors
|
| 10 |
+
Vasoconstrictor Agents
|
| 11 |
+
Epinephrine
|
| 12 |
+
Ondansetron
|
| 13 |
+
Ibuprofen
|
| 14 |
+
Morphine
|
| 15 |
+
Lidocaine
|
| 16 |
+
Airway Management
|
| 17 |
+
intubation
|
| 18 |
+
Intubation Intratracheal
|
| 19 |
+
ventilation support
|
| 20 |
+
Ventilators
|
| 21 |
+
oxygen therapy
|
| 22 |
+
Oxygen Inhalation Therapy
|
| 23 |
+
cpap
|
| 24 |
+
Continuous Positive Airway Pressure
|
| 25 |
+
bipap
|
| 26 |
+
Bi-level Positive Airway Pressure
|
| 27 |
+
Nebulization
|
| 28 |
+
cpr
|
| 29 |
+
Cardiopulmonary Resuscitation
|
| 30 |
+
ACLS
|
| 31 |
+
Advanced Cardiac Life Support
|
| 32 |
+
Defibrillation
|
| 33 |
+
Cardioversion
|
| 34 |
+
Blood Transfusion
|
| 35 |
+
transfusion
|
| 36 |
+
hemodynamic monitoring
|
| 37 |
+
Hemodynamics
|
| 38 |
+
central line placement
|
| 39 |
+
Catheterization Central Venous
|
| 40 |
+
arterial line placement
|
| 41 |
+
Catheterization Arterial
|
| 42 |
+
Hemostasis
|
| 43 |
+
wound care
|
| 44 |
+
Wound Management
|
| 45 |
+
Suturing
|
| 46 |
+
Tourniquet
|
| 47 |
+
compression dressing
|
| 48 |
+
Wound Dressing
|
| 49 |
+
splinting
|
| 50 |
+
Splints
|
| 51 |
+
radiologic imaging
|
| 52 |
+
Radiography
|
| 53 |
+
point-of-care ultrasound
|
| 54 |
+
POCUS
|
| 55 |
+
Ultrasonography Point-of-Care
|
| 56 |
+
x-ray
|
| 57 |
+
Radiography
|
| 58 |
+
ct scan
|
| 59 |
+
Tomography X-Ray Computed
|
| 60 |
+
laboratory testing
|
| 61 |
+
Laboratory Techniques
|
| 62 |
+
Sedation
|
| 63 |
+
analgesia
|
| 64 |
+
Analgesia
|
| 65 |
+
procedural sedation
|
| 66 |
+
Anesthesia Procedural
|
| 67 |
+
ketamine
|
| 68 |
+
Ketamine
|
| 69 |
+
midazolam
|
| 70 |
+
Midazolam
|
| 71 |
+
supportive care
|
| 72 |
+
Supportive Care
|
| 73 |
+
monitoring
|
| 74 |
+
Patient Monitoring
|
| 75 |
+
vital signs monitoring
|
| 76 |
+
Vital Signs
|
| 77 |
+
icu transfer
|
| 78 |
+
Intensive Care Units
|
| 79 |
+
treatment
|
| 80 |
+
Therapeutics
|
| 81 |
+
manage
|
| 82 |
+
Patient Management
|
| 83 |
+
management
|
| 84 |
+
Patient Management
|
| 85 |
+
intervention
|
| 86 |
+
Therapeutic Intervention
|
| 87 |
+
Therapy
|
| 88 |
+
medication
|
| 89 |
+
Drug Therapy
|
| 90 |
+
procedure
|
| 91 |
+
Surgical Procedures Operative
|
| 92 |
+
resuscitation
|
| 93 |
+
Cardiopulmonary Resuscitation
|
| 94 |
+
administer
|
| 95 |
+
Drug Administration Routes
|
| 96 |
+
dose
|
| 97 |
+
Dosage Forms
|
| 98 |
+
monitor
|
| 99 |
+
Patient Monitoring
|
| 100 |
+
Oxygen
|
| 101 |
+
fluid
|
| 102 |
+
Infusion Intravenous
|
| 103 |
+
surgery
|
| 104 |
+
Surgical Procedures
|
| 105 |
+
antibiotic
|
| 106 |
+
Anti-Bacterial Agents
|
| 107 |
+
Dopamine
|
| 108 |
+
Amiodarone
|
| 109 |
+
levophed
|
| 110 |
+
Norepinephrine
|
| 111 |
+
Epinephrine
|
| 112 |
+
Bosmin
|
| 113 |
+
Adrenaline
|
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/01_filter_emergency.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
# 工具函数:载入关键字并打印进度
|
| 8 |
+
def load_keywords(path):
|
| 9 |
+
print(f"📥 读取关键字:{path}")
|
| 10 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 11 |
+
kws = [line.strip() for line in f if line.strip()]
|
| 12 |
+
print(f" 共载入 {len(kws)} 个关键字")
|
| 13 |
+
return kws
|
| 14 |
+
|
| 15 |
+
# Step 1: 读取原始数据
|
| 16 |
+
print("1️⃣ 读取原始数据…")
|
| 17 |
+
source_path = "../dataset/guidelines_source_filtered.jsonl"
|
| 18 |
+
df = pd.read_json(source_path, lines=True)
|
| 19 |
+
print(f" 已读取 {len(df)} 条记录")
|
| 20 |
+
|
| 21 |
+
# Step 2: 载入急症关键字并匹配
|
| 22 |
+
print("2️⃣ 读取急症关键字并开始匹配…")
|
| 23 |
+
keywords = load_keywords("../keywords/emergency_keywords.txt")
|
| 24 |
+
pattern = r"\b(?:" + "|".join(keywords) + r")\b" # 使用非捕獲組 (?:...)
|
| 25 |
+
|
| 26 |
+
# 匹配關鍵詞
|
| 27 |
+
df["matched"] = (
|
| 28 |
+
df["clean_text"]
|
| 29 |
+
.fillna("") # 把 NaN 变成 ""
|
| 30 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
| 31 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
| 32 |
+
)
|
| 33 |
+
df["has_emergency"] = df["matched"].str.len() > 0
|
| 34 |
+
cnt_em = df["has_emergency"].sum()
|
| 35 |
+
|
| 36 |
+
# 计算平均匹配数(注意转义)
|
| 37 |
+
avg_matches = (
|
| 38 |
+
df[df["has_emergency"]]["matched"]
|
| 39 |
+
.str.count(r"\|") # 这里要转义
|
| 40 |
+
.add(1)
|
| 41 |
+
.mean()
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print(f" 匹配到 {cnt_em} 条急症相关记录")
|
| 45 |
+
print(f" 其中平均每条记录包含 {avg_matches:.2f} 个关键词")
|
| 46 |
+
|
| 47 |
+
# Step 3: 保存急症子集
|
| 48 |
+
print("3️⃣ 保存急症子集…")
|
| 49 |
+
out_dir = "../dataset/emergency"
|
| 50 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 51 |
+
subset = df[df["has_emergency"]]
|
| 52 |
+
subset.to_json(f"{out_dir}/emergency_subset.jsonl", orient="records", lines=True)
|
| 53 |
+
subset.to_csv(f"{out_dir}/emergency_subset.csv", index=False)
|
| 54 |
+
print(f"✅ 完成!已生成急症子集,共 {len(subset)} 条记录,保存在 `{out_dir}`")
|
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scripts/02_filter_treatment.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# 工具函数:载入关键字
|
| 7 |
+
def load_keywords(path):
|
| 8 |
+
print(f"📥 载入关键字:{path}")
|
| 9 |
+
with open(path, "r") as f:
|
| 10 |
+
kws = [line.strip() for line in f if line.strip()]
|
| 11 |
+
print(f" 共载入 {len(kws)} 个关键字")
|
| 12 |
+
return kws
|
| 13 |
+
|
| 14 |
+
# Step 1: 载入急症子集
|
| 15 |
+
print("1️⃣ 读取急症子集…")
|
| 16 |
+
emergency_path = "../dataset/emergency/emergency_subset.jsonl"
|
| 17 |
+
df = pd.read_json(emergency_path, lines=True)
|
| 18 |
+
print(f" 已读取 {len(df)} 条急症相关记录")
|
| 19 |
+
|
| 20 |
+
# Step 2: 载入处置/管理关键字并过滤
|
| 21 |
+
print("2️⃣ 读取处置/管理关键字并开始过滤…")
|
| 22 |
+
treatment_keywords = load_keywords("../keywords/treatment_keywords.txt")
|
| 23 |
+
pattern2 = "|".join(treatment_keywords)
|
| 24 |
+
df["has_treatment"] = df["clean_text"].str.contains(pattern2, case=False, na=False)
|
| 25 |
+
cnt_treat = df["has_treatment"].sum()
|
| 26 |
+
print(f" 匹配到 {cnt_treat} 条包含处置/管理描述的记录")
|
| 27 |
+
|
| 28 |
+
# Step 3: 保存急症+处置子集
|
| 29 |
+
print("3️⃣ 保存急症+处置子集…")
|
| 30 |
+
out_dir = "../dataset/emergency_treatment"
|
| 31 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 32 |
+
subset2 = df[df["has_treatment"]]
|
| 33 |
+
subset2.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
|
| 34 |
+
subset2.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
|
| 35 |
+
print(f" 已保存 {len(subset2)} 条记录到 `{out_dir}`")
|
| 36 |
+
|
| 37 |
+
print("✅ 完成!急症+处置子集已生成。")
|
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feat(dataset): Implement emergency subset extraction with enhanced matching
|
| 2 |
+
|
| 3 |
+
Implement initial data preprocessing pipeline for RAG system evaluation.
|
| 4 |
+
|
| 5 |
+
Key Changes:
|
| 6 |
+
- Enhance keyword matching with findall and non-capturing groups
|
| 7 |
+
- Add matched column for tracking all keyword occurrences
|
| 8 |
+
- Implement basic statistics calculation
|
| 9 |
+
- Prepare for data exploration phase
|
| 10 |
+
|
| 11 |
+
Technical Details:
|
| 12 |
+
1. Keyword Matching Enhancement:
|
| 13 |
+
- Use non-capturing groups (?:...) to handle multiple matches
|
| 14 |
+
- Implement proper regex pattern with word boundaries
|
| 15 |
+
- Handle NaN values explicitly
|
| 16 |
+
|
| 17 |
+
2. Data Flow:
|
| 18 |
+
```
|
| 19 |
+
Raw Data (guidelines_source_filtered.jsonl)
|
| 20 |
+
│
|
| 21 |
+
▼
|
| 22 |
+
Keyword Matching (emergency_keywords.txt)
|
| 23 |
+
│ ┌─ Pattern: \b(?:keyword1|keyword2)\b
|
| 24 |
+
│ └─ Flags: re.IGNORECASE
|
| 25 |
+
▼
|
| 26 |
+
Multiple Match Extraction
|
| 27 |
+
│ ┌─ Use str.findall
|
| 28 |
+
│ └─ Join multiple matches with |
|
| 29 |
+
▼
|
| 30 |
+
Subset Creation
|
| 31 |
+
│ ┌─ matched column: "keyword1|keyword2"
|
| 32 |
+
│ └─ has_emergency flag
|
| 33 |
+
▼
|
| 34 |
+
Output Files
|
| 35 |
+
├─ emergency_subset.jsonl
|
| 36 |
+
└─ emergency_subset.csv
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
3. Next Steps:
|
| 40 |
+
- Run data_explorer.py for detailed analysis
|
| 41 |
+
- Evaluate subset quality against draft_offlineSubsetbuilding.md
|
| 42 |
+
- Consider implementing treatment subset with similar approach
|
| 43 |
+
|
| 44 |
+
Performance Metrics:
|
| 45 |
+
- Capture all keyword matches (not just first occurrence)
|
| 46 |
+
- Calculate average keywords per document
|
| 47 |
+
- Prepare for co-occurrence analysis
|
| 48 |
+
|
| 49 |
+
This approach aligns with the RAG system requirements:
|
| 50 |
+
1. Maintain semantic relationships (multiple keyword tracking)
|
| 51 |
+
2. Enable detailed analysis (matched column)
|
| 52 |
+
3. Support future enhancements (treatment subset)
|
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /scripts/data_explorer.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns # 添加
|
| 5 |
+
import numpy as np # 添加
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json # 添加
|
| 8 |
+
|
| 9 |
+
def analyze_subset(file_path, keywords_path, output_dir="analysis"):
|
| 10 |
+
"""分析子集數據質量和分布"""
|
| 11 |
+
print(f"正在分析: {file_path}")
|
| 12 |
+
|
| 13 |
+
# 載入數據
|
| 14 |
+
df = pd.read_csv(file_path)
|
| 15 |
+
output_dir = Path(output_dir)
|
| 16 |
+
|
| 17 |
+
# 1. 基本統計 (保持原有的)
|
| 18 |
+
print(f"總記錄數: {len(df)}")
|
| 19 |
+
df['text_length'] = df['clean_text'].str.len() # 移到這裡
|
| 20 |
+
print(f"平均文本長度: {df['text_length'].mean():.2f}")
|
| 21 |
+
|
| 22 |
+
# 2. 關鍵字分析 (保持原有的)
|
| 23 |
+
with open(keywords_path, 'r') as f:
|
| 24 |
+
keywords = [line.strip() for line in f if line.strip()]
|
| 25 |
+
|
| 26 |
+
keyword_stats = {}
|
| 27 |
+
for keyword in keywords:
|
| 28 |
+
count = df['clean_text'].str.contains(keyword, case=False).sum()
|
| 29 |
+
keyword_stats[keyword] = count
|
| 30 |
+
print(f"{keyword}: {count} 條記錄")
|
| 31 |
+
|
| 32 |
+
# 3. 可視化
|
| 33 |
+
output_path = Path(output_dir) / "plots"
|
| 34 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
# 3.1 關鍵詞分布圖 (原有的)
|
| 37 |
+
plt.figure(figsize=(15, 8))
|
| 38 |
+
plt.bar(keyword_stats.keys(), keyword_stats.values())
|
| 39 |
+
plt.xticks(rotation=45, ha='right')
|
| 40 |
+
plt.title('關鍵詞匹配分布')
|
| 41 |
+
plt.xlabel('關鍵詞')
|
| 42 |
+
plt.ylabel('匹配數量')
|
| 43 |
+
# TODO: change the name of the file to the name of the subset
|
| 44 |
+
plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
|
| 45 |
+
plt.close()
|
| 46 |
+
|
| 47 |
+
# 3.2 文本長度分布 (新增的)
|
| 48 |
+
plt.figure(figsize=(10, 6))
|
| 49 |
+
df['text_length'].hist(bins=50)
|
| 50 |
+
plt.title('文本長度分布')
|
| 51 |
+
plt.xlabel('文本長度')
|
| 52 |
+
plt.ylabel('頻率')
|
| 53 |
+
plt.savefig(output_path / "text_length_dist.png", bbox_inches='tight')
|
| 54 |
+
plt.close()
|
| 55 |
+
|
| 56 |
+
# 3.3 關鍵詞共現分析 (新增的)
|
| 57 |
+
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
|
| 58 |
+
for text in df['clean_text']:
|
| 59 |
+
present_keywords = [k for k in keywords if k.lower() in text.lower()]
|
| 60 |
+
for i, k1 in enumerate(present_keywords):
|
| 61 |
+
for j, k2 in enumerate(present_keywords):
|
| 62 |
+
if i != j:
|
| 63 |
+
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
|
| 64 |
+
|
| 65 |
+
plt.figure(figsize=(12, 8))
|
| 66 |
+
sns.heatmap(cooccurrence_matrix,
|
| 67 |
+
xticklabels=keywords,
|
| 68 |
+
yticklabels=keywords,
|
| 69 |
+
cmap='YlOrRd')
|
| 70 |
+
plt.title('關鍵詞共現熱力圖')
|
| 71 |
+
plt.xticks(rotation=45, ha='right')
|
| 72 |
+
plt.tight_layout()
|
| 73 |
+
# TODO: change the name of the file to the name of the subset
|
| 74 |
+
plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
|
| 75 |
+
plt.close()
|
| 76 |
+
|
| 77 |
+
# 4. 保存統計數據 (擴展原有的)
|
| 78 |
+
stats_path = Path(output_dir) / "stats"
|
| 79 |
+
stats_path.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
|
| 81 |
+
stats = {
|
| 82 |
+
'基本統計': {
|
| 83 |
+
'總記錄數': len(df),
|
| 84 |
+
'平均文本長度': float(df['text_length'].mean()),
|
| 85 |
+
'文本長度分位數': df['text_length'].describe().to_dict()
|
| 86 |
+
},
|
| 87 |
+
'關鍵詞統計': keyword_stats
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# TODO: change the name of the file to the name of the subset
|
| 91 |
+
with open(stats_path / "analysis_stats_emergency_subset.json", 'w', encoding='utf-8') as f:
|
| 92 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|