ciyidogan commited on
Commit
28d6858
·
verified ·
1 Parent(s): 48bf505

Update tokenize_and_upload.py

Browse files
Files changed (1) hide show
  1. tokenize_and_upload.py +120 -0
tokenize_and_upload.py CHANGED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from datasets import Dataset
4
+ from transformers import AutoTokenizer, AutoConfig
5
+ from datetime import datetime
6
+ from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
7
+ import traceback
8
+ import threading
9
+ import uvicorn
10
+ import time
11
+ from fastapi import FastAPI
12
+ from fastapi.responses import JSONResponse
13
+
14
+ # === Sabitler ===
15
+ MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
16
+ HF_TOKEN = os.getenv("HF_TOKEN")
17
+ SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
18
+ TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
19
+ BUFFER_SIZE = 5
20
+ START_CHUNK_NUMBER = 0
21
+ PROCESS_CHUNK_COUNT = 10
22
+
23
+ CHUNK_FOLDER = "/data/chunks"
24
+ PARQUET_FOLDER = "/data/tokenized_chunks"
25
+ CACHE_DIR = "/data/.hf_cache"
26
+
27
+ os.makedirs(CHUNK_FOLDER, exist_ok=True)
28
+ os.makedirs(PARQUET_FOLDER, exist_ok=True)
29
+ os.makedirs(CACHE_DIR, exist_ok=True)
30
+
31
+ # ✅ Health check sunucusu
32
+ app = FastAPI()
33
+
34
+ @app.get("/")
35
+ def health():
36
+ return JSONResponse(content={"status": "ok"})
37
+
38
+ def run_health_server():
39
+ uvicorn.run(app, host="0.0.0.0", port=7860)
40
+
41
+ threading.Thread(target=run_health_server, daemon=True).start()
42
+
43
+ # 🕒 Zamanlı log fonksiyonu
44
+ def log(message):
45
+ timestamp = datetime.now().strftime("%H:%M:%S")
46
+ print(f"[{timestamp}] {message}")
47
+ os.sys.stdout.flush()
48
+
49
+ # === Tokenizer ===
50
+ os.environ["HF_HOME"] = CACHE_DIR
51
+ log(f"🔁 Tokenizer yükleniyor: {MODEL_NAME}")
52
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, cache_dir=CACHE_DIR)
53
+ if tokenizer.pad_token is None:
54
+ log("ℹ️ pad_token tanımlı değil, eos_token atanıyor.")
55
+ tokenizer.pad_token = tokenizer.eos_token
56
+
57
+ config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
58
+ MAX_LEN = getattr(config, "max_position_embeddings", 2048)
59
+
60
+ # === Hugging Face API ===
61
+ api = HfApi()
62
+ files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", token=HF_TOKEN)
63
+ csv_files = sorted([f for f in files if f.endswith(".csv")])
64
+ selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
65
+
66
+ buffer_counter = 0
67
+
68
+ def tokenize(example):
69
+ prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
70
+ tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
71
+ tokenized["labels"] = [
72
+ -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
73
+ ]
74
+ return tokenized
75
+
76
+ def upload_if_ready():
77
+ global buffer_counter
78
+ if os.listdir(PARQUET_FOLDER):
79
+ log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {TRAIN_TARGET_DATASET_ID}")
80
+ create_repo(TRAIN_TARGET_DATASET_ID, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
81
+ upload_folder(repo_id=TRAIN_TARGET_DATASET_ID, folder_path=PARQUET_FOLDER, repo_type="dataset", token=HF_TOKEN)
82
+ log("🧹 Upload sonrası klasör temizleniyor...")
83
+ for f in os.listdir(PARQUET_FOLDER):
84
+ os.remove(os.path.join(PARQUET_FOLDER, f))
85
+ buffer_counter = 0
86
+
87
+ for idx, filename in enumerate(selected_files):
88
+ log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
89
+ try:
90
+ local_path = os.path.join(CHUNK_FOLDER, os.path.basename(filename))
91
+ hf_hub_download(
92
+ repo_id=SOURCE_DATASET_ID,
93
+ filename=filename,
94
+ local_dir=CHUNK_FOLDER,
95
+ token=HF_TOKEN,
96
+ repo_type="dataset"
97
+ )
98
+ df = pd.read_csv(local_path).dropna()
99
+ df = df[df["question"].str.strip().astype(bool) & df["answer"].str.strip().astype(bool)]
100
+ df = df.rename(columns={"question": "instruction", "answer": "output"})
101
+ log(f"✅ Geçerli satır sayısı: {len(df)}")
102
+
103
+ dataset = Dataset.from_pandas(df[["instruction", "output"]])
104
+ tokenized_dataset = dataset.map(tokenize)
105
+ parquet_path = os.path.join(PARQUET_FOLDER, filename.replace(".csv", ".parquet"))
106
+ tokenized_dataset.to_parquet(parquet_path, compression="snappy")
107
+ log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
108
+ buffer_counter += 1
109
+ if buffer_counter >= BUFFER_SIZE:
110
+ upload_if_ready()
111
+ except Exception as e:
112
+ log(f"❌ Hata oluştu: {filename} → {e}")
113
+ traceback.print_exc()
114
+ continue
115
+
116
+ upload_if_ready()
117
+
118
+ log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
119
+ while True:
120
+ time.sleep(60)