Update tokenize_and_upload_mistral.py
Browse files- tokenize_and_upload_mistral.py +43 -23
tokenize_and_upload_mistral.py
CHANGED
@@ -16,16 +16,22 @@ MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
|
|
16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
17 |
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
|
18 |
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
|
|
|
19 |
BUFFER_SIZE = 5
|
20 |
-
START_CHUNK_NUMBER =
|
21 |
-
PROCESS_CHUNK_COUNT =
|
|
|
|
|
|
|
22 |
|
23 |
CHUNK_FOLDER = "/data/chunks"
|
24 |
-
|
|
|
25 |
CACHE_DIR = "/data/.hf_cache"
|
26 |
|
27 |
os.makedirs(CHUNK_FOLDER, exist_ok=True)
|
28 |
-
os.makedirs(
|
|
|
29 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
30 |
|
31 |
# ✅ Health check sunucusu
|
@@ -63,7 +69,8 @@ files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", toke
|
|
63 |
csv_files = sorted([f for f in files if f.endswith(".csv")])
|
64 |
selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
|
65 |
|
66 |
-
|
|
|
67 |
|
68 |
def tokenize(example):
|
69 |
prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
|
@@ -73,16 +80,16 @@ def tokenize(example):
|
|
73 |
]
|
74 |
return tokenized
|
75 |
|
76 |
-
def upload_if_ready():
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
upload_folder(repo_id=TRAIN_TARGET_DATASET_ID, folder_path=PARQUET_FOLDER, repo_type="dataset", token=HF_TOKEN)
|
82 |
log("🧹 Upload sonrası klasör temizleniyor...")
|
83 |
-
for f in os.listdir(
|
84 |
-
os.remove(os.path.join(
|
85 |
-
|
|
|
86 |
|
87 |
for idx, filename in enumerate(selected_files):
|
88 |
log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
|
@@ -100,20 +107,33 @@ for idx, filename in enumerate(selected_files):
|
|
100 |
df = df.rename(columns={"question": "instruction", "answer": "output"})
|
101 |
log(f"✅ Geçerli satır sayısı: {len(df)}")
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
except Exception as e:
|
112 |
log(f"❌ Hata oluştu: {filename} → {e}")
|
113 |
traceback.print_exc()
|
114 |
continue
|
115 |
|
116 |
-
|
|
|
|
|
|
|
117 |
|
118 |
log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
|
119 |
while True:
|
|
|
16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
17 |
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
|
18 |
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
|
19 |
+
RAG_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-rag"
|
20 |
BUFFER_SIZE = 5
|
21 |
+
START_CHUNK_NUMBER = 0
|
22 |
+
PROCESS_CHUNK_COUNT = 776
|
23 |
+
|
24 |
+
GENERATE_TRAIN_DATA = False
|
25 |
+
GENERATE_RAG_DATA = True
|
26 |
|
27 |
CHUNK_FOLDER = "/data/chunks"
|
28 |
+
TRAIN_FOLDER = "/data/tokenized_chunks"
|
29 |
+
RAG_FOLDER = "/data/rag_chunks"
|
30 |
CACHE_DIR = "/data/.hf_cache"
|
31 |
|
32 |
os.makedirs(CHUNK_FOLDER, exist_ok=True)
|
33 |
+
os.makedirs(TRAIN_FOLDER, exist_ok=True)
|
34 |
+
os.makedirs(RAG_FOLDER, exist_ok=True)
|
35 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
36 |
|
37 |
# ✅ Health check sunucusu
|
|
|
69 |
csv_files = sorted([f for f in files if f.endswith(".csv")])
|
70 |
selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
|
71 |
|
72 |
+
buffer_counter_train = 0
|
73 |
+
buffer_counter_rag = 0
|
74 |
|
75 |
def tokenize(example):
|
76 |
prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
|
|
|
80 |
]
|
81 |
return tokenized
|
82 |
|
83 |
+
def upload_if_ready(folder_path, target_repo):
|
84 |
+
if os.listdir(folder_path):
|
85 |
+
log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {target_repo}")
|
86 |
+
create_repo(target_repo, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
|
87 |
+
upload_folder(repo_id=target_repo, folder_path=folder_path, repo_type="dataset", token=HF_TOKEN)
|
|
|
88 |
log("🧹 Upload sonrası klasör temizleniyor...")
|
89 |
+
for f in os.listdir(folder_path):
|
90 |
+
os.remove(os.path.join(folder_path, f))
|
91 |
+
return 0
|
92 |
+
return 0
|
93 |
|
94 |
for idx, filename in enumerate(selected_files):
|
95 |
log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
|
|
|
107 |
df = df.rename(columns={"question": "instruction", "answer": "output"})
|
108 |
log(f"✅ Geçerli satır sayısı: {len(df)}")
|
109 |
|
110 |
+
if GENERATE_RAG_DATA:
|
111 |
+
rag_dataset = Dataset.from_pandas(df[["instruction", "output"]])
|
112 |
+
rag_path = os.path.join(RAG_FOLDER, filename.replace(".csv", ".parquet"))
|
113 |
+
rag_dataset.to_parquet(rag_path, compression="brotli")
|
114 |
+
log(f"📦 RAG parquet kaydedildi: {rag_path}")
|
115 |
+
buffer_counter_rag += 1
|
116 |
+
if buffer_counter_rag >= BUFFER_SIZE:
|
117 |
+
buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)
|
118 |
+
|
119 |
+
if GENERATE_TRAIN_DATA:
|
120 |
+
train_dataset = Dataset.from_pandas(df[["instruction", "output"]])
|
121 |
+
tokenized_dataset = train_dataset.map(tokenize)
|
122 |
+
parquet_path = os.path.join(TRAIN_FOLDER, filename.replace(".csv", ".parquet"))
|
123 |
+
tokenized_dataset.to_parquet(parquet_path, compression="snappy")
|
124 |
+
log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
|
125 |
+
buffer_counter_train += 1
|
126 |
+
if buffer_counter_train >= BUFFER_SIZE:
|
127 |
+
buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
|
128 |
except Exception as e:
|
129 |
log(f"❌ Hata oluştu: {filename} → {e}")
|
130 |
traceback.print_exc()
|
131 |
continue
|
132 |
|
133 |
+
if GENERATE_TRAIN_DATA:
|
134 |
+
buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
|
135 |
+
if GENERATE_RAG_DATA:
|
136 |
+
buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)
|
137 |
|
138 |
log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
|
139 |
while True:
|