File size: 4,396 Bytes
28d6858 a09dc5b 28d6858 f69e27e 28d6858 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoConfig
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
import traceback
import threading
import uvicorn
import time
from fastapi import FastAPI
from fastapi.responses import JSONResponse
# === Sabitler ===
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
HF_TOKEN = os.getenv("HF_TOKEN")
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
BUFFER_SIZE = 5
START_CHUNK_NUMBER = 0
PROCESS_CHUNK_COUNT = 776
CHUNK_FOLDER = "/data/chunks"
PARQUET_FOLDER = "/data/tokenized_chunks"
CACHE_DIR = "/data/.hf_cache"
os.makedirs(CHUNK_FOLDER, exist_ok=True)
os.makedirs(PARQUET_FOLDER, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
# ✅ Health check sunucusu
app = FastAPI()
@app.get("/")
def health():
return JSONResponse(content={"status": "ok"})
def run_health_server():
uvicorn.run(app, host="0.0.0.0", port=7860)
threading.Thread(target=run_health_server, daemon=True).start()
# 🕒 Zamanlı log fonksiyonu
def log(message):
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"[{timestamp}] {message}")
os.sys.stdout.flush()
# === Tokenizer ===
os.environ["HF_HOME"] = CACHE_DIR
log(f"🔁 Tokenizer yükleniyor: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, cache_dir=CACHE_DIR)
if tokenizer.pad_token is None:
log("ℹ️ pad_token tanımlı değil, eos_token atanıyor.")
tokenizer.pad_token = tokenizer.eos_token
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
MAX_LEN = getattr(config, "max_position_embeddings", 2048)
# === Hugging Face API ===
api = HfApi()
files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", token=HF_TOKEN)
csv_files = sorted([f for f in files if f.endswith(".csv")])
selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
buffer_counter = 0
def tokenize(example):
prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
tokenized["labels"] = [
-100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
]
return tokenized
def upload_if_ready():
global buffer_counter
if os.listdir(PARQUET_FOLDER):
log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {TRAIN_TARGET_DATASET_ID}")
create_repo(TRAIN_TARGET_DATASET_ID, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
upload_folder(repo_id=TRAIN_TARGET_DATASET_ID, folder_path=PARQUET_FOLDER, repo_type="dataset", token=HF_TOKEN)
log("🧹 Upload sonrası klasör temizleniyor...")
for f in os.listdir(PARQUET_FOLDER):
os.remove(os.path.join(PARQUET_FOLDER, f))
buffer_counter = 0
for idx, filename in enumerate(selected_files):
log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
try:
local_path = os.path.join(CHUNK_FOLDER, os.path.basename(filename))
hf_hub_download(
repo_id=SOURCE_DATASET_ID,
filename=filename,
local_dir=CHUNK_FOLDER,
token=HF_TOKEN,
repo_type="dataset"
)
df = pd.read_csv(local_path).dropna()
df = df[df["question"].str.strip().astype(bool) & df["answer"].str.strip().astype(bool)]
df = df.rename(columns={"question": "instruction", "answer": "output"})
log(f"✅ Geçerli satır sayısı: {len(df)}")
dataset = Dataset.from_pandas(df[["instruction", "output"]])
tokenized_dataset = dataset.map(tokenize)
parquet_path = os.path.join(PARQUET_FOLDER, filename.replace(".csv", ".parquet"))
tokenized_dataset.to_parquet(parquet_path, compression="snappy")
log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
buffer_counter += 1
if buffer_counter >= BUFFER_SIZE:
upload_if_ready()
except Exception as e:
log(f"❌ Hata oluştu: {filename} → {e}")
traceback.print_exc()
continue
upload_if_ready()
log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
while True:
time.sleep(60)
|