|
import sys, os, zipfile, shutil, time, traceback, threading, uvicorn |
|
from fastapi import FastAPI |
|
from fastapi.responses import JSONResponse |
|
from datetime import datetime |
|
from datasets import load_dataset |
|
from huggingface_hub import HfApi |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling |
|
from peft import get_peft_model, LoraConfig, TaskType |
|
import torch |
|
|
|
|
|
START_NUMBER = 0 |
|
END_NUMBER = 9 |
|
MODEL_NAME = "TURKCELL/Turkcell-LLM-7b-v1" |
|
TOKENIZED_DATASET_ID = "UcsTurkey/turkish-train-tokenized" |
|
ZIP_UPLOAD_REPO = "UcsTurkey/trained-zips" |
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
BATCH_SIZE = 1 |
|
EPOCHS = 2 |
|
MAX_LENGTH = 2048 |
|
OUTPUT_DIR = "/data/output" |
|
ZIP_FOLDER = "/data/zip_temp" |
|
zip_name = f"trained_model_{START_NUMBER:03d}_{END_NUMBER:03d}.zip" |
|
ZIP_PATH = os.path.join(ZIP_FOLDER, zip_name) |
|
|
|
|
|
app = FastAPI() |
|
|
|
@app.get("/") |
|
def health(): |
|
return JSONResponse(content={"status": "ok"}) |
|
|
|
def run_health_server(): |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|
threading.Thread(target=run_health_server, daemon=True).start() |
|
|
|
|
|
def log(message): |
|
timestamp = datetime.now().strftime("%H:%M:%S") |
|
print(f"[{timestamp}] {message}") |
|
sys.stdout.flush() |
|
|
|
|
|
log("🛠️ Ortam hazırlanıyor...") |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
log("🧠 Model indiriliyor...") |
|
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16) |
|
base_model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
log("🎯 LoRA adapter uygulanıyor...") |
|
peft_config = LoraConfig( |
|
task_type=TaskType.CAUSAL_LM, |
|
r=64, |
|
lora_alpha=16, |
|
lora_dropout=0.1, |
|
bias="none", |
|
fan_in_fan_out=False |
|
) |
|
model = get_peft_model(base_model, peft_config) |
|
model.print_trainable_parameters() |
|
|
|
log("📦 Parquet dosyaları listeleniyor...") |
|
api = HfApi() |
|
files = api.list_repo_files(repo_id=TOKENIZED_DATASET_ID, repo_type="dataset", token=HF_TOKEN) |
|
selected_files = sorted([f for f in files if f.startswith("chunk_") and f.endswith(".parquet")])[START_NUMBER:END_NUMBER+1] |
|
|
|
if not selected_files: |
|
log("⚠️ Parquet bulunamadı. Eğitim iptal.") |
|
exit(0) |
|
|
|
training_args = TrainingArguments( |
|
output_dir=OUTPUT_DIR, |
|
per_device_train_batch_size=BATCH_SIZE, |
|
num_train_epochs=EPOCHS, |
|
save_strategy="epoch", |
|
save_total_limit=2, |
|
learning_rate=2e-4, |
|
disable_tqdm=True, |
|
logging_strategy="steps", |
|
logging_steps=10, |
|
report_to=[], |
|
bf16=True, |
|
fp16=False |
|
) |
|
|
|
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
for file in selected_files: |
|
try: |
|
log(f"\n📄 Yükleniyor: {file}") |
|
dataset = load_dataset( |
|
path=TOKENIZED_DATASET_ID, |
|
data_files={"train": file}, |
|
split="train", |
|
token=HF_TOKEN |
|
) |
|
log(f"🔍 {len(dataset)} örnek") |
|
if len(dataset) == 0: |
|
continue |
|
|
|
|
|
|
|
first_row = dataset[0] |
|
decoded_prompt = tokenizer.decode(first_row["input_ids"], skip_special_tokens=True) |
|
log(f"📌 Örnek prompt: {decoded_prompt[:200]}...") |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
data_collator=collator |
|
) |
|
log("🚀 Eğitim başlıyor...") |
|
trainer.train() |
|
log("✅ Eğitim tamam.") |
|
except Exception as e: |
|
log(f"❌ Hata: {file} → {e}") |
|
traceback.print_exc() |
|
|
|
|
|
log("📦 Model zipleniyor...") |
|
try: |
|
tmp_dir = os.path.join(ZIP_FOLDER, "temp_save") |
|
os.makedirs(tmp_dir, exist_ok=True) |
|
model.save_pretrained(tmp_dir) |
|
tokenizer.save_pretrained(tmp_dir) |
|
|
|
with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as zipf: |
|
for root, _, files in os.walk(tmp_dir): |
|
for file in files: |
|
filepath = os.path.join(root, file) |
|
arcname = os.path.relpath(filepath, tmp_dir) |
|
zipf.write(filepath, arcname=os.path.join("output", arcname)) |
|
log(f"✅ Zip oluşturuldu: {ZIP_PATH}") |
|
except Exception as e: |
|
log(f"❌ Zipleme hatası: {e}") |
|
traceback.print_exc() |
|
|
|
|
|
try: |
|
log("☁️ Hugging Face'e yükleniyor...") |
|
api.upload_file( |
|
path_or_fileobj=ZIP_PATH, |
|
path_in_repo=zip_name, |
|
repo_id=ZIP_UPLOAD_REPO, |
|
repo_type="model", |
|
token=HF_TOKEN |
|
) |
|
log("✅ Upload tamam.") |
|
except Exception as e: |
|
log(f"❌ Upload hatası: {e}") |
|
traceback.print_exc() |
|
|
|
log("⏸️ Eğitim tamamlandı. Servis bekleme modunda...") |
|
while True: |
|
time.sleep(60) |
|
|