File size: 4,396 Bytes
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09dc5b
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f69e27e
28d6858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoConfig
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_folder, hf_hub_download
import traceback
import threading
import uvicorn
import time
from fastapi import FastAPI
from fastapi.responses import JSONResponse

# === Sabitler ===
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
HF_TOKEN = os.getenv("HF_TOKEN")
SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
BUFFER_SIZE = 5
START_CHUNK_NUMBER = 0
PROCESS_CHUNK_COUNT = 776

CHUNK_FOLDER = "/data/chunks"
PARQUET_FOLDER = "/data/tokenized_chunks"
CACHE_DIR = "/data/.hf_cache"

os.makedirs(CHUNK_FOLDER, exist_ok=True)
os.makedirs(PARQUET_FOLDER, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

# ✅ Health check sunucusu
app = FastAPI()

@app.get("/")
def health():
    return JSONResponse(content={"status": "ok"})

def run_health_server():
    uvicorn.run(app, host="0.0.0.0", port=7860)

threading.Thread(target=run_health_server, daemon=True).start()

# 🕒 Zamanlı log fonksiyonu
def log(message):
    timestamp = datetime.now().strftime("%H:%M:%S")
    print(f"[{timestamp}] {message}")
    os.sys.stdout.flush()

# === Tokenizer ===
os.environ["HF_HOME"] = CACHE_DIR
log(f"🔁 Tokenizer yükleniyor: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, cache_dir=CACHE_DIR)
if tokenizer.pad_token is None:
    log("ℹ️ pad_token tanımlı değil, eos_token atanıyor.")
    tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
MAX_LEN = getattr(config, "max_position_embeddings", 2048)

# === Hugging Face API ===
api = HfApi()
files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", token=HF_TOKEN)
csv_files = sorted([f for f in files if f.endswith(".csv")])
selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]

buffer_counter = 0

def tokenize(example):
    prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
    tokenized["labels"] = [
        -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
    ]
    return tokenized

def upload_if_ready():
    global buffer_counter
    if os.listdir(PARQUET_FOLDER):
        log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {TRAIN_TARGET_DATASET_ID}")
        create_repo(TRAIN_TARGET_DATASET_ID, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
        upload_folder(repo_id=TRAIN_TARGET_DATASET_ID, folder_path=PARQUET_FOLDER, repo_type="dataset", token=HF_TOKEN)
        log("🧹 Upload sonrası klasör temizleniyor...")
        for f in os.listdir(PARQUET_FOLDER):
            os.remove(os.path.join(PARQUET_FOLDER, f))
        buffer_counter = 0

for idx, filename in enumerate(selected_files):
    log(f"\n📄 {idx+1}/{len(selected_files)}{filename} işleniyor...")
    try:
        local_path = os.path.join(CHUNK_FOLDER, os.path.basename(filename))
        hf_hub_download(
            repo_id=SOURCE_DATASET_ID,
            filename=filename,
            local_dir=CHUNK_FOLDER,
            token=HF_TOKEN,
            repo_type="dataset"
        )
        df = pd.read_csv(local_path).dropna()
        df = df[df["question"].str.strip().astype(bool) & df["answer"].str.strip().astype(bool)]
        df = df.rename(columns={"question": "instruction", "answer": "output"})
        log(f"✅ Geçerli satır sayısı: {len(df)}")

        dataset = Dataset.from_pandas(df[["instruction", "output"]])
        tokenized_dataset = dataset.map(tokenize)
        parquet_path = os.path.join(PARQUET_FOLDER, filename.replace(".csv", ".parquet"))
        tokenized_dataset.to_parquet(parquet_path, compression="snappy")
        log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
        buffer_counter += 1
        if buffer_counter >= BUFFER_SIZE:
            upload_if_ready()
    except Exception as e:
        log(f"❌ Hata oluştu: {filename}{e}")
        traceback.print_exc()
        continue

upload_if_ready()

log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
while True:
    time.sleep(60)