ciyidogan commited on
Commit
1d5c35d
·
verified ·
1 Parent(s): c00e5c0

Update tokenize_and_upload_mistral.py

Browse files
Files changed (1) hide show
  1. tokenize_and_upload_mistral.py +43 -23
tokenize_and_upload_mistral.py CHANGED
@@ -16,16 +16,22 @@ MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
  SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
18
  TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
 
19
  BUFFER_SIZE = 5
20
- START_CHUNK_NUMBER = 776
21
- PROCESS_CHUNK_COUNT = 2
 
 
 
22
 
23
  CHUNK_FOLDER = "/data/chunks"
24
- PARQUET_FOLDER = "/data/tokenized_chunks"
 
25
  CACHE_DIR = "/data/.hf_cache"
26
 
27
  os.makedirs(CHUNK_FOLDER, exist_ok=True)
28
- os.makedirs(PARQUET_FOLDER, exist_ok=True)
 
29
  os.makedirs(CACHE_DIR, exist_ok=True)
30
 
31
  # ✅ Health check sunucusu
@@ -63,7 +69,8 @@ files = api.list_repo_files(repo_id=SOURCE_DATASET_ID, repo_type="dataset", toke
63
  csv_files = sorted([f for f in files if f.endswith(".csv")])
64
  selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
65
 
66
- buffer_counter = 0
 
67
 
68
  def tokenize(example):
69
  prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
@@ -73,16 +80,16 @@ def tokenize(example):
73
  ]
74
  return tokenized
75
 
76
- def upload_if_ready():
77
- global buffer_counter
78
- if os.listdir(PARQUET_FOLDER):
79
- log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {TRAIN_TARGET_DATASET_ID}")
80
- create_repo(TRAIN_TARGET_DATASET_ID, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
81
- upload_folder(repo_id=TRAIN_TARGET_DATASET_ID, folder_path=PARQUET_FOLDER, repo_type="dataset", token=HF_TOKEN)
82
  log("🧹 Upload sonrası klasör temizleniyor...")
83
- for f in os.listdir(PARQUET_FOLDER):
84
- os.remove(os.path.join(PARQUET_FOLDER, f))
85
- buffer_counter = 0
 
86
 
87
  for idx, filename in enumerate(selected_files):
88
  log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
@@ -100,20 +107,33 @@ for idx, filename in enumerate(selected_files):
100
  df = df.rename(columns={"question": "instruction", "answer": "output"})
101
  log(f"✅ Geçerli satır sayısı: {len(df)}")
102
 
103
- dataset = Dataset.from_pandas(df[["instruction", "output"]])
104
- tokenized_dataset = dataset.map(tokenize)
105
- parquet_path = os.path.join(PARQUET_FOLDER, filename.replace(".csv", ".parquet"))
106
- tokenized_dataset.to_parquet(parquet_path, compression="snappy")
107
- log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
108
- buffer_counter += 1
109
- if buffer_counter >= BUFFER_SIZE:
110
- upload_if_ready()
 
 
 
 
 
 
 
 
 
 
111
  except Exception as e:
112
  log(f"❌ Hata oluştu: {filename} → {e}")
113
  traceback.print_exc()
114
  continue
115
 
116
- upload_if_ready()
 
 
 
117
 
118
  log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
119
  while True:
 
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
  SOURCE_DATASET_ID = "UcsTurkey/turkish-general-culture-chunks"
18
  TRAIN_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-tokenized"
19
+ RAG_TARGET_DATASET_ID = "UcsTurkey/turkish-general-culture-rag"
20
  BUFFER_SIZE = 5
21
+ START_CHUNK_NUMBER = 0
22
+ PROCESS_CHUNK_COUNT = 776
23
+
24
+ GENERATE_TRAIN_DATA = False
25
+ GENERATE_RAG_DATA = True
26
 
27
  CHUNK_FOLDER = "/data/chunks"
28
+ TRAIN_FOLDER = "/data/tokenized_chunks"
29
+ RAG_FOLDER = "/data/rag_chunks"
30
  CACHE_DIR = "/data/.hf_cache"
31
 
32
  os.makedirs(CHUNK_FOLDER, exist_ok=True)
33
+ os.makedirs(TRAIN_FOLDER, exist_ok=True)
34
+ os.makedirs(RAG_FOLDER, exist_ok=True)
35
  os.makedirs(CACHE_DIR, exist_ok=True)
36
 
37
  # ✅ Health check sunucusu
 
69
  csv_files = sorted([f for f in files if f.endswith(".csv")])
70
  selected_files = csv_files[START_CHUNK_NUMBER:START_CHUNK_NUMBER + PROCESS_CHUNK_COUNT]
71
 
72
+ buffer_counter_train = 0
73
+ buffer_counter_rag = 0
74
 
75
  def tokenize(example):
76
  prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
 
80
  ]
81
  return tokenized
82
 
83
+ def upload_if_ready(folder_path, target_repo):
84
+ if os.listdir(folder_path):
85
+ log(f"⬆️ BUFFER doldu. Hugging Face'e yükleniyor: {target_repo}")
86
+ create_repo(target_repo, repo_type="dataset", token=HF_TOKEN, exist_ok=True)
87
+ upload_folder(repo_id=target_repo, folder_path=folder_path, repo_type="dataset", token=HF_TOKEN)
 
88
  log("🧹 Upload sonrası klasör temizleniyor...")
89
+ for f in os.listdir(folder_path):
90
+ os.remove(os.path.join(folder_path, f))
91
+ return 0
92
+ return 0
93
 
94
  for idx, filename in enumerate(selected_files):
95
  log(f"\n📄 {idx+1}/{len(selected_files)} → {filename} işleniyor...")
 
107
  df = df.rename(columns={"question": "instruction", "answer": "output"})
108
  log(f"✅ Geçerli satır sayısı: {len(df)}")
109
 
110
+ if GENERATE_RAG_DATA:
111
+ rag_dataset = Dataset.from_pandas(df[["instruction", "output"]])
112
+ rag_path = os.path.join(RAG_FOLDER, filename.replace(".csv", ".parquet"))
113
+ rag_dataset.to_parquet(rag_path, compression="brotli")
114
+ log(f"📦 RAG parquet kaydedildi: {rag_path}")
115
+ buffer_counter_rag += 1
116
+ if buffer_counter_rag >= BUFFER_SIZE:
117
+ buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)
118
+
119
+ if GENERATE_TRAIN_DATA:
120
+ train_dataset = Dataset.from_pandas(df[["instruction", "output"]])
121
+ tokenized_dataset = train_dataset.map(tokenize)
122
+ parquet_path = os.path.join(TRAIN_FOLDER, filename.replace(".csv", ".parquet"))
123
+ tokenized_dataset.to_parquet(parquet_path, compression="snappy")
124
+ log(f"🎯 Tokenized parquet kaydedildi: {parquet_path}")
125
+ buffer_counter_train += 1
126
+ if buffer_counter_train >= BUFFER_SIZE:
127
+ buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
128
  except Exception as e:
129
  log(f"❌ Hata oluştu: {filename} → {e}")
130
  traceback.print_exc()
131
  continue
132
 
133
+ if GENERATE_TRAIN_DATA:
134
+ buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
135
+ if GENERATE_RAG_DATA:
136
+ buffer_counter_rag = upload_if_ready(RAG_FOLDER, RAG_TARGET_DATASET_ID)
137
 
138
  log("✅ Tüm işlemler tamamlandı. Servis bekleme modunda...")
139
  while True: