Spaces:
Paused
Paused
# coding=utf-8 | |
import os | |
import json | |
import time | |
from datasets import load_dataset | |
from googletrans import Translator | |
def main(): | |
split = "train" | |
translator = Translator() | |
def translate(text: str) -> str: | |
if len(text) == 0: | |
return "" | |
if text.startswith("http") or text.startswith("Reddit.com"): | |
return text | |
local_patience = 0 | |
while local_patience < 5: | |
try: | |
result = translator.translate(text, dest="zh-cn", src="en") | |
print("translate: {} -> {}".format(text, result.text)) | |
time.sleep(1) | |
return result.text | |
except Exception: | |
print(f"Error occurred while translating {text}, retrying...") | |
local_patience += 1 | |
time.sleep(10) | |
raise Exception | |
dataset = load_dataset("../data/hh_rlhf_en", split=split) | |
if os.path.exists(f"{split}.json"): | |
with open(f"{split}.json", "r", encoding="utf-8", newline="\n") as f: | |
jsondata = json.load(f) | |
else: | |
jsondata = [] | |
global_patience = 0 | |
i = len(jsondata) | |
while i < len(dataset): | |
try: | |
jsondata.append({ | |
"instruction": translate(dataset[i]["instruction"]), | |
"output": [translate(output) for output in dataset[i]["output"]], | |
"history": [[translate(hist[0]), translate(hist[1])] for hist in dataset[i]["history"]] | |
}) | |
i += 1 | |
global_patience = 0 | |
if i % 10 == 0: | |
with open(f"{split}.json", "w", encoding="utf-8", newline="\n") as f: | |
json.dump(jsondata, f, indent=2, ensure_ascii=False) | |
except Exception: | |
print(f"Error occurred at {i}-th data, retrying...") | |
global_patience += 1 | |
time.sleep(50) | |
if global_patience > 10: | |
print("Stop") | |
return | |
if __name__ == "__main__": | |
main() | |