Spaces:
Build error
Build error
| from tqdm import tqdm | |
| from deep_translator import GoogleTranslator | |
| from itertools import chain | |
| import copy | |
| from .language_configuration import fix_code_language, INVERTED_LANGUAGES | |
| from .logging_setup import logger | |
| import re | |
| import json | |
| import time | |
| TRANSLATION_PROCESS_OPTIONS = [ | |
| "google_translator_batch", | |
| "google_translator", | |
| "gpt-3.5-turbo-0125_batch", | |
| "gpt-3.5-turbo-0125", | |
| "gpt-4-turbo-preview_batch", | |
| "gpt-4-turbo-preview", | |
| "disable_translation", | |
| ] | |
| DOCS_TRANSLATION_PROCESS_OPTIONS = [ | |
| "google_translator", | |
| "gpt-3.5-turbo-0125", | |
| "gpt-4-turbo-preview", | |
| "disable_translation", | |
| ] | |
| def translate_iterative(segments, target, source=None): | |
| """ | |
| Translate text segments individually to the specified language. | |
| Parameters: | |
| - segments (list): A list of dictionaries with 'text' as a key for | |
| segment text. | |
| - target (str): Target language code. | |
| - source (str, optional): Source language code. Defaults to None. | |
| Returns: | |
| - list: Translated text segments in the target language. | |
| Notes: | |
| - Translates each segment using Google Translate. | |
| Example: | |
| segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] | |
| translated_segments = translate_iterative(segments, 'es') | |
| """ | |
| segments_ = copy.deepcopy(segments) | |
| if ( | |
| not source | |
| ): | |
| logger.debug("No source language") | |
| source = "auto" | |
| translator = GoogleTranslator(source=source, target=target) | |
| for line in tqdm(range(len(segments_))): | |
| text = segments_[line]["text"] | |
| translated_line = translator.translate(text.strip()) | |
| segments_[line]["text"] = translated_line | |
| return segments_ | |
| def verify_translate( | |
| segments, | |
| segments_copy, | |
| translated_lines, | |
| target, | |
| source | |
| ): | |
| """ | |
| Verify integrity and translate segments if lengths match, otherwise | |
| switch to iterative translation. | |
| """ | |
| if len(segments) == len(translated_lines): | |
| for line in range(len(segments_copy)): | |
| logger.debug( | |
| f"{segments_copy[line]['text']} >> " | |
| f"{translated_lines[line].strip()}" | |
| ) | |
| segments_copy[line]["text"] = translated_lines[ | |
| line].replace("\t", "").replace("\n", "").strip() | |
| return segments_copy | |
| else: | |
| logger.error( | |
| "The translation failed, switching to google_translate iterative. " | |
| f"{len(segments), len(translated_lines)}" | |
| ) | |
| return translate_iterative(segments, target, source) | |
| def translate_batch(segments, target, chunk_size=2000, source=None): | |
| """ | |
| Translate a batch of text segments into the specified language in chunks, | |
| respecting the character limit. | |
| Parameters: | |
| - segments (list): List of dictionaries with 'text' as a key for segment | |
| text. | |
| - target (str): Target language code. | |
| - chunk_size (int, optional): Maximum character limit for each translation | |
| chunk (default is 2000; max 5000). | |
| - source (str, optional): Source language code. Defaults to None. | |
| Returns: | |
| - list: Translated text segments in the target language. | |
| Notes: | |
| - Splits input segments into chunks respecting the character limit for | |
| translation. | |
| - Translates the chunks using Google Translate. | |
| - If chunked translation fails, switches to iterative translation using | |
| `translate_iterative()`. | |
| Example: | |
| segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] | |
| translated = translate_batch(segments, 'es', chunk_size=4000, source='en') | |
| """ | |
| segments_copy = copy.deepcopy(segments) | |
| if ( | |
| not source | |
| ): | |
| logger.debug("No source language") | |
| source = "auto" | |
| # Get text | |
| text_lines = [] | |
| for line in range(len(segments_copy)): | |
| text = segments_copy[line]["text"].strip() | |
| text_lines.append(text) | |
| # chunk limit | |
| text_merge = [] | |
| actual_chunk = "" | |
| global_text_list = [] | |
| actual_text_list = [] | |
| for one_line in text_lines: | |
| one_line = " " if not one_line else one_line | |
| if (len(actual_chunk) + len(one_line)) <= chunk_size: | |
| if actual_chunk: | |
| actual_chunk += " ||||| " | |
| actual_chunk += one_line | |
| actual_text_list.append(one_line) | |
| else: | |
| text_merge.append(actual_chunk) | |
| actual_chunk = one_line | |
| global_text_list.append(actual_text_list) | |
| actual_text_list = [one_line] | |
| if actual_chunk: | |
| text_merge.append(actual_chunk) | |
| global_text_list.append(actual_text_list) | |
| # translate chunks | |
| progress_bar = tqdm(total=len(segments), desc="Translating") | |
| translator = GoogleTranslator(source=source, target=target) | |
| split_list = [] | |
| try: | |
| for text, text_iterable in zip(text_merge, global_text_list): | |
| translated_line = translator.translate(text.strip()) | |
| split_text = translated_line.split("|||||") | |
| if len(split_text) == len(text_iterable): | |
| progress_bar.update(len(split_text)) | |
| else: | |
| logger.debug( | |
| "Chunk fixing iteratively. Len chunk: " | |
| f"{len(split_text)}, expected: {len(text_iterable)}" | |
| ) | |
| split_text = [] | |
| for txt_iter in text_iterable: | |
| translated_txt = translator.translate(txt_iter.strip()) | |
| split_text.append(translated_txt) | |
| progress_bar.update(1) | |
| split_list.append(split_text) | |
| progress_bar.close() | |
| except Exception as error: | |
| progress_bar.close() | |
| logger.error(str(error)) | |
| logger.warning( | |
| "The translation in chunks failed, switching to iterative." | |
| " Related: too many request" | |
| ) # use proxy or less chunk size | |
| return translate_iterative(segments, target, source) | |
| # un chunk | |
| translated_lines = list(chain.from_iterable(split_list)) | |
| return verify_translate( | |
| segments, segments_copy, translated_lines, target, source | |
| ) | |
| def call_gpt_translate( | |
| client, | |
| model, | |
| system_prompt, | |
| user_prompt, | |
| original_text=None, | |
| batch_lines=None, | |
| ): | |
| # https://platform.openai.com/docs/guides/text-generation/json-mode | |
| response = client.chat.completions.create( | |
| model=model, | |
| response_format={"type": "json_object"}, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| ) | |
| result = response.choices[0].message.content | |
| logger.debug(f"Result: {str(result)}") | |
| try: | |
| translation = json.loads(result) | |
| except Exception as error: | |
| match_result = re.search(r'\{.*?\}', result) | |
| if match_result: | |
| logger.error(str(error)) | |
| json_str = match_result.group(0) | |
| translation = json.loads(json_str) | |
| else: | |
| raise error | |
| # Get valid data | |
| if batch_lines: | |
| for conversation in translation.values(): | |
| if isinstance(conversation, dict): | |
| conversation = list(conversation.values())[0] | |
| if ( | |
| list( | |
| original_text["conversation"][0].values() | |
| )[0].strip() == | |
| list(conversation[0].values())[0].strip() | |
| ): | |
| continue | |
| if len(conversation) == batch_lines: | |
| break | |
| fix_conversation_length = [] | |
| for line in conversation: | |
| for speaker_code, text_tr in line.items(): | |
| fix_conversation_length.append({speaker_code: text_tr}) | |
| logger.debug(f"Data batch: {str(fix_conversation_length)}") | |
| logger.debug( | |
| f"Lines Received: {len(fix_conversation_length)}," | |
| f" expected: {batch_lines}" | |
| ) | |
| return fix_conversation_length | |
| else: | |
| if isinstance(translation, dict): | |
| translation = list(translation.values())[0] | |
| if isinstance(translation, list): | |
| translation = translation[0] | |
| if isinstance(translation, set): | |
| translation = list(translation)[0] | |
| if not isinstance(translation, str): | |
| raise ValueError(f"No valid response received: {str(translation)}") | |
| return translation | |
| def gpt_sequential(segments, model, target, source=None): | |
| from openai import OpenAI | |
| translated_segments = copy.deepcopy(segments) | |
| client = OpenAI() | |
| progress_bar = tqdm(total=len(segments), desc="Translating") | |
| lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() | |
| lang_sc = "" | |
| if source: | |
| lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() | |
| fixed_target = fix_code_language(target) | |
| fixed_source = fix_code_language(source) if source else "auto" | |
| system_prompt = "Machine translation designed to output the translated_text JSON." | |
| for i, line in enumerate(translated_segments): | |
| text = line["text"].strip() | |
| start = line["start"] | |
| user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" | |
| time.sleep(0.5) | |
| try: | |
| translated_text = call_gpt_translate( | |
| client, | |
| model, | |
| system_prompt, | |
| user_prompt, | |
| ) | |
| except Exception as error: | |
| logger.error( | |
| f"{str(error)} >> The text of segment {start} " | |
| "is being corrected with Google Translate" | |
| ) | |
| translator = GoogleTranslator( | |
| source=fixed_source, target=fixed_target | |
| ) | |
| translated_text = translator.translate(text.strip()) | |
| translated_segments[i]["text"] = translated_text.strip() | |
| progress_bar.update(1) | |
| progress_bar.close() | |
| return translated_segments | |
| def gpt_batch(segments, model, target, token_batch_limit=900, source=None): | |
| from openai import OpenAI | |
| import tiktoken | |
| token_batch_limit = max(100, (token_batch_limit - 40) // 2) | |
| progress_bar = tqdm(total=len(segments), desc="Translating") | |
| segments_copy = copy.deepcopy(segments) | |
| encoding = tiktoken.get_encoding("cl100k_base") | |
| client = OpenAI() | |
| lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() | |
| lang_sc = "" | |
| if source: | |
| lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() | |
| fixed_target = fix_code_language(target) | |
| fixed_source = fix_code_language(source) if source else "auto" | |
| name_speaker = "ABCDEFGHIJKL" | |
| translated_lines = [] | |
| text_data_dict = [] | |
| num_tokens = 0 | |
| count_sk = {char: 0 for char in "ABCDEFGHIJKL"} | |
| for i, line in enumerate(segments_copy): | |
| text = line["text"] | |
| speaker = line["speaker"] | |
| last_start = line["start"] | |
| # text_data_dict.append({str(int(speaker[-1])+1): text}) | |
| index_sk = int(speaker[-2:]) | |
| character_sk = name_speaker[index_sk] | |
| count_sk[character_sk] += 1 | |
| code_sk = character_sk+str(count_sk[character_sk]) | |
| text_data_dict.append({code_sk: text}) | |
| num_tokens += len(encoding.encode(text)) + 7 | |
| if num_tokens >= token_batch_limit or i == len(segments_copy)-1: | |
| try: | |
| batch_lines = len(text_data_dict) | |
| batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} | |
| # Reset vars | |
| num_tokens = 0 | |
| text_data_dict = [] | |
| count_sk = {char: 0 for char in "ABCDEFGHIJKL"} | |
| # Process translation | |
| # https://arxiv.org/pdf/2309.03409.pdf | |
| system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." | |
| user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" | |
| logger.debug(f"Prompt: {str(user_prompt)}") | |
| conversation = call_gpt_translate( | |
| client, | |
| model, | |
| system_prompt, | |
| user_prompt, | |
| original_text=batch_conversation, | |
| batch_lines=batch_lines, | |
| ) | |
| if len(conversation) < batch_lines: | |
| raise ValueError( | |
| "Incomplete result received. Batch lines: " | |
| f"{len(conversation)}, expected: {batch_lines}" | |
| ) | |
| for i, translated_text in enumerate(conversation): | |
| if i+1 > batch_lines: | |
| break | |
| translated_lines.append(list(translated_text.values())[0]) | |
| progress_bar.update(batch_lines) | |
| except Exception as error: | |
| logger.error(str(error)) | |
| first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] | |
| logger.warning( | |
| f"The batch from {first_start} to {last_start} " | |
| "failed, is being corrected with Google Translate" | |
| ) | |
| translator = GoogleTranslator( | |
| source=fixed_source, | |
| target=fixed_target | |
| ) | |
| for txt_source in batch_conversation["conversation"]: | |
| translated_txt = translator.translate( | |
| list(txt_source.values())[0].strip() | |
| ) | |
| translated_lines.append(translated_txt.strip()) | |
| progress_bar.update(1) | |
| progress_bar.close() | |
| return verify_translate( | |
| segments, segments_copy, translated_lines, fixed_target, fixed_source | |
| ) | |
| def translate_text( | |
| segments, | |
| target, | |
| translation_process="google_translator_batch", | |
| chunk_size=4500, | |
| source=None, | |
| token_batch_limit=1000, | |
| ): | |
| """Translates text segments using a specified process.""" | |
| match translation_process: | |
| case "google_translator_batch": | |
| return translate_batch( | |
| segments, | |
| fix_code_language(target), | |
| chunk_size, | |
| fix_code_language(source) | |
| ) | |
| case "google_translator": | |
| return translate_iterative( | |
| segments, | |
| fix_code_language(target), | |
| fix_code_language(source) | |
| ) | |
| case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: | |
| return gpt_sequential(segments, model, target, source) | |
| case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: | |
| return gpt_batch( | |
| segments, | |
| translation_process.replace("_batch", ""), | |
| target, | |
| token_batch_limit, | |
| source | |
| ) | |
| case "disable_translation": | |
| return segments | |
| case _: | |
| raise ValueError("No valid translation process") | |