Spaces:
Sleeping
Sleeping
| """ | |
| Text Translation Module using NLLB-3.3B model | |
| Handles text segmentation and batch translation | |
| """ | |
| import logging | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| logger = logging.getLogger(__name__) | |
| def translate_text(text): | |
| """ | |
| Translate English text to Simplified Chinese | |
| Args: | |
| text: Input English text | |
| Returns: | |
| Translated Chinese text | |
| """ | |
| logger.info(f"Starting translation for text length: {len(text)}") | |
| try: | |
| # Model initialization with explicit language codes | |
| logger.info("Loading NLLB model") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "facebook/nllb-200-3.3B", | |
| src_lang="eng_Latn" # Specify source language | |
| ) | |
| model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B") | |
| logger.info("Translation model loaded") | |
| # Text processing | |
| max_chunk_length = 1000 | |
| text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)] | |
| logger.info(f"Split text into {len(text_chunks)} chunks") | |
| translated_chunks = [] | |
| for i, chunk in enumerate(text_chunks): | |
| logger.info(f"Processing chunk {i+1}/{len(text_chunks)}") | |
| # Tokenize with source language specification | |
| inputs = tokenizer( | |
| chunk, | |
| return_tensors="pt", | |
| max_length=1024, | |
| truncation=True | |
| ) | |
| # Generate translation with target language specification | |
| outputs = model.generate( | |
| **inputs, | |
| forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"), | |
| max_new_tokens=1024 | |
| ) | |
| translated = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| translated_chunks.append(translated) | |
| logger.info(f"Chunk {i+1} translated successfully") | |
| result = "".join(translated_chunks) | |
| logger.info(f"Translation completed. Total length: {len(result)}") | |
| return result | |
| except Exception as e: | |
| logger.error(f"Translation failed: {str(e)}", exc_info=True) | |
| raise |