File size: 3,021 Bytes
acf8bfe
129257a
 
acf8bfe
129257a
fc58506
5dc46ff
 
 
 
fc58506
acf8bfe
 
 
 
 
 
129257a
 
 
 
acf8bfe
 
 
 
fc58506
129257a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc58506
 
acf8bfe
 
 
129257a
 
 
 
 
 
 
 
 
acf8bfe
129257a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from fastapi import FastAPI, HTTPException
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import langdetect
import logging
import os

# Atur direktori cache untuk Hugging Face
os.environ["HF_HOME"] = "/app/cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"

app = FastAPI()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    logger.info("Loading translation model...")
    # Load model dan tokenizer M2M100
    model_name = "facebook/m2m100_418M"
    model = M2M100ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = M2M100Tokenizer.from_pretrained(model_name)
    logger.info("Model loaded successfully")
except Exception as e:
    logger.error(f"Failed to load model: {str(e)}")
    raise Exception(f"Model initialization failed: {str(e)}")

def detect_language(text: str) -> str:
    """Deteksi bahasa dari teks menggunakan langdetect."""
    try:
        lang = langdetect.detect(text)
        # Konversi kode bahasa ke format M2M100
        lang_map = {
            "id": "id",  # Indonesia
            "fr": "fr",  # Prancis
            "es": "es",  # Spanyol
            "de": "de",  # Jerman
            "ja": "ja",  # Jepang
            "zh": "zh",  # Mandarin
            "ru": "ru",  # Rusia
            "th": "th",  # Thailand
        }
        return lang_map.get(lang, "en")  # Default ke Inggris jika tidak dikenali
    except Exception as e:
        logger.warning(f"Language detection failed: {str(e)}, defaulting to English")
        return "en"

def translate_to_english(text: str, source_lang: str) -> str:
    """Terjemahkan teks ke Inggris menggunakan M2M100."""
    try:
        # Set bahasa sumber
        tokenizer.src_lang = source_lang
        # Encode teks
        encoded = tokenizer(text, return_tensors="pt")
        # Generate terjemahan (ke Inggris)
        generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id("en"))
        # Decode hasil
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        return translated_text
    except Exception as e:
        logger.error(f"Translation failed: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")

@app.post("/translate")
async def translate(text: str):
    if not text:
        raise HTTPException(status_code=400, detail="Text input is required")
    try:
        # Deteksi bahasa
        source_lang = detect_language(text)
        logger.info(f"Detected source language: {source_lang}")
        # Jika sudah Inggris, kembalikan teks asli
        if source_lang == "en":
            return {"translated_text": text}
        # Terjemahkan ke Inggris
        translated_text = translate_to_english(text, source_lang)
        return {"translated_text": translated_text}
    except Exception as e:
        logger.error(f"Processing failed: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")