File size: 1,607 Bytes
ae6c842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2d3314
1c4dccf
e059b03
ae6c842
e059b03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from fastapi import FastAPI, UploadFile, File
from transformers import MarianMTModel, MarianTokenizer
import PyPDF2
import docx
import io

app = FastAPI()

# Charger le modèle MarianMT pour la traduction (ex: anglais → français)
MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)

def translate_text(text, src_lang="en", tgt_lang="fr"):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
    return text

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    text = " ".join([p.text for p in doc.paragraphs])
    return text

@app.post("/translate/")
async def translate_file(file: UploadFile = File(...), src_lang: str = "en", tgt_lang: str = "fr"):
    if file.filename.endswith(".pdf"):
        text = extract_text_from_pdf(io.BytesIO(await file.read()))
    elif file.filename.endswith(".docx"):
        text = extract_text_from_docx(io.BytesIO(await file.read()))
    else:
        return {"error": "Format non supporté. Utilise PDF ou DOCX."}
    
    translated_text = translate_text(text, src_lang, tgt_lang)
    return {"translated_text": translated_text}

@app.get("/")
def home():
    return {"message": "Bienvenue sur l'API de traduction de fichiers !"}