|
from fastapi import FastAPI, UploadFile, File |
|
from transformers import MarianMTModel, MarianTokenizer |
|
import PyPDF2 |
|
import docx |
|
import io |
|
|
|
app = FastAPI() |
|
|
|
|
|
MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr" |
|
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME) |
|
model = MarianMTModel.from_pretrained(MODEL_NAME) |
|
|
|
def translate_text(text, src_lang="en", tgt_lang="fr"): |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
translated = model.generate(**inputs) |
|
return tokenizer.batch_decode(translated, skip_special_tokens=True)[0] |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
reader = PyPDF2.PdfReader(pdf_file) |
|
text = " ".join(page.extract_text() for page in reader.pages if page.extract_text()) |
|
return text |
|
|
|
def extract_text_from_docx(docx_file): |
|
doc = docx.Document(docx_file) |
|
text = " ".join([p.text for p in doc.paragraphs]) |
|
return text |
|
|
|
@app.post("/translate/") |
|
async def translate_file(file: UploadFile = File(...), src_lang: str = "en", tgt_lang: str = "fr"): |
|
if file.filename.endswith(".pdf"): |
|
text = extract_text_from_pdf(io.BytesIO(await file.read())) |
|
elif file.filename.endswith(".docx"): |
|
text = extract_text_from_docx(io.BytesIO(await file.read())) |
|
else: |
|
return {"error": "Format non supporté. Utilise PDF ou DOCX."} |
|
|
|
translated_text = translate_text(text, src_lang, tgt_lang) |
|
return {"translated_text": translated_text} |
|
|
|
@app.get("/") |
|
def home(): |
|
return {"message": "Bienvenue sur l'API de traduction de fichiers !"} |
|
|
|
|