from fastapi import FastAPI, UploadFile, File from transformers import MarianMTModel, MarianTokenizer import PyPDF2 import docx import io app = FastAPI() # Charger le modèle MarianMT pour la traduction (ex: anglais → français) MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr" tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME) model = MarianMTModel.from_pretrained(MODEL_NAME) def translate_text(text, src_lang="en", tgt_lang="fr"): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) translated = model.generate(**inputs) return tokenizer.batch_decode(translated, skip_special_tokens=True)[0] def extract_text_from_pdf(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = " ".join(page.extract_text() for page in reader.pages if page.extract_text()) return text def extract_text_from_docx(docx_file): doc = docx.Document(docx_file) text = " ".join([p.text for p in doc.paragraphs]) return text @app.post("/translate/") async def translate_file(file: UploadFile = File(...), src_lang: str = "en", tgt_lang: str = "fr"): if file.filename.endswith(".pdf"): text = extract_text_from_pdf(io.BytesIO(await file.read())) elif file.filename.endswith(".docx"): text = extract_text_from_docx(io.BytesIO(await file.read())) else: return {"error": "Format non supporté. Utilise PDF ou DOCX."} translated_text = translate_text(text, src_lang, tgt_lang) return {"translated_text": translated_text} @app.get("/") def home(): return {"message": "Bienvenue sur l'API de traduction de fichiers !"}