Spaces:
Building
Building
File size: 7,269 Bytes
4635155 b4191f7 a51d2c7 4635155 a51d2c7 4635155 a51d2c7 4635155 a51d2c7 4635155 a51d2c7 4635155 a51d2c7 8dc6bdd a51d2c7 8dc6bdd b4191f7 4635155 8dc6bdd b4191f7 8dc6bdd b4191f7 8dc6bdd b4191f7 8dc6bdd b4191f7 a51d2c7 7d09bf2 b4191f7 2d4d59c 4635155 767d3ba 4635155 a51d2c7 4635155 a51d2c7 4635155 7d09bf2 4635155 a51d2c7 7d09bf2 ca1debd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
from datetime import datetime
import langid
import os
import requests
from io import BytesIO
from pydub import AudioSegment
import speech_recognition as sr
import warnings
warnings.filterwarnings("ignore", message="Recommended: pip install sacremoses.")
langid.set_languages(['en', 'fr', 'sw'])
MODEL_MAP = {
"English β Swahili": "Helsinki-NLP/opus-mt-en-sw",
"English β French": "Helsinki-NLP/opus-mt-en-fr",
"French β English": "Helsinki-NLP/opus-mt-fr-en",
"French β Swahili (via English)": ["Helsinki-NLP/opus-mt-fr-en", "Helsinki-NLP/opus-mt-en-sw"]
}
TONE_MODIFIERS = {
"Neutral": "",
"Romantic": "Express this romantically: ",
"Formal": "Translate this in a formal tone: ",
"Casual": "Make this sound casual: "
}
VOICE_IDS = {
"Rachel (Female)": "21m00Tcm4TlvDq8ikWAM",
"Adam (Male)": "pNInz6obpgDQGcFmaJgB"
}
loaded_models = {}
def load_model(model_name):
if model_name not in loaded_models:
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
loaded_models[model_name] = (tokenizer, model)
return loaded_models[model_name]
def detect_language(text):
try:
lang, score = langid.classify(text)
return lang
except:
return "unknown"
def translate(text, direction, tone):
detected_lang = detect_language(text)
expected_src = direction.split(" β ")[0].lower()
warning = ""
if expected_src.startswith("english") and detected_lang != "en":
warning = f"β οΈ Detected language is '{detected_lang}', but you selected English as source."
elif expected_src.startswith("french") and detected_lang != "fr":
warning = f"β οΈ Detected language is '{detected_lang}', but you selected French as source."
elif expected_src.startswith("swahili") and detected_lang != "sw":
warning = f"β οΈ Detected language is '{detected_lang}', but you selected Swahili as source."
prompt = TONE_MODIFIERS[tone] + text
model_info = MODEL_MAP[direction]
if isinstance(model_info, list):
tokenizer1, model1 = load_model(model_info[0])
encoded1 = tokenizer1(prompt, return_tensors="pt", padding=True, truncation=True)
intermediate = model1.generate(**encoded1)
intermediate_text = tokenizer1.decode(intermediate[0], skip_special_tokens=True)
tokenizer2, model2 = load_model(model_info[1])
encoded2 = tokenizer2(intermediate_text, return_tensors="pt", padding=True, truncation=True)
final = model2.generate(**encoded2)
translation = tokenizer2.decode(final[0], skip_special_tokens=True)
else:
tokenizer, model = load_model(model_info)
encoded = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
generated = model.generate(**encoded)
translation = tokenizer.decode(generated[0], skip_special_tokens=True)
with open("translation_log.txt", "a", encoding="utf-8") as f:
f.write(f"[{datetime.now()}] {direction} | Tone: {tone}\n")
f.write(f"Input: {text}\nOutput: {translation}\n\n")
return f"{warning}\n{translation}" if warning else translation
def tts_via_api(text, voice_choice):
api_key = os.getenv("ELEVENLABS_API_KEY")
voice_id = VOICE_IDS.get(voice_choice, "21m00Tcm4TlvDq8ikWAM")
if not api_key:
return None
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
headers = {
"xi-api-key": api_key,
"Content-Type": "application/json",
"accept": "audio/mpeg"
}
payload = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75
}
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
mp3_audio = BytesIO(response.content)
audio = AudioSegment.from_file(mp3_audio, format="mp3")
wav_io = BytesIO()
audio.export(wav_io, format="wav")
wav_io.seek(0)
return (wav_io.read(), "audio/wav")
else:
print("TTS API Error:", response.status_code, response.text)
return None
def transcribe_and_translate(audio_path, direction, tone):
recognizer = sr.Recognizer()
try:
with sr.AudioFile(audio_path) as source:
audio = recognizer.record(source)
if len(audio.frame_data) < 10000:
return "β οΈ Audio too short or empty. Please try again."
text = recognizer.recognize_google(audio)
return translate(text, direction, tone)
except Exception as e:
return f"β οΈ Could not transcribe audio: {e}"
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("## π EAC Translator")
gr.Markdown("Supports English, French, and Swahili. Includes tone control, language detection, voice input, and speech playback.")
with gr.Tabs():
with gr.Tab("π Text Translation"):
input_text = gr.Textbox(label="Text to Translate", lines=3)
direction = gr.Dropdown(choices=list(MODEL_MAP.keys()), label="Translation Direction", value="English β Swahili")
tone = gr.Radio(choices=list(TONE_MODIFIERS.keys()), label="Tone", value="Neutral")
output_text = gr.Textbox(label="Translated Text", lines=3)
voice_select = gr.Dropdown(choices=list(VOICE_IDS.keys()), label="Voice", value="Rachel (Female)")
translate_btn = gr.Button("Translate")
speak_btn = gr.Button("π Speak Translation")
audio_output = gr.Audio(label="Playback", interactive=False)
with gr.Tab("ποΈ Voice Translation"):
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Speak Now")
direction_voice = gr.Dropdown(choices=list(MODEL_MAP.keys()), label="Translation Direction", value="English β Swahili")
tone_voice = gr.Radio(choices=list(TONE_MODIFIERS.keys()), label="Tone", value="Neutral")
voice_output = gr.Textbox(label="Translated Text")
voice_select2 = gr.Dropdown(choices=list(VOICE_IDS.keys()), label="Voice", value="Rachel (Female)")
voice_translate_btn = gr.Button("Transcribe & Translate")
voice_speak_btn = gr.Button("π Speak Translation")
audio_output2 = gr.Audio(label="Playback", interactive=False)
translate_btn.click(fn=translate, inputs=[input_text, direction, tone], outputs=output_text)
speak_btn.click(fn=tts_via_api, inputs=[output_text, voice_select], outputs=audio_output)
voice_translate_btn.click(fn=transcribe_and_translate, inputs=[audio_input, direction_voice, tone_voice], outputs=voice_output)
voice_speak_btn.click(fn=tts_via_api, inputs=[voice_output, voice_select2], outputs=audio_output2)
gr.Markdown(
"""<div style='text-align: center;'>
<a href='https://eng-jobbers.vercel.app/' target='_blank' style='text-decoration: none; font-weight: bold;'>
Built with β€οΈ by Eng. Jobbers β Qtrinova Inc
</a>
</div>""",
elem_id="footer"
)
demo.launch() |