Athspi commited on
Commit
9dbf879
·
verified ·
1 Parent(s): 413a70d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -21
app.py CHANGED
@@ -4,6 +4,8 @@ from faster_whisper import WhisperModel
4
  import google.generativeai as genai
5
  from gtts import gTTS, lang
6
  import tempfile
 
 
7
 
8
  # Configure Gemini API (use environment variable for Hugging Face Spaces)
9
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
@@ -19,6 +21,19 @@ except ValueError:
19
  print("Float16 not supported, falling back to int8 on CPU")
20
  whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Function to transcribe audio using faster-whisper
23
  def transcribe_audio(audio_file):
24
  try:
@@ -40,20 +55,36 @@ def translate_text(text, target_language):
40
  except Exception as e:
41
  return None, f"Translation error: {str(e)}"
42
 
43
- # Function to convert text to speech using gTTS with full language support
44
- def text_to_speech(text, language):
45
  try:
46
- lang_map = lang.tts_langs()
47
- tts_lang = language.lower() if language.lower() in lang_map else "en"
48
- tts = gTTS(text=text, lang=tts_lang, slow=False)
49
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
50
- tts.save(fp.name)
51
- return fp.name, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
  return None, f"TTS error: {str(e)}"
54
 
55
  # Main function to process audio input and return outputs
56
- def process_audio(audio_file, target_language):
57
  if audio_file is None:
58
  return "Please upload an audio file or record audio.", None, None, None
59
 
@@ -65,9 +96,7 @@ def process_audio(audio_file, target_language):
65
  if error:
66
  return error, transcription, None, None
67
 
68
- lang_map = lang.tts_langs()
69
- lang_key = next((k for k, v in lang_map.items() if v.lower() == target_language.lower()), "en")
70
- audio_output, error = text_to_speech(translated_text, lang_key)
71
  if error:
72
  return error, transcription, translated_text, None
73
 
@@ -76,18 +105,23 @@ def process_audio(audio_file, target_language):
76
  # Gradio interface
77
  with gr.Blocks(title="AI Audio Translator") as demo:
78
  gr.Markdown("# AI Audio Translator")
79
- gr.Markdown("Upload an audio file or record via microphone, select a target language, and get the transcription, translation, and translated audio!")
80
 
81
- supported_langs = {v: k for k, v in lang.tts_langs().items()}
82
- language_choices = list(supported_langs.keys())
83
 
84
  with gr.Row():
85
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
86
- target_lang = gr.Dropdown(
87
- choices=sorted(language_choices),
88
- value="Spanish",
89
- label="Target Language"
90
- )
 
 
 
 
 
 
91
 
92
  submit_btn = gr.Button("Translate")
93
 
@@ -99,7 +133,7 @@ with gr.Blocks(title="AI Audio Translator") as demo:
99
 
100
  submit_btn.click(
101
  fn=process_audio,
102
- inputs=[audio_input, target_lang],
103
  outputs=[error_output, transcription_output, translation_output, audio_output]
104
  )
105
 
 
4
  import google.generativeai as genai
5
  from gtts import gTTS, lang
6
  import tempfile
7
+ import soundfile as sf
8
+ from kokoro import KPipeline
9
 
10
  # Configure Gemini API (use environment variable for Hugging Face Spaces)
11
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 
21
  print("Float16 not supported, falling back to int8 on CPU")
22
  whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
23
 
24
+ # Language codes for Kokoro TTS
25
+ KOKORO_LANGUAGES = {
26
+ "American English": "a",
27
+ "British English": "b",
28
+ "Japanese": "j",
29
+ "Mandarin Chinese": "z",
30
+ "Spanish": "e",
31
+ "French": "f",
32
+ "Hindi": "h",
33
+ "Italian": "i",
34
+ "Brazilian Portuguese": "p"
35
+ }
36
+
37
  # Function to transcribe audio using faster-whisper
38
  def transcribe_audio(audio_file):
39
  try:
 
55
  except Exception as e:
56
  return None, f"Translation error: {str(e)}"
57
 
58
+ # Function to convert text to speech using Kokoro or gTTS
59
+ def text_to_speech(text, language, tts_engine):
60
  try:
61
+ if tts_engine == "Kokoro" and language in KOKORO_LANGUAGES:
62
+ # Use Kokoro TTS
63
+ lang_code = KOKORO_LANGUAGES[language]
64
+ pipeline = KPipeline(lang_code=lang_code)
65
+ generator = pipeline(text, voice="af_heart", speed=1, split_pattern=r'\n+')
66
+ audio_data = None
67
+ for i, (gs, ps, audio) in enumerate(generator):
68
+ audio_data = audio # Use the last generated audio segment
69
+ break # Only take the first segment for simplicity
70
+ if audio_data is None:
71
+ raise ValueError("No audio generated by Kokoro")
72
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
73
+ sf.write(fp.name, audio_data, 24000)
74
+ return fp.name, None
75
+ else:
76
+ # Fallback to gTTS
77
+ lang_map = lang.tts_langs()
78
+ tts_lang = next((k for k, v in lang_map.items() if v.lower() == language.lower()), "en")
79
+ tts = gTTS(text=text, lang=tts_lang, slow=False)
80
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
81
+ tts.save(fp.name)
82
+ return fp.name, None
83
  except Exception as e:
84
  return None, f"TTS error: {str(e)}"
85
 
86
  # Main function to process audio input and return outputs
87
+ def process_audio(audio_file, target_language, tts_engine):
88
  if audio_file is None:
89
  return "Please upload an audio file or record audio.", None, None, None
90
 
 
96
  if error:
97
  return error, transcription, None, None
98
 
99
+ audio_output, error = text_to_speech(translated_text, target_language, tts_engine)
 
 
100
  if error:
101
  return error, transcription, translated_text, None
102
 
 
105
  # Gradio interface
106
  with gr.Blocks(title="AI Audio Translator") as demo:
107
  gr.Markdown("# AI Audio Translator")
108
+ gr.Markdown("Upload an audio file or record via microphone, select a target language and TTS engine, and get the transcription, translation, and translated audio!")
109
 
110
+ supported_langs = list(set(list(KOKORO_LANGUAGES.keys()) + list({v: k for k, v in lang.tts_langs().items()}.keys())))
 
111
 
112
  with gr.Row():
113
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio")
114
+ with gr.Column():
115
+ target_lang = gr.Dropdown(
116
+ choices=sorted(supported_langs),
117
+ value="Spanish",
118
+ label="Target Language"
119
+ )
120
+ tts_engine = gr.Radio(
121
+ choices=["Kokoro", "gTTS"],
122
+ value="gTTS",
123
+ label="Text-to-Speech Engine"
124
+ )
125
 
126
  submit_btn = gr.Button("Translate")
127
 
 
133
 
134
  submit_btn.click(
135
  fn=process_audio,
136
+ inputs=[audio_input, target_lang, tts_engine],
137
  outputs=[error_output, transcription_output, translation_output, audio_output]
138
  )
139