Athspi commited on
Commit
e51d62b
·
verified ·
1 Parent(s): 280b5d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -92
app.py CHANGED
@@ -12,140 +12,175 @@ from gtts import gTTS, lang
12
  from kokoro import KPipeline
13
 
14
  import google.generativeai as genai
15
- from google.generativeai.types import GenerateContentConfig, SpeechConfig, VoiceConfig, PrebuiltVoiceConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Load API key
18
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
19
  if not GEMINI_API_KEY:
20
- raise ValueError("GEMINI_API_KEY environment variable not set")
21
 
22
  genai.configure(api_key=GEMINI_API_KEY)
 
23
 
24
- # Flask app setup
25
- app = Flask(__name__, static_folder='static')
26
- CORS(app)
27
-
28
- # Supported languages
29
  KOKORO_LANGUAGES = {
30
- "American English": "a", "British English": "b", "Mandarin Chinese": "z",
31
- "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"
 
 
 
 
 
 
32
  }
33
  GTTS_LANGUAGES = lang.tts_langs()
34
- GTTS_LANGUAGES['ja'] = 'Japanese'
35
- SUPPORTED_LANGUAGES = sorted(set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values()))
 
 
 
36
 
37
- # Voice name for Gemini TTS
38
- DEFAULT_GEMINI_VOICE = "Kore"
39
 
40
- def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
 
 
 
 
 
41
  with wave.open(filename, "wb") as wf:
42
  wf.setnchannels(channels)
43
  wf.setsampwidth(sample_width)
44
  wf.setframerate(rate)
45
  wf.writeframes(pcm)
46
 
47
- @app.route('/')
 
 
 
 
 
 
 
 
48
  def serve_index():
49
- return send_from_directory(app.static_folder, 'index.html')
 
50
 
51
- @app.route('/languages')
52
- def get_languages():
 
53
  return jsonify(SUPPORTED_LANGUAGES)
54
 
55
- @app.route('/translate', methods=['POST'])
 
56
  def translate_audio():
57
  try:
58
- if 'audio' not in request.files:
59
- return jsonify({'error': 'No audio file uploaded'}), 400
 
60
 
61
- audio_file = request.files['audio']
62
- target_language = request.form.get('language', 'English')
63
 
64
- if not audio_file or audio_file.filename == '':
65
- return jsonify({'error': 'Invalid audio file'}), 400
66
 
67
- allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
68
- if audio_file.mimetype not in allowed_mime_types:
69
- return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
70
 
71
- # Transcribe audio with Gemini
72
- model = genai.GenerativeModel("models/gemini-1.5-flash")
73
- audio_blob = {
74
- 'mime_type': audio_file.mimetype,
75
- 'data': audio_file.read()
76
- }
77
 
78
  convo = model.start_chat()
79
- convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim.")
80
- response = convo.send_message(audio_blob)
81
- transcription = response.text.strip()
 
 
82
 
83
- # Translate
84
- prompt = f"Translate the following text to {target_language}:\n\n{transcription}"
85
- translation_response = model.generate_content(prompt)
86
- translated_text = translation_response.text.strip()
87
 
88
- # Try Gemini 2.5 TTS
89
  try:
90
- tts_response = genai.generate_content(
91
- model="models/gemini-2.5-flash-preview-tts",
92
  contents=translated_text,
93
- generation_config=GenerateContentConfig(
94
- response_mime_type="audio/wav"
 
 
 
 
 
 
 
95
  ),
96
- speech_config=SpeechConfig(
97
- voice_config=VoiceConfig(
98
- prebuilt_voice=PrebuiltVoiceConfig(voice_name=DEFAULT_GEMINI_VOICE)
99
- )
100
- )
101
  )
102
-
103
- data = tts_response.candidates[0].content.parts[0].inline_data.data
104
- temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gemini.wav")
105
- wave_file(temp_output_path, data)
106
-
107
- except Exception as gemini_tts_error:
108
- app.logger.warning(f"Gemini TTS failed: {gemini_tts_error}")
109
- # Fallback to Kokoro or gTTS
110
- if target_language in KOKORO_LANGUAGES:
111
- lang_code = KOKORO_LANGUAGES[target_language]
112
- pipeline = KPipeline(lang_code=lang_code)
113
  generator = pipeline(translated_text, voice="af_heart", speed=1)
114
 
115
- audio_segments = [audio for _, _, audio in generator if audio is not None]
116
- if audio_segments:
117
- audio_data = np.concatenate(audio_segments)
118
- temp_output_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
119
- sf.write(temp_output_path, audio_data, 24000)
120
  else:
121
- raise ValueError("No audio generated by Kokoro.")
122
- else:
123
- lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
124
- tts = gTTS(translated_text, lang=lang_code)
125
- temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
126
- tts.save(temp_output_path)
127
 
128
- return jsonify({
129
- 'transcription': transcription,
130
- 'translation': translated_text,
131
- 'audio_url': f'/download/{os.path.basename(temp_output_path)}'
132
- })
 
 
 
 
 
 
 
133
 
134
  except Exception as e:
135
- app.logger.error(f"Processing error: {str(e)}")
136
- return jsonify({'error': str(e)}), 500
 
137
 
138
- @app.route('/download/<filename>')
139
  def download_file(filename):
140
- try:
141
- return send_file(
142
- os.path.join(tempfile.gettempdir(), filename),
143
- mimetype="audio/mpeg",
144
- as_attachment=True,
145
- download_name=f"translated_{filename}"
146
- )
147
- except FileNotFoundError:
148
- return jsonify({'error': 'File not found'}), 404
149
 
150
- if __name__ == '__main__':
151
  app.run(host="0.0.0.0", port=7860)
 
12
  from kokoro import KPipeline
13
 
14
  import google.generativeai as genai
15
+ from google.generativeai.types import (
16
+ GenerateContentConfig,
17
+ SpeechConfig,
18
+ VoiceConfig,
19
+ PrebuiltVoiceConfig,
20
+ )
21
+
22
+ # -----------------------------------------------------------------------------
23
+ # Configuration
24
+ # -----------------------------------------------------------------------------
25
+
26
+ # 1) Make sure you've run:
27
+ # pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
28
+ #
29
+ # 2) Set your Gemini API key in the environment:
30
+ # export GEMINI_API_KEY="your_real_api_key_here"
31
 
 
32
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
33
  if not GEMINI_API_KEY:
34
+ raise RuntimeError("GEMINI_API_KEY environment variable not set")
35
 
36
  genai.configure(api_key=GEMINI_API_KEY)
37
+ client = genai.Client(api_key=GEMINI_API_KEY)
38
 
39
+ # Kokoro and gTTS language maps
 
 
 
 
40
  KOKORO_LANGUAGES = {
41
+ "American English": "a",
42
+ "British English": "b",
43
+ "Mandarin Chinese": "z",
44
+ "Spanish": "e",
45
+ "French": "f",
46
+ "Hindi": "h",
47
+ "Italian": "i",
48
+ "Brazilian Portuguese": "p",
49
  }
50
  GTTS_LANGUAGES = lang.tts_langs()
51
+ GTTS_LANGUAGES["ja"] = "Japanese" # explicit Japanese support
52
+
53
+ SUPPORTED_LANGUAGES = sorted(
54
+ set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values())
55
+ )
56
 
57
+ # Voice name for Gemini TTS preview
58
+ GEMINI_VOICE_NAME = "Kore"
59
 
60
+ # -----------------------------------------------------------------------------
61
+ # Helpers
62
+ # -----------------------------------------------------------------------------
63
+
64
+ def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
65
+ """Write raw PCM into a .wav file."""
66
  with wave.open(filename, "wb") as wf:
67
  wf.setnchannels(channels)
68
  wf.setsampwidth(sample_width)
69
  wf.setframerate(rate)
70
  wf.writeframes(pcm)
71
 
72
+ # -----------------------------------------------------------------------------
73
+ # Flask App
74
+ # -----------------------------------------------------------------------------
75
+
76
+ app = Flask(__name__, static_folder="static")
77
+ CORS(app)
78
+
79
+
80
+ @app.route("/")
81
  def serve_index():
82
+ # serve your index.html from ./static/index.html
83
+ return send_from_directory(app.static_folder, "index.html")
84
 
85
+
86
+ @app.route("/languages")
87
+ def list_languages():
88
  return jsonify(SUPPORTED_LANGUAGES)
89
 
90
+
91
+ @app.route("/translate", methods=["POST"])
92
  def translate_audio():
93
  try:
94
+ # 1. Receive file + target language
95
+ if "audio" not in request.files:
96
+ return jsonify(error="No audio file uploaded"), 400
97
 
98
+ audio_file = request.files["audio"]
99
+ target_lang = request.form.get("language", "English")
100
 
101
+ if not audio_file or audio_file.filename == "":
102
+ return jsonify(error="Invalid audio file"), 400
103
 
104
+ # 2. Validate MIME type
105
+ if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
106
+ return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400
107
 
108
+ # 3. Transcribe with Gemini
109
+ model = genai.GenerativeModel("gemini-2.0-flash")
110
+ blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}
 
 
 
111
 
112
  convo = model.start_chat()
113
+ convo.send_message(
114
+ "You are a professional transcriber. Transcribe this audio accurately, verbatim."
115
+ )
116
+ resp = convo.send_message(blob)
117
+ transcription = resp.text.strip()
118
 
119
+ # 4. Translate with Gemini
120
+ prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
121
+ translation_resp = model.generate_content(prompt)
122
+ translated_text = translation_resp.text.strip()
123
 
124
+ # 5. Try Gemini TTS 2.5 preview
125
  try:
126
+ tts_resp = client.models.generate_content(
127
+ model="gemini-2.5-flash-preview-tts",
128
  contents=translated_text,
129
+ config=GenerateContentConfig(
130
+ response_modalities=["AUDIO"],
131
+ speech_config=SpeechConfig(
132
+ voice_config=VoiceConfig(
133
+ prebuilt_voice_config=PrebuiltVoiceConfig(
134
+ voice_name=GEMINI_VOICE_NAME
135
+ )
136
+ )
137
+ ),
138
  ),
 
 
 
 
 
139
  )
140
+ pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
141
+ out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
142
+ wave_file(out_path, pcm_data)
143
+
144
+ except Exception:
145
+ # Fallback: Kokoro
146
+ if target_lang in KOKORO_LANGUAGES:
147
+ code = KOKORO_LANGUAGES[target_lang]
148
+ pipeline = KPipeline(lang_code=code)
 
 
149
  generator = pipeline(translated_text, voice="af_heart", speed=1)
150
 
151
+ segments = [audio for _, _, audio in generator if audio is not None]
152
+ if segments:
153
+ arr = np.concatenate(segments)
154
+ out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
155
+ sf.write(out_path, arr, 24000)
156
  else:
157
+ raise RuntimeError("Kokoro produced no audio")
 
 
 
 
 
158
 
159
+ # Final fallback: gTTS
160
+ else:
161
+ gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
162
+ tts = gTTS(translated_text, lang=gtts_code)
163
+ out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
164
+ tts.save(out_path)
165
+
166
+ return jsonify(
167
+ transcription=transcription,
168
+ translation=translated_text,
169
+ audio_url=f"/download/{os.path.basename(out_path)}",
170
+ )
171
 
172
  except Exception as e:
173
+ app.logger.exception("Error in /translate")
174
+ return jsonify(error=str(e)), 500
175
+
176
 
177
+ @app.route("/download/<filename>")
178
  def download_file(filename):
179
+ path = os.path.join(tempfile.gettempdir(), filename)
180
+ if not os.path.isfile(path):
181
+ return jsonify(error="File not found"), 404
182
+ return send_file(path, as_attachment=True, download_name=f"translated_{filename}")
183
+
 
 
 
 
184
 
185
+ if __name__ == "__main__":
186
  app.run(host="0.0.0.0", port=7860)