Athspi commited on
Commit
9e7d27b
·
verified ·
1 Parent(s): e51d62b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -150
app.py CHANGED
@@ -1,186 +1,152 @@
1
  import os
2
  import tempfile
3
- import wave
4
  import numpy as np
5
  import soundfile as sf
 
6
 
7
  from flask import Flask, request, jsonify, send_file, send_from_directory
8
  from flask_cors import CORS
9
- from werkzeug.utils import secure_filename
10
-
11
- from gtts import gTTS, lang
12
- from kokoro import KPipeline
13
-
14
  import google.generativeai as genai
15
- from google.generativeai.types import (
16
- GenerateContentConfig,
17
- SpeechConfig,
18
- VoiceConfig,
19
- PrebuiltVoiceConfig,
20
- )
21
-
22
- # -----------------------------------------------------------------------------
23
- # Configuration
24
- # -----------------------------------------------------------------------------
25
-
26
- # 1) Make sure you've run:
27
- # pip install --upgrade google-generativeai gTTS soundfile kokoro flask flask-cors werkzeug
28
- #
29
- # 2) Set your Gemini API key in the environment:
30
- # export GEMINI_API_KEY="your_real_api_key_here"
31
 
 
 
 
 
 
32
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
33
  if not GEMINI_API_KEY:
34
- raise RuntimeError("GEMINI_API_KEY environment variable not set")
35
-
36
  genai.configure(api_key=GEMINI_API_KEY)
37
- client = genai.Client(api_key=GEMINI_API_KEY)
38
-
39
- # Kokoro and gTTS language maps
40
- KOKORO_LANGUAGES = {
41
- "American English": "a",
42
- "British English": "b",
43
- "Mandarin Chinese": "z",
44
- "Spanish": "e",
45
- "French": "f",
46
- "Hindi": "h",
47
- "Italian": "i",
48
- "Brazilian Portuguese": "p",
49
- }
50
- GTTS_LANGUAGES = lang.tts_langs()
51
- GTTS_LANGUAGES["ja"] = "Japanese" # explicit Japanese support
52
-
53
- SUPPORTED_LANGUAGES = sorted(
54
- set(KOKORO_LANGUAGES.keys()) | set(GTTS_LANGUAGES.values())
55
- )
56
-
57
- # Voice name for Gemini TTS preview
58
- GEMINI_VOICE_NAME = "Kore"
59
-
60
- # -----------------------------------------------------------------------------
61
- # Helpers
62
- # -----------------------------------------------------------------------------
63
-
64
- def wave_file(filename: str, pcm: bytes, channels=1, rate=24000, sample_width=2):
65
- """Write raw PCM into a .wav file."""
66
- with wave.open(filename, "wb") as wf:
67
- wf.setnchannels(channels)
68
- wf.setsampwidth(sample_width)
69
- wf.setframerate(rate)
70
- wf.writeframes(pcm)
71
-
72
- # -----------------------------------------------------------------------------
73
- # Flask App
74
- # -----------------------------------------------------------------------------
75
-
76
- app = Flask(__name__, static_folder="static")
77
- CORS(app)
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- @app.route("/")
81
  def serve_index():
82
- # serve your index.html from ./static/index.html
83
- return send_from_directory(app.static_folder, "index.html")
84
-
85
-
86
- @app.route("/languages")
87
- def list_languages():
88
- return jsonify(SUPPORTED_LANGUAGES)
89
 
 
 
 
90
 
91
- @app.route("/translate", methods=["POST"])
92
  def translate_audio():
93
  try:
94
- # 1. Receive file + target language
95
- if "audio" not in request.files:
96
- return jsonify(error="No audio file uploaded"), 400
97
 
98
- audio_file = request.files["audio"]
99
- target_lang = request.form.get("language", "English")
100
 
101
- if not audio_file or audio_file.filename == "":
102
- return jsonify(error="Invalid audio file"), 400
103
 
104
- # 2. Validate MIME type
105
- if audio_file.mimetype not in ("audio/wav", "audio/mpeg", "audio/mp4", "audio/webm"):
106
- return jsonify(error=f"Unsupported file type: {audio_file.mimetype}"), 400
 
107
 
108
- # 3. Transcribe with Gemini
 
 
 
109
  model = genai.GenerativeModel("gemini-2.0-flash")
110
- blob = {"mime_type": audio_file.mimetype, "data": audio_file.read()}
 
 
 
111
 
112
  convo = model.start_chat()
113
- convo.send_message(
114
- "You are a professional transcriber. Transcribe this audio accurately, verbatim."
115
- )
116
- resp = convo.send_message(blob)
117
- transcription = resp.text.strip()
118
-
119
- # 4. Translate with Gemini
120
- prompt = f"Translate the following text to {target_lang}, preserving meaning and cultural nuances:\n\n{transcription}"
121
- translation_resp = model.generate_content(prompt)
122
- translated_text = translation_resp.text.strip()
123
-
124
- # 5. Try Gemini TTS 2.5 preview
125
- try:
126
- tts_resp = client.models.generate_content(
127
- model="gemini-2.5-flash-preview-tts",
128
- contents=translated_text,
129
- config=GenerateContentConfig(
130
- response_modalities=["AUDIO"],
131
- speech_config=SpeechConfig(
132
- voice_config=VoiceConfig(
133
- prebuilt_voice_config=PrebuiltVoiceConfig(
134
- voice_name=GEMINI_VOICE_NAME
135
- )
 
 
 
136
  )
137
- ),
138
  ),
139
  )
140
- pcm_data = tts_resp.candidates[0].content.parts[0].inline_data.data
141
- out_path = os.path.join(tempfile.gettempdir(), f"tts_gemini.wav")
142
- wave_file(out_path, pcm_data)
143
-
144
- except Exception:
145
- # Fallback: Kokoro
146
- if target_lang in KOKORO_LANGUAGES:
147
- code = KOKORO_LANGUAGES[target_lang]
148
- pipeline = KPipeline(lang_code=code)
149
- generator = pipeline(translated_text, voice="af_heart", speed=1)
150
-
151
- segments = [audio for _, _, audio in generator if audio is not None]
152
- if segments:
153
- arr = np.concatenate(segments)
154
- out_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
155
- sf.write(out_path, arr, 24000)
156
- else:
157
- raise RuntimeError("Kokoro produced no audio")
158
-
159
- # Final fallback: gTTS
160
- else:
161
- gtts_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_lang), "en")
162
- tts = gTTS(translated_text, lang=gtts_code)
163
- out_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
164
- tts.save(out_path)
165
-
166
- return jsonify(
167
- transcription=transcription,
168
- translation=translated_text,
169
- audio_url=f"/download/{os.path.basename(out_path)}",
170
  )
171
 
172
- except Exception as e:
173
- app.logger.exception("Error in /translate")
174
- return jsonify(error=str(e)), 500
175
 
 
 
 
 
 
 
 
176
 
177
- @app.route("/download/<filename>")
178
- def download_file(filename):
179
- path = os.path.join(tempfile.gettempdir(), filename)
180
- if not os.path.isfile(path):
181
- return jsonify(error="File not found"), 404
182
- return send_file(path, as_attachment=True, download_name=f"translated_{filename}")
 
 
 
183
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- if __name__ == "__main__":
186
  app.run(host="0.0.0.0", port=7860)
 
1
  import os
2
  import tempfile
 
3
  import numpy as np
4
  import soundfile as sf
5
+ import wave
6
 
7
  from flask import Flask, request, jsonify, send_file, send_from_directory
8
  from flask_cors import CORS
 
 
 
 
 
9
  import google.generativeai as genai
10
+ from google.generativeai import types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Initialize Flask app
13
+ app = Flask(__name__, static_folder='static')
14
+ CORS(app)
15
+
16
+ # Configure Gemini API
17
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
18
  if not GEMINI_API_KEY:
19
+ raise ValueError("GEMINI_API_KEY environment variable not set")
 
20
  genai.configure(api_key=GEMINI_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Supported languages and their BCP-47 codes
23
+ SUPPORTED_LANGUAGES = {
24
+ "Arabic (Egyptian)": "ar-EG",
25
+ "German (Germany)": "de-DE",
26
+ "English (US)": "en-US",
27
+ "Spanish (US)": "es-US",
28
+ "French (France)": "fr-FR",
29
+ "Hindi (India)": "hi-IN",
30
+ "Indonesian (Indonesia)": "id-ID",
31
+ "Italian (Italy)": "it-IT",
32
+ "Japanese (Japan)": "ja-JP",
33
+ "Korean (Korea)": "ko-KR",
34
+ "Portuguese (Brazil)": "pt-BR",
35
+ "Russian (Russia)": "ru-RU",
36
+ "Dutch (Netherlands)": "nl-NL",
37
+ "Polish (Poland)": "pl-PL",
38
+ "Thai (Thailand)": "th-TH",
39
+ "Turkish (Turkey)": "tr-TR",
40
+ "Vietnamese (Vietnam)": "vi-VN",
41
+ "Romanian (Romania)": "ro-RO",
42
+ "Ukrainian (Ukraine)": "uk-UA",
43
+ "Bengali (Bangladesh)": "bn-BD",
44
+ "English (India)": "en-IN",
45
+ "Marathi (India)": "mr-IN",
46
+ "Tamil (India)": "ta-IN",
47
+ "Telugu (India)": "te-IN"
48
+ }
49
 
50
+ @app.route('/')
51
  def serve_index():
52
+ return send_from_directory(app.static_folder, 'index.html')
 
 
 
 
 
 
53
 
54
+ @app.route('/languages')
55
+ def get_languages():
56
+ return jsonify(list(SUPPORTED_LANGUAGES.keys()))
57
 
58
+ @app.route('/translate', methods=['POST'])
59
  def translate_audio():
60
  try:
61
+ if 'audio' not in request.files:
62
+ return jsonify({'error': 'No audio file uploaded'}), 400
 
63
 
64
+ audio_file = request.files['audio']
65
+ target_language = request.form.get('language', 'English (US)')
66
 
67
+ if not audio_file or audio_file.filename == '':
68
+ return jsonify({'error': 'Invalid audio file'}), 400
69
 
70
+ # Validate MIME type
71
+ allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
72
+ if audio_file.mimetype not in allowed_mime_types:
73
+ return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
74
 
75
+ # Read audio data
76
+ audio_data = audio_file.read()
77
+
78
+ # Transcribe audio using Gemini
79
  model = genai.GenerativeModel("gemini-2.0-flash")
80
+ audio_blob = {
81
+ 'mime_type': audio_file.mimetype,
82
+ 'data': audio_data
83
+ }
84
 
85
  convo = model.start_chat()
86
+ convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
87
+ response = convo.send_message(audio_blob)
88
+ transcription = response.text.strip()
89
+
90
+ # Translate text using Gemini
91
+ prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
92
+ response = model.generate_content(prompt)
93
+ translated_text = response.text.strip()
94
+
95
+ # Generate TTS using Gemini
96
+ # Initialize Gemini client
97
+ client = genai.Client(api_key=GEMINI_API_KEY)
98
+
99
+ # Determine language code
100
+ lang_code = SUPPORTED_LANGUAGES.get(target_language, 'en-US')
101
+
102
+ # Generate speech
103
+ response = client.models.generate_content(
104
+ model="gemini-2.5-flash-preview-tts",
105
+ contents=translated_text,
106
+ config=types.GenerateContentConfig(
107
+ response_modalities=["AUDIO"],
108
+ speech_config=types.SpeechConfig(
109
+ voice_config=types.VoiceConfig(
110
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
111
+ voice_name='Kore' # You can change the voice as needed
112
  )
113
+ )
114
  ),
115
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  )
117
 
118
+ # Extract audio data
119
+ audio_output = response.candidates[0].content.parts[0].inline_data.data
 
120
 
121
+ # Save audio to temporary file
122
+ temp_fd, temp_output_path = tempfile.mkstemp(suffix=".wav")
123
+ with wave.open(temp_output_path, "wb") as wf:
124
+ wf.setnchannels(1)
125
+ wf.setsampwidth(2)
126
+ wf.setframerate(24000)
127
+ wf.writeframes(audio_output)
128
 
129
+ return jsonify({
130
+ 'transcription': transcription,
131
+ 'translation': translated_text,
132
+ 'audio_url': f'/download/{os.path.basename(temp_output_path)}'
133
+ })
134
+
135
+ except Exception as e:
136
+ app.logger.error(f"Error processing request: {str(e)}")
137
+ return jsonify({'error': str(e)}), 500
138
 
139
+ @app.route('/download/<filename>')
140
+ def download_file(filename):
141
+ try:
142
+ return send_file(
143
+ os.path.join(tempfile.gettempdir(), filename),
144
+ mimetype="audio/wav",
145
+ as_attachment=True,
146
+ download_name=f"translated_{filename}"
147
+ )
148
+ except FileNotFoundError:
149
+ return jsonify({'error': 'File not found'}), 404
150
 
151
+ if __name__ == '__main__':
152
  app.run(host="0.0.0.0", port=7860)