Athspi commited on
Commit
132c026
·
verified ·
1 Parent(s): edc2fb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -64
app.py CHANGED
@@ -1,13 +1,15 @@
1
  import os
2
- import numpy as np
3
  from flask import Flask, request, jsonify, send_file, send_from_directory
4
  import google.generativeai as genai
5
- from gtts import gTTS, lang
 
6
  import tempfile
7
- import soundfile as sf
8
- from kokoro import KPipeline
9
  from werkzeug.utils import secure_filename
10
  from flask_cors import CORS
 
11
 
12
  app = Flask(__name__, static_folder='static')
13
  CORS(app)
@@ -18,24 +20,53 @@ if not GEMINI_API_KEY:
18
  raise ValueError("GEMINI_API_KEY environment variable not set")
19
  genai.configure(api_key=GEMINI_API_KEY)
20
 
21
- # Language configurations
22
- KOKORO_LANGUAGES = {
23
- "American English": "a",
24
- "British English": "b",
25
- "Mandarin Chinese": "z",
26
- "Spanish": "e",
27
- "French": "f",
28
- "Hindi": "h",
29
- "Italian": "i",
30
- "Brazilian Portuguese": "p"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
 
33
- GTTS_LANGUAGES = lang.tts_langs()
34
- GTTS_LANGUAGES['ja'] = 'Japanese' # Explicit Japanese support
35
 
36
- SUPPORTED_LANGUAGES = sorted(
37
- list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
38
- )
 
 
 
 
 
39
 
40
  @app.route('/')
41
  def serve_index():
@@ -52,60 +83,63 @@ def translate_audio():
52
  return jsonify({'error': 'No audio file uploaded'}), 400
53
 
54
  audio_file = request.files['audio']
55
- target_language = request.form.get('language', 'English')
56
 
57
  if not audio_file or audio_file.filename == '':
58
  return jsonify({'error': 'Invalid audio file'}), 400
59
 
60
- # Validate MIME type
61
- allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
62
  if audio_file.mimetype not in allowed_mime_types:
63
- return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
64
 
65
- # Transcribe audio using Gemini
66
- model = genai.GenerativeModel("gemini-2.0-flash")
67
 
68
- # Create proper audio blob
69
- audio_blob = {
70
- 'mime_type': audio_file.mimetype,
71
- 'data': audio_file.read()
72
- }
73
 
74
  # Get transcription
75
- convo = model.start_chat()
76
- convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
77
- response = convo.send_message(audio_blob)
 
 
 
78
  transcription = response.text.strip()
79
 
80
  # Translate text using Gemini
81
- prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
82
- response = model.generate_content(prompt)
83
  translated_text = response.text.strip()
84
 
85
- # Generate TTS
86
- if target_language in KOKORO_LANGUAGES:
87
- lang_code = KOKORO_LANGUAGES[target_language]
88
- pipeline = KPipeline(lang_code=lang_code)
89
- generator = pipeline(translated_text, voice="af_heart", speed=1)
90
-
91
- # Collect all audio segments
92
- audio_segments = []
93
- for _, _, audio in generator:
94
- if audio is not None:
95
- audio_segments.append(audio)
96
-
97
- if audio_segments:
98
- audio_data = np.concatenate(audio_segments)
99
- _, temp_output_path = tempfile.mkstemp(suffix=".wav")
100
- sf.write(temp_output_path, audio_data, 24000)
101
- else:
102
- raise ValueError("No audio generated by Kokoro")
103
- else:
104
- # Standard gTTS handling
105
- lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
106
- tts = gTTS(translated_text, lang=lang_code)
107
- _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
108
- tts.save(temp_output_path)
 
 
 
109
 
110
  return jsonify({
111
  'transcription': transcription,
@@ -114,20 +148,27 @@ def translate_audio():
114
  })
115
 
116
  except Exception as e:
117
- app.logger.error(f"Error processing request: {str(e)}")
118
  return jsonify({'error': str(e)}), 500
119
 
120
  @app.route('/download/<filename>')
121
  def download_file(filename):
122
  try:
 
 
123
  return send_file(
124
- os.path.join(tempfile.gettempdir(), filename),
125
- mimetype="audio/mpeg",
126
  as_attachment=True,
127
- download_name=f"translated_{filename}"
128
  )
129
  except FileNotFoundError:
130
  return jsonify({'error': 'File not found'}), 404
 
 
 
 
131
 
132
  if __name__ == '__main__':
133
- app.run(host="0.0.0.0", port=7860)
 
 
1
  import os
2
+ # import numpy as np # No longer needed for TTS
3
  from flask import Flask, request, jsonify, send_file, send_from_directory
4
  import google.generativeai as genai
5
+ from google.generativeai import types as genai_types # For clarity if needed, or use genai.types
6
+ # from gtts import gTTS, lang # Removed
7
  import tempfile
8
+ # import soundfile as sf # Removed, using wave module instead
9
+ # from kokoro import KPipeline # Removed
10
  from werkzeug.utils import secure_filename
11
  from flask_cors import CORS
12
+ import wave # Added for saving WAV files
13
 
14
  app = Flask(__name__, static_folder='static')
15
  CORS(app)
 
20
  raise ValueError("GEMINI_API_KEY environment variable not set")
21
  genai.configure(api_key=GEMINI_API_KEY)
22
 
23
+ # Transcription and Translation Model
24
+ TRANSCRIPTION_TRANSLATION_MODEL_NAME = "gemini-2.0-flash" # Using 1.5 flash as it's common, was "gemini-2.0-flash"
25
+ # Text-to-Speech Model
26
+ TTS_MODEL_NAME = "gemini-2.5-flash-preview-tts" # Using a model known to support audio output modality.
27
+ # The user's example mentioned "gemini-2.5-flash-preview-tts".
28
+ # If that specific model works with response_mime_type, it can be used.
29
+
30
+ # Gemini TTS Supported Languages (Display Name: BCP-47 Code)
31
+ # Based on the user-provided list. The TTS API auto-detects language from text.
32
+ # This list is primarily for the frontend language selector.
33
+ GEMINI_TTS_LANGUAGES = {
34
+ "Arabic (Egyptian)": "ar-EG",
35
+ "German (Germany)": "de-DE",
36
+ "English (US)": "en-US",
37
+ "Spanish (US)": "es-US",
38
+ "French (France)": "fr-FR",
39
+ "Hindi (India)": "hi-IN",
40
+ "Indonesian (Indonesia)": "id-ID",
41
+ "Italian (Italy)": "it-IT",
42
+ "Japanese (Japan)": "ja-JP",
43
+ "Korean (Korea)": "ko-KR",
44
+ "Portuguese (Brazil)": "pt-BR",
45
+ "Russian (Russia)": "ru-RU",
46
+ "Dutch (Netherlands)": "nl-NL",
47
+ "Polish (Poland)": "pl-PL",
48
+ "Thai (Thailand)": "th-TH",
49
+ "Turkish (Turkey)": "tr-TR",
50
+ "Vietnamese (Vietnam)": "vi-VN",
51
+ "Romanian (Romania)": "ro-RO",
52
+ "Ukrainian (Ukraine)": "uk-UA",
53
+ "Bengali (Bangladesh)": "bn-BD",
54
+ "English (India)": "en-IN",
55
+ "Marathi (India)": "mr-IN",
56
+ "Tamil (India)": "ta-IN",
57
+ "Telugu (India)": "te-IN"
58
  }
59
 
60
+ SUPPORTED_LANGUAGES = sorted(list(GEMINI_TTS_LANGUAGES.keys()))
 
61
 
62
+ # Helper function to save PCM data as a WAV file
63
+ def save_wave_file(filename, pcm_data, channels=1, sample_width=2, frame_rate=24000):
64
+ """Saves PCM audio data to a WAV file."""
65
+ with wave.open(filename, "wb") as wf:
66
+ wf.setnchannels(channels)
67
+ wf.setsampwidth(sample_width) # Bytes per sample
68
+ wf.setframerate(frame_rate)
69
+ wf.writeframes(pcm_data)
70
 
71
  @app.route('/')
72
  def serve_index():
 
83
  return jsonify({'error': 'No audio file uploaded'}), 400
84
 
85
  audio_file = request.files['audio']
86
+ target_language_display_name = request.form.get('language', 'English (US)') # Default to a common one
87
 
88
  if not audio_file or audio_file.filename == '':
89
  return jsonify({'error': 'Invalid audio file'}), 400
90
 
91
+ # Validate MIME type for transcription
92
+ allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp3', 'audio/ogg', 'audio/flac', 'audio/mp4', 'audio/webm', 'audio/amr']
93
  if audio_file.mimetype not in allowed_mime_types:
94
+ return jsonify({'error': f'Unsupported file type for transcription: {audio_file.mimetype}'}), 400
95
 
96
+ # Initialize Gemini model for transcription and translation
97
+ model = genai.GenerativeModel(TRANSCRIPTION_TRANSLATION_MODEL_NAME)
98
 
99
+ audio_data_bytes = audio_file.read()
100
+ audio_blob = genai_types.Blob(mime_type=audio_file.mimetype, data=audio_data_bytes)
 
 
 
101
 
102
  # Get transcription
103
+ # Forcing transcription to be in original language can be tricky if the model tends to translate.
104
+ # A more robust prompt might be needed if issues arise.
105
+ transcription_prompt = "You are a professional transcriber. Transcribe this audio accurately and verbatim in its original spoken language. Respond only with the transcription."
106
+
107
+ # Using genai.upload_file for larger files if needed, but for direct blob:
108
+ response = model.generate_content([transcription_prompt, audio_blob])
109
  transcription = response.text.strip()
110
 
111
  # Translate text using Gemini
112
+ translation_prompt = f"Translate the following text to {target_language_display_name}. Preserve meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
113
+ response = model.generate_content(translation_prompt)
114
  translated_text = response.text.strip()
115
 
116
+ # Generate TTS using Gemini
117
+ tts_model = genai.GenerativeModel(TTS_MODEL_NAME)
118
+
119
+ # Gemini TTS detects language from the text.
120
+ # The voice selection is typically handled by the model or default voice for the detected language.
121
+ # The user's snippet for `speech_config` and `voice_name='Kore'` is not directly compatible
122
+ # with the current `google-generativeai` SDK's `GenerativeModel.generate_content` method
123
+ # in a straightforward way. This method uses `response_mime_type` for audio output.
124
+
125
+ tts_generation_config = genai_types.GenerationConfig(
126
+ response_mime_type="audio/wav" # Gemini will output WAV audio
127
+ )
128
+
129
+ # The content for TTS is just the translated text.
130
+ tts_response = tts_model.generate_content(
131
+ contents=[translated_text], # Make sure contents is an iterable of Parts or strings
132
+ generation_config=tts_generation_config
133
+ )
134
+
135
+ if not (tts_response.candidates and tts_response.candidates[0].content.parts):
136
+ raise ValueError("Gemini TTS did not return audio data.")
137
+
138
+ audio_pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data
139
+
140
+ _, temp_output_path = tempfile.mkstemp(suffix=".wav")
141
+ # Default parameters from the user's example: rate=24000, sample_width=2 (16-bit), channels=1
142
+ save_wave_file(temp_output_path, audio_pcm_data, channels=1, sample_width=2, frame_rate=24000)
143
 
144
  return jsonify({
145
  'transcription': transcription,
 
148
  })
149
 
150
  except Exception as e:
151
+ app.logger.error(f"Error processing request: {str(e)}", exc_info=True)
152
  return jsonify({'error': str(e)}), 500
153
 
154
  @app.route('/download/<filename>')
155
  def download_file(filename):
156
  try:
157
+ # tempfile.gettempdir() is the directory where mkstemp creates files
158
+ file_path = os.path.join(tempfile.gettempdir(), filename)
159
  return send_file(
160
+ file_path,
161
+ mimetype="audio/wav", # Changed from mpeg to wav
162
  as_attachment=True,
163
+ download_name=f"translated_{filename.replace(tempfile.gettempdir(), '')}" # Cleaner name
164
  )
165
  except FileNotFoundError:
166
  return jsonify({'error': 'File not found'}), 404
167
+ except Exception as e:
168
+ app.logger.error(f"Error downloading file: {str(e)}", exc_info=True)
169
+ return jsonify({'error': f"Error downloading file: {str(e)}"}), 500
170
+
171
 
172
  if __name__ == '__main__':
173
+ # Consider adding an environment variable for debug mode for production
174
+ app.run(host="0.0.0.0", port=7860) # Added debug=True for development