Athspi commited on
Commit
9ffbfd1
·
verified ·
1 Parent(s): 132c026

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -105
app.py CHANGED
@@ -1,15 +1,13 @@
1
  import os
2
- # import numpy as np # No longer needed for TTS
3
  from flask import Flask, request, jsonify, send_file, send_from_directory
4
  import google.generativeai as genai
5
- from google.generativeai import types as genai_types # For clarity if needed, or use genai.types
6
- # from gtts import gTTS, lang # Removed
7
  import tempfile
8
- # import soundfile as sf # Removed, using wave module instead
9
- # from kokoro import KPipeline # Removed
10
  from werkzeug.utils import secure_filename
11
  from flask_cors import CORS
12
- import wave # Added for saving WAV files
13
 
14
  app = Flask(__name__, static_folder='static')
15
  CORS(app)
@@ -20,53 +18,24 @@ if not GEMINI_API_KEY:
20
  raise ValueError("GEMINI_API_KEY environment variable not set")
21
  genai.configure(api_key=GEMINI_API_KEY)
22
 
23
- # Transcription and Translation Model
24
- TRANSCRIPTION_TRANSLATION_MODEL_NAME = "gemini-2.0-flash" # Using 1.5 flash as it's common, was "gemini-2.0-flash"
25
- # Text-to-Speech Model
26
- TTS_MODEL_NAME = "gemini-2.5-flash-preview-tts" # Using a model known to support audio output modality.
27
- # The user's example mentioned "gemini-2.5-flash-preview-tts".
28
- # If that specific model works with response_mime_type, it can be used.
29
-
30
- # Gemini TTS Supported Languages (Display Name: BCP-47 Code)
31
- # Based on the user-provided list. The TTS API auto-detects language from text.
32
- # This list is primarily for the frontend language selector.
33
- GEMINI_TTS_LANGUAGES = {
34
- "Arabic (Egyptian)": "ar-EG",
35
- "German (Germany)": "de-DE",
36
- "English (US)": "en-US",
37
- "Spanish (US)": "es-US",
38
- "French (France)": "fr-FR",
39
- "Hindi (India)": "hi-IN",
40
- "Indonesian (Indonesia)": "id-ID",
41
- "Italian (Italy)": "it-IT",
42
- "Japanese (Japan)": "ja-JP",
43
- "Korean (Korea)": "ko-KR",
44
- "Portuguese (Brazil)": "pt-BR",
45
- "Russian (Russia)": "ru-RU",
46
- "Dutch (Netherlands)": "nl-NL",
47
- "Polish (Poland)": "pl-PL",
48
- "Thai (Thailand)": "th-TH",
49
- "Turkish (Turkey)": "tr-TR",
50
- "Vietnamese (Vietnam)": "vi-VN",
51
- "Romanian (Romania)": "ro-RO",
52
- "Ukrainian (Ukraine)": "uk-UA",
53
- "Bengali (Bangladesh)": "bn-BD",
54
- "English (India)": "en-IN",
55
- "Marathi (India)": "mr-IN",
56
- "Tamil (India)": "ta-IN",
57
- "Telugu (India)": "te-IN"
58
  }
59
 
60
- SUPPORTED_LANGUAGES = sorted(list(GEMINI_TTS_LANGUAGES.keys()))
 
61
 
62
- # Helper function to save PCM data as a WAV file
63
- def save_wave_file(filename, pcm_data, channels=1, sample_width=2, frame_rate=24000):
64
- """Saves PCM audio data to a WAV file."""
65
- with wave.open(filename, "wb") as wf:
66
- wf.setnchannels(channels)
67
- wf.setsampwidth(sample_width) # Bytes per sample
68
- wf.setframerate(frame_rate)
69
- wf.writeframes(pcm_data)
70
 
71
  @app.route('/')
72
  def serve_index():
@@ -83,63 +52,60 @@ def translate_audio():
83
  return jsonify({'error': 'No audio file uploaded'}), 400
84
 
85
  audio_file = request.files['audio']
86
- target_language_display_name = request.form.get('language', 'English (US)') # Default to a common one
87
 
88
  if not audio_file or audio_file.filename == '':
89
  return jsonify({'error': 'Invalid audio file'}), 400
90
 
91
- # Validate MIME type for transcription
92
- allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp3', 'audio/ogg', 'audio/flac', 'audio/mp4', 'audio/webm', 'audio/amr']
93
  if audio_file.mimetype not in allowed_mime_types:
94
- return jsonify({'error': f'Unsupported file type for transcription: {audio_file.mimetype}'}), 400
95
 
96
- # Initialize Gemini model for transcription and translation
97
- model = genai.GenerativeModel(TRANSCRIPTION_TRANSLATION_MODEL_NAME)
98
 
99
- audio_data_bytes = audio_file.read()
100
- audio_blob = genai_types.Blob(mime_type=audio_file.mimetype, data=audio_data_bytes)
 
 
 
101
 
102
  # Get transcription
103
- # Forcing transcription to be in original language can be tricky if the model tends to translate.
104
- # A more robust prompt might be needed if issues arise.
105
- transcription_prompt = "You are a professional transcriber. Transcribe this audio accurately and verbatim in its original spoken language. Respond only with the transcription."
106
-
107
- # Using genai.upload_file for larger files if needed, but for direct blob:
108
- response = model.generate_content([transcription_prompt, audio_blob])
109
  transcription = response.text.strip()
110
 
111
  # Translate text using Gemini
112
- translation_prompt = f"Translate the following text to {target_language_display_name}. Preserve meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
113
- response = model.generate_content(translation_prompt)
114
  translated_text = response.text.strip()
115
 
116
- # Generate TTS using Gemini
117
- tts_model = genai.GenerativeModel(TTS_MODEL_NAME)
118
-
119
- # Gemini TTS detects language from the text.
120
- # The voice selection is typically handled by the model or default voice for the detected language.
121
- # The user's snippet for `speech_config` and `voice_name='Kore'` is not directly compatible
122
- # with the current `google-generativeai` SDK's `GenerativeModel.generate_content` method
123
- # in a straightforward way. This method uses `response_mime_type` for audio output.
124
-
125
- tts_generation_config = genai_types.GenerationConfig(
126
- response_mime_type="audio/wav" # Gemini will output WAV audio
127
- )
128
-
129
- # The content for TTS is just the translated text.
130
- tts_response = tts_model.generate_content(
131
- contents=[translated_text], # Make sure contents is an iterable of Parts or strings
132
- generation_config=tts_generation_config
133
- )
134
-
135
- if not (tts_response.candidates and tts_response.candidates[0].content.parts):
136
- raise ValueError("Gemini TTS did not return audio data.")
137
-
138
- audio_pcm_data = tts_response.candidates[0].content.parts[0].inline_data.data
139
-
140
- _, temp_output_path = tempfile.mkstemp(suffix=".wav")
141
- # Default parameters from the user's example: rate=24000, sample_width=2 (16-bit), channels=1
142
- save_wave_file(temp_output_path, audio_pcm_data, channels=1, sample_width=2, frame_rate=24000)
143
 
144
  return jsonify({
145
  'transcription': transcription,
@@ -148,27 +114,20 @@ def translate_audio():
148
  })
149
 
150
  except Exception as e:
151
- app.logger.error(f"Error processing request: {str(e)}", exc_info=True)
152
  return jsonify({'error': str(e)}), 500
153
 
154
  @app.route('/download/<filename>')
155
  def download_file(filename):
156
  try:
157
- # tempfile.gettempdir() is the directory where mkstemp creates files
158
- file_path = os.path.join(tempfile.gettempdir(), filename)
159
  return send_file(
160
- file_path,
161
- mimetype="audio/wav", # Changed from mpeg to wav
162
  as_attachment=True,
163
- download_name=f"translated_{filename.replace(tempfile.gettempdir(), '')}" # Cleaner name
164
  )
165
  except FileNotFoundError:
166
  return jsonify({'error': 'File not found'}), 404
167
- except Exception as e:
168
- app.logger.error(f"Error downloading file: {str(e)}", exc_info=True)
169
- return jsonify({'error': f"Error downloading file: {str(e)}"}), 500
170
-
171
 
172
  if __name__ == '__main__':
173
- # Consider adding an environment variable for debug mode for production
174
- app.run(host="0.0.0.0", port=7860) # Added debug=True for development
 
1
  import os
2
+ import numpy as np
3
  from flask import Flask, request, jsonify, send_file, send_from_directory
4
  import google.generativeai as genai
5
+ from gtts import gTTS, lang
 
6
  import tempfile
7
+ import soundfile as sf
8
+ from kokoro import KPipeline
9
  from werkzeug.utils import secure_filename
10
  from flask_cors import CORS
 
11
 
12
  app = Flask(__name__, static_folder='static')
13
  CORS(app)
 
18
  raise ValueError("GEMINI_API_KEY environment variable not set")
19
  genai.configure(api_key=GEMINI_API_KEY)
20
 
21
+ # Language configurations
22
+ KOKORO_LANGUAGES = {
23
+ "American English": "a",
24
+ "British English": "b",
25
+ "Mandarin Chinese": "z",
26
+ "Spanish": "e",
27
+ "French": "f",
28
+ "Hindi": "h",
29
+ "Italian": "i",
30
+ "Brazilian Portuguese": "p"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
 
33
+ GTTS_LANGUAGES = lang.tts_langs()
34
+ GTTS_LANGUAGES['ja'] = 'Japanese' # Explicit Japanese support
35
 
36
+ SUPPORTED_LANGUAGES = sorted(
37
+ list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
38
+ )
 
 
 
 
 
39
 
40
  @app.route('/')
41
  def serve_index():
 
52
  return jsonify({'error': 'No audio file uploaded'}), 400
53
 
54
  audio_file = request.files['audio']
55
+ target_language = request.form.get('language', 'English')
56
 
57
  if not audio_file or audio_file.filename == '':
58
  return jsonify({'error': 'Invalid audio file'}), 400
59
 
60
+ # Validate MIME type
61
+ allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
62
  if audio_file.mimetype not in allowed_mime_types:
63
+ return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
64
 
65
+ # Transcribe audio using Gemini
66
+ model = genai.GenerativeModel("gemini-2.0-flash")
67
 
68
+ # Create proper audio blob
69
+ audio_blob = {
70
+ 'mime_type': audio_file.mimetype,
71
+ 'data': audio_file.read()
72
+ }
73
 
74
  # Get transcription
75
+ convo = model.start_chat()
76
+ convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
77
+ response = convo.send_message(audio_blob)
 
 
 
78
  transcription = response.text.strip()
79
 
80
  # Translate text using Gemini
81
+ prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
82
+ response = model.generate_content(prompt)
83
  translated_text = response.text.strip()
84
 
85
+ # Generate TTS
86
+ if target_language in KOKORO_LANGUAGES:
87
+ lang_code = KOKORO_LANGUAGES[target_language]
88
+ pipeline = KPipeline(lang_code=lang_code)
89
+ generator = pipeline(translated_text, voice="af_heart", speed=1)
90
+
91
+ # Collect all audio segments
92
+ audio_segments = []
93
+ for _, _, audio in generator:
94
+ if audio is not None:
95
+ audio_segments.append(audio)
96
+
97
+ if audio_segments:
98
+ audio_data = np.concatenate(audio_segments)
99
+ _, temp_output_path = tempfile.mkstemp(suffix=".wav")
100
+ sf.write(temp_output_path, audio_data, 24000)
101
+ else:
102
+ raise ValueError("No audio generated by Kokoro")
103
+ else:
104
+ # Standard gTTS handling
105
+ lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
106
+ tts = gTTS(translated_text, lang=lang_code)
107
+ _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
108
+ tts.save(temp_output_path)
 
 
 
109
 
110
  return jsonify({
111
  'transcription': transcription,
 
114
  })
115
 
116
  except Exception as e:
117
+ app.logger.error(f"Error processing request: {str(e)}")
118
  return jsonify({'error': str(e)}), 500
119
 
120
  @app.route('/download/<filename>')
121
  def download_file(filename):
122
  try:
 
 
123
  return send_file(
124
+ os.path.join(tempfile.gettempdir(), filename),
125
+ mimetype="audio/mpeg",
126
  as_attachment=True,
127
+ download_name=f"translated_{filename}"
128
  )
129
  except FileNotFoundError:
130
  return jsonify({'error': 'File not found'}), 404
 
 
 
 
131
 
132
  if __name__ == '__main__':
133
+ app.run(host="0.0.0.0", port=7860)