Athspi commited on
Commit
385365a
·
verified ·
1 Parent(s): f49c906

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -64
app.py CHANGED
@@ -1,41 +1,45 @@
1
  import os
 
 
2
  import numpy as np
 
 
3
  from flask import Flask, request, jsonify, send_file, send_from_directory
4
- import google.generativeai as genai
 
 
5
  from gtts import gTTS, lang
6
- import tempfile
7
- import soundfile as sf
8
  from kokoro import KPipeline
9
- from werkzeug.utils import secure_filename
10
- from flask_cors import CORS
11
 
12
- app = Flask(__name__, static_folder='static')
13
- CORS(app)
14
 
15
- # Configure Gemini API
16
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
17
  if not GEMINI_API_KEY:
18
  raise ValueError("GEMINI_API_KEY environment variable not set")
19
- genai.configure(api_key=GEMINI_API_KEY)
20
-
21
- # Language configurations
22
- KOKORO_LANGUAGES = {
23
- "American English": "a",
24
- "British English": "b",
25
- "Mandarin Chinese": "z",
26
- "Spanish": "e",
27
- "French": "f",
28
- "Hindi": "h",
29
- "Italian": "i",
30
- "Brazilian Portuguese": "p"
31
- }
32
 
 
 
 
 
 
 
 
 
 
33
  GTTS_LANGUAGES = lang.tts_langs()
34
- GTTS_LANGUAGES['ja'] = 'Japanese' # Explicit Japanese support
 
35
 
36
- SUPPORTED_LANGUAGES = sorted(
37
- list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values())))
38
- )
 
 
 
 
 
39
 
40
  @app.route('/')
41
  def serve_index():
@@ -50,71 +54,80 @@ def translate_audio():
50
  try:
51
  if 'audio' not in request.files:
52
  return jsonify({'error': 'No audio file uploaded'}), 400
53
-
54
  audio_file = request.files['audio']
55
  target_language = request.form.get('language', 'English')
56
-
57
  if not audio_file or audio_file.filename == '':
58
  return jsonify({'error': 'Invalid audio file'}), 400
59
 
60
- # Validate MIME type
61
  allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
62
  if audio_file.mimetype not in allowed_mime_types:
63
  return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
64
 
65
- # Transcribe audio using Gemini
66
  model = genai.GenerativeModel("gemini-2.0-flash")
67
-
68
- # Create proper audio blob
69
  audio_blob = {
70
  'mime_type': audio_file.mimetype,
71
  'data': audio_file.read()
72
  }
73
 
74
- # Get transcription
75
  convo = model.start_chat()
76
- convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim in the original language. Respond only with the transcription.")
77
  response = convo.send_message(audio_blob)
78
  transcription = response.text.strip()
79
 
80
- # Translate text using Gemini
81
- prompt = f"Translate the following text to {target_language} preserving meaning and cultural nuances. Respond only with the translation:\n\n{transcription}"
82
- response = model.generate_content(prompt)
83
- translated_text = response.text.strip()
84
-
85
- # Generate TTS
86
- if target_language in KOKORO_LANGUAGES:
87
- lang_code = KOKORO_LANGUAGES[target_language]
88
- pipeline = KPipeline(lang_code=lang_code)
89
- generator = pipeline(translated_text, voice="af_heart", speed=1)
90
-
91
- # Collect all audio segments
92
- audio_segments = []
93
- for _, _, audio in generator:
94
- if audio is not None:
95
- audio_segments.append(audio)
96
-
97
- if audio_segments:
98
- audio_data = np.concatenate(audio_segments)
99
- _, temp_output_path = tempfile.mkstemp(suffix=".wav")
100
- sf.write(temp_output_path, audio_data, 24000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  else:
102
- raise ValueError("No audio generated by Kokoro")
103
- else:
104
- # Standard gTTS handling
105
- lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
106
- tts = gTTS(translated_text, lang=lang_code)
107
- _, temp_output_path = tempfile.mkstemp(suffix=".mp3")
108
- tts.save(temp_output_path)
109
-
110
  return jsonify({
111
  'transcription': transcription,
112
  'translation': translated_text,
113
  'audio_url': f'/download/{os.path.basename(temp_output_path)}'
114
  })
115
-
116
  except Exception as e:
117
- app.logger.error(f"Error processing request: {str(e)}")
118
  return jsonify({'error': str(e)}), 500
119
 
120
  @app.route('/download/<filename>')
 
1
  import os
2
+ import tempfile
3
+ import wave
4
  import numpy as np
5
+ import soundfile as sf
6
+
7
  from flask import Flask, request, jsonify, send_file, send_from_directory
8
+ from flask_cors import CORS
9
+ from werkzeug.utils import secure_filename
10
+
11
  from gtts import gTTS, lang
 
 
12
  from kokoro import KPipeline
 
 
13
 
14
+ from google import genai
15
+ from google.genai import types
16
 
17
+ # API key setup
18
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
19
  if not GEMINI_API_KEY:
20
  raise ValueError("GEMINI_API_KEY environment variable not set")
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ client = genai.Client(api_key=GEMINI_API_KEY)
23
+
24
+ # App config
25
+ app = Flask(__name__, static_folder='static')
26
+ CORS(app)
27
+
28
+ # Language support
29
+ KOKORO_LANGUAGES = {"American English": "a", "British English": "b", "Mandarin Chinese": "z",
30
+ "Spanish": "e", "French": "f", "Hindi": "h", "Italian": "i", "Brazilian Portuguese": "p"}
31
  GTTS_LANGUAGES = lang.tts_langs()
32
+ GTTS_LANGUAGES['ja'] = 'Japanese'
33
+ SUPPORTED_LANGUAGES = sorted(list(set(list(KOKORO_LANGUAGES.keys()) + list(GTTS_LANGUAGES.values()))))
34
 
35
+ GEMINI_VOICE = "Kore"
36
+
37
+ def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
38
+ with wave.open(filename, "wb") as wf:
39
+ wf.setnchannels(channels)
40
+ wf.setsampwidth(sample_width)
41
+ wf.setframerate(rate)
42
+ wf.writeframes(pcm)
43
 
44
  @app.route('/')
45
  def serve_index():
 
54
  try:
55
  if 'audio' not in request.files:
56
  return jsonify({'error': 'No audio file uploaded'}), 400
57
+
58
  audio_file = request.files['audio']
59
  target_language = request.form.get('language', 'English')
60
+
61
  if not audio_file or audio_file.filename == '':
62
  return jsonify({'error': 'Invalid audio file'}), 400
63
 
 
64
  allowed_mime_types = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'audio/webm']
65
  if audio_file.mimetype not in allowed_mime_types:
66
  return jsonify({'error': f'Unsupported file type: {audio_file.mimetype}'}), 400
67
 
 
68
  model = genai.GenerativeModel("gemini-2.0-flash")
69
+
 
70
  audio_blob = {
71
  'mime_type': audio_file.mimetype,
72
  'data': audio_file.read()
73
  }
74
 
 
75
  convo = model.start_chat()
76
+ convo.send_message("You are a professional transcriber. Transcribe this audio accurately and verbatim.")
77
  response = convo.send_message(audio_blob)
78
  transcription = response.text.strip()
79
 
80
+ # Translate
81
+ prompt = f"Translate the following text to {target_language}:\n\n{transcription}"
82
+ translation_response = model.generate_content(prompt)
83
+ translated_text = translation_response.text.strip()
84
+
85
+ # Try Gemini 2.5 TTS
86
+ try:
87
+ response = client.models.generate_content(
88
+ model="gemini-2.5-flash-preview-tts",
89
+ contents=translated_text,
90
+ config=types.GenerateContentConfig(
91
+ response_modalities=["AUDIO"],
92
+ speech_config=types.SpeechConfig(
93
+ voice_config=types.VoiceConfig(
94
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=GEMINI_VOICE)
95
+ )
96
+ )
97
+ )
98
+ )
99
+ data = response.candidates[0].content.parts[0].inline_data.data
100
+ temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gemini.wav")
101
+ wave_file(temp_output_path, data)
102
+ except Exception:
103
+ # Fallback: Kokoro or gTTS
104
+ if target_language in KOKORO_LANGUAGES:
105
+ lang_code = KOKORO_LANGUAGES[target_language]
106
+ pipeline = KPipeline(lang_code=lang_code)
107
+ generator = pipeline(translated_text, voice="af_heart", speed=1)
108
+
109
+ audio_segments = [audio for _, _, audio in generator if audio is not None]
110
+
111
+ if audio_segments:
112
+ audio_data = np.concatenate(audio_segments)
113
+ temp_output_path = os.path.join(tempfile.gettempdir(), "tts_kokoro.wav")
114
+ sf.write(temp_output_path, audio_data, 24000)
115
+ else:
116
+ raise ValueError("No audio generated by Kokoro")
117
  else:
118
+ lang_code = next((k for k, v in GTTS_LANGUAGES.items() if v == target_language), 'en')
119
+ tts = gTTS(translated_text, lang=lang_code)
120
+ temp_output_path = os.path.join(tempfile.gettempdir(), "tts_gtts.mp3")
121
+ tts.save(temp_output_path)
122
+
 
 
 
123
  return jsonify({
124
  'transcription': transcription,
125
  'translation': translated_text,
126
  'audio_url': f'/download/{os.path.basename(temp_output_path)}'
127
  })
128
+
129
  except Exception as e:
130
+ app.logger.error(f"Error: {str(e)}")
131
  return jsonify({'error': str(e)}), 500
132
 
133
  @app.route('/download/<filename>')