Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,6 @@ import base64
|
|
| 8 |
import logging
|
| 9 |
import torch
|
| 10 |
import librosa
|
| 11 |
-
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
| 12 |
from pathlib import Path
|
| 13 |
import magic # For MIME type detection
|
| 14 |
from pydub import AudioSegment
|
|
@@ -17,7 +16,7 @@ from pydub import AudioSegment
|
|
| 17 |
from asr import transcribe, ASR_LANGUAGES
|
| 18 |
from tts import synthesize, TTS_LANGUAGES
|
| 19 |
from lid import identify
|
| 20 |
-
from asr import ASR_SAMPLING_RATE
|
| 21 |
|
| 22 |
# Configure logging
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -78,11 +77,27 @@ async def transcribe_audio(request: AudioRequest):
|
|
| 78 |
@app.post("/synthesize")
|
| 79 |
async def synthesize_speech(request: TTSRequest):
|
| 80 |
try:
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# Convert numpy array to bytes
|
| 83 |
buffer = io.BytesIO()
|
| 84 |
-
sf.write(buffer, audio,
|
| 85 |
buffer.seek(0)
|
|
|
|
| 86 |
return FileResponse(
|
| 87 |
buffer,
|
| 88 |
media_type="audio/wav",
|
|
@@ -117,4 +132,4 @@ async def get_tts_languages():
|
|
| 117 |
return JSONResponse(content=TTS_LANGUAGES)
|
| 118 |
except Exception as e:
|
| 119 |
logger.error(f"Error in get_tts_languages: {str(e)}")
|
| 120 |
-
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
|
|
|
| 8 |
import logging
|
| 9 |
import torch
|
| 10 |
import librosa
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
import magic # For MIME type detection
|
| 13 |
from pydub import AudioSegment
|
|
|
|
| 16 |
from asr import transcribe, ASR_LANGUAGES
|
| 17 |
from tts import synthesize, TTS_LANGUAGES
|
| 18 |
from lid import identify
|
| 19 |
+
from asr import ASR_SAMPLING_RATE
|
| 20 |
|
| 21 |
# Configure logging
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 77 |
@app.post("/synthesize")
|
| 78 |
async def synthesize_speech(request: TTSRequest):
|
| 79 |
try:
|
| 80 |
+
logger.info(f"Synthesizing speech for text: {request.text}, language: {request.language}, speed: {request.speed}")
|
| 81 |
+
result, filtered_text = synthesize(request.text, request.language, request.speed)
|
| 82 |
+
logger.info(f"Synthesis complete. Filtered text: {filtered_text}")
|
| 83 |
+
|
| 84 |
+
sample_rate, audio = result
|
| 85 |
+
logger.info(f"Sample rate: {sample_rate}, Audio shape: {audio.shape}, Audio dtype: {audio.dtype}")
|
| 86 |
+
|
| 87 |
+
# Ensure audio is a numpy array with the correct dtype
|
| 88 |
+
audio = np.array(audio, dtype=np.float32)
|
| 89 |
+
|
| 90 |
+
# Normalize audio to [-1, 1] range
|
| 91 |
+
audio = audio / np.max(np.abs(audio))
|
| 92 |
+
|
| 93 |
+
# Convert to int16 for WAV file
|
| 94 |
+
audio = (audio * 32767).astype(np.int16)
|
| 95 |
+
|
| 96 |
# Convert numpy array to bytes
|
| 97 |
buffer = io.BytesIO()
|
| 98 |
+
sf.write(buffer, audio, sample_rate, format='wav')
|
| 99 |
buffer.seek(0)
|
| 100 |
+
|
| 101 |
return FileResponse(
|
| 102 |
buffer,
|
| 103 |
media_type="audio/wav",
|
|
|
|
| 132 |
return JSONResponse(content=TTS_LANGUAGES)
|
| 133 |
except Exception as e:
|
| 134 |
logger.error(f"Error in get_tts_languages: {str(e)}")
|
| 135 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|