Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
dbf60e3
1
Parent(s):
de7876c
updates!
Browse files- app.py +144 -55
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -52,32 +52,38 @@ def load_asr_model():
|
|
| 52 |
def load_llm_model():
|
| 53 |
model_id = "facebook/opt-1.3b"
|
| 54 |
|
|
|
|
| 55 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
(tokenizer.pad_token == tokenizer.eos_token))
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 68 |
-
model_id,
|
| 69 |
-
torch_dtype=torch_dtype,
|
| 70 |
-
low_cpu_mem_usage=True
|
| 71 |
-
)
|
| 72 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
else:
|
| 74 |
-
print(f"Pad token
|
| 75 |
-
|
| 76 |
-
model_id,
|
| 77 |
-
torch_dtype=torch_dtype,
|
| 78 |
-
low_cpu_mem_usage=True
|
| 79 |
-
)
|
| 80 |
|
|
|
|
| 81 |
model.to(device)
|
| 82 |
|
| 83 |
return model, tokenizer
|
|
@@ -85,72 +91,150 @@ def load_llm_model():
|
|
| 85 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
| 86 |
def gtts_text_to_speech(text):
|
| 87 |
"""Convert text to speech using gTTS and ensure proper WAV format."""
|
| 88 |
-
# Create temporary files
|
| 89 |
-
|
| 90 |
-
os.
|
| 91 |
-
|
| 92 |
-
wav_fd, wav_filename = tempfile.mkstemp(suffix='.wav')
|
| 93 |
-
os.close(wav_fd)
|
| 94 |
|
| 95 |
try:
|
| 96 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
tts = gTTS(text=text, lang='en', slow=False)
|
| 98 |
tts.save(mp3_filename)
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
try:
|
| 102 |
import subprocess
|
|
|
|
|
|
|
|
|
|
| 103 |
result = subprocess.run(
|
| 104 |
-
|
| 105 |
-
stdout=subprocess.PIPE,
|
|
|
|
| 106 |
check=True
|
| 107 |
)
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
if
|
| 118 |
-
# Read the WAV file with scipy
|
| 119 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
sample_rate, audio_data = wavfile.read(wav_filename)
|
|
|
|
| 121 |
# Convert to expected format
|
| 122 |
audio_data = audio_data.reshape(1, -1).astype(np.int16)
|
|
|
|
| 123 |
return (sample_rate, audio_data)
|
|
|
|
| 124 |
except Exception as e:
|
| 125 |
-
print(f"Error reading WAV file
|
| 126 |
-
# Try alternative approach with pydub
|
| 127 |
-
try:
|
| 128 |
-
from pydub import AudioSegment
|
| 129 |
-
sound = AudioSegment.from_file(wav_filename, format="wav")
|
| 130 |
-
audio_data = np.array(sound.get_array_of_samples(), dtype=np.int16)
|
| 131 |
-
audio_data = audio_data.reshape(1, -1)
|
| 132 |
-
return (sound.frame_rate, audio_data)
|
| 133 |
-
except Exception as e2:
|
| 134 |
-
print(f"Error with pydub fallback: {e2}")
|
| 135 |
|
| 136 |
# If all else fails, generate a simple tone
|
| 137 |
-
print("Falling back to synthetic audio tone")
|
| 138 |
sample_rate = 24000
|
| 139 |
-
duration_sec = len(text) * 0.1
|
| 140 |
tone_length = int(sample_rate * duration_sec)
|
| 141 |
audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
|
| 142 |
audio_data = (audio_data * 32767).astype(np.int16)
|
| 143 |
audio_data = audio_data.reshape(1, -1)
|
| 144 |
return (sample_rate, audio_data)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
finally:
|
| 147 |
# Clean up temporary files
|
| 148 |
for filename in [mp3_filename, wav_filename]:
|
| 149 |
try:
|
| 150 |
if os.path.exists(filename):
|
| 151 |
os.remove(filename)
|
| 152 |
-
except:
|
| 153 |
-
|
| 154 |
|
| 155 |
# Initialize models
|
| 156 |
print("Loading ASR model...")
|
|
@@ -183,10 +267,13 @@ def generate_response(prompt):
|
|
| 183 |
full_prompt += "Assistant: "
|
| 184 |
|
| 185 |
# Generate response with proper attention mask
|
|
|
|
| 186 |
tokenized_inputs = llm_tokenizer(
|
| 187 |
full_prompt,
|
| 188 |
return_tensors="pt",
|
| 189 |
-
padding=
|
|
|
|
|
|
|
| 190 |
return_attention_mask=True
|
| 191 |
)
|
| 192 |
|
|
@@ -194,7 +281,7 @@ def generate_response(prompt):
|
|
| 194 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
| 195 |
attention_mask = tokenized_inputs["attention_mask"].to(device)
|
| 196 |
|
| 197 |
-
# Generate response
|
| 198 |
with torch.no_grad():
|
| 199 |
output = llm_model.generate(
|
| 200 |
input_ids=input_ids,
|
|
@@ -202,7 +289,9 @@ def generate_response(prompt):
|
|
| 202 |
max_new_tokens=128,
|
| 203 |
do_sample=True,
|
| 204 |
temperature=0.7,
|
| 205 |
-
top_p=0.9
|
|
|
|
|
|
|
| 206 |
)
|
| 207 |
|
| 208 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
|
| 52 |
def load_llm_model():
|
| 53 |
model_id = "facebook/opt-1.3b"
|
| 54 |
|
| 55 |
+
# First load the tokenizer
|
| 56 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 57 |
|
| 58 |
+
# Print current token configuration
|
| 59 |
+
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
|
|
|
| 60 |
|
| 61 |
+
# Load the model first
|
| 62 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 63 |
+
model_id,
|
| 64 |
+
torch_dtype=torch_dtype,
|
| 65 |
+
low_cpu_mem_usage=True
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Set pad token if needed
|
| 69 |
+
if tokenizer.pad_token is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
|
| 70 |
+
# Add a new special token as padding token
|
| 71 |
+
special_tokens = {'pad_token': '[PAD]'}
|
| 72 |
+
num_added = tokenizer.add_special_tokens(special_tokens)
|
| 73 |
|
| 74 |
+
# Must resize the token embeddings when adding tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
model.resize_token_embeddings(len(tokenizer))
|
| 76 |
+
|
| 77 |
+
# Update the model's config to explicitly set the pad token ID
|
| 78 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
| 79 |
+
|
| 80 |
+
print(f"Added pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
| 81 |
+
print(f"Different from EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
|
| 82 |
else:
|
| 83 |
+
print(f"Pad token already set: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
| 84 |
+
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# Move model to the right device
|
| 87 |
model.to(device)
|
| 88 |
|
| 89 |
return model, tokenizer
|
|
|
|
| 91 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
| 92 |
def gtts_text_to_speech(text):
|
| 93 |
"""Convert text to speech using gTTS and ensure proper WAV format."""
|
| 94 |
+
# Create absolute paths for temporary files
|
| 95 |
+
temp_dir = tempfile.gettempdir()
|
| 96 |
+
mp3_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.mp3")
|
| 97 |
+
wav_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.wav")
|
|
|
|
|
|
|
| 98 |
|
| 99 |
try:
|
| 100 |
+
# Make sure text is not empty
|
| 101 |
+
if not text or text.isspace():
|
| 102 |
+
text = "I don't have a response for that."
|
| 103 |
+
|
| 104 |
+
# Create gTTS object and save to MP3
|
| 105 |
tts = gTTS(text=text, lang='en', slow=False)
|
| 106 |
tts.save(mp3_filename)
|
| 107 |
|
| 108 |
+
print(f"MP3 file created: {mp3_filename}, size: {os.path.getsize(mp3_filename)}")
|
| 109 |
+
|
| 110 |
+
# Try multiple methods to convert MP3 to WAV
|
| 111 |
+
wav_created = False
|
| 112 |
+
|
| 113 |
+
# Method 1: Try ffmpeg (most reliable)
|
| 114 |
try:
|
| 115 |
import subprocess
|
| 116 |
+
cmd = ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename]
|
| 117 |
+
print(f"Running ffmpeg command: {' '.join(cmd)}")
|
| 118 |
+
|
| 119 |
result = subprocess.run(
|
| 120 |
+
cmd,
|
| 121 |
+
stdout=subprocess.PIPE,
|
| 122 |
+
stderr=subprocess.PIPE,
|
| 123 |
check=True
|
| 124 |
)
|
| 125 |
|
| 126 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
| 127 |
+
print(f"WAV file successfully created with ffmpeg: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
| 128 |
+
wav_created = True
|
| 129 |
+
else:
|
| 130 |
+
print(f"ffmpeg ran but WAV file is missing or too small: {wav_filename}")
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f"ffmpeg conversion failed: {str(e)}")
|
| 134 |
|
| 135 |
+
# Method 2: Try pydub if ffmpeg failed
|
| 136 |
+
if not wav_created:
|
|
|
|
| 137 |
try:
|
| 138 |
+
from pydub import AudioSegment
|
| 139 |
+
print("Converting MP3 to WAV using pydub...")
|
| 140 |
+
sound = AudioSegment.from_mp3(mp3_filename)
|
| 141 |
+
sound = sound.set_frame_rate(24000).set_channels(1)
|
| 142 |
+
sound.export(wav_filename, format="wav")
|
| 143 |
+
|
| 144 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
| 145 |
+
print(f"WAV file successfully created with pydub: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
| 146 |
+
wav_created = True
|
| 147 |
+
else:
|
| 148 |
+
print(f"pydub ran but WAV file is missing or too small")
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"pydub conversion failed: {str(e)}")
|
| 152 |
+
|
| 153 |
+
# Method 3: Direct WAV creation with gTTS-like library (last resort)
|
| 154 |
+
if not wav_created:
|
| 155 |
+
try:
|
| 156 |
+
import numpy as np
|
| 157 |
+
from scipy.io import wavfile
|
| 158 |
+
|
| 159 |
+
print("Generating synthetic speech directly...")
|
| 160 |
+
# Generate a simple speech-like tone pattern
|
| 161 |
+
sample_rate = 24000
|
| 162 |
+
duration = len(text) * 0.075 # Approx timing
|
| 163 |
+
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
| 164 |
+
|
| 165 |
+
# Create a speech-like tone with some variation
|
| 166 |
+
frequencies = [220, 440, 330, 550]
|
| 167 |
+
audio = np.zeros_like(t)
|
| 168 |
+
for i, freq in enumerate(frequencies):
|
| 169 |
+
audio += 0.2 * np.sin(2 * np.pi * freq * t + i)
|
| 170 |
+
|
| 171 |
+
# Add some envelope
|
| 172 |
+
envelope = np.ones_like(t)
|
| 173 |
+
attack = int(0.01 * sample_rate)
|
| 174 |
+
release = int(0.1 * sample_rate)
|
| 175 |
+
envelope[:attack] = np.linspace(0, 1, attack)
|
| 176 |
+
envelope[-release:] = np.linspace(1, 0, release)
|
| 177 |
+
audio = audio * envelope
|
| 178 |
+
|
| 179 |
+
# Normalize and convert to int16
|
| 180 |
+
audio = audio / np.max(np.abs(audio))
|
| 181 |
+
audio = (audio * 32767).astype(np.int16)
|
| 182 |
+
|
| 183 |
+
# Save as WAV
|
| 184 |
+
wavfile.write(wav_filename, sample_rate, audio)
|
| 185 |
+
|
| 186 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
| 187 |
+
print(f"WAV file successfully created directly: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
| 188 |
+
wav_created = True
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Direct WAV creation failed: {str(e)}")
|
| 192 |
+
|
| 193 |
+
# Read the WAV file if it was created
|
| 194 |
+
if wav_created:
|
| 195 |
+
try:
|
| 196 |
+
# Add a small delay to ensure the file is fully written
|
| 197 |
+
time.sleep(0.1)
|
| 198 |
+
|
| 199 |
+
# Read WAV file with scipy
|
| 200 |
+
print(f"Reading WAV file: {wav_filename}")
|
| 201 |
sample_rate, audio_data = wavfile.read(wav_filename)
|
| 202 |
+
|
| 203 |
# Convert to expected format
|
| 204 |
audio_data = audio_data.reshape(1, -1).astype(np.int16)
|
| 205 |
+
print(f"WAV file read successfully, shape: {audio_data.shape}, sample rate: {sample_rate}")
|
| 206 |
return (sample_rate, audio_data)
|
| 207 |
+
|
| 208 |
except Exception as e:
|
| 209 |
+
print(f"Error reading WAV file: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
# If all else fails, generate a simple tone
|
| 212 |
+
print("All methods failed. Falling back to synthetic audio tone")
|
| 213 |
sample_rate = 24000
|
| 214 |
+
duration_sec = max(1, len(text) * 0.1)
|
| 215 |
tone_length = int(sample_rate * duration_sec)
|
| 216 |
audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
|
| 217 |
audio_data = (audio_data * 32767).astype(np.int16)
|
| 218 |
audio_data = audio_data.reshape(1, -1)
|
| 219 |
return (sample_rate, audio_data)
|
| 220 |
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"Unexpected error in text-to-speech: {str(e)}")
|
| 223 |
+
# Generate a simple tone as last resort
|
| 224 |
+
sample_rate = 24000
|
| 225 |
+
audio_data = np.sin(2 * np.pi * np.arange(sample_rate) * 440 / sample_rate)
|
| 226 |
+
audio_data = (audio_data * 32767).astype(np.int16)
|
| 227 |
+
audio_data = audio_data.reshape(1, -1)
|
| 228 |
+
return (sample_rate, audio_data)
|
| 229 |
+
|
| 230 |
finally:
|
| 231 |
# Clean up temporary files
|
| 232 |
for filename in [mp3_filename, wav_filename]:
|
| 233 |
try:
|
| 234 |
if os.path.exists(filename):
|
| 235 |
os.remove(filename)
|
| 236 |
+
except Exception as e:
|
| 237 |
+
print(f"Failed to remove temporary file {filename}: {str(e)}")
|
| 238 |
|
| 239 |
# Initialize models
|
| 240 |
print("Loading ASR model...")
|
|
|
|
| 267 |
full_prompt += "Assistant: "
|
| 268 |
|
| 269 |
# Generate response with proper attention mask
|
| 270 |
+
# Ensure padding is done correctly with explicit parameters
|
| 271 |
tokenized_inputs = llm_tokenizer(
|
| 272 |
full_prompt,
|
| 273 |
return_tensors="pt",
|
| 274 |
+
padding="max_length",
|
| 275 |
+
max_length=512, # Fixed length helps with attention masks
|
| 276 |
+
truncation=True,
|
| 277 |
return_attention_mask=True
|
| 278 |
)
|
| 279 |
|
|
|
|
| 281 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
| 282 |
attention_mask = tokenized_inputs["attention_mask"].to(device)
|
| 283 |
|
| 284 |
+
# Generate response - explicitly pass all needed parameters
|
| 285 |
with torch.no_grad():
|
| 286 |
output = llm_model.generate(
|
| 287 |
input_ids=input_ids,
|
|
|
|
| 289 |
max_new_tokens=128,
|
| 290 |
do_sample=True,
|
| 291 |
temperature=0.7,
|
| 292 |
+
top_p=0.9,
|
| 293 |
+
pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
|
| 294 |
+
eos_token_id=llm_tokenizer.eos_token_id # Explicitly set EOS token ID
|
| 295 |
)
|
| 296 |
|
| 297 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
requirements.txt
CHANGED
|
@@ -10,4 +10,5 @@ fastrtc[vad,tts]
|
|
| 10 |
torchaudio
|
| 11 |
gtts
|
| 12 |
pydub
|
| 13 |
-
scipy
|
|
|
|
|
|
| 10 |
torchaudio
|
| 11 |
gtts
|
| 12 |
pydub
|
| 13 |
+
scipy
|
| 14 |
+
time
|