Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
dbf60e3
1
Parent(s):
de7876c
updates!
Browse files- app.py +144 -55
- requirements.txt +2 -1
app.py
CHANGED
@@ -52,32 +52,38 @@ def load_asr_model():
|
|
52 |
def load_llm_model():
|
53 |
model_id = "facebook/opt-1.3b"
|
54 |
|
|
|
55 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
-
(tokenizer.pad_token == tokenizer.eos_token))
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
#
|
67 |
-
model = AutoModelForCausalLM.from_pretrained(
|
68 |
-
model_id,
|
69 |
-
torch_dtype=torch_dtype,
|
70 |
-
low_cpu_mem_usage=True
|
71 |
-
)
|
72 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
else:
|
74 |
-
print(f"Pad token
|
75 |
-
|
76 |
-
model_id,
|
77 |
-
torch_dtype=torch_dtype,
|
78 |
-
low_cpu_mem_usage=True
|
79 |
-
)
|
80 |
|
|
|
81 |
model.to(device)
|
82 |
|
83 |
return model, tokenizer
|
@@ -85,72 +91,150 @@ def load_llm_model():
|
|
85 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
86 |
def gtts_text_to_speech(text):
|
87 |
"""Convert text to speech using gTTS and ensure proper WAV format."""
|
88 |
-
# Create temporary files
|
89 |
-
|
90 |
-
os.
|
91 |
-
|
92 |
-
wav_fd, wav_filename = tempfile.mkstemp(suffix='.wav')
|
93 |
-
os.close(wav_fd)
|
94 |
|
95 |
try:
|
96 |
-
#
|
|
|
|
|
|
|
|
|
97 |
tts = gTTS(text=text, lang='en', slow=False)
|
98 |
tts.save(mp3_filename)
|
99 |
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
try:
|
102 |
import subprocess
|
|
|
|
|
|
|
103 |
result = subprocess.run(
|
104 |
-
|
105 |
-
stdout=subprocess.PIPE,
|
|
|
106 |
check=True
|
107 |
)
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
115 |
|
116 |
-
#
|
117 |
-
if
|
118 |
-
# Read the WAV file with scipy
|
119 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
sample_rate, audio_data = wavfile.read(wav_filename)
|
|
|
121 |
# Convert to expected format
|
122 |
audio_data = audio_data.reshape(1, -1).astype(np.int16)
|
|
|
123 |
return (sample_rate, audio_data)
|
|
|
124 |
except Exception as e:
|
125 |
-
print(f"Error reading WAV file
|
126 |
-
# Try alternative approach with pydub
|
127 |
-
try:
|
128 |
-
from pydub import AudioSegment
|
129 |
-
sound = AudioSegment.from_file(wav_filename, format="wav")
|
130 |
-
audio_data = np.array(sound.get_array_of_samples(), dtype=np.int16)
|
131 |
-
audio_data = audio_data.reshape(1, -1)
|
132 |
-
return (sound.frame_rate, audio_data)
|
133 |
-
except Exception as e2:
|
134 |
-
print(f"Error with pydub fallback: {e2}")
|
135 |
|
136 |
# If all else fails, generate a simple tone
|
137 |
-
print("Falling back to synthetic audio tone")
|
138 |
sample_rate = 24000
|
139 |
-
duration_sec = len(text) * 0.1
|
140 |
tone_length = int(sample_rate * duration_sec)
|
141 |
audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
|
142 |
audio_data = (audio_data * 32767).astype(np.int16)
|
143 |
audio_data = audio_data.reshape(1, -1)
|
144 |
return (sample_rate, audio_data)
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
finally:
|
147 |
# Clean up temporary files
|
148 |
for filename in [mp3_filename, wav_filename]:
|
149 |
try:
|
150 |
if os.path.exists(filename):
|
151 |
os.remove(filename)
|
152 |
-
except:
|
153 |
-
|
154 |
|
155 |
# Initialize models
|
156 |
print("Loading ASR model...")
|
@@ -183,10 +267,13 @@ def generate_response(prompt):
|
|
183 |
full_prompt += "Assistant: "
|
184 |
|
185 |
# Generate response with proper attention mask
|
|
|
186 |
tokenized_inputs = llm_tokenizer(
|
187 |
full_prompt,
|
188 |
return_tensors="pt",
|
189 |
-
padding=
|
|
|
|
|
190 |
return_attention_mask=True
|
191 |
)
|
192 |
|
@@ -194,7 +281,7 @@ def generate_response(prompt):
|
|
194 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
195 |
attention_mask = tokenized_inputs["attention_mask"].to(device)
|
196 |
|
197 |
-
# Generate response
|
198 |
with torch.no_grad():
|
199 |
output = llm_model.generate(
|
200 |
input_ids=input_ids,
|
@@ -202,7 +289,9 @@ def generate_response(prompt):
|
|
202 |
max_new_tokens=128,
|
203 |
do_sample=True,
|
204 |
temperature=0.7,
|
205 |
-
top_p=0.9
|
|
|
|
|
206 |
)
|
207 |
|
208 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
52 |
def load_llm_model():
|
53 |
model_id = "facebook/opt-1.3b"
|
54 |
|
55 |
+
# First load the tokenizer
|
56 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
57 |
|
58 |
+
# Print current token configuration
|
59 |
+
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
|
|
60 |
|
61 |
+
# Load the model first
|
62 |
+
model = AutoModelForCausalLM.from_pretrained(
|
63 |
+
model_id,
|
64 |
+
torch_dtype=torch_dtype,
|
65 |
+
low_cpu_mem_usage=True
|
66 |
+
)
|
67 |
+
|
68 |
+
# Set pad token if needed
|
69 |
+
if tokenizer.pad_token is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
|
70 |
+
# Add a new special token as padding token
|
71 |
+
special_tokens = {'pad_token': '[PAD]'}
|
72 |
+
num_added = tokenizer.add_special_tokens(special_tokens)
|
73 |
|
74 |
+
# Must resize the token embeddings when adding tokens
|
|
|
|
|
|
|
|
|
|
|
75 |
model.resize_token_embeddings(len(tokenizer))
|
76 |
+
|
77 |
+
# Update the model's config to explicitly set the pad token ID
|
78 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
79 |
+
|
80 |
+
print(f"Added pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
81 |
+
print(f"Different from EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
|
82 |
else:
|
83 |
+
print(f"Pad token already set: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
84 |
+
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
# Move model to the right device
|
87 |
model.to(device)
|
88 |
|
89 |
return model, tokenizer
|
|
|
91 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
92 |
def gtts_text_to_speech(text):
|
93 |
"""Convert text to speech using gTTS and ensure proper WAV format."""
|
94 |
+
# Create absolute paths for temporary files
|
95 |
+
temp_dir = tempfile.gettempdir()
|
96 |
+
mp3_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.mp3")
|
97 |
+
wav_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.wav")
|
|
|
|
|
98 |
|
99 |
try:
|
100 |
+
# Make sure text is not empty
|
101 |
+
if not text or text.isspace():
|
102 |
+
text = "I don't have a response for that."
|
103 |
+
|
104 |
+
# Create gTTS object and save to MP3
|
105 |
tts = gTTS(text=text, lang='en', slow=False)
|
106 |
tts.save(mp3_filename)
|
107 |
|
108 |
+
print(f"MP3 file created: {mp3_filename}, size: {os.path.getsize(mp3_filename)}")
|
109 |
+
|
110 |
+
# Try multiple methods to convert MP3 to WAV
|
111 |
+
wav_created = False
|
112 |
+
|
113 |
+
# Method 1: Try ffmpeg (most reliable)
|
114 |
try:
|
115 |
import subprocess
|
116 |
+
cmd = ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename]
|
117 |
+
print(f"Running ffmpeg command: {' '.join(cmd)}")
|
118 |
+
|
119 |
result = subprocess.run(
|
120 |
+
cmd,
|
121 |
+
stdout=subprocess.PIPE,
|
122 |
+
stderr=subprocess.PIPE,
|
123 |
check=True
|
124 |
)
|
125 |
|
126 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
127 |
+
print(f"WAV file successfully created with ffmpeg: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
128 |
+
wav_created = True
|
129 |
+
else:
|
130 |
+
print(f"ffmpeg ran but WAV file is missing or too small: {wav_filename}")
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
print(f"ffmpeg conversion failed: {str(e)}")
|
134 |
|
135 |
+
# Method 2: Try pydub if ffmpeg failed
|
136 |
+
if not wav_created:
|
|
|
137 |
try:
|
138 |
+
from pydub import AudioSegment
|
139 |
+
print("Converting MP3 to WAV using pydub...")
|
140 |
+
sound = AudioSegment.from_mp3(mp3_filename)
|
141 |
+
sound = sound.set_frame_rate(24000).set_channels(1)
|
142 |
+
sound.export(wav_filename, format="wav")
|
143 |
+
|
144 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
145 |
+
print(f"WAV file successfully created with pydub: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
146 |
+
wav_created = True
|
147 |
+
else:
|
148 |
+
print(f"pydub ran but WAV file is missing or too small")
|
149 |
+
|
150 |
+
except Exception as e:
|
151 |
+
print(f"pydub conversion failed: {str(e)}")
|
152 |
+
|
153 |
+
# Method 3: Direct WAV creation with gTTS-like library (last resort)
|
154 |
+
if not wav_created:
|
155 |
+
try:
|
156 |
+
import numpy as np
|
157 |
+
from scipy.io import wavfile
|
158 |
+
|
159 |
+
print("Generating synthetic speech directly...")
|
160 |
+
# Generate a simple speech-like tone pattern
|
161 |
+
sample_rate = 24000
|
162 |
+
duration = len(text) * 0.075 # Approx timing
|
163 |
+
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
164 |
+
|
165 |
+
# Create a speech-like tone with some variation
|
166 |
+
frequencies = [220, 440, 330, 550]
|
167 |
+
audio = np.zeros_like(t)
|
168 |
+
for i, freq in enumerate(frequencies):
|
169 |
+
audio += 0.2 * np.sin(2 * np.pi * freq * t + i)
|
170 |
+
|
171 |
+
# Add some envelope
|
172 |
+
envelope = np.ones_like(t)
|
173 |
+
attack = int(0.01 * sample_rate)
|
174 |
+
release = int(0.1 * sample_rate)
|
175 |
+
envelope[:attack] = np.linspace(0, 1, attack)
|
176 |
+
envelope[-release:] = np.linspace(1, 0, release)
|
177 |
+
audio = audio * envelope
|
178 |
+
|
179 |
+
# Normalize and convert to int16
|
180 |
+
audio = audio / np.max(np.abs(audio))
|
181 |
+
audio = (audio * 32767).astype(np.int16)
|
182 |
+
|
183 |
+
# Save as WAV
|
184 |
+
wavfile.write(wav_filename, sample_rate, audio)
|
185 |
+
|
186 |
+
if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
|
187 |
+
print(f"WAV file successfully created directly: {wav_filename}, size: {os.path.getsize(wav_filename)}")
|
188 |
+
wav_created = True
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
print(f"Direct WAV creation failed: {str(e)}")
|
192 |
+
|
193 |
+
# Read the WAV file if it was created
|
194 |
+
if wav_created:
|
195 |
+
try:
|
196 |
+
# Add a small delay to ensure the file is fully written
|
197 |
+
time.sleep(0.1)
|
198 |
+
|
199 |
+
# Read WAV file with scipy
|
200 |
+
print(f"Reading WAV file: {wav_filename}")
|
201 |
sample_rate, audio_data = wavfile.read(wav_filename)
|
202 |
+
|
203 |
# Convert to expected format
|
204 |
audio_data = audio_data.reshape(1, -1).astype(np.int16)
|
205 |
+
print(f"WAV file read successfully, shape: {audio_data.shape}, sample rate: {sample_rate}")
|
206 |
return (sample_rate, audio_data)
|
207 |
+
|
208 |
except Exception as e:
|
209 |
+
print(f"Error reading WAV file: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
# If all else fails, generate a simple tone
|
212 |
+
print("All methods failed. Falling back to synthetic audio tone")
|
213 |
sample_rate = 24000
|
214 |
+
duration_sec = max(1, len(text) * 0.1)
|
215 |
tone_length = int(sample_rate * duration_sec)
|
216 |
audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
|
217 |
audio_data = (audio_data * 32767).astype(np.int16)
|
218 |
audio_data = audio_data.reshape(1, -1)
|
219 |
return (sample_rate, audio_data)
|
220 |
|
221 |
+
except Exception as e:
|
222 |
+
print(f"Unexpected error in text-to-speech: {str(e)}")
|
223 |
+
# Generate a simple tone as last resort
|
224 |
+
sample_rate = 24000
|
225 |
+
audio_data = np.sin(2 * np.pi * np.arange(sample_rate) * 440 / sample_rate)
|
226 |
+
audio_data = (audio_data * 32767).astype(np.int16)
|
227 |
+
audio_data = audio_data.reshape(1, -1)
|
228 |
+
return (sample_rate, audio_data)
|
229 |
+
|
230 |
finally:
|
231 |
# Clean up temporary files
|
232 |
for filename in [mp3_filename, wav_filename]:
|
233 |
try:
|
234 |
if os.path.exists(filename):
|
235 |
os.remove(filename)
|
236 |
+
except Exception as e:
|
237 |
+
print(f"Failed to remove temporary file {filename}: {str(e)}")
|
238 |
|
239 |
# Initialize models
|
240 |
print("Loading ASR model...")
|
|
|
267 |
full_prompt += "Assistant: "
|
268 |
|
269 |
# Generate response with proper attention mask
|
270 |
+
# Ensure padding is done correctly with explicit parameters
|
271 |
tokenized_inputs = llm_tokenizer(
|
272 |
full_prompt,
|
273 |
return_tensors="pt",
|
274 |
+
padding="max_length",
|
275 |
+
max_length=512, # Fixed length helps with attention masks
|
276 |
+
truncation=True,
|
277 |
return_attention_mask=True
|
278 |
)
|
279 |
|
|
|
281 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
282 |
attention_mask = tokenized_inputs["attention_mask"].to(device)
|
283 |
|
284 |
+
# Generate response - explicitly pass all needed parameters
|
285 |
with torch.no_grad():
|
286 |
output = llm_model.generate(
|
287 |
input_ids=input_ids,
|
|
|
289 |
max_new_tokens=128,
|
290 |
do_sample=True,
|
291 |
temperature=0.7,
|
292 |
+
top_p=0.9,
|
293 |
+
pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
|
294 |
+
eos_token_id=llm_tokenizer.eos_token_id # Explicitly set EOS token ID
|
295 |
)
|
296 |
|
297 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
requirements.txt
CHANGED
@@ -10,4 +10,5 @@ fastrtc[vad,tts]
|
|
10 |
torchaudio
|
11 |
gtts
|
12 |
pydub
|
13 |
-
scipy
|
|
|
|
10 |
torchaudio
|
11 |
gtts
|
12 |
pydub
|
13 |
+
scipy
|
14 |
+
time
|