Update app.py
Browse files
app.py
CHANGED
@@ -1729,7 +1729,6 @@ def extract_candidate_details(file_path):
|
|
1729 |
# )
|
1730 |
|
1731 |
# demo.launch(debug=True)
|
1732 |
-
|
1733 |
import gradio as gr
|
1734 |
import time
|
1735 |
import tempfile
|
@@ -1757,14 +1756,26 @@ bark_voice_preset = "v2/en_speaker_9"
|
|
1757 |
# Thread pool for async operations
|
1758 |
executor = ThreadPoolExecutor(max_workers=2)
|
1759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1760 |
def load_models_lazy():
|
1761 |
"""Load models only when needed"""
|
1762 |
global model_bark, processor_bark, whisper_model
|
1763 |
|
|
|
|
|
|
|
1764 |
if model_bark is None:
|
1765 |
print("π Loading Bark model...")
|
1766 |
-
model_bark = BarkModel.from_pretrained("suno/bark").to(
|
1767 |
-
print("β
Bark model loaded")
|
1768 |
|
1769 |
if processor_bark is None:
|
1770 |
print("π Loading Bark processor...")
|
@@ -1773,16 +1784,22 @@ def load_models_lazy():
|
|
1773 |
|
1774 |
if whisper_model is None:
|
1775 |
print("π Loading Whisper model...")
|
1776 |
-
whisper_model = whisper.load_model("base", device=
|
1777 |
-
print("β
Whisper model loaded")
|
1778 |
|
1779 |
def bark_tts_async(text):
|
1780 |
"""Async TTS generation"""
|
1781 |
def _generate():
|
1782 |
load_models_lazy() # Load only when needed
|
1783 |
print(f"π Synthesizing TTS for: {text}")
|
|
|
|
|
|
|
|
|
|
|
1784 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1785 |
-
input_ids = inputs["input_ids"].to(
|
|
|
1786 |
start = time.time()
|
1787 |
with torch.no_grad():
|
1788 |
speech_values = model_bark.generate(
|
@@ -1791,7 +1808,8 @@ def bark_tts_async(text):
|
|
1791 |
fine_temperature=0.4,
|
1792 |
coarse_temperature=0.8
|
1793 |
)
|
1794 |
-
print(f"β
Bark finished in {round(time.time() - start, 2)}s")
|
|
|
1795 |
speech = speech_values.cpu().numpy().squeeze()
|
1796 |
speech = (speech * 32767).astype(np.int16)
|
1797 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
@@ -1806,6 +1824,11 @@ def whisper_stt(audio_path):
|
|
1806 |
return ""
|
1807 |
|
1808 |
load_models_lazy() # Load only when needed
|
|
|
|
|
|
|
|
|
|
|
1809 |
result = whisper_model.transcribe(audio_path)
|
1810 |
return result["text"]
|
1811 |
|
@@ -1942,9 +1965,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
1942 |
return gr.update(), gr.update(), tts_future_obj
|
1943 |
|
1944 |
start_interview_final_btn.click(
|
1945 |
-
start_interview_immediate,
|
1946 |
-
[user_data],
|
1947 |
-
[interview_state, interview_pre_section, interview_section, question_audio, question_text]
|
|
|
|
|
|
|
|
|
|
|
|
|
1948 |
)
|
1949 |
|
1950 |
def transcribe(audio_path):
|
@@ -2030,5 +2059,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
2030 |
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
|
2031 |
)
|
2032 |
|
2033 |
-
demo.launch(debug=True)
|
2034 |
-
|
|
|
1729 |
# )
|
1730 |
|
1731 |
# demo.launch(debug=True)
|
|
|
1732 |
import gradio as gr
|
1733 |
import time
|
1734 |
import tempfile
|
|
|
1756 |
# Thread pool for async operations
|
1757 |
executor = ThreadPoolExecutor(max_workers=2)
|
1758 |
|
1759 |
+
# Add after your imports
|
1760 |
+
if torch.cuda.is_available():
|
1761 |
+
print(f"π₯ CUDA Available: {torch.cuda.get_device_name(0)}")
|
1762 |
+
print(f"π₯ CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
1763 |
+
# Set default device
|
1764 |
+
torch.cuda.set_device(0)
|
1765 |
+
else:
|
1766 |
+
print("β οΈ CUDA not available, using CPU")
|
1767 |
+
|
1768 |
def load_models_lazy():
|
1769 |
"""Load models only when needed"""
|
1770 |
global model_bark, processor_bark, whisper_model
|
1771 |
|
1772 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1773 |
+
print(f"π Using device: {device}")
|
1774 |
+
|
1775 |
if model_bark is None:
|
1776 |
print("π Loading Bark model...")
|
1777 |
+
model_bark = BarkModel.from_pretrained("suno/bark").to(device)
|
1778 |
+
print(f"β
Bark model loaded on {device}")
|
1779 |
|
1780 |
if processor_bark is None:
|
1781 |
print("π Loading Bark processor...")
|
|
|
1784 |
|
1785 |
if whisper_model is None:
|
1786 |
print("π Loading Whisper model...")
|
1787 |
+
whisper_model = whisper.load_model("base", device=device)
|
1788 |
+
print(f"β
Whisper model loaded on {device}")
|
1789 |
|
1790 |
def bark_tts_async(text):
|
1791 |
"""Async TTS generation"""
|
1792 |
def _generate():
|
1793 |
load_models_lazy() # Load only when needed
|
1794 |
print(f"π Synthesizing TTS for: {text}")
|
1795 |
+
|
1796 |
+
# Ensure we're using the correct device
|
1797 |
+
device = next(model_bark.parameters()).device
|
1798 |
+
print(f"π Bark model is on device: {device}")
|
1799 |
+
|
1800 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1801 |
+
input_ids = inputs["input_ids"].to(device) # Move to same device as model
|
1802 |
+
|
1803 |
start = time.time()
|
1804 |
with torch.no_grad():
|
1805 |
speech_values = model_bark.generate(
|
|
|
1808 |
fine_temperature=0.4,
|
1809 |
coarse_temperature=0.8
|
1810 |
)
|
1811 |
+
print(f"β
Bark finished in {round(time.time() - start, 2)}s on {device}")
|
1812 |
+
|
1813 |
speech = speech_values.cpu().numpy().squeeze()
|
1814 |
speech = (speech * 32767).astype(np.int16)
|
1815 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
|
1824 |
return ""
|
1825 |
|
1826 |
load_models_lazy() # Load only when needed
|
1827 |
+
|
1828 |
+
# Check what device Whisper is actually using
|
1829 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1830 |
+
print(f"π Whisper transcribing on {device}")
|
1831 |
+
|
1832 |
result = whisper_model.transcribe(audio_path)
|
1833 |
return result["text"]
|
1834 |
|
|
|
1965 |
return gr.update(), gr.update(), tts_future_obj
|
1966 |
|
1967 |
start_interview_final_btn.click(
|
1968 |
+
start_interview_immediate,
|
1969 |
+
[user_data],
|
1970 |
+
[interview_state, tts_future, interview_pre_section, interview_section, loading_status, question_audio, question_text]
|
1971 |
+
).then(
|
1972 |
+
# Check TTS status every 500ms
|
1973 |
+
check_tts_ready,
|
1974 |
+
[interview_state, tts_future],
|
1975 |
+
[question_audio, loading_status, tts_future],
|
1976 |
+
every=0.5
|
1977 |
)
|
1978 |
|
1979 |
def transcribe(audio_path):
|
|
|
2059 |
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
|
2060 |
)
|
2061 |
|
2062 |
+
demo.launch(debug=True)
|
|