Update app.py
Browse files
app.py
CHANGED
|
@@ -1729,7 +1729,6 @@ def extract_candidate_details(file_path):
|
|
| 1729 |
# )
|
| 1730 |
|
| 1731 |
# demo.launch(debug=True)
|
| 1732 |
-
|
| 1733 |
import gradio as gr
|
| 1734 |
import time
|
| 1735 |
import tempfile
|
|
@@ -1757,14 +1756,26 @@ bark_voice_preset = "v2/en_speaker_9"
|
|
| 1757 |
# Thread pool for async operations
|
| 1758 |
executor = ThreadPoolExecutor(max_workers=2)
|
| 1759 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1760 |
def load_models_lazy():
|
| 1761 |
"""Load models only when needed"""
|
| 1762 |
global model_bark, processor_bark, whisper_model
|
| 1763 |
|
|
|
|
|
|
|
|
|
|
| 1764 |
if model_bark is None:
|
| 1765 |
print("π Loading Bark model...")
|
| 1766 |
-
model_bark = BarkModel.from_pretrained("suno/bark").to(
|
| 1767 |
-
print("β
Bark model loaded")
|
| 1768 |
|
| 1769 |
if processor_bark is None:
|
| 1770 |
print("π Loading Bark processor...")
|
|
@@ -1773,16 +1784,22 @@ def load_models_lazy():
|
|
| 1773 |
|
| 1774 |
if whisper_model is None:
|
| 1775 |
print("π Loading Whisper model...")
|
| 1776 |
-
whisper_model = whisper.load_model("base", device=
|
| 1777 |
-
print("β
Whisper model loaded")
|
| 1778 |
|
| 1779 |
def bark_tts_async(text):
|
| 1780 |
"""Async TTS generation"""
|
| 1781 |
def _generate():
|
| 1782 |
load_models_lazy() # Load only when needed
|
| 1783 |
print(f"π Synthesizing TTS for: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1784 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
| 1785 |
-
input_ids = inputs["input_ids"].to(
|
|
|
|
| 1786 |
start = time.time()
|
| 1787 |
with torch.no_grad():
|
| 1788 |
speech_values = model_bark.generate(
|
|
@@ -1791,7 +1808,8 @@ def bark_tts_async(text):
|
|
| 1791 |
fine_temperature=0.4,
|
| 1792 |
coarse_temperature=0.8
|
| 1793 |
)
|
| 1794 |
-
print(f"β
Bark finished in {round(time.time() - start, 2)}s")
|
|
|
|
| 1795 |
speech = speech_values.cpu().numpy().squeeze()
|
| 1796 |
speech = (speech * 32767).astype(np.int16)
|
| 1797 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
@@ -1806,6 +1824,11 @@ def whisper_stt(audio_path):
|
|
| 1806 |
return ""
|
| 1807 |
|
| 1808 |
load_models_lazy() # Load only when needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1809 |
result = whisper_model.transcribe(audio_path)
|
| 1810 |
return result["text"]
|
| 1811 |
|
|
@@ -1942,9 +1965,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 1942 |
return gr.update(), gr.update(), tts_future_obj
|
| 1943 |
|
| 1944 |
start_interview_final_btn.click(
|
| 1945 |
-
start_interview_immediate,
|
| 1946 |
-
[user_data],
|
| 1947 |
-
[interview_state, interview_pre_section, interview_section, question_audio, question_text]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1948 |
)
|
| 1949 |
|
| 1950 |
def transcribe(audio_path):
|
|
@@ -2030,5 +2059,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 2030 |
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
|
| 2031 |
)
|
| 2032 |
|
| 2033 |
-
demo.launch(debug=True)
|
| 2034 |
-
|
|
|
|
| 1729 |
# )
|
| 1730 |
|
| 1731 |
# demo.launch(debug=True)
|
|
|
|
| 1732 |
import gradio as gr
|
| 1733 |
import time
|
| 1734 |
import tempfile
|
|
|
|
| 1756 |
# Thread pool for async operations
|
| 1757 |
executor = ThreadPoolExecutor(max_workers=2)
|
| 1758 |
|
| 1759 |
+
# Add after your imports
|
| 1760 |
+
if torch.cuda.is_available():
|
| 1761 |
+
print(f"π₯ CUDA Available: {torch.cuda.get_device_name(0)}")
|
| 1762 |
+
print(f"π₯ CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
| 1763 |
+
# Set default device
|
| 1764 |
+
torch.cuda.set_device(0)
|
| 1765 |
+
else:
|
| 1766 |
+
print("β οΈ CUDA not available, using CPU")
|
| 1767 |
+
|
| 1768 |
def load_models_lazy():
|
| 1769 |
"""Load models only when needed"""
|
| 1770 |
global model_bark, processor_bark, whisper_model
|
| 1771 |
|
| 1772 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 1773 |
+
print(f"π Using device: {device}")
|
| 1774 |
+
|
| 1775 |
if model_bark is None:
|
| 1776 |
print("π Loading Bark model...")
|
| 1777 |
+
model_bark = BarkModel.from_pretrained("suno/bark").to(device)
|
| 1778 |
+
print(f"β
Bark model loaded on {device}")
|
| 1779 |
|
| 1780 |
if processor_bark is None:
|
| 1781 |
print("π Loading Bark processor...")
|
|
|
|
| 1784 |
|
| 1785 |
if whisper_model is None:
|
| 1786 |
print("π Loading Whisper model...")
|
| 1787 |
+
whisper_model = whisper.load_model("base", device=device)
|
| 1788 |
+
print(f"β
Whisper model loaded on {device}")
|
| 1789 |
|
| 1790 |
def bark_tts_async(text):
|
| 1791 |
"""Async TTS generation"""
|
| 1792 |
def _generate():
|
| 1793 |
load_models_lazy() # Load only when needed
|
| 1794 |
print(f"π Synthesizing TTS for: {text}")
|
| 1795 |
+
|
| 1796 |
+
# Ensure we're using the correct device
|
| 1797 |
+
device = next(model_bark.parameters()).device
|
| 1798 |
+
print(f"π Bark model is on device: {device}")
|
| 1799 |
+
|
| 1800 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
| 1801 |
+
input_ids = inputs["input_ids"].to(device) # Move to same device as model
|
| 1802 |
+
|
| 1803 |
start = time.time()
|
| 1804 |
with torch.no_grad():
|
| 1805 |
speech_values = model_bark.generate(
|
|
|
|
| 1808 |
fine_temperature=0.4,
|
| 1809 |
coarse_temperature=0.8
|
| 1810 |
)
|
| 1811 |
+
print(f"β
Bark finished in {round(time.time() - start, 2)}s on {device}")
|
| 1812 |
+
|
| 1813 |
speech = speech_values.cpu().numpy().squeeze()
|
| 1814 |
speech = (speech * 32767).astype(np.int16)
|
| 1815 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
|
|
| 1824 |
return ""
|
| 1825 |
|
| 1826 |
load_models_lazy() # Load only when needed
|
| 1827 |
+
|
| 1828 |
+
# Check what device Whisper is actually using
|
| 1829 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 1830 |
+
print(f"π Whisper transcribing on {device}")
|
| 1831 |
+
|
| 1832 |
result = whisper_model.transcribe(audio_path)
|
| 1833 |
return result["text"]
|
| 1834 |
|
|
|
|
| 1965 |
return gr.update(), gr.update(), tts_future_obj
|
| 1966 |
|
| 1967 |
start_interview_final_btn.click(
|
| 1968 |
+
start_interview_immediate,
|
| 1969 |
+
[user_data],
|
| 1970 |
+
[interview_state, tts_future, interview_pre_section, interview_section, loading_status, question_audio, question_text]
|
| 1971 |
+
).then(
|
| 1972 |
+
# Check TTS status every 500ms
|
| 1973 |
+
check_tts_ready,
|
| 1974 |
+
[interview_state, tts_future],
|
| 1975 |
+
[question_audio, loading_status, tts_future],
|
| 1976 |
+
every=0.5
|
| 1977 |
)
|
| 1978 |
|
| 1979 |
def transcribe(audio_path):
|
|
|
|
| 2059 |
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
|
| 2060 |
)
|
| 2061 |
|
| 2062 |
+
demo.launch(debug=True)
|
|
|