husseinelsaadi commited on
Commit
fa6324d
Β·
verified Β·
1 Parent(s): 5468de9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -12
app.py CHANGED
@@ -1729,7 +1729,6 @@ def extract_candidate_details(file_path):
1729
  # )
1730
 
1731
  # demo.launch(debug=True)
1732
-
1733
  import gradio as gr
1734
  import time
1735
  import tempfile
@@ -1757,14 +1756,26 @@ bark_voice_preset = "v2/en_speaker_9"
1757
  # Thread pool for async operations
1758
  executor = ThreadPoolExecutor(max_workers=2)
1759
 
 
 
 
 
 
 
 
 
 
1760
  def load_models_lazy():
1761
  """Load models only when needed"""
1762
  global model_bark, processor_bark, whisper_model
1763
 
 
 
 
1764
  if model_bark is None:
1765
  print("πŸ” Loading Bark model...")
1766
- model_bark = BarkModel.from_pretrained("suno/bark").to("cuda" if torch.cuda.is_available() else "cpu")
1767
- print("βœ… Bark model loaded")
1768
 
1769
  if processor_bark is None:
1770
  print("πŸ” Loading Bark processor...")
@@ -1773,16 +1784,22 @@ def load_models_lazy():
1773
 
1774
  if whisper_model is None:
1775
  print("πŸ” Loading Whisper model...")
1776
- whisper_model = whisper.load_model("base", device="cuda")
1777
- print("βœ… Whisper model loaded")
1778
 
1779
  def bark_tts_async(text):
1780
  """Async TTS generation"""
1781
  def _generate():
1782
  load_models_lazy() # Load only when needed
1783
  print(f"πŸ” Synthesizing TTS for: {text}")
 
 
 
 
 
1784
  inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
1785
- input_ids = inputs["input_ids"].to(model_bark.device)
 
1786
  start = time.time()
1787
  with torch.no_grad():
1788
  speech_values = model_bark.generate(
@@ -1791,7 +1808,8 @@ def bark_tts_async(text):
1791
  fine_temperature=0.4,
1792
  coarse_temperature=0.8
1793
  )
1794
- print(f"βœ… Bark finished in {round(time.time() - start, 2)}s")
 
1795
  speech = speech_values.cpu().numpy().squeeze()
1796
  speech = (speech * 32767).astype(np.int16)
1797
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
@@ -1806,6 +1824,11 @@ def whisper_stt(audio_path):
1806
  return ""
1807
 
1808
  load_models_lazy() # Load only when needed
 
 
 
 
 
1809
  result = whisper_model.transcribe(audio_path)
1810
  return result["text"]
1811
 
@@ -1942,9 +1965,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
1942
  return gr.update(), gr.update(), tts_future_obj
1943
 
1944
  start_interview_final_btn.click(
1945
- start_interview_immediate,
1946
- [user_data],
1947
- [interview_state, interview_pre_section, interview_section, question_audio, question_text]
 
 
 
 
 
 
1948
  )
1949
 
1950
  def transcribe(audio_path):
@@ -2030,5 +2059,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
2030
  lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
2031
  )
2032
 
2033
- demo.launch(debug=True)
2034
-
 
1729
  # )
1730
 
1731
  # demo.launch(debug=True)
 
1732
  import gradio as gr
1733
  import time
1734
  import tempfile
 
1756
  # Thread pool for async operations
1757
  executor = ThreadPoolExecutor(max_workers=2)
1758
 
1759
+ # Add after your imports
1760
+ if torch.cuda.is_available():
1761
+ print(f"πŸ”₯ CUDA Available: {torch.cuda.get_device_name(0)}")
1762
+ print(f"πŸ”₯ CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
1763
+ # Set default device
1764
+ torch.cuda.set_device(0)
1765
+ else:
1766
+ print("⚠️ CUDA not available, using CPU")
1767
+
1768
  def load_models_lazy():
1769
  """Load models only when needed"""
1770
  global model_bark, processor_bark, whisper_model
1771
 
1772
+ device = "cuda" if torch.cuda.is_available() else "cpu"
1773
+ print(f"πŸ” Using device: {device}")
1774
+
1775
  if model_bark is None:
1776
  print("πŸ” Loading Bark model...")
1777
+ model_bark = BarkModel.from_pretrained("suno/bark").to(device)
1778
+ print(f"βœ… Bark model loaded on {device}")
1779
 
1780
  if processor_bark is None:
1781
  print("πŸ” Loading Bark processor...")
 
1784
 
1785
  if whisper_model is None:
1786
  print("πŸ” Loading Whisper model...")
1787
+ whisper_model = whisper.load_model("base", device=device)
1788
+ print(f"βœ… Whisper model loaded on {device}")
1789
 
1790
  def bark_tts_async(text):
1791
  """Async TTS generation"""
1792
  def _generate():
1793
  load_models_lazy() # Load only when needed
1794
  print(f"πŸ” Synthesizing TTS for: {text}")
1795
+
1796
+ # Ensure we're using the correct device
1797
+ device = next(model_bark.parameters()).device
1798
+ print(f"πŸ” Bark model is on device: {device}")
1799
+
1800
  inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
1801
+ input_ids = inputs["input_ids"].to(device) # Move to same device as model
1802
+
1803
  start = time.time()
1804
  with torch.no_grad():
1805
  speech_values = model_bark.generate(
 
1808
  fine_temperature=0.4,
1809
  coarse_temperature=0.8
1810
  )
1811
+ print(f"βœ… Bark finished in {round(time.time() - start, 2)}s on {device}")
1812
+
1813
  speech = speech_values.cpu().numpy().squeeze()
1814
  speech = (speech * 32767).astype(np.int16)
1815
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
 
1824
  return ""
1825
 
1826
  load_models_lazy() # Load only when needed
1827
+
1828
+ # Check what device Whisper is actually using
1829
+ device = "cuda" if torch.cuda.is_available() else "cpu"
1830
+ print(f"πŸ” Whisper transcribing on {device}")
1831
+
1832
  result = whisper_model.transcribe(audio_path)
1833
  return result["text"]
1834
 
 
1965
  return gr.update(), gr.update(), tts_future_obj
1966
 
1967
  start_interview_final_btn.click(
1968
+ start_interview_immediate,
1969
+ [user_data],
1970
+ [interview_state, tts_future, interview_pre_section, interview_section, loading_status, question_audio, question_text]
1971
+ ).then(
1972
+ # Check TTS status every 500ms
1973
+ check_tts_ready,
1974
+ [interview_state, tts_future],
1975
+ [question_audio, loading_status, tts_future],
1976
+ every=0.5
1977
  )
1978
 
1979
  def transcribe(audio_path):
 
2059
  lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
2060
  )
2061
 
2062
+ demo.launch(debug=True)