bluenevus commited on
Commit
170241f
·
verified ·
1 Parent(s): 256795b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -11,6 +11,10 @@ from spellchecker import SpellChecker
11
  from pydub import AudioSegment
12
  import librosa
13
  import numpy as np
 
 
 
 
14
 
15
  # Check if CUDA is available and set the device
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -70,27 +74,35 @@ def format_transcript(transcript):
70
 
71
  def transcribe_audio(audio_file):
72
  try:
73
- # Load the entire audio file
 
 
 
74
  audio_input, sr = librosa.load(audio_file, sr=16000)
75
 
76
  # Convert to float32 numpy array
77
  audio_input = audio_input.astype(np.float32)
78
 
79
- # Process in chunks of 30 seconds with overlap
80
- chunk_length = 30 * sr
81
- overlap = 5 * sr # 5 seconds overlap
82
  transcriptions = []
 
83
 
84
- for i in range(0, len(audio_input), chunk_length - overlap):
85
- chunk = audio_input[i:i+chunk_length]
 
 
 
86
  input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
87
  predicted_ids = model.generate(input_features)
88
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
89
- transcriptions.extend(transcription)
 
 
 
 
 
 
90
 
91
- # Join all transcriptions
92
  full_transcription = " ".join(transcriptions)
93
-
94
  print(f"Full transcription length: {len(full_transcription)} characters")
95
  return full_transcription
96
  except Exception as e:
 
11
  from pydub import AudioSegment
12
  import librosa
13
  import numpy as np
14
+ from pyannote.audio import Pipeline
15
+
16
+ # Initialize the speaker diarization pipeline
17
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
18
 
19
  # Check if CUDA is available and set the device
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
74
 
75
  def transcribe_audio(audio_file):
76
  try:
77
+ # Perform speaker diarization
78
+ diarization = pipeline(audio_file)
79
+
80
+ # Load the audio file
81
  audio_input, sr = librosa.load(audio_file, sr=16000)
82
 
83
  # Convert to float32 numpy array
84
  audio_input = audio_input.astype(np.float32)
85
 
 
 
 
86
  transcriptions = []
87
+ current_speaker = None
88
 
89
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
90
+ start_sample = int(turn.start * sr)
91
+ end_sample = int(turn.end * sr)
92
+
93
+ chunk = audio_input[start_sample:end_sample]
94
  input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
95
  predicted_ids = model.generate(input_features)
96
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
97
+
98
+ if speaker != current_speaker:
99
+ if current_speaker is not None:
100
+ transcriptions.append("\n\n") # Add line break for new speaker
101
+ current_speaker = speaker
102
+
103
+ transcriptions.append(f"Speaker {speaker}: {transcription}")
104
 
 
105
  full_transcription = " ".join(transcriptions)
 
106
  print(f"Full transcription length: {len(full_transcription)} characters")
107
  return full_transcription
108
  except Exception as e: