bluenevus commited on
Commit
f1f904a
·
verified ·
1 Parent(s): 28b8cb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -19
app.py CHANGED
@@ -79,36 +79,39 @@ def format_transcript(transcript):
79
 
80
  def transcribe_audio(audio_file):
81
  try:
82
- # Perform speaker diarization
83
- diarization = pipeline(audio_file)
84
-
85
- # Load the audio file
86
  audio_input, sr = librosa.load(audio_file, sr=16000)
87
 
88
  # Convert to float32 numpy array
89
  audio_input = audio_input.astype(np.float32)
90
 
 
 
 
 
 
91
  transcriptions = []
92
- current_speaker = None
93
 
94
- for turn, _, speaker in diarization.itertracks(yield_label=True):
95
- start_sample = int(turn.start * sr)
96
- end_sample = int(turn.end * sr)
97
-
98
- chunk = audio_input[start_sample:end_sample]
99
  input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
100
  predicted_ids = model.generate(input_features)
101
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
102
-
103
- if speaker != current_speaker:
104
- if current_speaker is not None:
105
- transcriptions.append("\n\n") # Add line break for new speaker
106
- current_speaker = speaker
107
-
108
- transcriptions.append(f"Speaker {speaker}: {transcription}")
109
 
 
110
  full_transcription = " ".join(transcriptions)
111
- print(f"Full transcription length: {len(full_transcription)} characters")
 
 
 
 
 
 
 
112
  return full_transcription
113
  except Exception as e:
114
  print(f"Error in transcribe_audio: {str(e)}")
 
79
 
80
  def transcribe_audio(audio_file):
81
  try:
82
+ # Load the entire audio file
83
+ print("Loading audio file...")
 
 
84
  audio_input, sr = librosa.load(audio_file, sr=16000)
85
 
86
  # Convert to float32 numpy array
87
  audio_input = audio_input.astype(np.float32)
88
 
89
+ print(f"Audio duration: {len(audio_input) / sr:.2f} seconds")
90
+
91
+ # Process in chunks of 30 seconds with overlap
92
+ chunk_length = 30 * sr
93
+ overlap = 5 * sr # 5 seconds overlap
94
  transcriptions = []
 
95
 
96
+ print("Starting transcription...")
97
+ for i in range(0, len(audio_input), chunk_length - overlap):
98
+ chunk = audio_input[i:i+chunk_length]
 
 
99
  input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
100
  predicted_ids = model.generate(input_features)
101
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
102
+ transcriptions.extend(transcription)
103
+ print(f"Processed {i / sr:.2f} to {(i + chunk_length) / sr:.2f} seconds")
 
 
 
 
 
104
 
105
+ # Join all transcriptions
106
  full_transcription = " ".join(transcriptions)
107
+
108
+ print(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
109
+
110
+ # Apply spelling correction and formatting
111
+ print("Applying spelling correction and formatting...")
112
+ full_transcription = correct_spelling(full_transcription)
113
+ full_transcription = format_transcript(full_transcription)
114
+
115
  return full_transcription
116
  except Exception as e:
117
  print(f"Error in transcribe_audio: {str(e)}")