Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 2, 2024

Commit

b9f27c7

verified ·

1 Parent(s): 55553bb

Update diarization.py

Browse files

Files changed (1) hide show

diarization.py +24 -4

diarization.py CHANGED Viewed

@@ -43,9 +43,13 @@ class LazyTranscriptionPipeline:
                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
                 chunk_length_s=30,
                 return_timestamps=True,
-                device=torch.device("cuda")
             )
         return self.pipe
@@ -74,6 +78,9 @@ def transcribe_audio(audio_path, language):
         end = min((i + 1) * 30 * sr, len(audio))
         audio_chunk = audio[start:end]
         result = pipe(audio_chunk)
         transcription_txt += result["text"]
         for chunk in result["chunks"]:
@@ -84,6 +91,8 @@ def transcribe_audio(audio_path, language):
                 "text": chunk["text"]
             })
     return transcription_txt, transcription_chunks
 def create_combined_srt(transcription_chunks, diarization, output_path):
@@ -102,19 +111,21 @@ def create_combined_srt(transcription_chunks, diarization, output_path):
             start_time, end_time = chunk["start"], chunk["end"]
             text = chunk["text"]
             current_speaker = "Unknown"
             for seg_start, seg_end, speaker in speaker_segments:
                 if seg_start <= start_time < seg_end:
                     current_speaker = speaker
                     break
             start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
             end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
             srt_file.write(f"{i}\n")
-            srt_file.write(f"{start_str} --> {end_str}\n")
-            srt_file.write(f"{current_speaker}: {text}\n\n")
     speaker_durations = defaultdict(float)
     for seg_start, seg_end, speaker in speaker_segments:
         speaker_durations[speaker] += seg_end - seg_start
@@ -132,14 +143,23 @@ def process_video(video_path, diarization_access_token, language):
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
     pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
     diarization = pipeline(audio_path)
     transcription, chunks = transcribe_audio(audio_path, language)
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path)
     os.remove(audio_path)
-    return combined_srt_path

                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
+                max_new_tokens=128,
                 chunk_length_s=30,
+                batch_size=1,
                 return_timestamps=True,
+                torch_dtype=torch.float16,
+                device=torch.device("cuda"),
+                generate_kwargs={"language": language}
             )
         return self.pipe
         end = min((i + 1) * 30 * sr, len(audio))
         audio_chunk = audio[start:end]
+        # Convert the audio chunk to float32 numpy array
+        audio_chunk = (audio_chunk * 32767).astype(np.float32)
         result = pipe(audio_chunk)
         transcription_txt += result["text"]
         for chunk in result["chunks"]:
                 "text": chunk["text"]
             })
+        print(f"Transcription Progress: {int(((i + 1) / n_chunks) * 100)}%")
     return transcription_txt, transcription_chunks
 def create_combined_srt(transcription_chunks, diarization, output_path):
             start_time, end_time = chunk["start"], chunk["end"]
             text = chunk["text"]
+            # Find the corresponding speaker
             current_speaker = "Unknown"
             for seg_start, seg_end, speaker in speaker_segments:
                 if seg_start <= start_time < seg_end:
                     current_speaker = speaker
                     break
+            # Format timecodes as h:mm:ss (without leading zeros for hours)
             start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
             end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
             srt_file.write(f"{i}\n")
+            srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
+    # Add dominant speaker information
     speaker_durations = defaultdict(float)
     for seg_start, seg_end, speaker in speaker_segments:
         speaker_durations[speaker] += seg_end - seg_start
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
+    # Diarization
+    print("Performing diarization...")
     pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
     diarization = pipeline(audio_path)
+    print("Diarization complete.")
+    # Transcription
+    print("Performing transcription...")
     transcription, chunks = transcribe_audio(audio_path, language)
+    print("Transcription complete.")
+    # Create combined SRT file
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path)
+    print(f"Combined SRT file created and saved to {combined_srt_path}")
+    # Clean up
     os.remove(audio_path)
+    return combined_srt_path