Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 5, 2024

Commit

385f1b4

verified ·

1 Parent(s): 466cd11

Rename diarization.py to transcription_diarization.py

Browse files

Files changed (1) hide show

diarization.py → transcription_diarization.py +2 -35

diarization.py → transcription_diarization.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import torch
 import math
-from moviepy.editor import VideoFileClip, AudioFileClip
 from pyannote.audio import Pipeline
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import librosa
@@ -21,8 +21,6 @@ class LazyDiarizationPipeline:
             self.pipeline = self.pipeline.to(torch.device("cuda"))
         return self.pipeline
-lazy_diarization_pipeline = LazyDiarizationPipeline()
 class LazyTranscriptionPipeline:
     def __init__(self):
         self.model = None
@@ -53,6 +51,7 @@ class LazyTranscriptionPipeline:
             )
         return self.pipe
 lazy_transcription_pipeline = LazyTranscriptionPipeline()
 def extract_audio(video_path, audio_path):
@@ -77,8 +76,6 @@ def transcribe_audio(audio_path, language):
         start = i * 30 * sr
         end = min((i + 1) * 30 * sr, len(audio))
         audio_chunk = audio[start:end]
-        # Convert the audio chunk to float32 numpy array
         audio_chunk = (audio_chunk * 32767).astype(np.float32)
         result = pipe(audio_chunk)
@@ -99,87 +96,57 @@ def create_combined_srt(transcription_chunks, diarization, output_path):
     speaker_segments = []
     speaker_durations = defaultdict(float)
-    # First pass: calculate durations
     for segment, _, speaker in diarization.itertracks(yield_label=True):
         speaker_durations[speaker] += segment.end - segment.start
         speaker_segments.append((segment.start, segment.end, speaker))
-    # Sort speakers by duration
     sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)
-    # Create mapping of original labels to Speaker 1, Speaker 2, etc.
     speaker_map = {}
     for i, (speaker, _) in enumerate(sorted_speakers, start=1):
         speaker_map[speaker] = f"Speaker {i}"
-    # Write the SRT content
     with open(output_path, 'w', encoding='utf-8') as srt_file:
         for i, chunk in enumerate(transcription_chunks, 1):
             start_time, end_time = chunk["start"], chunk["end"]
             text = chunk["text"]
-            # Find the corresponding speaker
             current_speaker = "Unknown"
             for seg_start, seg_end, speaker in speaker_segments:
                 if seg_start <= start_time < seg_end:
                     current_speaker = speaker_map[speaker]
                     break
-            # Format timecodes as h:mm:ss (without leading zeros for hours)
             start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
             end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
             srt_file.write(f"{i}\n")
             srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
-    # Add information about the two most frequent speakers
     with open(output_path, 'a', encoding='utf-8') as srt_file:
         for i, (speaker, duration) in enumerate(sorted_speakers[:2], start=1):
             duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
             srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
-    # Calculate speaker durations
-    speaker_durations = defaultdict(float)
-    for seg_start, seg_end, speaker in speaker_segments:
-        speaker_durations[speaker] += seg_end - seg_start
-    # Find the two most frequent speakers
-    sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)
-    dominant_speaker, dominant_duration = sorted_speakers[0]
-    second_speaker, second_duration = sorted_speakers[1] if len(sorted_speakers) > 1 else (None, 0)
-    with open(output_path, 'a', encoding='utf-8') as srt_file:
-        dominant_duration_str = format_timestamp(dominant_duration).split('.')[0].lstrip('0')
-        srt_file.write(f"\nMost dominant speaker: {dominant_speaker} with total duration {dominant_duration_str}\n")
-        if second_speaker:
-            second_duration_str = format_timestamp(second_duration).split('.')[0].lstrip('0')
-            srt_file.write(f"Second most frequent speaker: {second_speaker} with total duration {second_duration_str}\n")
 @spaces.GPU(duration=100)
 def process_video(video_path, diarization_access_token, language):
     base_name = os.path.splitext(video_path)[0]
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
-    # Diarization
     print("Performing diarization...")
     pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
     diarization = pipeline(audio_path)
     print("Diarization complete.")
-    # Transcription
     print("Performing transcription...")
     transcription, chunks = transcribe_audio(audio_path, language)
     print("Transcription complete.")
-    # Create combined SRT file
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path)
     print(f"Combined SRT file created and saved to {combined_srt_path}")
-    # Clean up
     os.remove(audio_path)
     return combined_srt_path

 import os
 import torch
 import math
+from moviepy.editor import VideoFileClip
 from pyannote.audio import Pipeline
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import librosa
             self.pipeline = self.pipeline.to(torch.device("cuda"))
         return self.pipeline
 class LazyTranscriptionPipeline:
     def __init__(self):
         self.model = None
             )
         return self.pipe
+lazy_diarization_pipeline = LazyDiarizationPipeline()
 lazy_transcription_pipeline = LazyTranscriptionPipeline()
 def extract_audio(video_path, audio_path):
         start = i * 30 * sr
         end = min((i + 1) * 30 * sr, len(audio))
         audio_chunk = audio[start:end]
         audio_chunk = (audio_chunk * 32767).astype(np.float32)
         result = pipe(audio_chunk)
     speaker_segments = []
     speaker_durations = defaultdict(float)
     for segment, _, speaker in diarization.itertracks(yield_label=True):
         speaker_durations[speaker] += segment.end - segment.start
         speaker_segments.append((segment.start, segment.end, speaker))
     sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)
     speaker_map = {}
     for i, (speaker, _) in enumerate(sorted_speakers, start=1):
         speaker_map[speaker] = f"Speaker {i}"
     with open(output_path, 'w', encoding='utf-8') as srt_file:
         for i, chunk in enumerate(transcription_chunks, 1):
             start_time, end_time = chunk["start"], chunk["end"]
             text = chunk["text"]
             current_speaker = "Unknown"
             for seg_start, seg_end, speaker in speaker_segments:
                 if seg_start <= start_time < seg_end:
                     current_speaker = speaker_map[speaker]
                     break
             start_str = format_timestamp(start_time).split('.')[0].lstrip('0')
             end_str = format_timestamp(end_time).split('.')[0].lstrip('0')
             srt_file.write(f"{i}\n")
             srt_file.write(f"{{{current_speaker}}}\n time: ({start_str} --> {end_str})\n text: {text}\n\n")
     with open(output_path, 'a', encoding='utf-8') as srt_file:
         for i, (speaker, duration) in enumerate(sorted_speakers[:2], start=1):
             duration_str = format_timestamp(duration).split('.')[0].lstrip('0')
             srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
 @spaces.GPU(duration=100)
 def process_video(video_path, diarization_access_token, language):
     base_name = os.path.splitext(video_path)[0]
     audio_path = f"{base_name}.wav"
     extract_audio(video_path, audio_path)
     print("Performing diarization...")
     pipeline = lazy_diarization_pipeline.get_pipeline(diarization_access_token)
     diarization = pipeline(audio_path)
     print("Diarization complete.")
     print("Performing transcription...")
     transcription, chunks = transcribe_audio(audio_path, language)
     print("Transcription complete.")
     combined_srt_path = f"{base_name}_combined.srt"
     create_combined_srt(chunks, diarization, combined_srt_path)
     print(f"Combined SRT file created and saved to {combined_srt_path}")
     os.remove(audio_path)
     return combined_srt_path