Spaces:

reab5555
/

Multiple-Speakers-Personality-Analyzer

Runtime error

App Files Files Community

reab5555 commited on Aug 10, 2024

Commit

818cd17

verified ·

1 Parent(s): 117dfa6

Update transcription_diarization.py

Browse files

Files changed (1) hide show

transcription_diarization.py +115 -184

transcription_diarization.py CHANGED Viewed

@@ -1,192 +1,123 @@
 import os
-import torch
-import gc
-from moviepy.editor import VideoFileClip
-from pyannote.audio import Pipeline
-import datetime
-from collections import defaultdict
-from openai import OpenAI
-from config import openai_api_key, hf_token
-from pydub import AudioSegment, silence
-import math
-client = OpenAI(api_key=openai_api_key)
-class LazyDiarizationPipeline:
-    def __init__(self):
-        self.pipeline = None
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    def get_pipeline(self, hf_token):
-        if self.pipeline is None:
-            self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
-                                                     use_auth_token=hf_token)
-            self.pipeline = self.pipeline.to(self.device)
-        return self.pipeline
-lazy_diarization_pipeline = LazyDiarizationPipeline()
-def extract_audio(video_path):
-    base_name = os.path.splitext(video_path)[0]
-    audio_path = f"{base_name}.wav"
-    video = VideoFileClip(video_path)
-    audio = video.audio
-    # Reduce audio quality to keep file size smaller
-    audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, nbytes=2)
-    return audio_path
-def format_timestamp(seconds):
-    return str(datetime.timedelta(seconds=round(seconds))).zfill(8)
-def diarize_audio(audio_path, pipeline, max_speakers):
-    diarization = pipeline(audio_path, num_speakers=max_speakers)
-    return diarization
-def split_audio_on_silence(audio_path, min_silence_len=500, silence_thresh=-40, keep_silence=500):
-    audio = AudioSegment.from_wav(audio_path)
-    chunks = silence.split_on_silence(
-        audio,
-        min_silence_len=min_silence_len,
-        silence_thresh=silence_thresh,
-        keep_silence=keep_silence
     )
-    chunk_paths = []
-    for i, chunk in enumerate(chunks):
-        chunk_path = f"{audio_path[:-4]}_chunk_{i}.wav"
-        chunk.export(chunk_path, format="wav")
-        chunk_paths.append(chunk_path)
-    return chunk_paths
-def transcribe_audio(audio_path, language):
-    with open(audio_path, "rb") as audio_file:
-        transcript = client.audio.transcriptions.create(
-            file=audio_file,
-            model="whisper-1",
-            language=language,
-            response_format="verbose_json"
-        )
-    if not isinstance(transcript, dict):
-        transcript = transcript.model_dump()
-    transcription_txt = transcript.get("text", "")
-    transcription_chunks = []
-    for segment in transcript.get("segments", []):
-        transcription_chunks.append({
-            "start": segment.get("start", 0),
-            "end": segment.get("end", 0),
-            "text": segment.get("text", "")
         })
-    return transcription_txt, transcription_chunks
-def transcribe_large_audio(audio_path, language):
-    chunk_paths = split_audio_on_silence(audio_path)
-    transcription_txt = ""
-    transcription_chunks = []
-    for chunk_path in chunk_paths:
-        chunk_transcription_txt, chunk_transcription_chunks = transcribe_audio(chunk_path, language)
-        transcription_txt += chunk_transcription_txt
-        transcription_chunks.extend(chunk_transcription_chunks)
-        os.remove(chunk_path)
-    return transcription_txt, transcription_chunks
-def create_combined_srt(transcription_chunks, diarization, output_path, max_speakers):
-    speaker_segments = []
-    speaker_durations = defaultdict(float)
-    for segment, _, speaker in diarization.itertracks(yield_label=True):
-        speaker_durations[speaker] += segment.end - segment.start
-        speaker_segments.append((segment.start, segment.end, speaker))
-    sorted_speakers = sorted(speaker_durations.items(), key=lambda x: x[1], reverse=True)[:max_speakers]
-    speaker_map = {}
-    for i, (speaker, _) in enumerate(sorted_speakers, start=1):
-        speaker_map[speaker] = f"Speaker {i}"
-    with open(output_path, 'w', encoding='utf-8') as srt_file:
-        current_speaker = "Unknown"
-        current_text = ""
-        current_start = 0
-        current_end = 0
-        entry_count = 0
-        def write_entry():
-            nonlocal entry_count, current_speaker, current_start, current_end, current_text
-            if current_text:
-                entry_count += 1
-                start_str = format_timestamp(current_start)
-                end_str = format_timestamp(current_end)
-                srt_file.write(f"[{entry_count}. {current_speaker} | time: ({start_str} --> {end_str}) | text: {current_text.strip()}]\n\n")
-        for chunk in transcription_chunks:
-            start_time, end_time = chunk["start"], chunk["end"]
-            text = chunk["text"]
-            # Avoid splitting mid-sentence
-            if current_text and (text[0].isupper() or text.startswith(('.', '?', '!', '...'))):
-                write_entry()
-                current_speaker = "Unknown"
-                for seg_start, seg_end, speaker in speaker_segments:
-                    if seg_start <= start_time < seg_end:
-                        current_speaker = speaker_map.get(speaker, "Unknown")
-                        break
-                current_text = ""
-                current_start = start_time
-            current_text += " " + text
-            current_end = end_time
-            # Write entry if sentence ends with a punctuation mark
-            if current_text.strip().endswith(('.', '?', '!', '...')):
-                write_entry()
-                current_text = ""
-                current_start = end_time
-        write_entry()
-    with open(output_path, 'a', encoding='utf-8') as srt_file:
-        for i, (speaker, duration) in enumerate(sorted_speakers, start=1):
-            duration_str = format_timestamp(duration)
-            srt_file.write(f"Speaker {i} (originally {speaker}): total duration {duration_str}\n")
-def process_video(video_path, hf_token, language, max_speakers=3):
-    audio_path = extract_audio(video_path)
-    pipeline = lazy_diarization_pipeline.get_pipeline(hf_token)
-    diarization = diarize_audio(audio_path, pipeline, max_speakers)
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-    transcription, chunks = transcribe_large_audio(audio_path, language)
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-    combined_srt_path = f"{os.path.splitext(video_path)[0]}_combined.srt"
-    create_combined_srt(chunks, diarization, combined_srt_path, max_speakers)
-    os.remove(audio_path)
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-    # Convert the diarization results to a readable format
-    diarization_output = ""
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
-        start_time = format_timestamp(turn.start)
-        end_time = format_timestamp(turn.end)
-        diarization_output += f"Speaker {speaker}: {start_time} --> {end_time}\n"
-    return combined_srt_path, diarization_output

+import boto3
+import time
+import json
 import os
+from config import aws_access_key_id, aws_secret_access_key
+def upload_to_s3(local_file_path, bucket_name, s3_file_key):
+    s3_client = boto3.client('s3',
+                             aws_access_key_id=aws_access_key_id,
+                             aws_secret_access_key=aws_secret_access_key,
+                             region_name='eu-central-1')
+    s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
+    return f's3://{bucket_name}/{s3_file_key}'
+def transcribe_video(file_uri, job_name, max_speakers):
+    transcribe = boto3.client('transcribe',
+                              aws_access_key_id=aws_access_key_id,
+                              aws_secret_access_key=aws_secret_access_key,
+                              region_name='eu-central-1')
+    transcribe.start_transcription_job(
+        TranscriptionJobName=job_name,
+        Media={'MediaFileUri': file_uri},
+        MediaFormat='mp4',
+        IdentifyLanguage=True,
+        Settings={
+            'ShowSpeakerLabels': True,
+            'MaxSpeakerLabels': max_speakers
+        }
     )
+    while True:
+        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
+        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
+            break
+        print("Waiting for transcription to complete...")
+        time.sleep(30)
+    if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
+        transcript_url = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
+        print("Transcription completed successfully!")
+        return transcript_url
+    else:
+        print("Transcription failed.")
+        return None
+def download_transcript(transcript_url):
+    s3_client = boto3.client('s3',
+                             aws_access_key_id=aws_access_key_id,
+                             aws_secret_access_key=aws_secret_access_key,
+                             region_name='eu-central-1')
+    bucket_name = transcript_url.split('/')[2]
+    key = '/'.join(transcript_url.split('/')[3:])
+    response = s3_client.get_object(Bucket=bucket_name, Key=key)
+    transcript_content = response['Body'].read().decode('utf-8')
+    return json.loads(transcript_content)
+def extract_transcriptions_with_speakers(transcript_data):
+    segments = transcript_data['results']['speaker_labels']['segments']
+    items = transcript_data['results']['items']
+    current_speaker = None
+    current_text = []
+    transcriptions = []
+    for item in items:
+        if item['type'] == 'pronunciation':
+            start_time = float(item['start_time'])
+            end_time = float(item['end_time'])
+            content = item['alternatives'][0]['content']
+            speaker_segment = next((seg for seg in segments if float(seg['start_time']) <= start_time and float(seg['end_time']) >= end_time), None)
+            if speaker_segment and speaker_segment['speaker_label'] != current_speaker:
+                if current_text:
+                    transcriptions.append({
+                        'speaker': current_speaker,
+                        'text': ' '.join(current_text)
+                    })
+                    current_text = []
+                current_speaker = speaker_segment['speaker_label']
+            current_text.append(content)
+        elif item['type'] == 'punctuation':
+            current_text[-1] += item['alternatives'][0]['content']
+    if current_text:
+        transcriptions.append({
+            'speaker': current_speaker,
+            'text': ' '.join(current_text)
         })
+    return transcriptions
+def process_video(video_path, bucket_name, max_speakers):
+    # Upload video to S3
+    s3_file_key = os.path.basename(video_path)
+    file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
+    # Start transcription job
+    job_name = f'transcription_job_{int(time.time())}'
+    transcript_url = transcribe_video(file_uri, job_name, max_speakers)
+    if transcript_url:
+        # Download and process transcript
+        transcript_data = download_transcript(transcript_url)
+        transcriptions = extract_transcriptions_with_speakers(transcript_data)
+        # Create combined SRT-like output
+        output = []
+        for i, trans in enumerate(transcriptions, 1):
+            output.append(f"[{i}. {trans['speaker']} | text: {trans['text']}]\n")
+        return '\n'.join(output)
+    else:
+        return "Transcription failed."
+# This function will be called from the Gradio app
+def diarize_audio(video_path, max_speakers):
+    bucket_name = 'transcriptionjobbucket'  # Replace with your actual S3 bucket name
+    return process_video(video_path, bucket_name, max_speakers)