reab5555 commited on
Commit
a9fd016
·
verified ·
1 Parent(s): 6cd715f

Update transcription_diarization.py

Browse files
Files changed (1) hide show
  1. transcription_diarization.py +9 -53
transcription_diarization.py CHANGED
@@ -39,54 +39,18 @@ def transcribe_video(file_uri, job_name, max_speakers):
39
  time.sleep(30)
40
 
41
  if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
42
- transcript_url = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
43
- return transcript_url
44
  else:
45
  return None
46
 
47
  def download_transcript(transcript_url):
48
- print(f"Attempting to download transcript from URL: {transcript_url}")
49
-
50
  try:
51
- # Try to download directly using requests
52
  response = requests.get(transcript_url)
53
- response.raise_for_status() # Raises an HTTPError for bad responses
54
- transcript_content = response.text
55
- return json.loads(transcript_content)
56
- except requests.RequestException as e:
57
- print(f"Failed to download transcript directly: {e}")
58
-
59
- # If direct download fails, try using S3 client
60
- try:
61
- s3_client = boto3.client('s3',
62
- aws_access_key_id=aws_access_key_id,
63
- aws_secret_access_key=aws_secret_access_key,
64
- region_name='eu-central-1')
65
-
66
- # Parse the URL
67
- parsed_url = urllib.parse.urlparse(transcript_url)
68
-
69
- # Extract bucket name and key
70
- bucket_name = parsed_url.netloc.split('.')[0]
71
- key = urllib.parse.unquote(parsed_url.path.lstrip('/'))
72
-
73
- response = s3_client.get_object(Bucket=bucket_name, Key=key)
74
- transcript_content = response['Body'].read().decode('utf-8')
75
- return json.loads(transcript_content)
76
- except ClientError as e:
77
- error_code = e.response['Error']['Code']
78
- error_message = e.response['Error']['Message']
79
- print(f"S3 ClientError: {error_code} - {error_message}")
80
- if error_code == 'AccessDenied':
81
- print("Access Denied. Please check your AWS credentials and bucket permissions.")
82
- elif error_code == 'NoSuchKey':
83
- print(f"The file {key} does not exist in the bucket {bucket_name}")
84
- else:
85
- print(f"An unexpected error occurred: {e}")
86
- except Exception as e:
87
- print(f"An unexpected error occurred: {e}")
88
-
89
- return None
90
 
91
  def extract_transcriptions_with_speakers(transcript_data):
92
  segments = transcript_data['results']['speaker_labels']['segments']
@@ -125,33 +89,25 @@ def extract_transcriptions_with_speakers(transcript_data):
125
 
126
  return transcriptions
127
 
128
- def process_video(video_path, bucket_name, max_speakers):
129
- # Upload video to S3
130
  s3_file_key = os.path.basename(video_path)
131
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
132
 
133
- # Start transcription job
134
  job_name = f'transcription_job_{int(time.time())}'
135
  transcript_url = transcribe_video(file_uri, job_name, max_speakers)
136
 
137
  if transcript_url:
138
- # Download and process transcript
139
  transcript_data = download_transcript(transcript_url)
140
  if transcript_data is None:
141
  return "Failed to download transcript."
142
 
143
  transcriptions = extract_transcriptions_with_speakers(transcript_data)
144
 
145
- # Create combined SRT-like output
146
  output = []
147
  for i, trans in enumerate(transcriptions, 1):
148
  output.append(f"[{i}. {trans['speaker']} | text: {trans['text']}]\n")
149
 
150
  return '\n'.join(output)
151
  else:
152
- return "Transcription failed."
153
-
154
- # This function will be called from the Gradio app
155
- def diarize_audio(video_path, max_speakers):
156
- bucket_name = 'transcriptionjobbucket' # Replace with your actual S3 bucket name
157
- return process_video(video_path, bucket_name, max_speakers)
 
39
  time.sleep(30)
40
 
41
  if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
42
+ return status['TranscriptionJob']['Transcript']['TranscriptFileUri']
 
43
  else:
44
  return None
45
 
46
  def download_transcript(transcript_url):
 
 
47
  try:
 
48
  response = requests.get(transcript_url)
49
+ response.raise_for_status()
50
+ return json.loads(response.text)
51
+ except Exception as e:
52
+ print(f"Error downloading transcript: {e}")
53
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def extract_transcriptions_with_speakers(transcript_data):
56
  segments = transcript_data['results']['speaker_labels']['segments']
 
89
 
90
  return transcriptions
91
 
92
+ def diarize_audio(video_path, max_speakers):
93
+ bucket_name = 'transcriptionjobbucket'
94
  s3_file_key = os.path.basename(video_path)
95
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
96
 
 
97
  job_name = f'transcription_job_{int(time.time())}'
98
  transcript_url = transcribe_video(file_uri, job_name, max_speakers)
99
 
100
  if transcript_url:
 
101
  transcript_data = download_transcript(transcript_url)
102
  if transcript_data is None:
103
  return "Failed to download transcript."
104
 
105
  transcriptions = extract_transcriptions_with_speakers(transcript_data)
106
 
 
107
  output = []
108
  for i, trans in enumerate(transcriptions, 1):
109
  output.append(f"[{i}. {trans['speaker']} | text: {trans['text']}]\n")
110
 
111
  return '\n'.join(output)
112
  else:
113
+ return "Transcription failed."