reab5555 commited on
Commit
3ad4c21
·
verified ·
1 Parent(s): 260acfd

Update transcription_diarization.py

Browse files
Files changed (1) hide show
  1. transcription_diarization.py +11 -32
transcription_diarization.py CHANGED
@@ -15,30 +15,22 @@ def upload_to_s3(local_file_path, bucket_name, s3_file_key):
15
  s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
16
  return f's3://{bucket_name}/{s3_file_key}'
17
 
18
- def transcribe_video(file_uri, job_name, language):
19
  transcribe = boto3.client('transcribe',
20
  aws_access_key_id=aws_access_key_id,
21
  aws_secret_access_key=aws_secret_access_key,
22
  region_name='eu-central-1')
23
 
24
- language_code = get_language_code(language)
25
-
26
- job_params = {
27
- 'TranscriptionJobName': job_name,
28
- 'Media': {'MediaFileUri': file_uri},
29
- 'MediaFormat': 'mp4',
30
- 'Settings': {
31
  'ShowSpeakerLabels': True,
32
  'MaxSpeakerLabels': 4
33
  }
34
- }
35
-
36
- if language_code:
37
- job_params['LanguageCode'] = language_code
38
- else:
39
- job_params['IdentifyLanguage'] = True
40
-
41
- transcribe.start_transcription_job(**job_params)
42
 
43
  while True:
44
  status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
@@ -108,27 +100,14 @@ def extract_transcriptions_with_speakers(transcript_data):
108
 
109
  return transcriptions
110
 
111
- def get_language_code(language):
112
- language_codes = {
113
- "English": "en-US",
114
- "Hebrew": "he-IL",
115
- "Arabic": "ar-SA",
116
- "French": "fr-FR",
117
- "German": "de-DE",
118
- "Italian": "it-IT",
119
- "Japanese": "ja-JP",
120
- "Chinese": "zh-CN",
121
- "Auto-detect": None
122
- }
123
- return language_codes.get(language, "en-US")
124
-
125
- def diarize_audio(video_path, language):
126
  bucket_name = 'transcriptionjobbucket'
127
  s3_file_key = os.path.basename(video_path)
128
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
129
 
130
  job_name = f'transcription_job_{int(time.time())}'
131
- transcript_url = transcribe_video(file_uri, job_name, language)
132
 
133
  if transcript_url:
134
  transcript_data = download_transcript(transcript_url)
 
15
  s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
16
  return f's3://{bucket_name}/{s3_file_key}'
17
 
18
+ def transcribe_video(file_uri, job_name):
19
  transcribe = boto3.client('transcribe',
20
  aws_access_key_id=aws_access_key_id,
21
  aws_secret_access_key=aws_secret_access_key,
22
  region_name='eu-central-1')
23
 
24
+ transcribe.start_transcription_job(
25
+ TranscriptionJobName=job_name,
26
+ Media={'MediaFileUri': file_uri},
27
+ MediaFormat='mp4',
28
+ IdentifyLanguage=True,
29
+ Settings={
 
30
  'ShowSpeakerLabels': True,
31
  'MaxSpeakerLabels': 4
32
  }
33
+ )
 
 
 
 
 
 
 
34
 
35
  while True:
36
  status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
 
100
 
101
  return transcriptions
102
 
103
+
104
+ def diarize_audio(video_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  bucket_name = 'transcriptionjobbucket'
106
  s3_file_key = os.path.basename(video_path)
107
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
108
 
109
  job_name = f'transcription_job_{int(time.time())}'
110
+ transcript_url = transcribe_video(file_uri, job_name)
111
 
112
  if transcript_url:
113
  transcript_data = download_transcript(transcript_url)