reab5555 commited on
Commit
260acfd
·
verified ·
1 Parent(s): a141f18

Update transcription_diarization.py

Browse files
Files changed (1) hide show
  1. transcription_diarization.py +32 -11
transcription_diarization.py CHANGED
@@ -15,22 +15,30 @@ def upload_to_s3(local_file_path, bucket_name, s3_file_key):
15
  s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
16
  return f's3://{bucket_name}/{s3_file_key}'
17
 
18
- def transcribe_video(file_uri, job_name):
19
  transcribe = boto3.client('transcribe',
20
  aws_access_key_id=aws_access_key_id,
21
  aws_secret_access_key=aws_secret_access_key,
22
  region_name='eu-central-1')
23
 
24
- transcribe.start_transcription_job(
25
- TranscriptionJobName=job_name,
26
- Media={'MediaFileUri': file_uri},
27
- MediaFormat='mp4',
28
- IdentifyLanguage=True,
29
- Settings={
 
30
  'ShowSpeakerLabels': True,
31
  'MaxSpeakerLabels': 4
32
  }
33
- )
 
 
 
 
 
 
 
34
 
35
  while True:
36
  status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
@@ -100,14 +108,27 @@ def extract_transcriptions_with_speakers(transcript_data):
100
 
101
  return transcriptions
102
 
103
-
104
- def diarize_audio(video_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  bucket_name = 'transcriptionjobbucket'
106
  s3_file_key = os.path.basename(video_path)
107
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
108
 
109
  job_name = f'transcription_job_{int(time.time())}'
110
- transcript_url = transcribe_video(file_uri, job_name)
111
 
112
  if transcript_url:
113
  transcript_data = download_transcript(transcript_url)
 
15
  s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
16
  return f's3://{bucket_name}/{s3_file_key}'
17
 
18
+ def transcribe_video(file_uri, job_name, language):
19
  transcribe = boto3.client('transcribe',
20
  aws_access_key_id=aws_access_key_id,
21
  aws_secret_access_key=aws_secret_access_key,
22
  region_name='eu-central-1')
23
 
24
+ language_code = get_language_code(language)
25
+
26
+ job_params = {
27
+ 'TranscriptionJobName': job_name,
28
+ 'Media': {'MediaFileUri': file_uri},
29
+ 'MediaFormat': 'mp4',
30
+ 'Settings': {
31
  'ShowSpeakerLabels': True,
32
  'MaxSpeakerLabels': 4
33
  }
34
+ }
35
+
36
+ if language_code:
37
+ job_params['LanguageCode'] = language_code
38
+ else:
39
+ job_params['IdentifyLanguage'] = True
40
+
41
+ transcribe.start_transcription_job(**job_params)
42
 
43
  while True:
44
  status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
 
108
 
109
  return transcriptions
110
 
111
+ def get_language_code(language):
112
+ language_codes = {
113
+ "English": "en-US",
114
+ "Hebrew": "he-IL",
115
+ "Arabic": "ar-SA",
116
+ "French": "fr-FR",
117
+ "German": "de-DE",
118
+ "Italian": "it-IT",
119
+ "Japanese": "ja-JP",
120
+ "Chinese": "zh-CN",
121
+ "Auto-detect": None
122
+ }
123
+ return language_codes.get(language, "en-US")
124
+
125
+ def diarize_audio(video_path, language):
126
  bucket_name = 'transcriptionjobbucket'
127
  s3_file_key = os.path.basename(video_path)
128
  file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
129
 
130
  job_name = f'transcription_job_{int(time.time())}'
131
+ transcript_url = transcribe_video(file_uri, job_name, language)
132
 
133
  if transcript_url:
134
  transcript_data = download_transcript(transcript_url)