Spaces:
Runtime error
Runtime error
Update transcription_diarization.py
Browse files- transcription_diarization.py +32 -11
transcription_diarization.py
CHANGED
@@ -15,22 +15,30 @@ def upload_to_s3(local_file_path, bucket_name, s3_file_key):
|
|
15 |
s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
|
16 |
return f's3://{bucket_name}/{s3_file_key}'
|
17 |
|
18 |
-
def transcribe_video(file_uri, job_name):
|
19 |
transcribe = boto3.client('transcribe',
|
20 |
aws_access_key_id=aws_access_key_id,
|
21 |
aws_secret_access_key=aws_secret_access_key,
|
22 |
region_name='eu-central-1')
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
30 |
'ShowSpeakerLabels': True,
|
31 |
'MaxSpeakerLabels': 4
|
32 |
}
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
while True:
|
36 |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
@@ -100,14 +108,27 @@ def extract_transcriptions_with_speakers(transcript_data):
|
|
100 |
|
101 |
return transcriptions
|
102 |
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
bucket_name = 'transcriptionjobbucket'
|
106 |
s3_file_key = os.path.basename(video_path)
|
107 |
file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
|
108 |
|
109 |
job_name = f'transcription_job_{int(time.time())}'
|
110 |
-
transcript_url = transcribe_video(file_uri, job_name)
|
111 |
|
112 |
if transcript_url:
|
113 |
transcript_data = download_transcript(transcript_url)
|
|
|
15 |
s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
|
16 |
return f's3://{bucket_name}/{s3_file_key}'
|
17 |
|
18 |
+
def transcribe_video(file_uri, job_name, language):
|
19 |
transcribe = boto3.client('transcribe',
|
20 |
aws_access_key_id=aws_access_key_id,
|
21 |
aws_secret_access_key=aws_secret_access_key,
|
22 |
region_name='eu-central-1')
|
23 |
|
24 |
+
language_code = get_language_code(language)
|
25 |
+
|
26 |
+
job_params = {
|
27 |
+
'TranscriptionJobName': job_name,
|
28 |
+
'Media': {'MediaFileUri': file_uri},
|
29 |
+
'MediaFormat': 'mp4',
|
30 |
+
'Settings': {
|
31 |
'ShowSpeakerLabels': True,
|
32 |
'MaxSpeakerLabels': 4
|
33 |
}
|
34 |
+
}
|
35 |
+
|
36 |
+
if language_code:
|
37 |
+
job_params['LanguageCode'] = language_code
|
38 |
+
else:
|
39 |
+
job_params['IdentifyLanguage'] = True
|
40 |
+
|
41 |
+
transcribe.start_transcription_job(**job_params)
|
42 |
|
43 |
while True:
|
44 |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
|
|
108 |
|
109 |
return transcriptions
|
110 |
|
111 |
+
def get_language_code(language):
|
112 |
+
language_codes = {
|
113 |
+
"English": "en-US",
|
114 |
+
"Hebrew": "he-IL",
|
115 |
+
"Arabic": "ar-SA",
|
116 |
+
"French": "fr-FR",
|
117 |
+
"German": "de-DE",
|
118 |
+
"Italian": "it-IT",
|
119 |
+
"Japanese": "ja-JP",
|
120 |
+
"Chinese": "zh-CN",
|
121 |
+
"Auto-detect": None
|
122 |
+
}
|
123 |
+
return language_codes.get(language, "en-US")
|
124 |
+
|
125 |
+
def diarize_audio(video_path, language):
|
126 |
bucket_name = 'transcriptionjobbucket'
|
127 |
s3_file_key = os.path.basename(video_path)
|
128 |
file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
|
129 |
|
130 |
job_name = f'transcription_job_{int(time.time())}'
|
131 |
+
transcript_url = transcribe_video(file_uri, job_name, language)
|
132 |
|
133 |
if transcript_url:
|
134 |
transcript_data = download_transcript(transcript_url)
|