Spaces:
Runtime error
Runtime error
Update transcription_diarization.py
Browse files- transcription_diarization.py +11 -32
transcription_diarization.py
CHANGED
|
@@ -15,30 +15,22 @@ def upload_to_s3(local_file_path, bucket_name, s3_file_key):
|
|
| 15 |
s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
|
| 16 |
return f's3://{bucket_name}/{s3_file_key}'
|
| 17 |
|
| 18 |
-
def transcribe_video(file_uri, job_name
|
| 19 |
transcribe = boto3.client('transcribe',
|
| 20 |
aws_access_key_id=aws_access_key_id,
|
| 21 |
aws_secret_access_key=aws_secret_access_key,
|
| 22 |
region_name='eu-central-1')
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
'
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
'Settings': {
|
| 31 |
'ShowSpeakerLabels': True,
|
| 32 |
'MaxSpeakerLabels': 4
|
| 33 |
}
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
if language_code:
|
| 37 |
-
job_params['LanguageCode'] = language_code
|
| 38 |
-
else:
|
| 39 |
-
job_params['IdentifyLanguage'] = True
|
| 40 |
-
|
| 41 |
-
transcribe.start_transcription_job(**job_params)
|
| 42 |
|
| 43 |
while True:
|
| 44 |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
|
@@ -108,27 +100,14 @@ def extract_transcriptions_with_speakers(transcript_data):
|
|
| 108 |
|
| 109 |
return transcriptions
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
"English": "en-US",
|
| 114 |
-
"Hebrew": "he-IL",
|
| 115 |
-
"Arabic": "ar-SA",
|
| 116 |
-
"French": "fr-FR",
|
| 117 |
-
"German": "de-DE",
|
| 118 |
-
"Italian": "it-IT",
|
| 119 |
-
"Japanese": "ja-JP",
|
| 120 |
-
"Chinese": "zh-CN",
|
| 121 |
-
"Auto-detect": None
|
| 122 |
-
}
|
| 123 |
-
return language_codes.get(language, "en-US")
|
| 124 |
-
|
| 125 |
-
def diarize_audio(video_path, language):
|
| 126 |
bucket_name = 'transcriptionjobbucket'
|
| 127 |
s3_file_key = os.path.basename(video_path)
|
| 128 |
file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
|
| 129 |
|
| 130 |
job_name = f'transcription_job_{int(time.time())}'
|
| 131 |
-
transcript_url = transcribe_video(file_uri, job_name
|
| 132 |
|
| 133 |
if transcript_url:
|
| 134 |
transcript_data = download_transcript(transcript_url)
|
|
|
|
| 15 |
s3_client.upload_file(local_file_path, bucket_name, s3_file_key)
|
| 16 |
return f's3://{bucket_name}/{s3_file_key}'
|
| 17 |
|
| 18 |
+
def transcribe_video(file_uri, job_name):
|
| 19 |
transcribe = boto3.client('transcribe',
|
| 20 |
aws_access_key_id=aws_access_key_id,
|
| 21 |
aws_secret_access_key=aws_secret_access_key,
|
| 22 |
region_name='eu-central-1')
|
| 23 |
|
| 24 |
+
transcribe.start_transcription_job(
|
| 25 |
+
TranscriptionJobName=job_name,
|
| 26 |
+
Media={'MediaFileUri': file_uri},
|
| 27 |
+
MediaFormat='mp4',
|
| 28 |
+
IdentifyLanguage=True,
|
| 29 |
+
Settings={
|
|
|
|
| 30 |
'ShowSpeakerLabels': True,
|
| 31 |
'MaxSpeakerLabels': 4
|
| 32 |
}
|
| 33 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
while True:
|
| 36 |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
|
|
|
|
| 100 |
|
| 101 |
return transcriptions
|
| 102 |
|
| 103 |
+
|
| 104 |
+
def diarize_audio(video_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
bucket_name = 'transcriptionjobbucket'
|
| 106 |
s3_file_key = os.path.basename(video_path)
|
| 107 |
file_uri = upload_to_s3(video_path, bucket_name, s3_file_key)
|
| 108 |
|
| 109 |
job_name = f'transcription_job_{int(time.time())}'
|
| 110 |
+
transcript_url = transcribe_video(file_uri, job_name)
|
| 111 |
|
| 112 |
if transcript_url:
|
| 113 |
transcript_data = download_transcript(transcript_url)
|