Roger Condori
commited on
Commit
·
e6b8403
1
Parent(s):
e67b186
add base app
Browse files- .github/workflows/main.yml +23 -0
- README.md +12 -2
- app.py +470 -0
- requirements.txt +11 -0
- soni_translate/audio_segments.py +27 -0
- soni_translate/text_to_speech.py +30 -0
- soni_translate/translate_segments.py +10 -0
.github/workflows/main.yml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# to run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v2
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
- name: Add remote
|
| 17 |
+
env:
|
| 18 |
+
HF: ${{ secrets.HF }}
|
| 19 |
+
run: git remote add space https://r3gm:[email protected]/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content
|
| 20 |
+
- name: Push to hub
|
| 21 |
+
env:
|
| 22 |
+
HF: ${{ secrets.HF }}
|
| 23 |
+
run: git push --force https://r3gm:[email protected]/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content main
|
README.md
CHANGED
|
@@ -1,2 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SoniTranslate_translate_audio_of_a_video_content
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 3.35.2
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
#os.system("git clone https://github.com/R3gm/SoniTranslate")
|
| 3 |
+
# pip install -r requirements.txt
|
| 4 |
+
import numpy as np
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import whisperx
|
| 7 |
+
import torch
|
| 8 |
+
from gtts import gTTS
|
| 9 |
+
import librosa
|
| 10 |
+
import edge_tts
|
| 11 |
+
import asyncio
|
| 12 |
+
import gc
|
| 13 |
+
from pydub import AudioSegment
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
from deep_translator import GoogleTranslator
|
| 16 |
+
import os
|
| 17 |
+
from soni_translate.audio_segments import create_translated_audio
|
| 18 |
+
from soni_translate.text_to_speech import make_voice_gradio
|
| 19 |
+
from soni_translate.translate_segments import translate_text
|
| 20 |
+
#from soni_translate import test
|
| 21 |
+
|
| 22 |
+
title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>"
|
| 23 |
+
|
| 24 |
+
news = """ ## 📖 News
|
| 25 |
+
🔥 2023/07/01: Support (Thanks for [text](https://github.com)).
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
description = """ ## Translate the audio of a video content from one language to another while preserving synchronization.
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
This is a demo on Github project 📽️ [SoniTranslate](https://github.com/R3gm/SoniTranslate).
|
| 32 |
+
|
| 33 |
+
📼 You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations.
|
| 34 |
+
|
| 35 |
+
🚀 For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU.
|
| 36 |
+
|
| 37 |
+
[](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
|
| 38 |
+
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
tutorial = """ # 🔰 Instructions for use.
|
| 42 |
+
|
| 43 |
+
1. Upload a video on the first tab or use a video link on the second tab.
|
| 44 |
+
|
| 45 |
+
2. Choose the language in which you want to translate the video.
|
| 46 |
+
|
| 47 |
+
3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language.
|
| 48 |
+
|
| 49 |
+
4. Press the 'Translate' button to obtain the results.
|
| 50 |
+
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if not os.path.exists('audio'):
|
| 55 |
+
os.makedirs('audio')
|
| 56 |
+
|
| 57 |
+
if not os.path.exists('audio2/audio'):
|
| 58 |
+
os.makedirs('audio2/audio')
|
| 59 |
+
|
| 60 |
+
# Check GPU
|
| 61 |
+
if torch.cuda.is_available():
|
| 62 |
+
device = "cuda"
|
| 63 |
+
list_compute_type = ['float16', 'float32']
|
| 64 |
+
compute_type_default = 'float16'
|
| 65 |
+
whisper_model_default = 'large-v1'
|
| 66 |
+
else:
|
| 67 |
+
device = "cpu"
|
| 68 |
+
list_compute_type = ['float32']
|
| 69 |
+
compute_type_default = 'float32'
|
| 70 |
+
whisper_model_default = 'base'
|
| 71 |
+
print('Working in: ', device)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Download an audio
|
| 75 |
+
#url = "https://www.youtube.com/watch?v=Rdi-SNhe2v4"
|
| 76 |
+
|
| 77 |
+
### INIT
|
| 78 |
+
list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
|
| 82 |
+
TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
|
| 83 |
+
tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
|
| 84 |
+
|
| 85 |
+
YOUR_HF_TOKEN = os.getenv("My_hf_token")
|
| 86 |
+
|
| 87 |
+
OutputFile = 'Video.mp4'
|
| 88 |
+
audio_wav = "audio.wav"
|
| 89 |
+
Output_name_file = "audio_dub_solo.wav"
|
| 90 |
+
mix_audio = "audio_mix.mp3"
|
| 91 |
+
video_output = "diar_output.mp4"
|
| 92 |
+
|
| 93 |
+
os.system(f"rm {Output_name_file}")
|
| 94 |
+
os.system("rm Video.mp4")
|
| 95 |
+
#os.system("rm diar_output.mp4")
|
| 96 |
+
os.system("rm audio.wav")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if os.path.exists(video):
|
| 100 |
+
if device == 'cpu':
|
| 101 |
+
# max 1 minute in cpu
|
| 102 |
+
print('10 s. Limited for CPU ')
|
| 103 |
+
os.system(f"ffmpeg -y -i {video} -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4")
|
| 104 |
+
else:
|
| 105 |
+
os.system(f"ffmpeg -y -i {video} -c:v libx264 -c:a aac -strict experimental Video.mp4")
|
| 106 |
+
|
| 107 |
+
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
|
| 108 |
+
else:
|
| 109 |
+
if device == 'cpu':
|
| 110 |
+
# max 1 minute in cpu
|
| 111 |
+
print('10 s. Limited for CPU ')
|
| 112 |
+
#https://github.com/yt-dlp/yt-dlp/issues/2220
|
| 113 |
+
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
| 114 |
+
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 audio.wav"
|
| 115 |
+
else:
|
| 116 |
+
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
|
| 117 |
+
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
|
| 118 |
+
|
| 119 |
+
os.system(mp4_)
|
| 120 |
+
os.system(wav_)
|
| 121 |
+
|
| 122 |
+
print("Set file complete.")
|
| 123 |
+
|
| 124 |
+
# 1. Transcribe with original whisper (batched)
|
| 125 |
+
model = whisperx.load_model(
|
| 126 |
+
WHISPER_MODEL_SIZE,
|
| 127 |
+
device,
|
| 128 |
+
compute_type=compute_type
|
| 129 |
+
)
|
| 130 |
+
audio = whisperx.load_audio(audio_wav)
|
| 131 |
+
result = model.transcribe(audio, batch_size=batch_size)
|
| 132 |
+
gc.collect(); torch.cuda.empty_cache(); del model
|
| 133 |
+
print("Transcript complete")
|
| 134 |
+
|
| 135 |
+
# 2. Align whisper output
|
| 136 |
+
model_a, metadata = whisperx.load_align_model(
|
| 137 |
+
language_code=result["language"],
|
| 138 |
+
device=device
|
| 139 |
+
)
|
| 140 |
+
result = whisperx.align(
|
| 141 |
+
result["segments"],
|
| 142 |
+
model_a,
|
| 143 |
+
metadata,
|
| 144 |
+
audio,
|
| 145 |
+
device,
|
| 146 |
+
return_char_alignments=True,
|
| 147 |
+
)
|
| 148 |
+
gc.collect(); torch.cuda.empty_cache(); del model_a
|
| 149 |
+
print("Align complete")
|
| 150 |
+
|
| 151 |
+
# 3. Assign speaker labels
|
| 152 |
+
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
|
| 153 |
+
diarize_segments = diarize_model(
|
| 154 |
+
audio_wav,
|
| 155 |
+
min_speakers=min_speakers,
|
| 156 |
+
max_speakers=max_speakers)
|
| 157 |
+
result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
|
| 158 |
+
gc.collect(); torch.cuda.empty_cache(); del diarize_model
|
| 159 |
+
print("Diarize complete")
|
| 160 |
+
|
| 161 |
+
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
|
| 162 |
+
print("Translation complete")
|
| 163 |
+
|
| 164 |
+
audio_files = []
|
| 165 |
+
|
| 166 |
+
# Mapping speakers to voice variables
|
| 167 |
+
speaker_to_voice = {
|
| 168 |
+
'SPEAKER_00': tts_voice00,
|
| 169 |
+
'SPEAKER_01': tts_voice01,
|
| 170 |
+
'SPEAKER_02': tts_voice02,
|
| 171 |
+
'SPEAKER_03': tts_voice03,
|
| 172 |
+
'SPEAKER_04': tts_voice04,
|
| 173 |
+
'SPEAKER_05': tts_voice05
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
for segment in result_diarize['segments']:
|
| 177 |
+
|
| 178 |
+
text = segment['text']
|
| 179 |
+
start = segment['start']
|
| 180 |
+
end = segment['end']
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
speaker = segment['speaker']
|
| 184 |
+
except KeyError:
|
| 185 |
+
segment['speaker'] = "SPEAKER_99"
|
| 186 |
+
speaker = segment['speaker']
|
| 187 |
+
print("NO SPEAKER DETECT IN SEGMENT")
|
| 188 |
+
|
| 189 |
+
# make the tts audio
|
| 190 |
+
filename = f"audio/{start}.ogg"
|
| 191 |
+
|
| 192 |
+
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
|
| 193 |
+
make_voice_gradio(text, speaker_to_voice[speaker], filename)
|
| 194 |
+
elif speaker == "SPEAKER_99":
|
| 195 |
+
try:
|
| 196 |
+
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
|
| 197 |
+
tts.save(filename)
|
| 198 |
+
print('Using GTTS')
|
| 199 |
+
except:
|
| 200 |
+
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
|
| 201 |
+
tts.save(filename)
|
| 202 |
+
print('ERROR AUDIO GTTS')
|
| 203 |
+
|
| 204 |
+
# duration
|
| 205 |
+
duration_true = end - start
|
| 206 |
+
duration_tts = librosa.get_duration(filename=filename)
|
| 207 |
+
|
| 208 |
+
# porcentaje
|
| 209 |
+
porcentaje = duration_tts / duration_true
|
| 210 |
+
|
| 211 |
+
if porcentaje > 2.1:
|
| 212 |
+
porcentaje = 2.1
|
| 213 |
+
elif porcentaje <= 1.2 and porcentaje >= 0.8:
|
| 214 |
+
porcentaje = 1.0
|
| 215 |
+
elif porcentaje <= 0.79:
|
| 216 |
+
porcentaje = 0.8
|
| 217 |
+
|
| 218 |
+
# Smoth and round
|
| 219 |
+
porcentaje = round(porcentaje+0.0, 1)
|
| 220 |
+
|
| 221 |
+
# apply aceleration or opposite to the audio file in audio2 folder
|
| 222 |
+
os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
|
| 223 |
+
|
| 224 |
+
duration_create = librosa.get_duration(filename=f"audio2/{filename}")
|
| 225 |
+
audio_files.append(filename)
|
| 226 |
+
|
| 227 |
+
# replace files with the accelerates
|
| 228 |
+
os.system("mv -f audio2/audio/*.ogg audio/")
|
| 229 |
+
|
| 230 |
+
os.system(f"rm {Output_name_file}")
|
| 231 |
+
|
| 232 |
+
create_translated_audio(result_diarize, audio_files, Output_name_file)
|
| 233 |
+
|
| 234 |
+
os.system("rm audio_dub_stereo.wav")
|
| 235 |
+
os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
|
| 236 |
+
|
| 237 |
+
#os.system(f"ffmpeg -i Video.mp4 -i {Output_name_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
| 238 |
+
|
| 239 |
+
os.system(f"rm {mix_audio}")
|
| 240 |
+
#os.system(f'''ffmpeg -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}''')
|
| 241 |
+
#os.system(f'ffmpeg -y -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[0:0][1:0] amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
|
| 242 |
+
os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.25[a];[1:0]volume=1.85[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
|
| 243 |
+
|
| 244 |
+
os.system(f"rm {video_output}")
|
| 245 |
+
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
|
| 246 |
+
|
| 247 |
+
return video_output
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
import sys
|
| 252 |
+
|
| 253 |
+
class Logger:
|
| 254 |
+
def __init__(self, filename):
|
| 255 |
+
self.terminal = sys.stdout
|
| 256 |
+
self.log = open(filename, "w")
|
| 257 |
+
|
| 258 |
+
def write(self, message):
|
| 259 |
+
self.terminal.write(message)
|
| 260 |
+
self.log.write(message)
|
| 261 |
+
|
| 262 |
+
def flush(self):
|
| 263 |
+
self.terminal.flush()
|
| 264 |
+
self.log.flush()
|
| 265 |
+
|
| 266 |
+
def isatty(self):
|
| 267 |
+
return False
|
| 268 |
+
|
| 269 |
+
sys.stdout = Logger("output.log")
|
| 270 |
+
|
| 271 |
+
def read_logs():
|
| 272 |
+
sys.stdout.flush()
|
| 273 |
+
with open("output.log", "r") as f:
|
| 274 |
+
return f.read()
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
with gr.Blocks() as demo:
|
| 278 |
+
gr.Markdown(title)
|
| 279 |
+
gr.Markdown(description)
|
| 280 |
+
gr.Markdown(tutorial)
|
| 281 |
+
|
| 282 |
+
with gr.Tab("Translate audio from video"):
|
| 283 |
+
with gr.Row():
|
| 284 |
+
with gr.Column():
|
| 285 |
+
video_input = gr.Video() # height=300,width=300
|
| 286 |
+
|
| 287 |
+
gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
|
| 288 |
+
TRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
|
| 289 |
+
|
| 290 |
+
gr.Markdown("Select how many people are speaking in the video.")
|
| 291 |
+
min_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
|
| 292 |
+
max_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
|
| 293 |
+
|
| 294 |
+
gr.Markdown("Select the voice you want for each speaker.")
|
| 295 |
+
tts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
|
| 296 |
+
tts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
|
| 297 |
+
tts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
|
| 298 |
+
tts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
|
| 299 |
+
tts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
|
| 300 |
+
tts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
|
| 301 |
+
|
| 302 |
+
gr.Markdown("Default configuration of Whisper.")
|
| 303 |
+
WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
|
| 304 |
+
batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
|
| 305 |
+
compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
|
| 306 |
+
|
| 307 |
+
with gr.Column(variant='compact'):
|
| 308 |
+
with gr.Row():
|
| 309 |
+
video_button = gr.Button("Translate audio of video", )
|
| 310 |
+
with gr.Row():
|
| 311 |
+
video_output = gr.Video()
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
gr.Examples(
|
| 315 |
+
examples=[
|
| 316 |
+
[
|
| 317 |
+
"./assets/Video_subtitled.mp4",
|
| 318 |
+
"base",
|
| 319 |
+
16,
|
| 320 |
+
"float32",
|
| 321 |
+
"en",
|
| 322 |
+
1,
|
| 323 |
+
2,
|
| 324 |
+
'en-AU-WilliamNeural-Male',
|
| 325 |
+
'en-CA-ClaraNeural-Female',
|
| 326 |
+
'en-GB-ThomasNeural-Male',
|
| 327 |
+
'en-GB-SoniaNeural-Female',
|
| 328 |
+
'en-NZ-MitchellNeural-Male',
|
| 329 |
+
'en-GB-MaisieNeural-Female',
|
| 330 |
+
],
|
| 331 |
+
],
|
| 332 |
+
fn=translate_from_video,
|
| 333 |
+
inputs=[
|
| 334 |
+
video_input,
|
| 335 |
+
WHISPER_MODEL_SIZE,
|
| 336 |
+
batch_size,
|
| 337 |
+
compute_type,
|
| 338 |
+
TRANSLATE_AUDIO_TO,
|
| 339 |
+
min_speakers,
|
| 340 |
+
max_speakers,
|
| 341 |
+
tts_voice00,
|
| 342 |
+
tts_voice01,
|
| 343 |
+
tts_voice02,
|
| 344 |
+
tts_voice03,
|
| 345 |
+
tts_voice04,
|
| 346 |
+
tts_voice05,
|
| 347 |
+
],
|
| 348 |
+
outputs=[video_output],
|
| 349 |
+
#cache_examples=True,
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
with gr.Tab("Translate audio from video link"):
|
| 354 |
+
with gr.Row():
|
| 355 |
+
with gr.Column():
|
| 356 |
+
|
| 357 |
+
link_input = gr.Textbox(label="Media link. Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
|
| 358 |
+
#filename = gr.Textbox(label="File name", placeholder="best-vid")
|
| 359 |
+
|
| 360 |
+
gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
|
| 361 |
+
bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
|
| 362 |
+
|
| 363 |
+
gr.Markdown("Select how many people are speaking in the video.")
|
| 364 |
+
bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
|
| 365 |
+
bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
|
| 366 |
+
|
| 367 |
+
gr.Markdown("Select the voice you want for each speaker.")
|
| 368 |
+
btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
|
| 369 |
+
btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
|
| 370 |
+
btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
|
| 371 |
+
btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
|
| 372 |
+
btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
|
| 373 |
+
btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
|
| 374 |
+
|
| 375 |
+
gr.Markdown("Default configuration of Whisper.")
|
| 376 |
+
bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
|
| 377 |
+
bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
|
| 378 |
+
bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
|
| 379 |
+
|
| 380 |
+
# text_button = gr.Button("Translate audio of video")
|
| 381 |
+
# link_output = gr.Video() #gr.outputs.File(label="Download!")
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
with gr.Column(variant='compact'):
|
| 386 |
+
with gr.Row():
|
| 387 |
+
text_button = gr.Button("Translate audio of video")
|
| 388 |
+
with gr.Row():
|
| 389 |
+
link_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
|
| 390 |
+
|
| 391 |
+
gr.Examples(
|
| 392 |
+
examples=[
|
| 393 |
+
[
|
| 394 |
+
"https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
|
| 395 |
+
"base",
|
| 396 |
+
16,
|
| 397 |
+
"float32",
|
| 398 |
+
"en",
|
| 399 |
+
1,
|
| 400 |
+
2,
|
| 401 |
+
'en-CA-ClaraNeural-Female',
|
| 402 |
+
'en-AU-WilliamNeural-Male',
|
| 403 |
+
'en-GB-ThomasNeural-Male',
|
| 404 |
+
'en-GB-SoniaNeural-Female',
|
| 405 |
+
'en-NZ-MitchellNeural-Male',
|
| 406 |
+
'en-GB-MaisieNeural-Female',
|
| 407 |
+
],
|
| 408 |
+
],
|
| 409 |
+
fn=translate_from_video,
|
| 410 |
+
inputs=[
|
| 411 |
+
link_input,
|
| 412 |
+
bWHISPER_MODEL_SIZE,
|
| 413 |
+
bbatch_size,
|
| 414 |
+
bcompute_type,
|
| 415 |
+
bTRANSLATE_AUDIO_TO,
|
| 416 |
+
bmin_speakers,
|
| 417 |
+
bmax_speakers,
|
| 418 |
+
btts_voice00,
|
| 419 |
+
btts_voice01,
|
| 420 |
+
btts_voice02,
|
| 421 |
+
btts_voice03,
|
| 422 |
+
btts_voice04,
|
| 423 |
+
btts_voice05,
|
| 424 |
+
],
|
| 425 |
+
outputs=[video_output],
|
| 426 |
+
#cache_examples=True,
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
with gr.Accordion("Logs"):
|
| 432 |
+
logs = gr.Textbox()
|
| 433 |
+
demo.load(read_logs, None, logs, every=1)
|
| 434 |
+
|
| 435 |
+
# run
|
| 436 |
+
video_button.click(translate_from_video, inputs=[
|
| 437 |
+
video_input,
|
| 438 |
+
WHISPER_MODEL_SIZE,
|
| 439 |
+
batch_size,
|
| 440 |
+
compute_type,
|
| 441 |
+
TRANSLATE_AUDIO_TO,
|
| 442 |
+
min_speakers,
|
| 443 |
+
max_speakers,
|
| 444 |
+
tts_voice00,
|
| 445 |
+
tts_voice01,
|
| 446 |
+
tts_voice02,
|
| 447 |
+
tts_voice03,
|
| 448 |
+
tts_voice04,
|
| 449 |
+
tts_voice05,], outputs=video_output)
|
| 450 |
+
text_button.click(translate_from_video, inputs=[
|
| 451 |
+
link_input,
|
| 452 |
+
bWHISPER_MODEL_SIZE,
|
| 453 |
+
bbatch_size,
|
| 454 |
+
bcompute_type,
|
| 455 |
+
bTRANSLATE_AUDIO_TO,
|
| 456 |
+
bmin_speakers,
|
| 457 |
+
bmax_speakers,
|
| 458 |
+
btts_voice00,
|
| 459 |
+
btts_voice01,
|
| 460 |
+
btts_voice02,
|
| 461 |
+
btts_voice03,
|
| 462 |
+
btts_voice04,
|
| 463 |
+
btts_voice05,], outputs=link_output)
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
demo.launch(enable_queue=True)
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
git+https://github.com/m-bain/whisperx.git
|
| 4 |
+
yt-dlp
|
| 5 |
+
gTTS
|
| 6 |
+
pydub
|
| 7 |
+
edge_tts
|
| 8 |
+
deep_translator
|
| 9 |
+
torchaudio==2.0.0
|
| 10 |
+
gradio
|
| 11 |
+
nest_asyncio
|
soni_translate/audio_segments.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydub import AudioSegment
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def create_translated_audio(result_diarize, audio_files, Output_name_file):
|
| 6 |
+
total_duration = result_diarize['segments'][-1]['end'] # in seconds
|
| 7 |
+
|
| 8 |
+
# silent audio with total_duration
|
| 9 |
+
combined_audio = AudioSegment.silent(duration=int(total_duration * 1000))
|
| 10 |
+
print(round((total_duration / 60),2), 'minutes of video')
|
| 11 |
+
|
| 12 |
+
for line, audio_file in tqdm(zip(result_diarize['segments'], audio_files)):
|
| 13 |
+
start = float(line['start'])
|
| 14 |
+
|
| 15 |
+
# Overlay each audio at the corresponding time
|
| 16 |
+
try:
|
| 17 |
+
audio = AudioSegment.from_file(audio_file)
|
| 18 |
+
###audio_a = audio.speedup(playback_speed=1.5)
|
| 19 |
+
start_time = start * 1000 # to ms
|
| 20 |
+
combined_audio = combined_audio.overlay(audio, position=start_time)
|
| 21 |
+
except:
|
| 22 |
+
print(f'ERROR AUDIO FILE {audio_file}')
|
| 23 |
+
|
| 24 |
+
os.system("rm -rf audio/*")
|
| 25 |
+
|
| 26 |
+
# combined audio as a file
|
| 27 |
+
combined_audio.export(Output_name_file, format="wav")
|
soni_translate/text_to_speech.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gtts import gTTS
|
| 2 |
+
import edge_tts
|
| 3 |
+
import asyncio
|
| 4 |
+
import nest_asyncio
|
| 5 |
+
|
| 6 |
+
def make_voice(tts_text, tts_voice, filename):
|
| 7 |
+
try:
|
| 8 |
+
nest_asyncio.apply()
|
| 9 |
+
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
|
| 10 |
+
except 1:
|
| 11 |
+
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
|
| 12 |
+
tts.save(filename)
|
| 13 |
+
print('USE GTTS')
|
| 14 |
+
except 2:
|
| 15 |
+
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
|
| 16 |
+
tts.save(filename)
|
| 17 |
+
print('REPLACE AUDIO GTTS')
|
| 18 |
+
|
| 19 |
+
def make_voice_gradio(tts_text, tts_voice, filename):
|
| 20 |
+
print(tts_text, filename)
|
| 21 |
+
try:
|
| 22 |
+
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
|
| 23 |
+
except 1:
|
| 24 |
+
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
|
| 25 |
+
tts.save(filename)
|
| 26 |
+
print('USE GTTS')
|
| 27 |
+
except 2:
|
| 28 |
+
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
|
| 29 |
+
tts.save(filename)
|
| 30 |
+
print('REPLACE AUDIO GTTS')
|
soni_translate/translate_segments.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
from deep_translator import GoogleTranslator
|
| 3 |
+
|
| 4 |
+
def translate_text(segments, TRANSLATE_AUDIO_TO):
|
| 5 |
+
for line in tqdm(range(len(segments))):
|
| 6 |
+
text = segments[line]['text']
|
| 7 |
+
translator = GoogleTranslator(source='auto', target=TRANSLATE_AUDIO_TO)
|
| 8 |
+
translated_line = translator.translate(text.strip())
|
| 9 |
+
segments[line]['text'] = translated_line
|
| 10 |
+
return segments
|