Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import numpy as np
|
| 2 |
import cvxpy as cp
|
| 3 |
import re
|
|
|
|
| 4 |
import concurrent.futures
|
| 5 |
import gradio as gr
|
| 6 |
from datetime import datetime
|
|
@@ -556,7 +557,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
| 556 |
try:
|
| 557 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
| 558 |
desired_duration = entry["end"] - entry["start"]
|
| 559 |
-
desired_speed = calibrated_speed(entry['translated'], desired_duration)
|
| 560 |
|
| 561 |
speaker = entry.get("speaker", "default")
|
| 562 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
|
@@ -608,7 +609,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
| 608 |
|
| 609 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 610 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
| 611 |
-
for i, entry in enumerate(
|
| 612 |
|
| 613 |
results = []
|
| 614 |
for future in concurrent.futures.as_completed(futures):
|
|
@@ -715,6 +716,84 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
| 715 |
logger.error(traceback.format_exc())
|
| 716 |
return None, err_msg, err_msg
|
| 717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
def calibrated_speed(text, desired_duration):
|
| 719 |
"""
|
| 720 |
Compute a speed factor to help TTS fit audio into desired duration,
|
|
@@ -757,14 +836,15 @@ def upload_and_manage(file, target_language, process_mode):
|
|
| 757 |
|
| 758 |
# Step 2: Translate the transcription
|
| 759 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
| 760 |
-
|
| 761 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
| 762 |
|
| 763 |
# translated_json = post_edit_translated_segments(translated_json, file.name)
|
| 764 |
-
|
|
|
|
| 765 |
# Step 3: Add transcript to video based on timestamps
|
| 766 |
logger.info("Adding translated transcript to video...")
|
| 767 |
-
add_transcript_voiceover(file.name,
|
| 768 |
logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
|
| 769 |
|
| 770 |
# Convert translated JSON into a format for the editable table
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
import cvxpy as cp
|
| 3 |
import re
|
| 4 |
+
import copy
|
| 5 |
import concurrent.futures
|
| 6 |
import gradio as gr
|
| 7 |
from datetime import datetime
|
|
|
|
| 557 |
try:
|
| 558 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
| 559 |
desired_duration = entry["end"] - entry["start"]
|
| 560 |
+
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
| 561 |
|
| 562 |
speaker = entry.get("speaker", "default")
|
| 563 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
|
|
|
| 609 |
|
| 610 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 611 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
| 612 |
+
for i, entry in enumerate(translated_json_withspeed)]
|
| 613 |
|
| 614 |
results = []
|
| 615 |
for future in concurrent.futures.as_completed(futures):
|
|
|
|
| 716 |
logger.error(traceback.format_exc())
|
| 717 |
return None, err_msg, err_msg
|
| 718 |
|
| 719 |
+
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
| 720 |
+
"""
|
| 721 |
+
Adds a `speed` (chars/sec) and `target_duration` (sec) field to each segment
|
| 722 |
+
using shrinkage-based estimation and language stretch ratios.
|
| 723 |
+
Optionally modulates based on tone or style tags (e.g., "dramatic", "urgent").
|
| 724 |
+
"""
|
| 725 |
+
translated_json = copy.deepcopy(translated_json_raw)
|
| 726 |
+
|
| 727 |
+
# Prior average speech speeds by (category, target language)
|
| 728 |
+
priors = {
|
| 729 |
+
("drama", "en"): 5.0,
|
| 730 |
+
("drama", "zh"): 4.5,
|
| 731 |
+
("tutorial", "en"): 5.2,
|
| 732 |
+
("tutorial", "zh"): 4.8,
|
| 733 |
+
("shortplay", "en"): 5.1,
|
| 734 |
+
("shortplay", "zh"): 4.7,
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
# Adjustment ratio based on language pair (source → target)
|
| 738 |
+
lang_ratio = {
|
| 739 |
+
("zh", "en"): 0.85,
|
| 740 |
+
("en", "zh"): 1.15,
|
| 741 |
+
("zh", "jp"): 1.05,
|
| 742 |
+
("en", "ja"): 0.9,
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
+
# Optional style modulation factor
|
| 746 |
+
style_modifiers = {
|
| 747 |
+
"dramatic": 0.9, # slower
|
| 748 |
+
"urgent": 1.1, # faster
|
| 749 |
+
"neutral": 1.0
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
for idx, entry in enumerate(translated_json):
|
| 753 |
+
start, end = float(entry.get("start", 0)), float(entry.get("end", 0))
|
| 754 |
+
duration = max(0.1, end - start)
|
| 755 |
+
|
| 756 |
+
original_text = entry.get("original", "")
|
| 757 |
+
translated_text = entry.get("translated", "")
|
| 758 |
+
category = entry.get("category", "drama")
|
| 759 |
+
source_lang = source_language
|
| 760 |
+
target_lang = target_language
|
| 761 |
+
style = entry.get("style", "neutral") # Optional field like "dramatic"
|
| 762 |
+
|
| 763 |
+
# Observed speed from original
|
| 764 |
+
base_text = original_text or translated_text
|
| 765 |
+
obs_speed = len(base_text) / duration
|
| 766 |
+
|
| 767 |
+
# Prior speed from category + language
|
| 768 |
+
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
| 769 |
+
|
| 770 |
+
# Shrinkage estimate
|
| 771 |
+
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
| 772 |
+
|
| 773 |
+
# Adjust for language-specific pacing
|
| 774 |
+
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
| 775 |
+
adjusted_speed = shrink_speed * ratio
|
| 776 |
+
|
| 777 |
+
# Optional tone/style modulation (if available)
|
| 778 |
+
mod = style_modifiers.get(style.lower(), 1.0)
|
| 779 |
+
adjusted_speed *= mod
|
| 780 |
+
|
| 781 |
+
# Final estimated duration for synthesized segment
|
| 782 |
+
target_chars = len(translated_text)
|
| 783 |
+
target_duration = round(target_chars / adjusted_speed, 2)
|
| 784 |
+
|
| 785 |
+
# Logging for debugging
|
| 786 |
+
logger.info(
|
| 787 |
+
f"Segment {idx}: dur={duration:.2f}s, obs={obs_speed:.2f}, "
|
| 788 |
+
f"prior={prior_speed:.2f}, shrink={shrink_speed:.2f}, "
|
| 789 |
+
f"final_speed={adjusted_speed:.2f}, target_dur={target_duration:.2f}s"
|
| 790 |
+
)
|
| 791 |
+
|
| 792 |
+
entry["speed"] = round(adjusted_speed, 3)
|
| 793 |
+
entry["target_duration"] = target_duration
|
| 794 |
+
|
| 795 |
+
return translated_json
|
| 796 |
+
|
| 797 |
def calibrated_speed(text, desired_duration):
|
| 798 |
"""
|
| 799 |
Compute a speed factor to help TTS fit audio into desired duration,
|
|
|
|
| 836 |
|
| 837 |
# Step 2: Translate the transcription
|
| 838 |
logger.info(f"Translating transcription from {source_language} to {target_language}...")
|
| 839 |
+
translated_json_raw = translate_text(transcription_json, source_language, target_language)
|
| 840 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
| 841 |
|
| 842 |
# translated_json = post_edit_translated_segments(translated_json, file.name)
|
| 843 |
+
translated_json = apply_adaptive_speed(translated_json_raw, source_language, target_language)
|
| 844 |
+
|
| 845 |
# Step 3: Add transcript to video based on timestamps
|
| 846 |
logger.info("Adding translated transcript to video...")
|
| 847 |
+
add_transcript_voiceover(file.name, translated_json_speedcontrol, output_video_path, process_mode, target_language)
|
| 848 |
logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
|
| 849 |
|
| 850 |
# Convert translated JSON into a format for the editable table
|