Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -409,11 +409,11 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 409 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 410 |
return None
|
| 411 |
|
| 412 |
-
|
| 413 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 414 |
"""
|
| 415 |
Aligns speech segments using quadratic programming. If optimization fails,
|
| 416 |
applies greedy fallback: center shorter segments, stretch longer ones.
|
|
|
|
| 417 |
"""
|
| 418 |
N = len(original_segments)
|
| 419 |
d = np.array(generated_durations)
|
|
@@ -437,9 +437,13 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
| 437 |
for i in range(N):
|
| 438 |
original_segments[i]['start'] = round(s.value[i], 3)
|
| 439 |
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
except Exception as e:
|
| 442 |
-
|
| 443 |
|
| 444 |
for i in range(N):
|
| 445 |
orig_start = original_segments[i]['start']
|
|
@@ -456,12 +460,10 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
| 456 |
new_start = orig_start - extra
|
| 457 |
new_end = orig_end + extra
|
| 458 |
|
| 459 |
-
# Prevent overlap with previous
|
| 460 |
if i > 0:
|
| 461 |
prev_end = original_segments[i - 1]['end']
|
| 462 |
new_start = max(new_start, prev_end + 0.01)
|
| 463 |
|
| 464 |
-
# Prevent overlap with next
|
| 465 |
if i < N - 1:
|
| 466 |
next_start = original_segments[i + 1]['start']
|
| 467 |
new_end = min(new_end, next_start - 0.01)
|
|
@@ -473,6 +475,11 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
| 473 |
original_segments[i]['start'] = round(new_start, 3)
|
| 474 |
original_segments[i]['end'] = round(new_end, 3)
|
| 475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
return original_segments
|
| 477 |
|
| 478 |
def get_frame_image_bytes(video, t):
|
|
@@ -706,7 +713,10 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
| 706 |
speed=desired_speed,
|
| 707 |
split_sentences=True
|
| 708 |
)
|
| 709 |
-
msg =
|
|
|
|
|
|
|
|
|
|
| 710 |
logger.info(msg)
|
| 711 |
return output_audio_path, msg, None
|
| 712 |
|
|
@@ -718,9 +728,9 @@ def generate_voiceover_clone(full_text, tts_model, desired_speed, target_languag
|
|
| 718 |
|
| 719 |
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
| 720 |
"""
|
| 721 |
-
Adds
|
| 722 |
-
using shrinkage-based estimation
|
| 723 |
-
|
| 724 |
"""
|
| 725 |
translated_json = copy.deepcopy(translated_json_raw)
|
| 726 |
|
|
@@ -744,8 +754,8 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
|
|
| 744 |
|
| 745 |
# Optional style modulation factor
|
| 746 |
style_modifiers = {
|
| 747 |
-
"dramatic": 0.9,
|
| 748 |
-
"urgent": 1.1,
|
| 749 |
"neutral": 1.0
|
| 750 |
}
|
| 751 |
|
|
@@ -758,38 +768,45 @@ def apply_adaptive_speed(translated_json_raw, source_language, target_language,
|
|
| 758 |
category = entry.get("category", "drama")
|
| 759 |
source_lang = source_language
|
| 760 |
target_lang = target_language
|
| 761 |
-
style = entry.get("style", "neutral")
|
| 762 |
|
| 763 |
# Observed speed from original
|
| 764 |
base_text = original_text or translated_text
|
| 765 |
obs_speed = len(base_text) / duration
|
| 766 |
|
| 767 |
-
# Prior speed
|
| 768 |
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
| 769 |
|
| 770 |
-
# Shrinkage
|
| 771 |
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
| 772 |
|
| 773 |
-
#
|
| 774 |
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
| 775 |
adjusted_speed = shrink_speed * ratio
|
| 776 |
|
| 777 |
-
#
|
| 778 |
-
mod = style_modifiers.get(style
|
| 779 |
adjusted_speed *= mod
|
| 780 |
|
| 781 |
-
# Final
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
target_chars = len(translated_text)
|
| 783 |
-
target_duration = round(target_chars /
|
| 784 |
|
| 785 |
-
# Logging
|
| 786 |
logger.info(
|
| 787 |
-
f"Segment {idx}
|
| 788 |
-
f"
|
| 789 |
-
f"
|
|
|
|
| 790 |
)
|
| 791 |
|
| 792 |
-
entry["speed"] = round(
|
| 793 |
entry["target_duration"] = target_duration
|
| 794 |
|
| 795 |
return translated_json
|
|
|
|
| 409 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 410 |
return None
|
| 411 |
|
|
|
|
| 412 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 413 |
"""
|
| 414 |
Aligns speech segments using quadratic programming. If optimization fails,
|
| 415 |
applies greedy fallback: center shorter segments, stretch longer ones.
|
| 416 |
+
Logs alignment results for traceability.
|
| 417 |
"""
|
| 418 |
N = len(original_segments)
|
| 419 |
d = np.array(generated_durations)
|
|
|
|
| 437 |
for i in range(N):
|
| 438 |
original_segments[i]['start'] = round(s.value[i], 3)
|
| 439 |
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
| 440 |
+
logger.info(
|
| 441 |
+
f"[OPT] Segment {i}: duration={d[i]:.2f}s | start={original_segments[i]['start']:.2f}s | "
|
| 442 |
+
f"end={original_segments[i]['end']:.2f}s | mid={m[i]:.2f}s"
|
| 443 |
+
)
|
| 444 |
|
| 445 |
except Exception as e:
|
| 446 |
+
logger.warning(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
| 447 |
|
| 448 |
for i in range(N):
|
| 449 |
orig_start = original_segments[i]['start']
|
|
|
|
| 460 |
new_start = orig_start - extra
|
| 461 |
new_end = orig_end + extra
|
| 462 |
|
|
|
|
| 463 |
if i > 0:
|
| 464 |
prev_end = original_segments[i - 1]['end']
|
| 465 |
new_start = max(new_start, prev_end + 0.01)
|
| 466 |
|
|
|
|
| 467 |
if i < N - 1:
|
| 468 |
next_start = original_segments[i + 1]['start']
|
| 469 |
new_end = min(new_end, next_start - 0.01)
|
|
|
|
| 475 |
original_segments[i]['start'] = round(new_start, 3)
|
| 476 |
original_segments[i]['end'] = round(new_end, 3)
|
| 477 |
|
| 478 |
+
logger.info(
|
| 479 |
+
f"[FALLBACK] Segment {i}: duration={gen_duration:.2f}s | start={new_start:.2f}s | "
|
| 480 |
+
f"end={new_end:.2f}s | original_mid={orig_mid:.2f}s"
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
return original_segments
|
| 484 |
|
| 485 |
def get_frame_image_bytes(video, t):
|
|
|
|
| 713 |
speed=desired_speed,
|
| 714 |
split_sentences=True
|
| 715 |
)
|
| 716 |
+
msg = (
|
| 717 |
+
f"✅ Voice cloning completed successfully. "
|
| 718 |
+
f"[Speaker Wav: {speaker_wav_path}] [Speed: {desired_speed}]"
|
| 719 |
+
)
|
| 720 |
logger.info(msg)
|
| 721 |
return output_audio_path, msg, None
|
| 722 |
|
|
|
|
| 728 |
|
| 729 |
def apply_adaptive_speed(translated_json_raw, source_language, target_language, k=3.0, default_prior_speed=5.0):
|
| 730 |
"""
|
| 731 |
+
Adds `speed` (relative, 1.0 = normal speed) and `target_duration` (sec) to each segment
|
| 732 |
+
using shrinkage-based estimation, language stretch ratios, and optional style modifiers.
|
| 733 |
+
Speeds are clamped to [0.85, 1.7] to avoid unnatural TTS behavior.
|
| 734 |
"""
|
| 735 |
translated_json = copy.deepcopy(translated_json_raw)
|
| 736 |
|
|
|
|
| 754 |
|
| 755 |
# Optional style modulation factor
|
| 756 |
style_modifiers = {
|
| 757 |
+
"dramatic": 0.9,
|
| 758 |
+
"urgent": 1.1,
|
| 759 |
"neutral": 1.0
|
| 760 |
}
|
| 761 |
|
|
|
|
| 768 |
category = entry.get("category", "drama")
|
| 769 |
source_lang = source_language
|
| 770 |
target_lang = target_language
|
| 771 |
+
style = entry.get("style", "neutral").lower()
|
| 772 |
|
| 773 |
# Observed speed from original
|
| 774 |
base_text = original_text or translated_text
|
| 775 |
obs_speed = len(base_text) / duration
|
| 776 |
|
| 777 |
+
# Prior speed
|
| 778 |
prior_speed = priors.get((category, target_lang), default_prior_speed)
|
| 779 |
|
| 780 |
+
# Shrinkage
|
| 781 |
shrink_speed = (duration * obs_speed + k * prior_speed) / (duration + k)
|
| 782 |
|
| 783 |
+
# Language pacing adjustment
|
| 784 |
ratio = lang_ratio.get((source_lang, target_lang), 1.0)
|
| 785 |
adjusted_speed = shrink_speed * ratio
|
| 786 |
|
| 787 |
+
# Style modulation
|
| 788 |
+
mod = style_modifiers.get(style, 1.0)
|
| 789 |
adjusted_speed *= mod
|
| 790 |
|
| 791 |
+
# Final relative speed (normalized to prior)
|
| 792 |
+
relative_speed = adjusted_speed / prior_speed
|
| 793 |
+
|
| 794 |
+
# Clamp relative speed to [0.85, 1.7]
|
| 795 |
+
relative_speed = max(0.85, min(1.7, relative_speed))
|
| 796 |
+
|
| 797 |
+
# Compute target duration for synthesis
|
| 798 |
target_chars = len(translated_text)
|
| 799 |
+
target_duration = round(target_chars / (prior_speed * relative_speed), 2)
|
| 800 |
|
| 801 |
+
# Logging
|
| 802 |
logger.info(
|
| 803 |
+
f"[Segment {idx}] dur={duration:.2f}s | obs_speed={obs_speed:.2f} | prior={prior_speed:.2f} | "
|
| 804 |
+
f"shrinked={shrink_speed:.2f} | lang_ratio={ratio} | style_mod={mod} | "
|
| 805 |
+
f"adj_speed={adjusted_speed:.2f} | rel_speed={relative_speed:.2f} | "
|
| 806 |
+
f"target_dur={target_duration:.2f}s"
|
| 807 |
)
|
| 808 |
|
| 809 |
+
entry["speed"] = round(relative_speed, 3)
|
| 810 |
entry["target_duration"] = target_duration
|
| 811 |
|
| 812 |
return translated_json
|