Spaces:

Emmiq
/

EmmiSpace

Build error

SWivid commited on Jan 28

Commit

488d746

1 Parent(s): 572d786

0.4.5 fix extremely short case that lengths of text_seq > audio_seq, causing wrong cond_mask

Files changed (2) hide show

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "f5-tts"
-version = "0.4.4"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

 [project]
 name = "f5-tts"
+version = "0.4.5"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

src/f5_tts/model/cfm.py CHANGED Viewed

@@ -120,10 +120,6 @@ class CFM(nn.Module):
                 text = list_str_to_tensor(text).to(device)
             assert text.shape[0] == batch
-        if exists(text):
-            text_lens = (text != -1).sum(dim=-1)
-            lens = torch.maximum(text_lens, lens)  # make sure lengths are at least those of the text characters
         # duration
         cond_mask = lens_to_mask(lens)
@@ -133,7 +129,9 @@ class CFM(nn.Module):
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
-        duration = torch.maximum(lens + 1, duration)  # just add one token so something is generated
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()

                 text = list_str_to_tensor(text).to(device)
             assert text.shape[0] == batch
         # duration
         cond_mask = lens_to_mask(lens)
         if isinstance(duration, int):
             duration = torch.full((batch,), duration, device=device, dtype=torch.long)
+        duration = torch.maximum(
+            torch.maximum((text != -1).sum(dim=-1), lens) + 1, duration
+        )  # duration at least text/audio prompt length plus one token, so something is generated
         duration = duration.clamp(max=max_duration)
         max_duration = duration.amax()