SWivid commited on
Commit
61ff2a6
Β·
1 Parent(s): 1085b73

formatting #363, credit to @JarodMica, also dur_pred check fork repo

Browse files
src/f5_tts/infer/README.md CHANGED
@@ -122,7 +122,8 @@ To communicate with socket server you need to run
122
  python src/f5_tts/socket.py
123
  ```
124
 
125
- then create client to communicate
 
126
 
127
  ``` python
128
  import socket
@@ -184,3 +185,5 @@ async def main():
184
  asyncio.run(main())
185
  ```
186
 
 
 
 
122
  python src/f5_tts/socket.py
123
  ```
124
 
125
+ <details>
126
+ <summary>Then create client to communicate</summary>
127
 
128
  ``` python
129
  import socket
 
185
  asyncio.run(main())
186
  ```
187
 
188
+ </details>
189
+
src/f5_tts/socket.py CHANGED
@@ -22,7 +22,7 @@ class TTSStreamingProcessor:
22
  DiT,
23
  dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
24
  ckpt_file,
25
- vocab_file
26
  ).to(self.device, dtype=dtype)
27
 
28
  # Load the vocoder
@@ -59,14 +59,19 @@ class TTSStreamingProcessor:
59
 
60
  # Run inference for the input text
61
  audio_chunk, final_sample_rate, _ = infer_batch_process(
62
- (audio, sr), ref_text, [text], self.model, self.vocoder, device=self.device # Pass vocoder here
 
 
 
 
 
63
  )
64
 
65
  # Break the generated audio into chunks and send them
66
  chunk_size = int(final_sample_rate * play_steps_in_s)
67
-
68
  for i in range(0, len(audio_chunk), chunk_size):
69
- chunk = audio_chunk[i:i + chunk_size]
70
 
71
  # Check if it's the final chunk
72
  if i + chunk_size >= len(audio_chunk):
@@ -77,13 +82,13 @@ class TTSStreamingProcessor:
77
  break
78
 
79
  # Pack and send the audio chunk
80
- packed_audio = struct.pack(f'{len(chunk)}f', *chunk)
81
  yield packed_audio
82
 
83
  # Ensure that no final word is repeated by not resending partial chunks
84
  if len(audio_chunk) % chunk_size != 0:
85
- remaining_chunk = audio_chunk[-(len(audio_chunk) % chunk_size):]
86
- packed_audio = struct.pack(f'{len(remaining_chunk)}f', *remaining_chunk)
87
  yield packed_audio
88
 
89
 
@@ -134,9 +139,9 @@ def start_server(host, port, processor):
134
  if __name__ == "__main__":
135
  try:
136
  # Load the model and vocoder using the provided files
137
- ckpt_file = "" # pointing your checkpoint "ckpts/model/model_1096.pt"
138
  vocab_file = "" # Add vocab file path if needed
139
- ref_audio ="" # add ref audio"./tests/ref_audio/reference.wav"
140
  ref_text = ""
141
 
142
  # Initialize the processor with the model and vocoder
@@ -145,7 +150,7 @@ if __name__ == "__main__":
145
  vocab_file=vocab_file,
146
  ref_audio=ref_audio,
147
  ref_text=ref_text,
148
- dtype=torch.float32
149
  )
150
 
151
  # Start the server
 
22
  DiT,
23
  dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
24
  ckpt_file,
25
+ vocab_file,
26
  ).to(self.device, dtype=dtype)
27
 
28
  # Load the vocoder
 
59
 
60
  # Run inference for the input text
61
  audio_chunk, final_sample_rate, _ = infer_batch_process(
62
+ (audio, sr),
63
+ ref_text,
64
+ [text],
65
+ self.model,
66
+ self.vocoder,
67
+ device=self.device, # Pass vocoder here
68
  )
69
 
70
  # Break the generated audio into chunks and send them
71
  chunk_size = int(final_sample_rate * play_steps_in_s)
72
+
73
  for i in range(0, len(audio_chunk), chunk_size):
74
+ chunk = audio_chunk[i : i + chunk_size]
75
 
76
  # Check if it's the final chunk
77
  if i + chunk_size >= len(audio_chunk):
 
82
  break
83
 
84
  # Pack and send the audio chunk
85
+ packed_audio = struct.pack(f"{len(chunk)}f", *chunk)
86
  yield packed_audio
87
 
88
  # Ensure that no final word is repeated by not resending partial chunks
89
  if len(audio_chunk) % chunk_size != 0:
90
+ remaining_chunk = audio_chunk[-(len(audio_chunk) % chunk_size) :]
91
+ packed_audio = struct.pack(f"{len(remaining_chunk)}f", *remaining_chunk)
92
  yield packed_audio
93
 
94
 
 
139
  if __name__ == "__main__":
140
  try:
141
  # Load the model and vocoder using the provided files
142
+ ckpt_file = "" # pointing your checkpoint "ckpts/model/model_1096.pt"
143
  vocab_file = "" # Add vocab file path if needed
144
+ ref_audio = "" # add ref audio"./tests/ref_audio/reference.wav"
145
  ref_text = ""
146
 
147
  # Initialize the processor with the model and vocoder
 
150
  vocab_file=vocab_file,
151
  ref_audio=ref_audio,
152
  ref_text=ref_text,
153
+ dtype=torch.float32,
154
  )
155
 
156
  # Start the server
src/f5_tts/train/finetune_gradio.py CHANGED
@@ -1372,7 +1372,7 @@ def get_audio_select(file_sample):
1372
  with gr.Blocks() as app:
1373
  gr.Markdown(
1374
  """
1375
- # E2/F5 TTS AUTOMATIC FINETUNE
1376
 
1377
  This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
1378
 
@@ -1381,35 +1381,35 @@ This is a local web UI for F5 TTS with advanced batch processing support. This a
1381
 
1382
  The checkpoints support English and Chinese.
1383
 
1384
- for tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussions/143)
1385
  """
1386
  )
1387
 
1388
  with gr.Row():
1389
  projects, projects_selelect = get_list_projects()
1390
- tokenizer_type = gr.Radio(label="Tokenizer Type", choices=["pinyin", "char"], value="pinyin")
1391
- project_name = gr.Textbox(label="project name", value="my_speak")
1392
- bt_create = gr.Button("create new project")
1393
 
1394
  with gr.Row():
1395
  cm_project = gr.Dropdown(
1396
  choices=projects, value=projects_selelect, label="Project", allow_custom_value=True, scale=6
1397
  )
1398
- ch_refresh_project = gr.Button("refresh", scale=1)
1399
 
1400
  bt_create.click(fn=create_data_project, inputs=[project_name, tokenizer_type], outputs=[cm_project])
1401
 
1402
  with gr.Tabs():
1403
- with gr.TabItem("transcribe Data"):
1404
  gr.Markdown("""```plaintext
1405
  Skip this step if you have your dataset, metadata.csv, and a folder wavs with all the audio files.
1406
  ```""")
1407
 
1408
- ch_manual = gr.Checkbox(label="audio from path", value=False)
1409
 
1410
  mark_info_transcribe = gr.Markdown(
1411
  """```plaintext
1412
- Place your 'wavs' folder and 'metadata.csv' file in the {your_project_name}' directory.
1413
 
1414
  my_speak/
1415
  β”‚
@@ -1421,10 +1421,10 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
1421
  visible=False,
1422
  )
1423
 
1424
- audio_speaker = gr.File(label="voice", type="filepath", file_count="multiple")
1425
- txt_lang = gr.Text(label="Language", value="english")
1426
- bt_transcribe = bt_create = gr.Button("transcribe")
1427
- txt_info_transcribe = gr.Text(label="info", value="")
1428
  bt_transcribe.click(
1429
  fn=transcribe_all,
1430
  inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
@@ -1432,7 +1432,7 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
1432
  )
1433
  ch_manual.change(fn=check_user, inputs=[ch_manual], outputs=[audio_speaker, mark_info_transcribe])
1434
 
1435
- random_sample_transcribe = gr.Button("random sample")
1436
 
1437
  with gr.Row():
1438
  random_text_transcribe = gr.Text(label="Text")
@@ -1444,16 +1444,16 @@ Skip this step if you have your dataset, metadata.csv, and a folder wavs with al
1444
  outputs=[random_text_transcribe, random_audio_transcribe],
1445
  )
1446
 
1447
- with gr.TabItem("vocab check"):
1448
  gr.Markdown("""```plaintext
1449
- check the vocabulary for fine-tuning Emilia_ZH_EN to ensure all symbols are included. for finetune new language
1450
  ```""")
1451
 
1452
- check_button = gr.Button("check vocab")
1453
- txt_info_check = gr.Text(label="info", value="")
1454
 
1455
  gr.Markdown("""```plaintext
1456
- Using the extended model, you can fine-tune to a new language that is missing symbols in the vocab , this create a new model with a new vocabulary size and save it in your ckpts/project folder.
1457
  ```""")
1458
 
1459
  exp_name_extend = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
@@ -1465,10 +1465,10 @@ Using the extended model, you can fine-tune to a new language that is missing sy
1465
  placeholder="To add new symbols, make sure to use ',' for each symbol",
1466
  scale=6,
1467
  )
1468
- txt_count_symbol = gr.Textbox(label="new size vocab", value="", scale=1)
1469
 
1470
- extend_button = gr.Button("Extended")
1471
- txt_info_extend = gr.Text(label="info", value="")
1472
 
1473
  txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
1474
  check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
@@ -1476,18 +1476,18 @@ Using the extended model, you can fine-tune to a new language that is missing sy
1476
  fn=vocab_extend, inputs=[cm_project, txt_extend, exp_name_extend], outputs=[txt_info_extend]
1477
  )
1478
 
1479
- with gr.TabItem("prepare Data"):
1480
  gr.Markdown("""```plaintext
1481
- Skip this step if you have your dataset, raw.arrow , duraction.json and vocab.txt
1482
  ```""")
1483
 
1484
  gr.Markdown(
1485
  """```plaintext
1486
- place all your wavs folder and your metadata.csv file in {your name project}
1487
 
1488
- suport format for audio "wav", "mp3", "aac", "flac", "m4a", "alac", "ogg", "aiff", "wma", "amr"
1489
 
1490
- example wav format
1491
  my_speak/
1492
  β”‚
1493
  β”œβ”€β”€ wavs/
@@ -1497,24 +1497,24 @@ Skip this step if you have your dataset, raw.arrow , duraction.json and vocab.tx
1497
  β”‚
1498
  └── metadata.csv
1499
 
1500
- file format metadata.csv
1501
 
1502
  audio1|text1 or audio1.wav|text1 or your_path/audio1.wav|text1
1503
- audio2|text1 or audio2.wav|text1 or your_path/audio1.wav|text1
1504
  ...
1505
 
1506
  ```"""
1507
  )
1508
- ch_tokenizern = gr.Checkbox(label="create vocabulary", value=False, visible=False)
1509
- bt_prepare = bt_create = gr.Button("prepare")
1510
- txt_info_prepare = gr.Text(label="info", value="")
1511
- txt_vocab_prepare = gr.Text(label="vocab", value="")
1512
 
1513
  bt_prepare.click(
1514
  fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
1515
  )
1516
 
1517
- random_sample_prepare = gr.Button("random sample")
1518
 
1519
  with gr.Row():
1520
  random_text_prepare = gr.Text(label="Tokenizer")
@@ -1524,20 +1524,20 @@ Skip this step if you have your dataset, raw.arrow , duraction.json and vocab.tx
1524
  fn=get_random_sample_prepare, inputs=[cm_project], outputs=[random_text_prepare, random_audio_prepare]
1525
  )
1526
 
1527
- with gr.TabItem("train Data"):
1528
  gr.Markdown("""```plaintext
1529
- The auto-setting is still experimental. Please make sure that the epochs , save per updates , and last per steps are set correctly, or change them manually as needed.
1530
  If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
1531
  ```""")
1532
  with gr.Row():
1533
  bt_calculate = bt_create = gr.Button("Auto Settings")
1534
- lb_samples = gr.Label(label="samples")
1535
  batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
1536
 
1537
  with gr.Row():
1538
- ch_finetune = bt_create = gr.Checkbox(label="finetune", value=True)
1539
  tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
1540
- file_checkpoint_train = gr.Textbox(label="Path to the preetrain checkpoint ", value="")
1541
 
1542
  with gr.Row():
1543
  exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
@@ -1603,8 +1603,8 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
1603
  mixed_precision.value = mixed_precisionv
1604
  cd_logger.value = cd_loggerv
1605
 
1606
- ch_stream = gr.Checkbox(label="stream output experiment.", value=True)
1607
- txt_info_train = gr.Text(label="info", value="")
1608
 
1609
  list_audios, select_audio = get_audio_project(projects_selelect, False)
1610
 
@@ -1619,18 +1619,18 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
1619
  ch_list_audio = gr.Dropdown(
1620
  choices=list_audios,
1621
  value=select_audio,
1622
- label="audios",
1623
  allow_custom_value=True,
1624
  scale=6,
1625
  interactive=True,
1626
  )
1627
- bt_stream_audio = gr.Button("refresh", scale=1)
1628
  bt_stream_audio.click(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1629
  cm_project.change(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1630
 
1631
  with gr.Row():
1632
- audio_ref_stream = gr.Audio(label="original", type="filepath", value=select_audio_ref)
1633
- audio_gen_stream = gr.Audio(label="generate", type="filepath", value=select_audio_gen)
1634
 
1635
  ch_list_audio.change(
1636
  fn=get_audio_select,
@@ -1730,36 +1730,36 @@ If you encounter a memory error, try reducing the batch size per GPU to a smalle
1730
  outputs=outputs,
1731
  )
1732
 
1733
- with gr.TabItem("test model"):
1734
  gr.Markdown("""```plaintext
1735
- SOS : check the use_ema setting (True or False) for your model to see what works best for you.
1736
  ```""")
1737
  exp_name = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
1738
  list_checkpoints, checkpoint_select = get_checkpoints_project(projects_selelect, False)
1739
 
1740
- nfe_step = gr.Number(label="n_step", value=32)
1741
- ch_use_ema = gr.Checkbox(label="use ema", value=True)
1742
  with gr.Row():
1743
  cm_checkpoint = gr.Dropdown(
1744
- choices=list_checkpoints, value=checkpoint_select, label="checkpoints", allow_custom_value=True
1745
  )
1746
- bt_checkpoint_refresh = gr.Button("refresh")
1747
 
1748
- random_sample_infer = gr.Button("random sample")
1749
 
1750
- ref_text = gr.Textbox(label="ref text")
1751
- ref_audio = gr.Audio(label="audio ref", type="filepath")
1752
- gen_text = gr.Textbox(label="gen text")
1753
 
1754
  random_sample_infer.click(
1755
  fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
1756
  )
1757
 
1758
  with gr.Row():
1759
- txt_info_gpu = gr.Textbox("", label="device")
1760
- check_button_infer = gr.Button("infer")
1761
 
1762
- gen_audio = gr.Audio(label="audio gen", type="filepath")
1763
 
1764
  check_button_infer.click(
1765
  fn=infer,
@@ -1770,22 +1770,22 @@ SOS : check the use_ema setting (True or False) for your model to see what works
1770
  bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1771
  cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1772
 
1773
- with gr.TabItem("reduse checkpoint"):
1774
  gr.Markdown("""```plaintext
1775
- Reduce the model size from 5GB to 1.3GB. The new checkpoint can be used for inference or fine-tuning afterward, but it cannot be used to continue training..
1776
  ```""")
1777
- txt_path_checkpoint = gr.Text(label="path checkpoint :")
1778
- txt_path_checkpoint_small = gr.Text(label="path output :")
1779
- ch_safetensors = gr.Checkbox(label="safetensors", value="")
1780
- txt_info_reduse = gr.Text(label="info", value="")
1781
- reduse_button = gr.Button("reduse")
1782
  reduse_button.click(
1783
  fn=extract_and_save_ema_model,
1784
  inputs=[txt_path_checkpoint, txt_path_checkpoint_small, ch_safetensors],
1785
  outputs=[txt_info_reduse],
1786
  )
1787
 
1788
- with gr.TabItem("system info"):
1789
  output_box = gr.Textbox(label="GPU and CPU Information", lines=20)
1790
 
1791
  def update_stats():
 
1372
  with gr.Blocks() as app:
1373
  gr.Markdown(
1374
  """
1375
+ # E2/F5 TTS Automatic Finetune
1376
 
1377
  This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
1378
 
 
1381
 
1382
  The checkpoints support English and Chinese.
1383
 
1384
+ For tutorial and updates check here (https://github.com/SWivid/F5-TTS/discussions/143)
1385
  """
1386
  )
1387
 
1388
  with gr.Row():
1389
  projects, projects_selelect = get_list_projects()
1390
+ tokenizer_type = gr.Radio(label="Tokenizer Type", choices=["pinyin", "char", "custom"], value="pinyin")
1391
+ project_name = gr.Textbox(label="Project Name", value="my_speak")
1392
+ bt_create = gr.Button("Create a New Project")
1393
 
1394
  with gr.Row():
1395
  cm_project = gr.Dropdown(
1396
  choices=projects, value=projects_selelect, label="Project", allow_custom_value=True, scale=6
1397
  )
1398
+ ch_refresh_project = gr.Button("Refresh", scale=1)
1399
 
1400
  bt_create.click(fn=create_data_project, inputs=[project_name, tokenizer_type], outputs=[cm_project])
1401
 
1402
  with gr.Tabs():
1403
+ with gr.TabItem("Transcribe Data"):
1404
  gr.Markdown("""```plaintext
1405
  Skip this step if you have your dataset, metadata.csv, and a folder wavs with all the audio files.
1406
  ```""")
1407
 
1408
+ ch_manual = gr.Checkbox(label="Audio from Path", value=False)
1409
 
1410
  mark_info_transcribe = gr.Markdown(
1411
  """```plaintext
1412
+ Place your 'wavs' folder and 'metadata.csv' file in the '{your_project_name}' directory.
1413
 
1414
  my_speak/
1415
  β”‚
 
1421
  visible=False,
1422
  )
1423
 
1424
+ audio_speaker = gr.File(label="Voice", type="filepath", file_count="multiple")
1425
+ txt_lang = gr.Text(label="Language", value="English")
1426
+ bt_transcribe = bt_create = gr.Button("Transcribe")
1427
+ txt_info_transcribe = gr.Text(label="Info", value="")
1428
  bt_transcribe.click(
1429
  fn=transcribe_all,
1430
  inputs=[cm_project, audio_speaker, txt_lang, ch_manual],
 
1432
  )
1433
  ch_manual.change(fn=check_user, inputs=[ch_manual], outputs=[audio_speaker, mark_info_transcribe])
1434
 
1435
+ random_sample_transcribe = gr.Button("Random Sample")
1436
 
1437
  with gr.Row():
1438
  random_text_transcribe = gr.Text(label="Text")
 
1444
  outputs=[random_text_transcribe, random_audio_transcribe],
1445
  )
1446
 
1447
+ with gr.TabItem("Vocab Check"):
1448
  gr.Markdown("""```plaintext
1449
+ Check the vocabulary for fine-tuning Emilia_ZH_EN to ensure all symbols are included. For fine-tuning a new language.
1450
  ```""")
1451
 
1452
+ check_button = gr.Button("Check Vocab")
1453
+ txt_info_check = gr.Text(label="Info", value="")
1454
 
1455
  gr.Markdown("""```plaintext
1456
+ Using the extended model, you can finetune to a new language that is missing symbols in the vocab. This creates a new model with a new vocabulary size and saves it in your ckpts/project folder.
1457
  ```""")
1458
 
1459
  exp_name_extend = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
 
1465
  placeholder="To add new symbols, make sure to use ',' for each symbol",
1466
  scale=6,
1467
  )
1468
+ txt_count_symbol = gr.Textbox(label="New Vocab Size", value="", scale=1)
1469
 
1470
+ extend_button = gr.Button("Extend")
1471
+ txt_info_extend = gr.Text(label="Info", value="")
1472
 
1473
  txt_extend.change(vocab_count, inputs=[txt_extend], outputs=[txt_count_symbol])
1474
  check_button.click(fn=vocab_check, inputs=[cm_project], outputs=[txt_info_check, txt_extend])
 
1476
  fn=vocab_extend, inputs=[cm_project, txt_extend, exp_name_extend], outputs=[txt_info_extend]
1477
  )
1478
 
1479
+ with gr.TabItem("Prepare Data"):
1480
  gr.Markdown("""```plaintext
1481
+ Skip this step if you have your dataset, raw.arrow, duration.json, and vocab.txt
1482
  ```""")
1483
 
1484
  gr.Markdown(
1485
  """```plaintext
1486
+ Place all your "wavs" folder and your "metadata.csv" file in your project name directory.
1487
 
1488
+ Supported audio formats: "wav", "mp3", "aac", "flac", "m4a", "alac", "ogg", "aiff", "wma", "amr"
1489
 
1490
+ Example wav format:
1491
  my_speak/
1492
  β”‚
1493
  β”œβ”€β”€ wavs/
 
1497
  β”‚
1498
  └── metadata.csv
1499
 
1500
+ File format metadata.csv:
1501
 
1502
  audio1|text1 or audio1.wav|text1 or your_path/audio1.wav|text1
1503
+ audio2|text1 or audio2.wav|text1 or your_path/audio2.wav|text1
1504
  ...
1505
 
1506
  ```"""
1507
  )
1508
+ ch_tokenizern = gr.Checkbox(label="Create Vocabulary", value=False, visible=False)
1509
+ bt_prepare = bt_create = gr.Button("Prepare")
1510
+ txt_info_prepare = gr.Text(label="Info", value="")
1511
+ txt_vocab_prepare = gr.Text(label="Vocab", value="")
1512
 
1513
  bt_prepare.click(
1514
  fn=create_metadata, inputs=[cm_project, ch_tokenizern], outputs=[txt_info_prepare, txt_vocab_prepare]
1515
  )
1516
 
1517
+ random_sample_prepare = gr.Button("Random Sample")
1518
 
1519
  with gr.Row():
1520
  random_text_prepare = gr.Text(label="Tokenizer")
 
1524
  fn=get_random_sample_prepare, inputs=[cm_project], outputs=[random_text_prepare, random_audio_prepare]
1525
  )
1526
 
1527
+ with gr.TabItem("Train Data"):
1528
  gr.Markdown("""```plaintext
1529
+ The auto-setting is still experimental. Please make sure that the epochs, save per updates, and last per steps are set correctly, or change them manually as needed.
1530
  If you encounter a memory error, try reducing the batch size per GPU to a smaller number.
1531
  ```""")
1532
  with gr.Row():
1533
  bt_calculate = bt_create = gr.Button("Auto Settings")
1534
+ lb_samples = gr.Label(label="Samples")
1535
  batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
1536
 
1537
  with gr.Row():
1538
+ ch_finetune = bt_create = gr.Checkbox(label="Finetune", value=True)
1539
  tokenizer_file = gr.Textbox(label="Tokenizer File", value="")
1540
+ file_checkpoint_train = gr.Textbox(label="Path to the Pretrained Checkpoint", value="")
1541
 
1542
  with gr.Row():
1543
  exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
 
1603
  mixed_precision.value = mixed_precisionv
1604
  cd_logger.value = cd_loggerv
1605
 
1606
+ ch_stream = gr.Checkbox(label="Stream Output Experiment", value=True)
1607
+ txt_info_train = gr.Text(label="Info", value="")
1608
 
1609
  list_audios, select_audio = get_audio_project(projects_selelect, False)
1610
 
 
1619
  ch_list_audio = gr.Dropdown(
1620
  choices=list_audios,
1621
  value=select_audio,
1622
+ label="Audios",
1623
  allow_custom_value=True,
1624
  scale=6,
1625
  interactive=True,
1626
  )
1627
+ bt_stream_audio = gr.Button("Refresh", scale=1)
1628
  bt_stream_audio.click(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1629
  cm_project.change(fn=get_audio_project, inputs=[cm_project], outputs=[ch_list_audio])
1630
 
1631
  with gr.Row():
1632
+ audio_ref_stream = gr.Audio(label="Original", type="filepath", value=select_audio_ref)
1633
+ audio_gen_stream = gr.Audio(label="Generate", type="filepath", value=select_audio_gen)
1634
 
1635
  ch_list_audio.change(
1636
  fn=get_audio_select,
 
1730
  outputs=outputs,
1731
  )
1732
 
1733
+ with gr.TabItem("Test Model"):
1734
  gr.Markdown("""```plaintext
1735
+ SOS: Check the use_ema setting (True or False) for your model to see what works best for you.
1736
  ```""")
1737
  exp_name = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
1738
  list_checkpoints, checkpoint_select = get_checkpoints_project(projects_selelect, False)
1739
 
1740
+ nfe_step = gr.Number(label="NFE Step", value=32)
1741
+ ch_use_ema = gr.Checkbox(label="Use EMA", value=True)
1742
  with gr.Row():
1743
  cm_checkpoint = gr.Dropdown(
1744
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True
1745
  )
1746
+ bt_checkpoint_refresh = gr.Button("Refresh")
1747
 
1748
+ random_sample_infer = gr.Button("Random Sample")
1749
 
1750
+ ref_text = gr.Textbox(label="Ref Text")
1751
+ ref_audio = gr.Audio(label="Audio Ref", type="filepath")
1752
+ gen_text = gr.Textbox(label="Gen Text")
1753
 
1754
  random_sample_infer.click(
1755
  fn=get_random_sample_infer, inputs=[cm_project], outputs=[ref_text, gen_text, ref_audio]
1756
  )
1757
 
1758
  with gr.Row():
1759
+ txt_info_gpu = gr.Textbox("", label="Device")
1760
+ check_button_infer = gr.Button("Infer")
1761
 
1762
+ gen_audio = gr.Audio(label="Audio Gen", type="filepath")
1763
 
1764
  check_button_infer.click(
1765
  fn=infer,
 
1770
  bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1771
  cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
1772
 
1773
+ with gr.TabItem("Reduce Checkpoint"):
1774
  gr.Markdown("""```plaintext
1775
+ Reduce the model size from 5GB to 1.3GB. The new checkpoint can be used for inference or fine-tuning afterward, but it cannot be used to continue training.
1776
  ```""")
1777
+ txt_path_checkpoint = gr.Text(label="Path to Checkpoint:")
1778
+ txt_path_checkpoint_small = gr.Text(label="Path to Output:")
1779
+ ch_safetensors = gr.Checkbox(label="Safetensors", value="")
1780
+ txt_info_reduse = gr.Text(label="Info", value="")
1781
+ reduse_button = gr.Button("Reduce")
1782
  reduse_button.click(
1783
  fn=extract_and_save_ema_model,
1784
  inputs=[txt_path_checkpoint, txt_path_checkpoint_small, ch_safetensors],
1785
  outputs=[txt_info_reduse],
1786
  )
1787
 
1788
+ with gr.TabItem("System Info"):
1789
  output_box = gr.Textbox(label="GPU and CPU Information", lines=20)
1790
 
1791
  def update_stats():