SWivid commited on
Commit
23a1101
·
1 Parent(s): a435039

make a structure first

Browse files
src/f5_tts/api.py CHANGED
@@ -1,15 +1,14 @@
 
 
 
 
1
  import soundfile as sf
2
  import torch
3
- import tqdm
4
  from cached_path import cached_path
5
 
6
  from f5_tts.model import DiT, UNetT
7
- from f5_tts.model.utils import save_spectrogram
8
-
9
  from f5_tts.model.utils_infer import load_vocoder, load_model, infer_process, remove_silence_for_generated_wav
10
- from f5_tts.model.utils import seed_everything
11
- import random
12
- import sys
13
 
14
 
15
  class F5TTS:
 
1
+ import random
2
+ import sys
3
+ import tqdm
4
+
5
  import soundfile as sf
6
  import torch
 
7
  from cached_path import cached_path
8
 
9
  from f5_tts.model import DiT, UNetT
10
+ from f5_tts.model.utils import seed_everything, save_spectrogram
 
11
  from f5_tts.model.utils_infer import load_vocoder, load_model, infer_process, remove_silence_for_generated_wav
 
 
 
12
 
13
 
14
  class F5TTS:
src/f5_tts/data/Emilia_ZH_EN_pinyin/vocab.txt DELETED
@@ -1,2545 +0,0 @@
1
-
2
- !
3
- "
4
- #
5
- $
6
- %
7
- &
8
- '
9
- (
10
- )
11
- *
12
- +
13
- ,
14
- -
15
- .
16
- /
17
- 0
18
- 1
19
- 2
20
- 3
21
- 4
22
- 5
23
- 6
24
- 7
25
- 8
26
- 9
27
- :
28
- ;
29
- =
30
- >
31
- ?
32
- @
33
- A
34
- B
35
- C
36
- D
37
- E
38
- F
39
- G
40
- H
41
- I
42
- J
43
- K
44
- L
45
- M
46
- N
47
- O
48
- P
49
- Q
50
- R
51
- S
52
- T
53
- U
54
- V
55
- W
56
- X
57
- Y
58
- Z
59
- [
60
- \
61
- ]
62
- _
63
- a
64
- a1
65
- ai1
66
- ai2
67
- ai3
68
- ai4
69
- an1
70
- an3
71
- an4
72
- ang1
73
- ang2
74
- ang4
75
- ao1
76
- ao2
77
- ao3
78
- ao4
79
- b
80
- ba
81
- ba1
82
- ba2
83
- ba3
84
- ba4
85
- bai1
86
- bai2
87
- bai3
88
- bai4
89
- ban1
90
- ban2
91
- ban3
92
- ban4
93
- bang1
94
- bang2
95
- bang3
96
- bang4
97
- bao1
98
- bao2
99
- bao3
100
- bao4
101
- bei
102
- bei1
103
- bei2
104
- bei3
105
- bei4
106
- ben1
107
- ben2
108
- ben3
109
- ben4
110
- beng
111
- beng1
112
- beng2
113
- beng3
114
- beng4
115
- bi1
116
- bi2
117
- bi3
118
- bi4
119
- bian1
120
- bian2
121
- bian3
122
- bian4
123
- biao1
124
- biao2
125
- biao3
126
- bie1
127
- bie2
128
- bie3
129
- bie4
130
- bin1
131
- bin4
132
- bing1
133
- bing2
134
- bing3
135
- bing4
136
- bo
137
- bo1
138
- bo2
139
- bo3
140
- bo4
141
- bu2
142
- bu3
143
- bu4
144
- c
145
- ca1
146
- cai1
147
- cai2
148
- cai3
149
- cai4
150
- can1
151
- can2
152
- can3
153
- can4
154
- cang1
155
- cang2
156
- cao1
157
- cao2
158
- cao3
159
- ce4
160
- cen1
161
- cen2
162
- ceng1
163
- ceng2
164
- ceng4
165
- cha1
166
- cha2
167
- cha3
168
- cha4
169
- chai1
170
- chai2
171
- chan1
172
- chan2
173
- chan3
174
- chan4
175
- chang1
176
- chang2
177
- chang3
178
- chang4
179
- chao1
180
- chao2
181
- chao3
182
- che1
183
- che2
184
- che3
185
- che4
186
- chen1
187
- chen2
188
- chen3
189
- chen4
190
- cheng1
191
- cheng2
192
- cheng3
193
- cheng4
194
- chi1
195
- chi2
196
- chi3
197
- chi4
198
- chong1
199
- chong2
200
- chong3
201
- chong4
202
- chou1
203
- chou2
204
- chou3
205
- chou4
206
- chu1
207
- chu2
208
- chu3
209
- chu4
210
- chua1
211
- chuai1
212
- chuai2
213
- chuai3
214
- chuai4
215
- chuan1
216
- chuan2
217
- chuan3
218
- chuan4
219
- chuang1
220
- chuang2
221
- chuang3
222
- chuang4
223
- chui1
224
- chui2
225
- chun1
226
- chun2
227
- chun3
228
- chuo1
229
- chuo4
230
- ci1
231
- ci2
232
- ci3
233
- ci4
234
- cong1
235
- cong2
236
- cou4
237
- cu1
238
- cu4
239
- cuan1
240
- cuan2
241
- cuan4
242
- cui1
243
- cui3
244
- cui4
245
- cun1
246
- cun2
247
- cun4
248
- cuo1
249
- cuo2
250
- cuo4
251
- d
252
- da
253
- da1
254
- da2
255
- da3
256
- da4
257
- dai1
258
- dai2
259
- dai3
260
- dai4
261
- dan1
262
- dan2
263
- dan3
264
- dan4
265
- dang1
266
- dang2
267
- dang3
268
- dang4
269
- dao1
270
- dao2
271
- dao3
272
- dao4
273
- de
274
- de1
275
- de2
276
- dei3
277
- den4
278
- deng1
279
- deng2
280
- deng3
281
- deng4
282
- di1
283
- di2
284
- di3
285
- di4
286
- dia3
287
- dian1
288
- dian2
289
- dian3
290
- dian4
291
- diao1
292
- diao3
293
- diao4
294
- die1
295
- die2
296
- die4
297
- ding1
298
- ding2
299
- ding3
300
- ding4
301
- diu1
302
- dong1
303
- dong3
304
- dong4
305
- dou1
306
- dou2
307
- dou3
308
- dou4
309
- du1
310
- du2
311
- du3
312
- du4
313
- duan1
314
- duan2
315
- duan3
316
- duan4
317
- dui1
318
- dui4
319
- dun1
320
- dun3
321
- dun4
322
- duo1
323
- duo2
324
- duo3
325
- duo4
326
- e
327
- e1
328
- e2
329
- e3
330
- e4
331
- ei2
332
- en1
333
- en4
334
- er
335
- er2
336
- er3
337
- er4
338
- f
339
- fa1
340
- fa2
341
- fa3
342
- fa4
343
- fan1
344
- fan2
345
- fan3
346
- fan4
347
- fang1
348
- fang2
349
- fang3
350
- fang4
351
- fei1
352
- fei2
353
- fei3
354
- fei4
355
- fen1
356
- fen2
357
- fen3
358
- fen4
359
- feng1
360
- feng2
361
- feng3
362
- feng4
363
- fo2
364
- fou2
365
- fou3
366
- fu1
367
- fu2
368
- fu3
369
- fu4
370
- g
371
- ga1
372
- ga2
373
- ga3
374
- ga4
375
- gai1
376
- gai2
377
- gai3
378
- gai4
379
- gan1
380
- gan2
381
- gan3
382
- gan4
383
- gang1
384
- gang2
385
- gang3
386
- gang4
387
- gao1
388
- gao2
389
- gao3
390
- gao4
391
- ge1
392
- ge2
393
- ge3
394
- ge4
395
- gei2
396
- gei3
397
- gen1
398
- gen2
399
- gen3
400
- gen4
401
- geng1
402
- geng3
403
- geng4
404
- gong1
405
- gong3
406
- gong4
407
- gou1
408
- gou2
409
- gou3
410
- gou4
411
- gu
412
- gu1
413
- gu2
414
- gu3
415
- gu4
416
- gua1
417
- gua2
418
- gua3
419
- gua4
420
- guai1
421
- guai2
422
- guai3
423
- guai4
424
- guan1
425
- guan2
426
- guan3
427
- guan4
428
- guang1
429
- guang2
430
- guang3
431
- guang4
432
- gui1
433
- gui2
434
- gui3
435
- gui4
436
- gun3
437
- gun4
438
- guo1
439
- guo2
440
- guo3
441
- guo4
442
- h
443
- ha1
444
- ha2
445
- ha3
446
- hai1
447
- hai2
448
- hai3
449
- hai4
450
- han1
451
- han2
452
- han3
453
- han4
454
- hang1
455
- hang2
456
- hang4
457
- hao1
458
- hao2
459
- hao3
460
- hao4
461
- he1
462
- he2
463
- he4
464
- hei1
465
- hen2
466
- hen3
467
- hen4
468
- heng1
469
- heng2
470
- heng4
471
- hong1
472
- hong2
473
- hong3
474
- hong4
475
- hou1
476
- hou2
477
- hou3
478
- hou4
479
- hu1
480
- hu2
481
- hu3
482
- hu4
483
- hua1
484
- hua2
485
- hua4
486
- huai2
487
- huai4
488
- huan1
489
- huan2
490
- huan3
491
- huan4
492
- huang1
493
- huang2
494
- huang3
495
- huang4
496
- hui1
497
- hui2
498
- hui3
499
- hui4
500
- hun1
501
- hun2
502
- hun4
503
- huo
504
- huo1
505
- huo2
506
- huo3
507
- huo4
508
- i
509
- j
510
- ji1
511
- ji2
512
- ji3
513
- ji4
514
- jia
515
- jia1
516
- jia2
517
- jia3
518
- jia4
519
- jian1
520
- jian2
521
- jian3
522
- jian4
523
- jiang1
524
- jiang2
525
- jiang3
526
- jiang4
527
- jiao1
528
- jiao2
529
- jiao3
530
- jiao4
531
- jie1
532
- jie2
533
- jie3
534
- jie4
535
- jin1
536
- jin2
537
- jin3
538
- jin4
539
- jing1
540
- jing2
541
- jing3
542
- jing4
543
- jiong3
544
- jiu1
545
- jiu2
546
- jiu3
547
- jiu4
548
- ju1
549
- ju2
550
- ju3
551
- ju4
552
- juan1
553
- juan2
554
- juan3
555
- juan4
556
- jue1
557
- jue2
558
- jue4
559
- jun1
560
- jun4
561
- k
562
- ka1
563
- ka2
564
- ka3
565
- kai1
566
- kai2
567
- kai3
568
- kai4
569
- kan1
570
- kan2
571
- kan3
572
- kan4
573
- kang1
574
- kang2
575
- kang4
576
- kao1
577
- kao2
578
- kao3
579
- kao4
580
- ke1
581
- ke2
582
- ke3
583
- ke4
584
- ken3
585
- keng1
586
- kong1
587
- kong3
588
- kong4
589
- kou1
590
- kou2
591
- kou3
592
- kou4
593
- ku1
594
- ku2
595
- ku3
596
- ku4
597
- kua1
598
- kua3
599
- kua4
600
- kuai3
601
- kuai4
602
- kuan1
603
- kuan2
604
- kuan3
605
- kuang1
606
- kuang2
607
- kuang4
608
- kui1
609
- kui2
610
- kui3
611
- kui4
612
- kun1
613
- kun3
614
- kun4
615
- kuo4
616
- l
617
- la
618
- la1
619
- la2
620
- la3
621
- la4
622
- lai2
623
- lai4
624
- lan2
625
- lan3
626
- lan4
627
- lang1
628
- lang2
629
- lang3
630
- lang4
631
- lao1
632
- lao2
633
- lao3
634
- lao4
635
- le
636
- le1
637
- le4
638
- lei
639
- lei1
640
- lei2
641
- lei3
642
- lei4
643
- leng1
644
- leng2
645
- leng3
646
- leng4
647
- li
648
- li1
649
- li2
650
- li3
651
- li4
652
- lia3
653
- lian2
654
- lian3
655
- lian4
656
- liang2
657
- liang3
658
- liang4
659
- liao1
660
- liao2
661
- liao3
662
- liao4
663
- lie1
664
- lie2
665
- lie3
666
- lie4
667
- lin1
668
- lin2
669
- lin3
670
- lin4
671
- ling2
672
- ling3
673
- ling4
674
- liu1
675
- liu2
676
- liu3
677
- liu4
678
- long1
679
- long2
680
- long3
681
- long4
682
- lou1
683
- lou2
684
- lou3
685
- lou4
686
- lu1
687
- lu2
688
- lu3
689
- lu4
690
- luan2
691
- luan3
692
- luan4
693
- lun1
694
- lun2
695
- lun4
696
- luo1
697
- luo2
698
- luo3
699
- luo4
700
- lv2
701
- lv3
702
- lv4
703
- lve3
704
- lve4
705
- m
706
- ma
707
- ma1
708
- ma2
709
- ma3
710
- ma4
711
- mai2
712
- mai3
713
- mai4
714
- man1
715
- man2
716
- man3
717
- man4
718
- mang2
719
- mang3
720
- mao1
721
- mao2
722
- mao3
723
- mao4
724
- me
725
- mei2
726
- mei3
727
- mei4
728
- men
729
- men1
730
- men2
731
- men4
732
- meng
733
- meng1
734
- meng2
735
- meng3
736
- meng4
737
- mi1
738
- mi2
739
- mi3
740
- mi4
741
- mian2
742
- mian3
743
- mian4
744
- miao1
745
- miao2
746
- miao3
747
- miao4
748
- mie1
749
- mie4
750
- min2
751
- min3
752
- ming2
753
- ming3
754
- ming4
755
- miu4
756
- mo1
757
- mo2
758
- mo3
759
- mo4
760
- mou1
761
- mou2
762
- mou3
763
- mu2
764
- mu3
765
- mu4
766
- n
767
- n2
768
- na1
769
- na2
770
- na3
771
- na4
772
- nai2
773
- nai3
774
- nai4
775
- nan1
776
- nan2
777
- nan3
778
- nan4
779
- nang1
780
- nang2
781
- nang3
782
- nao1
783
- nao2
784
- nao3
785
- nao4
786
- ne
787
- ne2
788
- ne4
789
- nei3
790
- nei4
791
- nen4
792
- neng2
793
- ni1
794
- ni2
795
- ni3
796
- ni4
797
- nian1
798
- nian2
799
- nian3
800
- nian4
801
- niang2
802
- niang4
803
- niao2
804
- niao3
805
- niao4
806
- nie1
807
- nie4
808
- nin2
809
- ning2
810
- ning3
811
- ning4
812
- niu1
813
- niu2
814
- niu3
815
- niu4
816
- nong2
817
- nong4
818
- nou4
819
- nu2
820
- nu3
821
- nu4
822
- nuan3
823
- nuo2
824
- nuo4
825
- nv2
826
- nv3
827
- nve4
828
- o
829
- o1
830
- o2
831
- ou1
832
- ou2
833
- ou3
834
- ou4
835
- p
836
- pa1
837
- pa2
838
- pa4
839
- pai1
840
- pai2
841
- pai3
842
- pai4
843
- pan1
844
- pan2
845
- pan4
846
- pang1
847
- pang2
848
- pang4
849
- pao1
850
- pao2
851
- pao3
852
- pao4
853
- pei1
854
- pei2
855
- pei4
856
- pen1
857
- pen2
858
- pen4
859
- peng1
860
- peng2
861
- peng3
862
- peng4
863
- pi1
864
- pi2
865
- pi3
866
- pi4
867
- pian1
868
- pian2
869
- pian4
870
- piao1
871
- piao2
872
- piao3
873
- piao4
874
- pie1
875
- pie2
876
- pie3
877
- pin1
878
- pin2
879
- pin3
880
- pin4
881
- ping1
882
- ping2
883
- po1
884
- po2
885
- po3
886
- po4
887
- pou1
888
- pu1
889
- pu2
890
- pu3
891
- pu4
892
- q
893
- qi1
894
- qi2
895
- qi3
896
- qi4
897
- qia1
898
- qia3
899
- qia4
900
- qian1
901
- qian2
902
- qian3
903
- qian4
904
- qiang1
905
- qiang2
906
- qiang3
907
- qiang4
908
- qiao1
909
- qiao2
910
- qiao3
911
- qiao4
912
- qie1
913
- qie2
914
- qie3
915
- qie4
916
- qin1
917
- qin2
918
- qin3
919
- qin4
920
- qing1
921
- qing2
922
- qing3
923
- qing4
924
- qiong1
925
- qiong2
926
- qiu1
927
- qiu2
928
- qiu3
929
- qu1
930
- qu2
931
- qu3
932
- qu4
933
- quan1
934
- quan2
935
- quan3
936
- quan4
937
- que1
938
- que2
939
- que4
940
- qun2
941
- r
942
- ran2
943
- ran3
944
- rang1
945
- rang2
946
- rang3
947
- rang4
948
- rao2
949
- rao3
950
- rao4
951
- re2
952
- re3
953
- re4
954
- ren2
955
- ren3
956
- ren4
957
- reng1
958
- reng2
959
- ri4
960
- rong1
961
- rong2
962
- rong3
963
- rou2
964
- rou4
965
- ru2
966
- ru3
967
- ru4
968
- ruan2
969
- ruan3
970
- rui3
971
- rui4
972
- run4
973
- ruo4
974
- s
975
- sa1
976
- sa2
977
- sa3
978
- sa4
979
- sai1
980
- sai4
981
- san1
982
- san2
983
- san3
984
- san4
985
- sang1
986
- sang3
987
- sang4
988
- sao1
989
- sao2
990
- sao3
991
- sao4
992
- se4
993
- sen1
994
- seng1
995
- sha1
996
- sha2
997
- sha3
998
- sha4
999
- shai1
1000
- shai2
1001
- shai3
1002
- shai4
1003
- shan1
1004
- shan3
1005
- shan4
1006
- shang
1007
- shang1
1008
- shang3
1009
- shang4
1010
- shao1
1011
- shao2
1012
- shao3
1013
- shao4
1014
- she1
1015
- she2
1016
- she3
1017
- she4
1018
- shei2
1019
- shen1
1020
- shen2
1021
- shen3
1022
- shen4
1023
- sheng1
1024
- sheng2
1025
- sheng3
1026
- sheng4
1027
- shi
1028
- shi1
1029
- shi2
1030
- shi3
1031
- shi4
1032
- shou1
1033
- shou2
1034
- shou3
1035
- shou4
1036
- shu1
1037
- shu2
1038
- shu3
1039
- shu4
1040
- shua1
1041
- shua2
1042
- shua3
1043
- shua4
1044
- shuai1
1045
- shuai3
1046
- shuai4
1047
- shuan1
1048
- shuan4
1049
- shuang1
1050
- shuang3
1051
- shui2
1052
- shui3
1053
- shui4
1054
- shun3
1055
- shun4
1056
- shuo1
1057
- shuo4
1058
- si1
1059
- si2
1060
- si3
1061
- si4
1062
- song1
1063
- song3
1064
- song4
1065
- sou1
1066
- sou3
1067
- sou4
1068
- su1
1069
- su2
1070
- su4
1071
- suan1
1072
- suan4
1073
- sui1
1074
- sui2
1075
- sui3
1076
- sui4
1077
- sun1
1078
- sun3
1079
- suo
1080
- suo1
1081
- suo2
1082
- suo3
1083
- t
1084
- ta1
1085
- ta2
1086
- ta3
1087
- ta4
1088
- tai1
1089
- tai2
1090
- tai4
1091
- tan1
1092
- tan2
1093
- tan3
1094
- tan4
1095
- tang1
1096
- tang2
1097
- tang3
1098
- tang4
1099
- tao1
1100
- tao2
1101
- tao3
1102
- tao4
1103
- te4
1104
- teng2
1105
- ti1
1106
- ti2
1107
- ti3
1108
- ti4
1109
- tian1
1110
- tian2
1111
- tian3
1112
- tiao1
1113
- tiao2
1114
- tiao3
1115
- tiao4
1116
- tie1
1117
- tie2
1118
- tie3
1119
- tie4
1120
- ting1
1121
- ting2
1122
- ting3
1123
- tong1
1124
- tong2
1125
- tong3
1126
- tong4
1127
- tou
1128
- tou1
1129
- tou2
1130
- tou4
1131
- tu1
1132
- tu2
1133
- tu3
1134
- tu4
1135
- tuan1
1136
- tuan2
1137
- tui1
1138
- tui2
1139
- tui3
1140
- tui4
1141
- tun1
1142
- tun2
1143
- tun4
1144
- tuo1
1145
- tuo2
1146
- tuo3
1147
- tuo4
1148
- u
1149
- v
1150
- w
1151
- wa
1152
- wa1
1153
- wa2
1154
- wa3
1155
- wa4
1156
- wai1
1157
- wai3
1158
- wai4
1159
- wan1
1160
- wan2
1161
- wan3
1162
- wan4
1163
- wang1
1164
- wang2
1165
- wang3
1166
- wang4
1167
- wei1
1168
- wei2
1169
- wei3
1170
- wei4
1171
- wen1
1172
- wen2
1173
- wen3
1174
- wen4
1175
- weng1
1176
- weng4
1177
- wo1
1178
- wo2
1179
- wo3
1180
- wo4
1181
- wu1
1182
- wu2
1183
- wu3
1184
- wu4
1185
- x
1186
- xi1
1187
- xi2
1188
- xi3
1189
- xi4
1190
- xia1
1191
- xia2
1192
- xia4
1193
- xian1
1194
- xian2
1195
- xian3
1196
- xian4
1197
- xiang1
1198
- xiang2
1199
- xiang3
1200
- xiang4
1201
- xiao1
1202
- xiao2
1203
- xiao3
1204
- xiao4
1205
- xie1
1206
- xie2
1207
- xie3
1208
- xie4
1209
- xin1
1210
- xin2
1211
- xin4
1212
- xing1
1213
- xing2
1214
- xing3
1215
- xing4
1216
- xiong1
1217
- xiong2
1218
- xiu1
1219
- xiu3
1220
- xiu4
1221
- xu
1222
- xu1
1223
- xu2
1224
- xu3
1225
- xu4
1226
- xuan1
1227
- xuan2
1228
- xuan3
1229
- xuan4
1230
- xue1
1231
- xue2
1232
- xue3
1233
- xue4
1234
- xun1
1235
- xun2
1236
- xun4
1237
- y
1238
- ya
1239
- ya1
1240
- ya2
1241
- ya3
1242
- ya4
1243
- yan1
1244
- yan2
1245
- yan3
1246
- yan4
1247
- yang1
1248
- yang2
1249
- yang3
1250
- yang4
1251
- yao1
1252
- yao2
1253
- yao3
1254
- yao4
1255
- ye1
1256
- ye2
1257
- ye3
1258
- ye4
1259
- yi
1260
- yi1
1261
- yi2
1262
- yi3
1263
- yi4
1264
- yin1
1265
- yin2
1266
- yin3
1267
- yin4
1268
- ying1
1269
- ying2
1270
- ying3
1271
- ying4
1272
- yo1
1273
- yong1
1274
- yong2
1275
- yong3
1276
- yong4
1277
- you1
1278
- you2
1279
- you3
1280
- you4
1281
- yu1
1282
- yu2
1283
- yu3
1284
- yu4
1285
- yuan1
1286
- yuan2
1287
- yuan3
1288
- yuan4
1289
- yue1
1290
- yue4
1291
- yun1
1292
- yun2
1293
- yun3
1294
- yun4
1295
- z
1296
- za1
1297
- za2
1298
- za3
1299
- zai1
1300
- zai3
1301
- zai4
1302
- zan1
1303
- zan2
1304
- zan3
1305
- zan4
1306
- zang1
1307
- zang4
1308
- zao1
1309
- zao2
1310
- zao3
1311
- zao4
1312
- ze2
1313
- ze4
1314
- zei2
1315
- zen3
1316
- zeng1
1317
- zeng4
1318
- zha1
1319
- zha2
1320
- zha3
1321
- zha4
1322
- zhai1
1323
- zhai2
1324
- zhai3
1325
- zhai4
1326
- zhan1
1327
- zhan2
1328
- zhan3
1329
- zhan4
1330
- zhang1
1331
- zhang2
1332
- zhang3
1333
- zhang4
1334
- zhao1
1335
- zhao2
1336
- zhao3
1337
- zhao4
1338
- zhe
1339
- zhe1
1340
- zhe2
1341
- zhe3
1342
- zhe4
1343
- zhen1
1344
- zhen2
1345
- zhen3
1346
- zhen4
1347
- zheng1
1348
- zheng2
1349
- zheng3
1350
- zheng4
1351
- zhi1
1352
- zhi2
1353
- zhi3
1354
- zhi4
1355
- zhong1
1356
- zhong2
1357
- zhong3
1358
- zhong4
1359
- zhou1
1360
- zhou2
1361
- zhou3
1362
- zhou4
1363
- zhu1
1364
- zhu2
1365
- zhu3
1366
- zhu4
1367
- zhua1
1368
- zhua2
1369
- zhua3
1370
- zhuai1
1371
- zhuai3
1372
- zhuai4
1373
- zhuan1
1374
- zhuan2
1375
- zhuan3
1376
- zhuan4
1377
- zhuang1
1378
- zhuang4
1379
- zhui1
1380
- zhui4
1381
- zhun1
1382
- zhun2
1383
- zhun3
1384
- zhuo1
1385
- zhuo2
1386
- zi
1387
- zi1
1388
- zi2
1389
- zi3
1390
- zi4
1391
- zong1
1392
- zong2
1393
- zong3
1394
- zong4
1395
- zou1
1396
- zou2
1397
- zou3
1398
- zou4
1399
- zu1
1400
- zu2
1401
- zu3
1402
- zuan1
1403
- zuan3
1404
- zuan4
1405
- zui2
1406
- zui3
1407
- zui4
1408
- zun1
1409
- zuo
1410
- zuo1
1411
- zuo2
1412
- zuo3
1413
- zuo4
1414
- {
1415
- ~
1416
- ¡
1417
- ¢
1418
- £
1419
- ¥
1420
- §
1421
- ¨
1422
- ©
1423
- «
1424
- ®
1425
- ¯
1426
- °
1427
- ±
1428
- ²
1429
- ³
1430
- ´
1431
- µ
1432
- ·
1433
- ¹
1434
- º
1435
- »
1436
- ¼
1437
- ½
1438
- ¾
1439
- ¿
1440
- À
1441
- Á
1442
- Â
1443
- Ã
1444
- Ä
1445
- Å
1446
- Æ
1447
- Ç
1448
- È
1449
- É
1450
- Ê
1451
- Í
1452
- Î
1453
- Ñ
1454
- Ó
1455
- Ö
1456
- ×
1457
- Ø
1458
- Ú
1459
- Ü
1460
- Ý
1461
- Þ
1462
- ß
1463
- à
1464
- á
1465
- â
1466
- ã
1467
- ä
1468
- å
1469
- æ
1470
- ç
1471
- è
1472
- é
1473
- ê
1474
- ë
1475
- ì
1476
- í
1477
- î
1478
- ï
1479
- ð
1480
- ñ
1481
- ò
1482
- ó
1483
- ô
1484
- õ
1485
- ö
1486
- ø
1487
- ù
1488
- ú
1489
- û
1490
- ü
1491
- ý
1492
- Ā
1493
- ā
1494
- ă
1495
- ą
1496
- ć
1497
- Č
1498
- č
1499
- Đ
1500
- đ
1501
- ē
1502
- ė
1503
- ę
1504
- ě
1505
- ĝ
1506
- ğ
1507
- ħ
1508
- ī
1509
- į
1510
- İ
1511
- ı
1512
- Ł
1513
- ł
1514
- ń
1515
- ņ
1516
- ň
1517
- ŋ
1518
- Ō
1519
- ō
1520
- ő
1521
- œ
1522
- ř
1523
- Ś
1524
- ś
1525
- Ş
1526
- ş
1527
- Š
1528
- š
1529
- Ť
1530
- ť
1531
- ũ
1532
- ū
1533
- ź
1534
- Ż
1535
- ż
1536
- Ž
1537
- ž
1538
- ơ
1539
- ư
1540
- ǎ
1541
- ǐ
1542
- ǒ
1543
- ǔ
1544
- ǚ
1545
- ș
1546
- ț
1547
- ɑ
1548
- ɔ
1549
- ɕ
1550
- ə
1551
- ɛ
1552
- ɜ
1553
- ɡ
1554
- ɣ
1555
- ɪ
1556
- ɫ
1557
- ɴ
1558
- ɹ
1559
- ɾ
1560
- ʃ
1561
- ʊ
1562
- ʌ
1563
- ʒ
1564
- ʔ
1565
- ʰ
1566
- ʷ
1567
- ʻ
1568
- ʾ
1569
- ʿ
1570
- ˈ
1571
- ː
1572
- ˙
1573
- ˜
1574
- ˢ
1575
- ́
1576
- ̅
1577
- Α
1578
- Β
1579
- Δ
1580
- Ε
1581
- Θ
1582
- Κ
1583
- Λ
1584
- Μ
1585
- Ξ
1586
- Π
1587
- Σ
1588
- Τ
1589
- Φ
1590
- Χ
1591
- Ψ
1592
- Ω
1593
- ά
1594
- έ
1595
- ή
1596
- ί
1597
- α
1598
- β
1599
- γ
1600
- δ
1601
- ε
1602
- ζ
1603
- η
1604
- θ
1605
- ι
1606
- κ
1607
- λ
1608
- μ
1609
- ν
1610
- ξ
1611
- ο
1612
- π
1613
- ρ
1614
- ς
1615
- σ
1616
- τ
1617
- υ
1618
- φ
1619
- χ
1620
- ψ
1621
- ω
1622
- ϊ
1623
- ό
1624
- ύ
1625
- ώ
1626
- ϕ
1627
- ϵ
1628
- Ё
1629
- А
1630
- Б
1631
- В
1632
- Г
1633
- Д
1634
- Е
1635
- Ж
1636
- З
1637
- И
1638
- Й
1639
- К
1640
- Л
1641
- М
1642
- Н
1643
- О
1644
- П
1645
- Р
1646
- С
1647
- Т
1648
- У
1649
- Ф
1650
- Х
1651
- Ц
1652
- Ч
1653
- Ш
1654
- Щ
1655
- Ы
1656
- Ь
1657
- Э
1658
- Ю
1659
- Я
1660
- а
1661
- б
1662
- в
1663
- г
1664
- д
1665
- е
1666
- ж
1667
- з
1668
- и
1669
- й
1670
- к
1671
- л
1672
- м
1673
- н
1674
- о
1675
- п
1676
- р
1677
- с
1678
- т
1679
- у
1680
- ф
1681
- х
1682
- ц
1683
- ч
1684
- ш
1685
- щ
1686
- ъ
1687
- ы
1688
- ь
1689
- э
1690
- ю
1691
- я
1692
- ё
1693
- і
1694
- ְ
1695
- ִ
1696
- ֵ
1697
- ֶ
1698
- ַ
1699
- ָ
1700
- ֹ
1701
- ּ
1702
- ־
1703
- ׁ
1704
- א
1705
- ב
1706
- ג
1707
- ד
1708
- ה
1709
- ו
1710
- ז
1711
- ח
1712
- ט
1713
- י
1714
- כ
1715
- ל
1716
- ם
1717
- מ
1718
- ן
1719
- נ
1720
- ס
1721
- ע
1722
- פ
1723
- ק
1724
- ר
1725
- ש
1726
- ת
1727
- أ
1728
- ب
1729
- ة
1730
- ت
1731
- ج
1732
- ح
1733
- د
1734
- ر
1735
- ز
1736
- س
1737
- ص
1738
- ط
1739
- ع
1740
- ق
1741
- ك
1742
- ل
1743
- م
1744
- ن
1745
- ه
1746
- و
1747
- ي
1748
- َ
1749
- ُ
1750
- ِ
1751
- ْ
1752
-
1753
-
1754
-
1755
-
1756
-
1757
-
1758
-
1759
-
1760
-
1761
-
1762
-
1763
-
1764
-
1765
-
1766
-
1767
-
1768
-
1769
-
1770
-
1771
-
1772
-
1773
-
1774
-
1775
-
1776
-
1777
-
1778
-
1779
-
1780
-
1781
-
1782
-
1783
-
1784
-
1785
-
1786
-
1787
-
1788
-
1789
-
1790
-
1791
-
1792
-
1793
-
1794
-
1795
-
1796
-
1797
-
1798
-
1799
-
1800
- ế
1801
-
1802
-
1803
-
1804
-
1805
-
1806
-
1807
-
1808
-
1809
-
1810
-
1811
-
1812
-
1813
-
1814
-
1815
-
1816
-
1817
-
1818
-
1819
-
1820
-
1821
-
1822
-
1823
-
1824
-
1825
-
1826
-
1827
-
1828
-
1829
-
1830
- ���
1831
-
1832
-
1833
-
1834
-
1835
-
1836
-
1837
-
1838
-
1839
-
1840
-
1841
-
1842
-
1843
-
1844
-
1845
-
1846
-
1847
-
1848
-
1849
-
1850
-
1851
-
1852
-
1853
-
1854
-
1855
-
1856
-
1857
-
1858
-
1859
-
1860
-
1861
-
1862
-
1863
-
1864
-
1865
-
1866
-
1867
-
1868
-
1869
-
1870
-
1871
-
1872
-
1873
-
1874
-
1875
-
1876
-
1877
-
1878
-
1879
-
1880
-
1881
-
1882
-
1883
-
1884
-
1885
-
1886
-
1887
-
1888
-
1889
-
1890
-
1891
-
1892
-
1893
-
1894
-
1895
-
1896
-
1897
-
1898
-
1899
-
1900
-
1901
-
1902
-
1903
-
1904
-
1905
-
1906
-
1907
-
1908
-
1909
-
1910
-
1911
-
1912
-
1913
-
1914
-
1915
-
1916
-
1917
-
1918
-
1919
-
1920
-
1921
-
1922
-
1923
-
1924
-
1925
-
1926
-
1927
-
1928
-
1929
-
1930
-
1931
-
1932
-
1933
-
1934
-
1935
-
1936
-
1937
-
1938
-
1939
-
1940
-
1941
-
1942
-
1943
-
1944
-
1945
-
1946
-
1947
-
1948
-
1949
-
1950
-
1951
-
1952
-
1953
-
1954
-
1955
-
1956
-
1957
-
1958
-
1959
-
1960
-
1961
-
1962
-
1963
-
1964
-
1965
-
1966
-
1967
-
1968
-
1969
-
1970
-
1971
-
1972
-
1973
-
1974
-
1975
-
1976
-
1977
-
1978
-
1979
-
1980
-
1981
-
1982
-
1983
-
1984
-
1985
-
1986
-
1987
-
1988
-
1989
-
1990
-
1991
-
1992
-
1993
-
1994
-
1995
-
1996
-
1997
-
1998
-
1999
-
2000
-
2001
-
2002
-
2003
-
2004
-
2005
-
2006
-
2007
-
2008
-
2009
-
2010
-
2011
-
2012
-
2013
-
2014
-
2015
-
2016
-
2017
-
2018
-
2019
-
2020
-
2021
-
2022
-
2023
-
2024
-
2025
-
2026
-
2027
-
2028
-
2029
-
2030
-
2031
-
2032
-
2033
-
2034
-
2035
-
2036
-
2037
-
2038
-
2039
-
2040
-
2041
-
2042
-
2043
-
2044
-
2045
-
2046
-
2047
-
2048
-
2049
-
2050
-
2051
-
2052
-
2053
-
2054
-
2055
-
2056
-
2057
-
2058
-
2059
-
2060
-
2061
-
2062
-
2063
-
2064
-
2065
-
2066
-
2067
-
2068
-
2069
-
2070
-
2071
-
2072
-
2073
-
2074
-
2075
-
2076
-
2077
-
2078
-
2079
-
2080
-
2081
-
2082
-
2083
-
2084
-
2085
-
2086
-
2087
-
2088
-
2089
-
2090
-
2091
-
2092
-
2093
-
2094
-
2095
-
2096
-
2097
-
2098
-
2099
-
2100
-
2101
-
2102
-
2103
-
2104
-
2105
-
2106
-
2107
-
2108
-
2109
-
2110
-
2111
-
2112
-
2113
-
2114
-
2115
-
2116
-
2117
-
2118
-
2119
-
2120
-
2121
-
2122
-
2123
-
2124
-
2125
-
2126
-
2127
-
2128
-
2129
-
2130
-
2131
-
2132
-
2133
-
2134
-
2135
-
2136
-
2137
-
2138
-
2139
-
2140
-
2141
-
2142
-
2143
-
2144
-
2145
-
2146
-
2147
-
2148
-
2149
-
2150
-
2151
-
2152
-
2153
-
2154
-
2155
-
2156
-
2157
-
2158
-
2159
-
2160
-
2161
-
2162
-
2163
-
2164
-
2165
-
2166
-
2167
-
2168
-
2169
-
2170
-
2171
-
2172
-
2173
-
2174
-
2175
-
2176
-
2177
-
2178
-
2179
-
2180
-
2181
-
2182
-
2183
-
2184
-
2185
-
2186
-
2187
-
2188
-
2189
-
2190
-
2191
-
2192
-
2193
-
2194
-
2195
-
2196
-
2197
-
2198
-
2199
-
2200
-
2201
-
2202
-
2203
-
2204
-
2205
-
2206
-
2207
-
2208
-
2209
-
2210
-
2211
-
2212
-
2213
-
2214
-
2215
-
2216
-
2217
-
2218
-
2219
-
2220
-
2221
-
2222
-
2223
-
2224
-
2225
-
2226
-
2227
-
2228
-
2229
-
2230
-
2231
-
2232
-
2233
-
2234
-
2235
-
2236
-
2237
-
2238
-
2239
-
2240
-
2241
-
2242
-
2243
-
2244
-
2245
-
2246
-
2247
-
2248
-
2249
-
2250
-
2251
-
2252
-
2253
-
2254
-
2255
-
2256
-
2257
-
2258
-
2259
-
2260
-
2261
-
2262
-
2263
-
2264
-
2265
-
2266
-
2267
-
2268
-
2269
-
2270
-
2271
-
2272
-
2273
-
2274
-
2275
-
2276
-
2277
-
2278
-
2279
-
2280
-
2281
-
2282
-
2283
-
2284
-
2285
-
2286
-
2287
-
2288
-
2289
-
2290
-
2291
-
2292
-
2293
-
2294
-
2295
-
2296
-
2297
-
2298
-
2299
-
2300
-
2301
-
2302
-
2303
-
2304
-
2305
-
2306
-
2307
-
2308
-
2309
-
2310
-
2311
-
2312
-
2313
-
2314
-
2315
-
2316
-
2317
-
2318
-
2319
-
2320
-
2321
-
2322
-
2323
-
2324
-
2325
-
2326
-
2327
-
2328
-
2329
-
2330
-
2331
-
2332
-
2333
-
2334
-
2335
-
2336
-
2337
-
2338
-
2339
-
2340
-
2341
-
2342
-
2343
-
2344
-
2345
-
2346
-
2347
-
2348
-
2349
-
2350
-
2351
-
2352
-
2353
-
2354
-
2355
-
2356
-
2357
-
2358
-
2359
-
2360
-
2361
-
2362
-
2363
-
2364
-
2365
-
2366
-
2367
-
2368
-
2369
-
2370
-
2371
-
2372
-
2373
-
2374
-
2375
-
2376
-
2377
-
2378
-
2379
-
2380
-
2381
-
2382
-
2383
-
2384
-
2385
-
2386
-
2387
-
2388
-
2389
-
2390
-
2391
-
2392
-
2393
-
2394
-
2395
-
2396
-
2397
-
2398
-
2399
-
2400
-
2401
-
2402
-
2403
-
2404
-
2405
-
2406
-
2407
-
2408
-
2409
-
2410
-
2411
-
2412
-
2413
-
2414
-
2415
-
2416
-
2417
-
2418
-
2419
-
2420
-
2421
-
2422
-
2423
-
2424
-
2425
-
2426
-
2427
-
2428
-
2429
-
2430
-
2431
-
2432
-
2433
-
2434
-
2435
-
2436
-
2437
-
2438
-
2439
-
2440
-
2441
-
2442
-
2443
-
2444
-
2445
-
2446
-
2447
-
2448
-
2449
-
2450
-
2451
-
2452
-
2453
-
2454
-
2455
-
2456
-
2457
-
2458
-
2459
-
2460
-
2461
-
2462
-
2463
-
2464
-
2465
-
2466
-
2467
-
2468
-
2469
-
2470
-
2471
-
2472
-
2473
-
2474
-
2475
-
2476
-
2477
-
2478
-
2479
-
2480
-
2481
-
2482
-
2483
-
2484
-
2485
-
2486
-
2487
-
2488
-
2489
-
2490
-
2491
-
2492
-
2493
-
2494
-
2495
-
2496
-
2497
-
2498
-
2499
-
2500
-
2501
-
2502
-
2503
-
2504
-
2505
-
2506
-
2507
-
2508
-
2509
-
2510
-
2511
-
2512
-
2513
-
2514
-
2515
-
2516
-
2517
-
2518
-
2519
-
2520
-
2521
-
2522
-
2523
-
2524
-
2525
-
2526
-
2527
-
2528
-
2529
-
2530
-
2531
-
2532
-
2533
-
2534
-
2535
-
2536
-
2537
-
2538
-
2539
-
2540
-
2541
-
2542
-
2543
-
2544
-
2545
- 𠮶
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/f5_tts/data/inference-cli.toml DELETED
@@ -1,10 +0,0 @@
1
- # F5-TTS | E2-TTS
2
- model = "F5-TTS"
3
- ref_audio = "tests/ref_audio/test_en_1_ref_short.wav"
4
- # If an empty "", transcribes the reference audio automatically.
5
- ref_text = "Some call me nature, others call me mother nature."
6
- gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
7
- # File with text to generate. Ignores the text above.
8
- gen_file = ""
9
- remove_silence = false
10
- output_dir = "tests"
 
 
 
 
 
 
 
 
 
 
 
src/f5_tts/{scripts → eval}/eval_infer_batch.py RENAMED
File without changes
src/f5_tts/{scripts → eval}/eval_infer_batch.sh RENAMED
File without changes
src/f5_tts/{scripts → eval}/eval_librispeech_test_clean.py RENAMED
File without changes
src/f5_tts/{scripts → eval}/eval_seedtts_testset.py RENAMED
File without changes
src/f5_tts/{data → eval/eval_testset}/librispeech_pc_test_clean_cross_sentence.lst RENAMED
File without changes
src/f5_tts/{inference_cli.py → infer/infer_cli.py} RENAMED
@@ -1,7 +1,7 @@
1
  import argparse
2
  import codecs
3
- import re
4
  import os
 
5
  from pathlib import Path
6
  from importlib.resources import files
7
 
 
1
  import argparse
2
  import codecs
 
3
  import os
4
+ import re
5
  from pathlib import Path
6
  from importlib.resources import files
7
 
src/f5_tts/{gradio_app.py → infer/infer_gradio.py} RENAMED
File without changes
src/f5_tts/{speech_edit.py → infer/speech_edit.py} RENAMED
File without changes
src/f5_tts/scripts/count_params_gflops.py CHANGED
@@ -3,7 +3,7 @@ import os
3
 
4
  sys.path.append(os.getcwd())
5
 
6
- from f5_tts.model import M2_TTS, DiT
7
 
8
  import torch
9
  import thop
@@ -24,7 +24,7 @@ import thop
24
  transformer = DiT(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
25
 
26
 
27
- model = M2_TTS(transformer=transformer)
28
  target_sample_rate = 24000
29
  n_mel_channels = 100
30
  hop_length = 256
 
3
 
4
  sys.path.append(os.getcwd())
5
 
6
+ from f5_tts.model import CFM, DiT
7
 
8
  import torch
9
  import thop
 
24
  transformer = DiT(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
25
 
26
 
27
+ model = CFM(transformer=transformer)
28
  target_sample_rate = 24000
29
  n_mel_channels = 100
30
  hop_length = 256
src/f5_tts/{finetune_cli.py → train/finetune_cli.py} RENAMED
@@ -1,128 +1,128 @@
1
- import argparse
2
- import os
3
- import shutil
4
-
5
- from cached_path import cached_path
6
- from f5_tts.model import CFM, UNetT, DiT, Trainer
7
- from f5_tts.model.utils import get_tokenizer
8
- from f5_tts.model.dataset import load_dataset
9
-
10
- # -------------------------- Dataset Settings --------------------------- #
11
- target_sample_rate = 24000
12
- n_mel_channels = 100
13
- hop_length = 256
14
-
15
-
16
- # -------------------------- Argument Parsing --------------------------- #
17
- def parse_args():
18
- parser = argparse.ArgumentParser(description="Train CFM Model")
19
-
20
- parser.add_argument(
21
- "--exp_name", type=str, default="F5TTS_Base", choices=["F5TTS_Base", "E2TTS_Base"], help="Experiment name"
22
- )
23
- parser.add_argument("--dataset_name", type=str, default="Emilia_ZH_EN", help="Name of the dataset to use")
24
- parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate for training")
25
- parser.add_argument("--batch_size_per_gpu", type=int, default=256, help="Batch size per GPU")
26
- parser.add_argument(
27
- "--batch_size_type", type=str, default="frame", choices=["frame", "sample"], help="Batch size type"
28
- )
29
- parser.add_argument("--max_samples", type=int, default=16, help="Max sequences per batch")
30
- parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
31
- parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
32
- parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs")
33
- parser.add_argument("--num_warmup_updates", type=int, default=5, help="Warmup steps")
34
- parser.add_argument("--save_per_updates", type=int, default=10, help="Save checkpoint every X steps")
35
- parser.add_argument("--last_per_steps", type=int, default=10, help="Save last checkpoint every X steps")
36
- parser.add_argument("--finetune", type=bool, default=True, help="Use Finetune")
37
-
38
- parser.add_argument(
39
- "--tokenizer", type=str, default="pinyin", choices=["pinyin", "char", "custom"], help="Tokenizer type"
40
- )
41
- parser.add_argument(
42
- "--tokenizer_path",
43
- type=str,
44
- default=None,
45
- help="Path to custom tokenizer vocab file (only used if tokenizer = 'custom')",
46
- )
47
-
48
- return parser.parse_args()
49
-
50
-
51
- # -------------------------- Training Settings -------------------------- #
52
-
53
-
54
- def main():
55
- args = parse_args()
56
-
57
- # Model parameters based on experiment name
58
- if args.exp_name == "F5TTS_Base":
59
- wandb_resume_id = None
60
- model_cls = DiT
61
- model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
62
- if args.finetune:
63
- ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.pt"))
64
- elif args.exp_name == "E2TTS_Base":
65
- wandb_resume_id = None
66
- model_cls = UNetT
67
- model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
68
- if args.finetune:
69
- ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.pt"))
70
-
71
- if args.finetune:
72
- path_ckpt = os.path.join("ckpts", args.dataset_name)
73
- if not os.path.isdir(path_ckpt):
74
- os.makedirs(path_ckpt, exist_ok=True)
75
- shutil.copy2(ckpt_path, os.path.join(path_ckpt, os.path.basename(ckpt_path)))
76
-
77
- checkpoint_path = os.path.join("ckpts", args.dataset_name)
78
-
79
- # Use the tokenizer and tokenizer_path provided in the command line arguments
80
- tokenizer = args.tokenizer
81
- if tokenizer == "custom":
82
- if not args.tokenizer_path:
83
- raise ValueError("Custom tokenizer selected, but no tokenizer_path provided.")
84
- tokenizer_path = args.tokenizer_path
85
- else:
86
- tokenizer_path = args.dataset_name
87
-
88
- vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
89
-
90
- mel_spec_kwargs = dict(
91
- target_sample_rate=target_sample_rate,
92
- n_mel_channels=n_mel_channels,
93
- hop_length=hop_length,
94
- )
95
-
96
- e2tts = CFM(
97
- transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
98
- mel_spec_kwargs=mel_spec_kwargs,
99
- vocab_char_map=vocab_char_map,
100
- )
101
-
102
- trainer = Trainer(
103
- e2tts,
104
- args.epochs,
105
- args.learning_rate,
106
- num_warmup_updates=args.num_warmup_updates,
107
- save_per_updates=args.save_per_updates,
108
- checkpoint_path=checkpoint_path,
109
- batch_size=args.batch_size_per_gpu,
110
- batch_size_type=args.batch_size_type,
111
- max_samples=args.max_samples,
112
- grad_accumulation_steps=args.grad_accumulation_steps,
113
- max_grad_norm=args.max_grad_norm,
114
- wandb_project="CFM-TTS",
115
- wandb_run_name=args.exp_name,
116
- wandb_resume_id=wandb_resume_id,
117
- last_per_steps=args.last_per_steps,
118
- )
119
-
120
- train_dataset = load_dataset(args.dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
121
- trainer.train(
122
- train_dataset,
123
- resumable_with_seed=666, # seed for shuffling dataset
124
- )
125
-
126
-
127
- if __name__ == "__main__":
128
- main()
 
1
+ import argparse
2
+ import os
3
+ import shutil
4
+
5
+ from cached_path import cached_path
6
+ from f5_tts.model import CFM, UNetT, DiT, Trainer
7
+ from f5_tts.model.utils import get_tokenizer
8
+ from f5_tts.model.dataset import load_dataset
9
+
10
+ # -------------------------- Dataset Settings --------------------------- #
11
+ target_sample_rate = 24000
12
+ n_mel_channels = 100
13
+ hop_length = 256
14
+
15
+
16
+ # -------------------------- Argument Parsing --------------------------- #
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Train CFM Model")
19
+
20
+ parser.add_argument(
21
+ "--exp_name", type=str, default="F5TTS_Base", choices=["F5TTS_Base", "E2TTS_Base"], help="Experiment name"
22
+ )
23
+ parser.add_argument("--dataset_name", type=str, default="Emilia_ZH_EN", help="Name of the dataset to use")
24
+ parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate for training")
25
+ parser.add_argument("--batch_size_per_gpu", type=int, default=256, help="Batch size per GPU")
26
+ parser.add_argument(
27
+ "--batch_size_type", type=str, default="frame", choices=["frame", "sample"], help="Batch size type"
28
+ )
29
+ parser.add_argument("--max_samples", type=int, default=16, help="Max sequences per batch")
30
+ parser.add_argument("--grad_accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
31
+ parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Max gradient norm for clipping")
32
+ parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs")
33
+ parser.add_argument("--num_warmup_updates", type=int, default=5, help="Warmup steps")
34
+ parser.add_argument("--save_per_updates", type=int, default=10, help="Save checkpoint every X steps")
35
+ parser.add_argument("--last_per_steps", type=int, default=10, help="Save last checkpoint every X steps")
36
+ parser.add_argument("--finetune", type=bool, default=True, help="Use Finetune")
37
+
38
+ parser.add_argument(
39
+ "--tokenizer", type=str, default="pinyin", choices=["pinyin", "char", "custom"], help="Tokenizer type"
40
+ )
41
+ parser.add_argument(
42
+ "--tokenizer_path",
43
+ type=str,
44
+ default=None,
45
+ help="Path to custom tokenizer vocab file (only used if tokenizer = 'custom')",
46
+ )
47
+
48
+ return parser.parse_args()
49
+
50
+
51
+ # -------------------------- Training Settings -------------------------- #
52
+
53
+
54
+ def main():
55
+ args = parse_args()
56
+
57
+ # Model parameters based on experiment name
58
+ if args.exp_name == "F5TTS_Base":
59
+ wandb_resume_id = None
60
+ model_cls = DiT
61
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
62
+ if args.finetune:
63
+ ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.pt"))
64
+ elif args.exp_name == "E2TTS_Base":
65
+ wandb_resume_id = None
66
+ model_cls = UNetT
67
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
68
+ if args.finetune:
69
+ ckpt_path = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.pt"))
70
+
71
+ if args.finetune:
72
+ path_ckpt = os.path.join("ckpts", args.dataset_name)
73
+ if not os.path.isdir(path_ckpt):
74
+ os.makedirs(path_ckpt, exist_ok=True)
75
+ shutil.copy2(ckpt_path, os.path.join(path_ckpt, os.path.basename(ckpt_path)))
76
+
77
+ checkpoint_path = os.path.join("ckpts", args.dataset_name)
78
+
79
+ # Use the tokenizer and tokenizer_path provided in the command line arguments
80
+ tokenizer = args.tokenizer
81
+ if tokenizer == "custom":
82
+ if not args.tokenizer_path:
83
+ raise ValueError("Custom tokenizer selected, but no tokenizer_path provided.")
84
+ tokenizer_path = args.tokenizer_path
85
+ else:
86
+ tokenizer_path = args.dataset_name
87
+
88
+ vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
89
+
90
+ mel_spec_kwargs = dict(
91
+ target_sample_rate=target_sample_rate,
92
+ n_mel_channels=n_mel_channels,
93
+ hop_length=hop_length,
94
+ )
95
+
96
+ e2tts = CFM(
97
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
98
+ mel_spec_kwargs=mel_spec_kwargs,
99
+ vocab_char_map=vocab_char_map,
100
+ )
101
+
102
+ trainer = Trainer(
103
+ e2tts,
104
+ args.epochs,
105
+ args.learning_rate,
106
+ num_warmup_updates=args.num_warmup_updates,
107
+ save_per_updates=args.save_per_updates,
108
+ checkpoint_path=checkpoint_path,
109
+ batch_size=args.batch_size_per_gpu,
110
+ batch_size_type=args.batch_size_type,
111
+ max_samples=args.max_samples,
112
+ grad_accumulation_steps=args.grad_accumulation_steps,
113
+ max_grad_norm=args.max_grad_norm,
114
+ wandb_project="CFM-TTS",
115
+ wandb_run_name=args.exp_name,
116
+ wandb_resume_id=wandb_resume_id,
117
+ last_per_steps=args.last_per_steps,
118
+ )
119
+
120
+ train_dataset = load_dataset(args.dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
121
+ trainer.train(
122
+ train_dataset,
123
+ resumable_with_seed=666, # seed for shuffling dataset
124
+ )
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
src/f5_tts/{finetune_gradio.py → train/finetune_gradio.py} RENAMED
@@ -1,944 +1,944 @@
1
- import os
2
- import sys
3
-
4
- import tempfile
5
- import random
6
- from transformers import pipeline
7
- import gradio as gr
8
- import torch
9
- import gc
10
- import click
11
- import torchaudio
12
- from glob import glob
13
- import librosa
14
- import numpy as np
15
- from scipy.io import wavfile
16
- import shutil
17
- import time
18
-
19
- import json
20
- from f5_tts.model.utils import convert_char_to_pinyin
21
- import signal
22
- import psutil
23
- import platform
24
- import subprocess
25
- from datasets.arrow_writer import ArrowWriter
26
- from datasets import Dataset as Dataset_
27
- from f5_tts.api import F5TTS
28
-
29
-
30
- training_process = None
31
- system = platform.system()
32
- python_executable = sys.executable or "python"
33
- tts_api = None
34
- last_checkpoint = ""
35
- last_device = ""
36
-
37
- path_data = "data"
38
-
39
- device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
40
-
41
- pipe = None
42
-
43
-
44
- # Load metadata
45
- def get_audio_duration(audio_path):
46
- """Calculate the duration of an audio file."""
47
- audio, sample_rate = torchaudio.load(audio_path)
48
- num_channels = audio.shape[0]
49
- return audio.shape[1] / (sample_rate * num_channels)
50
-
51
-
52
- def clear_text(text):
53
- """Clean and prepare text by lowering the case and stripping whitespace."""
54
- return text.lower().strip()
55
-
56
-
57
- def get_rms(
58
- y,
59
- frame_length=2048,
60
- hop_length=512,
61
- pad_mode="constant",
62
- ): # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
63
- padding = (int(frame_length // 2), int(frame_length // 2))
64
- y = np.pad(y, padding, mode=pad_mode)
65
-
66
- axis = -1
67
- # put our new within-frame axis at the end for now
68
- out_strides = y.strides + tuple([y.strides[axis]])
69
- # Reduce the shape on the framing axis
70
- x_shape_trimmed = list(y.shape)
71
- x_shape_trimmed[axis] -= frame_length - 1
72
- out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
73
- xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
74
- if axis < 0:
75
- target_axis = axis - 1
76
- else:
77
- target_axis = axis + 1
78
- xw = np.moveaxis(xw, -1, target_axis)
79
- # Downsample along the target axis
80
- slices = [slice(None)] * xw.ndim
81
- slices[axis] = slice(0, None, hop_length)
82
- x = xw[tuple(slices)]
83
-
84
- # Calculate power
85
- power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
86
-
87
- return np.sqrt(power)
88
-
89
-
90
- class Slicer: # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
91
- def __init__(
92
- self,
93
- sr: int,
94
- threshold: float = -40.0,
95
- min_length: int = 2000,
96
- min_interval: int = 300,
97
- hop_size: int = 20,
98
- max_sil_kept: int = 2000,
99
- ):
100
- if not min_length >= min_interval >= hop_size:
101
- raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size")
102
- if not max_sil_kept >= hop_size:
103
- raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size")
104
- min_interval = sr * min_interval / 1000
105
- self.threshold = 10 ** (threshold / 20.0)
106
- self.hop_size = round(sr * hop_size / 1000)
107
- self.win_size = min(round(min_interval), 4 * self.hop_size)
108
- self.min_length = round(sr * min_length / 1000 / self.hop_size)
109
- self.min_interval = round(min_interval / self.hop_size)
110
- self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
111
-
112
- def _apply_slice(self, waveform, begin, end):
113
- if len(waveform.shape) > 1:
114
- return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)]
115
- else:
116
- return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)]
117
-
118
- # @timeit
119
- def slice(self, waveform):
120
- if len(waveform.shape) > 1:
121
- samples = waveform.mean(axis=0)
122
- else:
123
- samples = waveform
124
- if samples.shape[0] <= self.min_length:
125
- return [waveform]
126
- rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
127
- sil_tags = []
128
- silence_start = None
129
- clip_start = 0
130
- for i, rms in enumerate(rms_list):
131
- # Keep looping while frame is silent.
132
- if rms < self.threshold:
133
- # Record start of silent frames.
134
- if silence_start is None:
135
- silence_start = i
136
- continue
137
- # Keep looping while frame is not silent and silence start has not been recorded.
138
- if silence_start is None:
139
- continue
140
- # Clear recorded silence start if interval is not enough or clip is too short
141
- is_leading_silence = silence_start == 0 and i > self.max_sil_kept
142
- need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
143
- if not is_leading_silence and not need_slice_middle:
144
- silence_start = None
145
- continue
146
- # Need slicing. Record the range of silent frames to be removed.
147
- if i - silence_start <= self.max_sil_kept:
148
- pos = rms_list[silence_start : i + 1].argmin() + silence_start
149
- if silence_start == 0:
150
- sil_tags.append((0, pos))
151
- else:
152
- sil_tags.append((pos, pos))
153
- clip_start = pos
154
- elif i - silence_start <= self.max_sil_kept * 2:
155
- pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin()
156
- pos += i - self.max_sil_kept
157
- pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
158
- pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
159
- if silence_start == 0:
160
- sil_tags.append((0, pos_r))
161
- clip_start = pos_r
162
- else:
163
- sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
164
- clip_start = max(pos_r, pos)
165
- else:
166
- pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
167
- pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
168
- if silence_start == 0:
169
- sil_tags.append((0, pos_r))
170
- else:
171
- sil_tags.append((pos_l, pos_r))
172
- clip_start = pos_r
173
- silence_start = None
174
- # Deal with trailing silence.
175
- total_frames = rms_list.shape[0]
176
- if silence_start is not None and total_frames - silence_start >= self.min_interval:
177
- silence_end = min(total_frames, silence_start + self.max_sil_kept)
178
- pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
179
- sil_tags.append((pos, total_frames + 1))
180
- # Apply and return slices.
181
- ####音频+起始时间+终止时间
182
- if len(sil_tags) == 0:
183
- return [[waveform, 0, int(total_frames * self.hop_size)]]
184
- else:
185
- chunks = []
186
- if sil_tags[0][0] > 0:
187
- chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)])
188
- for i in range(len(sil_tags) - 1):
189
- chunks.append(
190
- [
191
- self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),
192
- int(sil_tags[i][1] * self.hop_size),
193
- int(sil_tags[i + 1][0] * self.hop_size),
194
- ]
195
- )
196
- if sil_tags[-1][1] < total_frames:
197
- chunks.append(
198
- [
199
- self._apply_slice(waveform, sil_tags[-1][1], total_frames),
200
- int(sil_tags[-1][1] * self.hop_size),
201
- int(total_frames * self.hop_size),
202
- ]
203
- )
204
- return chunks
205
-
206
-
207
- # terminal
208
- def terminate_process_tree(pid, including_parent=True):
209
- try:
210
- parent = psutil.Process(pid)
211
- except psutil.NoSuchProcess:
212
- # Process already terminated
213
- return
214
-
215
- children = parent.children(recursive=True)
216
- for child in children:
217
- try:
218
- os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
219
- except OSError:
220
- pass
221
- if including_parent:
222
- try:
223
- os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
224
- except OSError:
225
- pass
226
-
227
-
228
- def terminate_process(pid):
229
- if system == "Windows":
230
- cmd = f"taskkill /t /f /pid {pid}"
231
- os.system(cmd)
232
- else:
233
- terminate_process_tree(pid)
234
-
235
-
236
- def start_training(
237
- dataset_name="",
238
- exp_name="F5TTS_Base",
239
- learning_rate=1e-4,
240
- batch_size_per_gpu=400,
241
- batch_size_type="frame",
242
- max_samples=64,
243
- grad_accumulation_steps=1,
244
- max_grad_norm=1.0,
245
- epochs=11,
246
- num_warmup_updates=200,
247
- save_per_updates=400,
248
- last_per_steps=800,
249
- finetune=True,
250
- ):
251
- global training_process, tts_api
252
-
253
- if tts_api is not None:
254
- del tts_api
255
- gc.collect()
256
- torch.cuda.empty_cache()
257
- tts_api = None
258
-
259
- path_project = os.path.join(path_data, dataset_name + "_pinyin")
260
-
261
- if not os.path.isdir(path_project):
262
- yield (
263
- f"There is not project with name {dataset_name}",
264
- gr.update(interactive=True),
265
- gr.update(interactive=False),
266
- )
267
- return
268
-
269
- file_raw = os.path.join(path_project, "raw.arrow")
270
- if not os.path.isfile(file_raw):
271
- yield f"There is no file {file_raw}", gr.update(interactive=True), gr.update(interactive=False)
272
- return
273
-
274
- # Check if a training process is already running
275
- if training_process is not None:
276
- return "Train run already!", gr.update(interactive=False), gr.update(interactive=True)
277
-
278
- yield "start train", gr.update(interactive=False), gr.update(interactive=False)
279
-
280
- # Command to run the training script with the specified arguments
281
- cmd = (
282
- f"accelerate launch finetune-cli.py --exp_name {exp_name} "
283
- f"--learning_rate {learning_rate} "
284
- f"--batch_size_per_gpu {batch_size_per_gpu} "
285
- f"--batch_size_type {batch_size_type} "
286
- f"--max_samples {max_samples} "
287
- f"--grad_accumulation_steps {grad_accumulation_steps} "
288
- f"--max_grad_norm {max_grad_norm} "
289
- f"--epochs {epochs} "
290
- f"--num_warmup_updates {num_warmup_updates} "
291
- f"--save_per_updates {save_per_updates} "
292
- f"--last_per_steps {last_per_steps} "
293
- f"--dataset_name {dataset_name}"
294
- )
295
- if finetune:
296
- cmd += f" --finetune {finetune}"
297
-
298
- print(cmd)
299
-
300
- try:
301
- # Start the training process
302
- training_process = subprocess.Popen(cmd, shell=True)
303
-
304
- time.sleep(5)
305
- yield "train start", gr.update(interactive=False), gr.update(interactive=True)
306
-
307
- # Wait for the training process to finish
308
- training_process.wait()
309
- time.sleep(1)
310
-
311
- if training_process is None:
312
- text_info = "train stop"
313
- else:
314
- text_info = "train complete !"
315
-
316
- except Exception as e: # Catch all exceptions
317
- # Ensure that we reset the training process variable in case of an error
318
- text_info = f"An error occurred: {str(e)}"
319
-
320
- training_process = None
321
-
322
- yield text_info, gr.update(interactive=True), gr.update(interactive=False)
323
-
324
-
325
- def stop_training():
326
- global training_process
327
- if training_process is None:
328
- return "Train not run !", gr.update(interactive=True), gr.update(interactive=False)
329
- terminate_process_tree(training_process.pid)
330
- training_process = None
331
- return "train stop", gr.update(interactive=True), gr.update(interactive=False)
332
-
333
-
334
- def create_data_project(name):
335
- name += "_pinyin"
336
- os.makedirs(os.path.join(path_data, name), exist_ok=True)
337
- os.makedirs(os.path.join(path_data, name, "dataset"), exist_ok=True)
338
-
339
-
340
- def transcribe(file_audio, language="english"):
341
- global pipe
342
-
343
- if pipe is None:
344
- pipe = pipeline(
345
- "automatic-speech-recognition",
346
- model="openai/whisper-large-v3-turbo",
347
- torch_dtype=torch.float16,
348
- device=device,
349
- )
350
-
351
- text_transcribe = pipe(
352
- file_audio,
353
- chunk_length_s=30,
354
- batch_size=128,
355
- generate_kwargs={"task": "transcribe", "language": language},
356
- return_timestamps=False,
357
- )["text"].strip()
358
- return text_transcribe
359
-
360
-
361
- def transcribe_all(name_project, audio_files, language, user=False, progress=gr.Progress()):
362
- name_project += "_pinyin"
363
- path_project = os.path.join(path_data, name_project)
364
- path_dataset = os.path.join(path_project, "dataset")
365
- path_project_wavs = os.path.join(path_project, "wavs")
366
- file_metadata = os.path.join(path_project, "metadata.csv")
367
-
368
- if audio_files is None:
369
- return "You need to load an audio file."
370
-
371
- if os.path.isdir(path_project_wavs):
372
- shutil.rmtree(path_project_wavs)
373
-
374
- if os.path.isfile(file_metadata):
375
- os.remove(file_metadata)
376
-
377
- os.makedirs(path_project_wavs, exist_ok=True)
378
-
379
- if user:
380
- file_audios = [
381
- file
382
- for format in ("*.wav", "*.ogg", "*.opus", "*.mp3", "*.flac")
383
- for file in glob(os.path.join(path_dataset, format))
384
- ]
385
- if file_audios == []:
386
- return "No audio file was found in the dataset."
387
- else:
388
- file_audios = audio_files
389
-
390
- alpha = 0.5
391
- _max = 1.0
392
- slicer = Slicer(24000)
393
-
394
- num = 0
395
- error_num = 0
396
- data = ""
397
- for file_audio in progress.tqdm(file_audios, desc="transcribe files", total=len((file_audios))):
398
- audio, _ = librosa.load(file_audio, sr=24000, mono=True)
399
-
400
- list_slicer = slicer.slice(audio)
401
- for chunk, start, end in progress.tqdm(list_slicer, total=len(list_slicer), desc="slicer files"):
402
- name_segment = os.path.join(f"segment_{num}")
403
- file_segment = os.path.join(path_project_wavs, f"{name_segment}.wav")
404
-
405
- tmp_max = np.abs(chunk).max()
406
- if tmp_max > 1:
407
- chunk /= tmp_max
408
- chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
409
- wavfile.write(file_segment, 24000, (chunk * 32767).astype(np.int16))
410
-
411
- try:
412
- text = transcribe(file_segment, language)
413
- text = text.lower().strip().replace('"', "")
414
-
415
- data += f"{name_segment}|{text}\n"
416
-
417
- num += 1
418
- except: # noqa: E722
419
- error_num += 1
420
-
421
- with open(file_metadata, "w", encoding="utf-8") as f:
422
- f.write(data)
423
-
424
- if error_num != []:
425
- error_text = f"\nerror files : {error_num}"
426
- else:
427
- error_text = ""
428
-
429
- return f"transcribe complete samples : {num}\npath : {path_project_wavs}{error_text}"
430
-
431
-
432
- def format_seconds_to_hms(seconds):
433
- hours = int(seconds / 3600)
434
- minutes = int((seconds % 3600) / 60)
435
- seconds = seconds % 60
436
- return "{:02d}:{:02d}:{:02d}".format(hours, minutes, int(seconds))
437
-
438
-
439
- def create_metadata(name_project, progress=gr.Progress()):
440
- name_project += "_pinyin"
441
- path_project = os.path.join(path_data, name_project)
442
- path_project_wavs = os.path.join(path_project, "wavs")
443
- file_metadata = os.path.join(path_project, "metadata.csv")
444
- file_raw = os.path.join(path_project, "raw.arrow")
445
- file_duration = os.path.join(path_project, "duration.json")
446
- file_vocab = os.path.join(path_project, "vocab.txt")
447
-
448
- if not os.path.isfile(file_metadata):
449
- return "The file was not found in " + file_metadata
450
-
451
- with open(file_metadata, "r", encoding="utf-8") as f:
452
- data = f.read()
453
-
454
- audio_path_list = []
455
- text_list = []
456
- duration_list = []
457
-
458
- count = data.split("\n")
459
- lenght = 0
460
- result = []
461
- error_files = []
462
- for line in progress.tqdm(data.split("\n"), total=count):
463
- sp_line = line.split("|")
464
- if len(sp_line) != 2:
465
- continue
466
- name_audio, text = sp_line[:2]
467
-
468
- file_audio = os.path.join(path_project_wavs, name_audio + ".wav")
469
-
470
- if not os.path.isfile(file_audio):
471
- error_files.append(file_audio)
472
- continue
473
-
474
- duraction = get_audio_duration(file_audio)
475
- if duraction < 2 and duraction > 15:
476
- continue
477
- if len(text) < 4:
478
- continue
479
-
480
- text = clear_text(text)
481
- text = convert_char_to_pinyin([text], polyphone=True)[0]
482
-
483
- audio_path_list.append(file_audio)
484
- duration_list.append(duraction)
485
- text_list.append(text)
486
-
487
- result.append({"audio_path": file_audio, "text": text, "duration": duraction})
488
-
489
- lenght += duraction
490
-
491
- if duration_list == []:
492
- error_files_text = "\n".join(error_files)
493
- return f"Error: No audio files found in the specified path : \n{error_files_text}"
494
-
495
- min_second = round(min(duration_list), 2)
496
- max_second = round(max(duration_list), 2)
497
-
498
- with ArrowWriter(path=file_raw, writer_batch_size=1) as writer:
499
- for line in progress.tqdm(result, total=len(result), desc="prepare data"):
500
- writer.write(line)
501
-
502
- with open(file_duration, "w", encoding="utf-8") as f:
503
- json.dump({"duration": duration_list}, f, ensure_ascii=False)
504
-
505
- file_vocab_finetune = "data/Emilia_ZH_EN_pinyin/vocab.txt"
506
- if not os.path.isfile(file_vocab_finetune):
507
- return "Error: Vocabulary file 'Emilia_ZH_EN_pinyin' not found!"
508
- shutil.copy2(file_vocab_finetune, file_vocab)
509
-
510
- if error_files != []:
511
- error_text = "error files\n" + "\n".join(error_files)
512
- else:
513
- error_text = ""
514
-
515
- return f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\n{error_text}"
516
-
517
-
518
- def check_user(value):
519
- return gr.update(visible=not value), gr.update(visible=value)
520
-
521
-
522
- def calculate_train(
523
- name_project,
524
- batch_size_type,
525
- max_samples,
526
- learning_rate,
527
- num_warmup_updates,
528
- save_per_updates,
529
- last_per_steps,
530
- finetune,
531
- ):
532
- name_project += "_pinyin"
533
- path_project = os.path.join(path_data, name_project)
534
- file_duraction = os.path.join(path_project, "duration.json")
535
-
536
- if not os.path.isfile(file_duraction):
537
- return (
538
- 1000,
539
- max_samples,
540
- num_warmup_updates,
541
- save_per_updates,
542
- last_per_steps,
543
- "project not found !",
544
- learning_rate,
545
- )
546
-
547
- with open(file_duraction, "r") as file:
548
- data = json.load(file)
549
-
550
- duration_list = data["duration"]
551
-
552
- samples = len(duration_list)
553
-
554
- if torch.cuda.is_available():
555
- gpu_properties = torch.cuda.get_device_properties(0)
556
- total_memory = gpu_properties.total_memory / (1024**3)
557
- elif torch.backends.mps.is_available():
558
- total_memory = psutil.virtual_memory().available / (1024**3)
559
-
560
- if batch_size_type == "frame":
561
- batch = int(total_memory * 0.5)
562
- batch = (lambda num: num + 1 if num % 2 != 0 else num)(batch)
563
- batch_size_per_gpu = int(38400 / batch)
564
- else:
565
- batch_size_per_gpu = int(total_memory / 8)
566
- batch_size_per_gpu = (lambda num: num + 1 if num % 2 != 0 else num)(batch_size_per_gpu)
567
- batch = batch_size_per_gpu
568
-
569
- if batch_size_per_gpu <= 0:
570
- batch_size_per_gpu = 1
571
-
572
- if samples < 64:
573
- max_samples = int(samples * 0.25)
574
- else:
575
- max_samples = 64
576
-
577
- num_warmup_updates = int(samples * 0.05)
578
- save_per_updates = int(samples * 0.10)
579
- last_per_steps = int(save_per_updates * 5)
580
-
581
- max_samples = (lambda num: num + 1 if num % 2 != 0 else num)(max_samples)
582
- num_warmup_updates = (lambda num: num + 1 if num % 2 != 0 else num)(num_warmup_updates)
583
- save_per_updates = (lambda num: num + 1 if num % 2 != 0 else num)(save_per_updates)
584
- last_per_steps = (lambda num: num + 1 if num % 2 != 0 else num)(last_per_steps)
585
-
586
- if finetune:
587
- learning_rate = 1e-5
588
- else:
589
- learning_rate = 7.5e-5
590
-
591
- return batch_size_per_gpu, max_samples, num_warmup_updates, save_per_updates, last_per_steps, samples, learning_rate
592
-
593
-
594
- def extract_and_save_ema_model(checkpoint_path: str, new_checkpoint_path: str) -> None:
595
- try:
596
- checkpoint = torch.load(checkpoint_path)
597
- print("Original Checkpoint Keys:", checkpoint.keys())
598
-
599
- ema_model_state_dict = checkpoint.get("ema_model_state_dict", None)
600
-
601
- if ema_model_state_dict is not None:
602
- new_checkpoint = {"ema_model_state_dict": ema_model_state_dict}
603
- torch.save(new_checkpoint, new_checkpoint_path)
604
- return f"New checkpoint saved at: {new_checkpoint_path}"
605
- else:
606
- return "No 'ema_model_state_dict' found in the checkpoint."
607
-
608
- except Exception as e:
609
- return f"An error occurred: {e}"
610
-
611
-
612
- def vocab_check(project_name):
613
- name_project = project_name + "_pinyin"
614
- path_project = os.path.join(path_data, name_project)
615
-
616
- file_metadata = os.path.join(path_project, "metadata.csv")
617
-
618
- file_vocab = "data/Emilia_ZH_EN_pinyin/vocab.txt"
619
- if not os.path.isfile(file_vocab):
620
- return f"the file {file_vocab} not found !"
621
-
622
- with open(file_vocab, "r", encoding="utf-8") as f:
623
- data = f.read()
624
-
625
- vocab = data.split("\n")
626
-
627
- if not os.path.isfile(file_metadata):
628
- return f"the file {file_metadata} not found !"
629
-
630
- with open(file_metadata, "r", encoding="utf-8") as f:
631
- data = f.read()
632
-
633
- miss_symbols = []
634
- miss_symbols_keep = {}
635
- for item in data.split("\n"):
636
- sp = item.split("|")
637
- if len(sp) != 2:
638
- continue
639
-
640
- text = sp[1].lower().strip()
641
-
642
- for t in text:
643
- if t not in vocab and t not in miss_symbols_keep:
644
- miss_symbols.append(t)
645
- miss_symbols_keep[t] = t
646
- if miss_symbols == []:
647
- info = "You can train using your language !"
648
- else:
649
- info = f"The following symbols are missing in your language : {len(miss_symbols)}\n\n" + "\n".join(miss_symbols)
650
-
651
- return info
652
-
653
-
654
- def get_random_sample_prepare(project_name):
655
- name_project = project_name + "_pinyin"
656
- path_project = os.path.join(path_data, name_project)
657
- file_arrow = os.path.join(path_project, "raw.arrow")
658
- if not os.path.isfile(file_arrow):
659
- return "", None
660
- dataset = Dataset_.from_file(file_arrow)
661
- random_sample = dataset.shuffle(seed=random.randint(0, 1000)).select([0])
662
- text = "[" + " , ".join(["' " + t + " '" for t in random_sample["text"][0]]) + "]"
663
- audio_path = random_sample["audio_path"][0]
664
- return text, audio_path
665
-
666
-
667
- def get_random_sample_transcribe(project_name):
668
- name_project = project_name + "_pinyin"
669
- path_project = os.path.join(path_data, name_project)
670
- file_metadata = os.path.join(path_project, "metadata.csv")
671
- if not os.path.isfile(file_metadata):
672
- return "", None
673
-
674
- data = ""
675
- with open(file_metadata, "r", encoding="utf-8") as f:
676
- data = f.read()
677
-
678
- list_data = []
679
- for item in data.split("\n"):
680
- sp = item.split("|")
681
- if len(sp) != 2:
682
- continue
683
- list_data.append([os.path.join(path_project, "wavs", sp[0] + ".wav"), sp[1]])
684
-
685
- if list_data == []:
686
- return "", None
687
-
688
- random_item = random.choice(list_data)
689
-
690
- return random_item[1], random_item[0]
691
-
692
-
693
- def get_random_sample_infer(project_name):
694
- text, audio = get_random_sample_transcribe(project_name)
695
- return (
696
- text,
697
- text,
698
- audio,
699
- )
700
-
701
-
702
- def infer(file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step):
703
- global last_checkpoint, last_device, tts_api
704
-
705
- if not os.path.isfile(file_checkpoint):
706
- return None
707
-
708
- if training_process is not None:
709
- device_test = "cpu"
710
- else:
711
- device_test = None
712
-
713
- if last_checkpoint != file_checkpoint or last_device != device_test:
714
- if last_checkpoint != file_checkpoint:
715
- last_checkpoint = file_checkpoint
716
- if last_device != device_test:
717
- last_device = device_test
718
-
719
- tts_api = F5TTS(model_type=exp_name, ckpt_file=file_checkpoint, device=device_test)
720
-
721
- print("update", device_test, file_checkpoint)
722
-
723
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
724
- tts_api.infer(gen_text=gen_text, ref_text=ref_text, ref_file=ref_audio, nfe_step=nfe_step, file_wave=f.name)
725
- return f.name
726
-
727
-
728
- with gr.Blocks() as app:
729
- with gr.Row():
730
- project_name = gr.Textbox(label="project name", value="my_speak")
731
- bt_create = gr.Button("create new project")
732
-
733
- bt_create.click(fn=create_data_project, inputs=[project_name])
734
-
735
- with gr.Tabs():
736
- with gr.TabItem("transcribe Data"):
737
- ch_manual = gr.Checkbox(label="user", value=False)
738
-
739
- mark_info_transcribe = gr.Markdown(
740
- """```plaintext
741
- Place your 'wavs' folder and 'metadata.csv' file in the {your_project_name}' directory.
742
-
743
- my_speak/
744
-
745
- └── dataset/
746
- ├── audio1.wav
747
- └── audio2.wav
748
- ...
749
- ```""",
750
- visible=False,
751
- )
752
-
753
- audio_speaker = gr.File(label="voice", type="filepath", file_count="multiple")
754
- txt_lang = gr.Text(label="Language", value="english")
755
- bt_transcribe = bt_create = gr.Button("transcribe")
756
- txt_info_transcribe = gr.Text(label="info", value="")
757
- bt_transcribe.click(
758
- fn=transcribe_all,
759
- inputs=[project_name, audio_speaker, txt_lang, ch_manual],
760
- outputs=[txt_info_transcribe],
761
- )
762
- ch_manual.change(fn=check_user, inputs=[ch_manual], outputs=[audio_speaker, mark_info_transcribe])
763
-
764
- random_sample_transcribe = gr.Button("random sample")
765
-
766
- with gr.Row():
767
- random_text_transcribe = gr.Text(label="Text")
768
- random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
769
-
770
- random_sample_transcribe.click(
771
- fn=get_random_sample_transcribe,
772
- inputs=[project_name],
773
- outputs=[random_text_transcribe, random_audio_transcribe],
774
- )
775
-
776
- with gr.TabItem("prepare Data"):
777
- gr.Markdown(
778
- """```plaintext
779
- place all your wavs folder and your metadata.csv file in {your name project}
780
- my_speak/
781
-
782
- ├── wavs/
783
- │ ├── audio1.wav
784
- │ └── audio2.wav
785
- | ...
786
-
787
- └── metadata.csv
788
-
789
- file format metadata.csv
790
-
791
- audio1|text1
792
- audio2|text1
793
- ...
794
-
795
- ```"""
796
- )
797
-
798
- bt_prepare = bt_create = gr.Button("prepare")
799
- txt_info_prepare = gr.Text(label="info", value="")
800
- bt_prepare.click(fn=create_metadata, inputs=[project_name], outputs=[txt_info_prepare])
801
-
802
- random_sample_prepare = gr.Button("random sample")
803
-
804
- with gr.Row():
805
- random_text_prepare = gr.Text(label="Pinyin")
806
- random_audio_prepare = gr.Audio(label="Audio", type="filepath")
807
-
808
- random_sample_prepare.click(
809
- fn=get_random_sample_prepare, inputs=[project_name], outputs=[random_text_prepare, random_audio_prepare]
810
- )
811
-
812
- with gr.TabItem("train Data"):
813
- with gr.Row():
814
- bt_calculate = bt_create = gr.Button("Auto Settings")
815
- ch_finetune = bt_create = gr.Checkbox(label="finetune", value=True)
816
- lb_samples = gr.Label(label="samples")
817
- batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
818
-
819
- with gr.Row():
820
- exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
821
- learning_rate = gr.Number(label="Learning Rate", value=1e-5, step=1e-5)
822
-
823
- with gr.Row():
824
- batch_size_per_gpu = gr.Number(label="Batch Size per GPU", value=1000)
825
- max_samples = gr.Number(label="Max Samples", value=64)
826
-
827
- with gr.Row():
828
- grad_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=1)
829
- max_grad_norm = gr.Number(label="Max Gradient Norm", value=1.0)
830
-
831
- with gr.Row():
832
- epochs = gr.Number(label="Epochs", value=10)
833
- num_warmup_updates = gr.Number(label="Warmup Updates", value=5)
834
-
835
- with gr.Row():
836
- save_per_updates = gr.Number(label="Save per Updates", value=10)
837
- last_per_steps = gr.Number(label="Last per Steps", value=50)
838
-
839
- with gr.Row():
840
- start_button = gr.Button("Start Training")
841
- stop_button = gr.Button("Stop Training", interactive=False)
842
-
843
- txt_info_train = gr.Text(label="info", value="")
844
- start_button.click(
845
- fn=start_training,
846
- inputs=[
847
- project_name,
848
- exp_name,
849
- learning_rate,
850
- batch_size_per_gpu,
851
- batch_size_type,
852
- max_samples,
853
- grad_accumulation_steps,
854
- max_grad_norm,
855
- epochs,
856
- num_warmup_updates,
857
- save_per_updates,
858
- last_per_steps,
859
- ch_finetune,
860
- ],
861
- outputs=[txt_info_train, start_button, stop_button],
862
- )
863
- stop_button.click(fn=stop_training, outputs=[txt_info_train, start_button, stop_button])
864
- bt_calculate.click(
865
- fn=calculate_train,
866
- inputs=[
867
- project_name,
868
- batch_size_type,
869
- max_samples,
870
- learning_rate,
871
- num_warmup_updates,
872
- save_per_updates,
873
- last_per_steps,
874
- ch_finetune,
875
- ],
876
- outputs=[
877
- batch_size_per_gpu,
878
- max_samples,
879
- num_warmup_updates,
880
- save_per_updates,
881
- last_per_steps,
882
- lb_samples,
883
- learning_rate,
884
- ],
885
- )
886
-
887
- with gr.TabItem("reduse checkpoint"):
888
- txt_path_checkpoint = gr.Text(label="path checkpoint :")
889
- txt_path_checkpoint_small = gr.Text(label="path output :")
890
- txt_info_reduse = gr.Text(label="info", value="")
891
- reduse_button = gr.Button("reduse")
892
- reduse_button.click(
893
- fn=extract_and_save_ema_model,
894
- inputs=[txt_path_checkpoint, txt_path_checkpoint_small],
895
- outputs=[txt_info_reduse],
896
- )
897
-
898
- with gr.TabItem("vocab check experiment"):
899
- check_button = gr.Button("check vocab")
900
- txt_info_check = gr.Text(label="info", value="")
901
- check_button.click(fn=vocab_check, inputs=[project_name], outputs=[txt_info_check])
902
-
903
- with gr.TabItem("test model"):
904
- exp_name = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
905
- nfe_step = gr.Number(label="n_step", value=32)
906
- file_checkpoint_pt = gr.Textbox(label="Checkpoint", value="")
907
-
908
- random_sample_infer = gr.Button("random sample")
909
-
910
- ref_text = gr.Textbox(label="ref text")
911
- ref_audio = gr.Audio(label="audio ref", type="filepath")
912
- gen_text = gr.Textbox(label="gen text")
913
- random_sample_infer.click(
914
- fn=get_random_sample_infer, inputs=[project_name], outputs=[ref_text, gen_text, ref_audio]
915
- )
916
- check_button_infer = gr.Button("infer")
917
- gen_audio = gr.Audio(label="audio gen", type="filepath")
918
-
919
- check_button_infer.click(
920
- fn=infer,
921
- inputs=[file_checkpoint_pt, exp_name, ref_text, ref_audio, gen_text, nfe_step],
922
- outputs=[gen_audio],
923
- )
924
-
925
-
926
- @click.command()
927
- @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
928
- @click.option("--host", "-H", default=None, help="Host to run the app on")
929
- @click.option(
930
- "--share",
931
- "-s",
932
- default=False,
933
- is_flag=True,
934
- help="Share the app via Gradio share link",
935
- )
936
- @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
937
- def main(port, host, share, api):
938
- global app
939
- print("Starting app...")
940
- app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
941
-
942
-
943
- if __name__ == "__main__":
944
- main()
 
1
+ import gc
2
+ import json
3
+ import os
4
+ import platform
5
+ import psutil
6
+ import random
7
+ import signal
8
+ import shutil
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from glob import glob
14
+
15
+ import click
16
+ import gradio as gr
17
+ import librosa
18
+ import numpy as np
19
+ import torch
20
+ import torchaudio
21
+ from datasets import Dataset as Dataset_
22
+ from datasets.arrow_writer import ArrowWriter
23
+ from scipy.io import wavfile
24
+ from transformers import pipeline
25
+
26
+ from f5_tts.api import F5TTS
27
+ from f5_tts.model.utils import convert_char_to_pinyin
28
+
29
+
30
+ training_process = None
31
+ system = platform.system()
32
+ python_executable = sys.executable or "python"
33
+ tts_api = None
34
+ last_checkpoint = ""
35
+ last_device = ""
36
+
37
+ path_data = "data"
38
+
39
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
40
+
41
+ pipe = None
42
+
43
+
44
+ # Load metadata
45
+ def get_audio_duration(audio_path):
46
+ """Calculate the duration of an audio file."""
47
+ audio, sample_rate = torchaudio.load(audio_path)
48
+ num_channels = audio.shape[0]
49
+ return audio.shape[1] / (sample_rate * num_channels)
50
+
51
+
52
+ def clear_text(text):
53
+ """Clean and prepare text by lowering the case and stripping whitespace."""
54
+ return text.lower().strip()
55
+
56
+
57
+ def get_rms(
58
+ y,
59
+ frame_length=2048,
60
+ hop_length=512,
61
+ pad_mode="constant",
62
+ ): # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
63
+ padding = (int(frame_length // 2), int(frame_length // 2))
64
+ y = np.pad(y, padding, mode=pad_mode)
65
+
66
+ axis = -1
67
+ # put our new within-frame axis at the end for now
68
+ out_strides = y.strides + tuple([y.strides[axis]])
69
+ # Reduce the shape on the framing axis
70
+ x_shape_trimmed = list(y.shape)
71
+ x_shape_trimmed[axis] -= frame_length - 1
72
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
73
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
74
+ if axis < 0:
75
+ target_axis = axis - 1
76
+ else:
77
+ target_axis = axis + 1
78
+ xw = np.moveaxis(xw, -1, target_axis)
79
+ # Downsample along the target axis
80
+ slices = [slice(None)] * xw.ndim
81
+ slices[axis] = slice(0, None, hop_length)
82
+ x = xw[tuple(slices)]
83
+
84
+ # Calculate power
85
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
86
+
87
+ return np.sqrt(power)
88
+
89
+
90
+ class Slicer: # https://github.com/RVC-Boss/GPT-SoVITS/blob/main/tools/slicer2.py
91
+ def __init__(
92
+ self,
93
+ sr: int,
94
+ threshold: float = -40.0,
95
+ min_length: int = 2000,
96
+ min_interval: int = 300,
97
+ hop_size: int = 20,
98
+ max_sil_kept: int = 2000,
99
+ ):
100
+ if not min_length >= min_interval >= hop_size:
101
+ raise ValueError("The following condition must be satisfied: min_length >= min_interval >= hop_size")
102
+ if not max_sil_kept >= hop_size:
103
+ raise ValueError("The following condition must be satisfied: max_sil_kept >= hop_size")
104
+ min_interval = sr * min_interval / 1000
105
+ self.threshold = 10 ** (threshold / 20.0)
106
+ self.hop_size = round(sr * hop_size / 1000)
107
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
108
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
109
+ self.min_interval = round(min_interval / self.hop_size)
110
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
111
+
112
+ def _apply_slice(self, waveform, begin, end):
113
+ if len(waveform.shape) > 1:
114
+ return waveform[:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)]
115
+ else:
116
+ return waveform[begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)]
117
+
118
+ # @timeit
119
+ def slice(self, waveform):
120
+ if len(waveform.shape) > 1:
121
+ samples = waveform.mean(axis=0)
122
+ else:
123
+ samples = waveform
124
+ if samples.shape[0] <= self.min_length:
125
+ return [waveform]
126
+ rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
127
+ sil_tags = []
128
+ silence_start = None
129
+ clip_start = 0
130
+ for i, rms in enumerate(rms_list):
131
+ # Keep looping while frame is silent.
132
+ if rms < self.threshold:
133
+ # Record start of silent frames.
134
+ if silence_start is None:
135
+ silence_start = i
136
+ continue
137
+ # Keep looping while frame is not silent and silence start has not been recorded.
138
+ if silence_start is None:
139
+ continue
140
+ # Clear recorded silence start if interval is not enough or clip is too short
141
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
142
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
143
+ if not is_leading_silence and not need_slice_middle:
144
+ silence_start = None
145
+ continue
146
+ # Need slicing. Record the range of silent frames to be removed.
147
+ if i - silence_start <= self.max_sil_kept:
148
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos))
151
+ else:
152
+ sil_tags.append((pos, pos))
153
+ clip_start = pos
154
+ elif i - silence_start <= self.max_sil_kept * 2:
155
+ pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin()
156
+ pos += i - self.max_sil_kept
157
+ pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
158
+ pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
159
+ if silence_start == 0:
160
+ sil_tags.append((0, pos_r))
161
+ clip_start = pos_r
162
+ else:
163
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
164
+ clip_start = max(pos_r, pos)
165
+ else:
166
+ pos_l = rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start
167
+ pos_r = rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept
168
+ if silence_start == 0:
169
+ sil_tags.append((0, pos_r))
170
+ else:
171
+ sil_tags.append((pos_l, pos_r))
172
+ clip_start = pos_r
173
+ silence_start = None
174
+ # Deal with trailing silence.
175
+ total_frames = rms_list.shape[0]
176
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
177
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
178
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
179
+ sil_tags.append((pos, total_frames + 1))
180
+ # Apply and return slices.
181
+ ####音频+起始时间+终止时间
182
+ if len(sil_tags) == 0:
183
+ return [[waveform, 0, int(total_frames * self.hop_size)]]
184
+ else:
185
+ chunks = []
186
+ if sil_tags[0][0] > 0:
187
+ chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]), 0, int(sil_tags[0][0] * self.hop_size)])
188
+ for i in range(len(sil_tags) - 1):
189
+ chunks.append(
190
+ [
191
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),
192
+ int(sil_tags[i][1] * self.hop_size),
193
+ int(sil_tags[i + 1][0] * self.hop_size),
194
+ ]
195
+ )
196
+ if sil_tags[-1][1] < total_frames:
197
+ chunks.append(
198
+ [
199
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames),
200
+ int(sil_tags[-1][1] * self.hop_size),
201
+ int(total_frames * self.hop_size),
202
+ ]
203
+ )
204
+ return chunks
205
+
206
+
207
+ # terminal
208
+ def terminate_process_tree(pid, including_parent=True):
209
+ try:
210
+ parent = psutil.Process(pid)
211
+ except psutil.NoSuchProcess:
212
+ # Process already terminated
213
+ return
214
+
215
+ children = parent.children(recursive=True)
216
+ for child in children:
217
+ try:
218
+ os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
219
+ except OSError:
220
+ pass
221
+ if including_parent:
222
+ try:
223
+ os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
224
+ except OSError:
225
+ pass
226
+
227
+
228
+ def terminate_process(pid):
229
+ if system == "Windows":
230
+ cmd = f"taskkill /t /f /pid {pid}"
231
+ os.system(cmd)
232
+ else:
233
+ terminate_process_tree(pid)
234
+
235
+
236
+ def start_training(
237
+ dataset_name="",
238
+ exp_name="F5TTS_Base",
239
+ learning_rate=1e-4,
240
+ batch_size_per_gpu=400,
241
+ batch_size_type="frame",
242
+ max_samples=64,
243
+ grad_accumulation_steps=1,
244
+ max_grad_norm=1.0,
245
+ epochs=11,
246
+ num_warmup_updates=200,
247
+ save_per_updates=400,
248
+ last_per_steps=800,
249
+ finetune=True,
250
+ ):
251
+ global training_process, tts_api
252
+
253
+ if tts_api is not None:
254
+ del tts_api
255
+ gc.collect()
256
+ torch.cuda.empty_cache()
257
+ tts_api = None
258
+
259
+ path_project = os.path.join(path_data, dataset_name + "_pinyin")
260
+
261
+ if not os.path.isdir(path_project):
262
+ yield (
263
+ f"There is not project with name {dataset_name}",
264
+ gr.update(interactive=True),
265
+ gr.update(interactive=False),
266
+ )
267
+ return
268
+
269
+ file_raw = os.path.join(path_project, "raw.arrow")
270
+ if not os.path.isfile(file_raw):
271
+ yield f"There is no file {file_raw}", gr.update(interactive=True), gr.update(interactive=False)
272
+ return
273
+
274
+ # Check if a training process is already running
275
+ if training_process is not None:
276
+ return "Train run already!", gr.update(interactive=False), gr.update(interactive=True)
277
+
278
+ yield "start train", gr.update(interactive=False), gr.update(interactive=False)
279
+
280
+ # Command to run the training script with the specified arguments
281
+ cmd = (
282
+ f"accelerate launch finetune-cli.py --exp_name {exp_name} "
283
+ f"--learning_rate {learning_rate} "
284
+ f"--batch_size_per_gpu {batch_size_per_gpu} "
285
+ f"--batch_size_type {batch_size_type} "
286
+ f"--max_samples {max_samples} "
287
+ f"--grad_accumulation_steps {grad_accumulation_steps} "
288
+ f"--max_grad_norm {max_grad_norm} "
289
+ f"--epochs {epochs} "
290
+ f"--num_warmup_updates {num_warmup_updates} "
291
+ f"--save_per_updates {save_per_updates} "
292
+ f"--last_per_steps {last_per_steps} "
293
+ f"--dataset_name {dataset_name}"
294
+ )
295
+ if finetune:
296
+ cmd += f" --finetune {finetune}"
297
+
298
+ print(cmd)
299
+
300
+ try:
301
+ # Start the training process
302
+ training_process = subprocess.Popen(cmd, shell=True)
303
+
304
+ time.sleep(5)
305
+ yield "train start", gr.update(interactive=False), gr.update(interactive=True)
306
+
307
+ # Wait for the training process to finish
308
+ training_process.wait()
309
+ time.sleep(1)
310
+
311
+ if training_process is None:
312
+ text_info = "train stop"
313
+ else:
314
+ text_info = "train complete !"
315
+
316
+ except Exception as e: # Catch all exceptions
317
+ # Ensure that we reset the training process variable in case of an error
318
+ text_info = f"An error occurred: {str(e)}"
319
+
320
+ training_process = None
321
+
322
+ yield text_info, gr.update(interactive=True), gr.update(interactive=False)
323
+
324
+
325
+ def stop_training():
326
+ global training_process
327
+ if training_process is None:
328
+ return "Train not run !", gr.update(interactive=True), gr.update(interactive=False)
329
+ terminate_process_tree(training_process.pid)
330
+ training_process = None
331
+ return "train stop", gr.update(interactive=True), gr.update(interactive=False)
332
+
333
+
334
+ def create_data_project(name):
335
+ name += "_pinyin"
336
+ os.makedirs(os.path.join(path_data, name), exist_ok=True)
337
+ os.makedirs(os.path.join(path_data, name, "dataset"), exist_ok=True)
338
+
339
+
340
+ def transcribe(file_audio, language="english"):
341
+ global pipe
342
+
343
+ if pipe is None:
344
+ pipe = pipeline(
345
+ "automatic-speech-recognition",
346
+ model="openai/whisper-large-v3-turbo",
347
+ torch_dtype=torch.float16,
348
+ device=device,
349
+ )
350
+
351
+ text_transcribe = pipe(
352
+ file_audio,
353
+ chunk_length_s=30,
354
+ batch_size=128,
355
+ generate_kwargs={"task": "transcribe", "language": language},
356
+ return_timestamps=False,
357
+ )["text"].strip()
358
+ return text_transcribe
359
+
360
+
361
+ def transcribe_all(name_project, audio_files, language, user=False, progress=gr.Progress()):
362
+ name_project += "_pinyin"
363
+ path_project = os.path.join(path_data, name_project)
364
+ path_dataset = os.path.join(path_project, "dataset")
365
+ path_project_wavs = os.path.join(path_project, "wavs")
366
+ file_metadata = os.path.join(path_project, "metadata.csv")
367
+
368
+ if audio_files is None:
369
+ return "You need to load an audio file."
370
+
371
+ if os.path.isdir(path_project_wavs):
372
+ shutil.rmtree(path_project_wavs)
373
+
374
+ if os.path.isfile(file_metadata):
375
+ os.remove(file_metadata)
376
+
377
+ os.makedirs(path_project_wavs, exist_ok=True)
378
+
379
+ if user:
380
+ file_audios = [
381
+ file
382
+ for format in ("*.wav", "*.ogg", "*.opus", "*.mp3", "*.flac")
383
+ for file in glob(os.path.join(path_dataset, format))
384
+ ]
385
+ if file_audios == []:
386
+ return "No audio file was found in the dataset."
387
+ else:
388
+ file_audios = audio_files
389
+
390
+ alpha = 0.5
391
+ _max = 1.0
392
+ slicer = Slicer(24000)
393
+
394
+ num = 0
395
+ error_num = 0
396
+ data = ""
397
+ for file_audio in progress.tqdm(file_audios, desc="transcribe files", total=len((file_audios))):
398
+ audio, _ = librosa.load(file_audio, sr=24000, mono=True)
399
+
400
+ list_slicer = slicer.slice(audio)
401
+ for chunk, start, end in progress.tqdm(list_slicer, total=len(list_slicer), desc="slicer files"):
402
+ name_segment = os.path.join(f"segment_{num}")
403
+ file_segment = os.path.join(path_project_wavs, f"{name_segment}.wav")
404
+
405
+ tmp_max = np.abs(chunk).max()
406
+ if tmp_max > 1:
407
+ chunk /= tmp_max
408
+ chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
409
+ wavfile.write(file_segment, 24000, (chunk * 32767).astype(np.int16))
410
+
411
+ try:
412
+ text = transcribe(file_segment, language)
413
+ text = text.lower().strip().replace('"', "")
414
+
415
+ data += f"{name_segment}|{text}\n"
416
+
417
+ num += 1
418
+ except: # noqa: E722
419
+ error_num += 1
420
+
421
+ with open(file_metadata, "w", encoding="utf-8") as f:
422
+ f.write(data)
423
+
424
+ if error_num != []:
425
+ error_text = f"\nerror files : {error_num}"
426
+ else:
427
+ error_text = ""
428
+
429
+ return f"transcribe complete samples : {num}\npath : {path_project_wavs}{error_text}"
430
+
431
+
432
+ def format_seconds_to_hms(seconds):
433
+ hours = int(seconds / 3600)
434
+ minutes = int((seconds % 3600) / 60)
435
+ seconds = seconds % 60
436
+ return "{:02d}:{:02d}:{:02d}".format(hours, minutes, int(seconds))
437
+
438
+
439
+ def create_metadata(name_project, progress=gr.Progress()):
440
+ name_project += "_pinyin"
441
+ path_project = os.path.join(path_data, name_project)
442
+ path_project_wavs = os.path.join(path_project, "wavs")
443
+ file_metadata = os.path.join(path_project, "metadata.csv")
444
+ file_raw = os.path.join(path_project, "raw.arrow")
445
+ file_duration = os.path.join(path_project, "duration.json")
446
+ file_vocab = os.path.join(path_project, "vocab.txt")
447
+
448
+ if not os.path.isfile(file_metadata):
449
+ return "The file was not found in " + file_metadata
450
+
451
+ with open(file_metadata, "r", encoding="utf-8") as f:
452
+ data = f.read()
453
+
454
+ audio_path_list = []
455
+ text_list = []
456
+ duration_list = []
457
+
458
+ count = data.split("\n")
459
+ lenght = 0
460
+ result = []
461
+ error_files = []
462
+ for line in progress.tqdm(data.split("\n"), total=count):
463
+ sp_line = line.split("|")
464
+ if len(sp_line) != 2:
465
+ continue
466
+ name_audio, text = sp_line[:2]
467
+
468
+ file_audio = os.path.join(path_project_wavs, name_audio + ".wav")
469
+
470
+ if not os.path.isfile(file_audio):
471
+ error_files.append(file_audio)
472
+ continue
473
+
474
+ duraction = get_audio_duration(file_audio)
475
+ if duraction < 2 and duraction > 15:
476
+ continue
477
+ if len(text) < 4:
478
+ continue
479
+
480
+ text = clear_text(text)
481
+ text = convert_char_to_pinyin([text], polyphone=True)[0]
482
+
483
+ audio_path_list.append(file_audio)
484
+ duration_list.append(duraction)
485
+ text_list.append(text)
486
+
487
+ result.append({"audio_path": file_audio, "text": text, "duration": duraction})
488
+
489
+ lenght += duraction
490
+
491
+ if duration_list == []:
492
+ error_files_text = "\n".join(error_files)
493
+ return f"Error: No audio files found in the specified path : \n{error_files_text}"
494
+
495
+ min_second = round(min(duration_list), 2)
496
+ max_second = round(max(duration_list), 2)
497
+
498
+ with ArrowWriter(path=file_raw, writer_batch_size=1) as writer:
499
+ for line in progress.tqdm(result, total=len(result), desc="prepare data"):
500
+ writer.write(line)
501
+
502
+ with open(file_duration, "w", encoding="utf-8") as f:
503
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
504
+
505
+ file_vocab_finetune = "data/Emilia_ZH_EN_pinyin/vocab.txt"
506
+ if not os.path.isfile(file_vocab_finetune):
507
+ return "Error: Vocabulary file 'Emilia_ZH_EN_pinyin' not found!"
508
+ shutil.copy2(file_vocab_finetune, file_vocab)
509
+
510
+ if error_files != []:
511
+ error_text = "error files\n" + "\n".join(error_files)
512
+ else:
513
+ error_text = ""
514
+
515
+ return f"prepare complete \nsamples : {len(text_list)}\ntime data : {format_seconds_to_hms(lenght)}\nmin sec : {min_second}\nmax sec : {max_second}\nfile_arrow : {file_raw}\n{error_text}"
516
+
517
+
518
+ def check_user(value):
519
+ return gr.update(visible=not value), gr.update(visible=value)
520
+
521
+
522
+ def calculate_train(
523
+ name_project,
524
+ batch_size_type,
525
+ max_samples,
526
+ learning_rate,
527
+ num_warmup_updates,
528
+ save_per_updates,
529
+ last_per_steps,
530
+ finetune,
531
+ ):
532
+ name_project += "_pinyin"
533
+ path_project = os.path.join(path_data, name_project)
534
+ file_duraction = os.path.join(path_project, "duration.json")
535
+
536
+ if not os.path.isfile(file_duraction):
537
+ return (
538
+ 1000,
539
+ max_samples,
540
+ num_warmup_updates,
541
+ save_per_updates,
542
+ last_per_steps,
543
+ "project not found !",
544
+ learning_rate,
545
+ )
546
+
547
+ with open(file_duraction, "r") as file:
548
+ data = json.load(file)
549
+
550
+ duration_list = data["duration"]
551
+
552
+ samples = len(duration_list)
553
+
554
+ if torch.cuda.is_available():
555
+ gpu_properties = torch.cuda.get_device_properties(0)
556
+ total_memory = gpu_properties.total_memory / (1024**3)
557
+ elif torch.backends.mps.is_available():
558
+ total_memory = psutil.virtual_memory().available / (1024**3)
559
+
560
+ if batch_size_type == "frame":
561
+ batch = int(total_memory * 0.5)
562
+ batch = (lambda num: num + 1 if num % 2 != 0 else num)(batch)
563
+ batch_size_per_gpu = int(38400 / batch)
564
+ else:
565
+ batch_size_per_gpu = int(total_memory / 8)
566
+ batch_size_per_gpu = (lambda num: num + 1 if num % 2 != 0 else num)(batch_size_per_gpu)
567
+ batch = batch_size_per_gpu
568
+
569
+ if batch_size_per_gpu <= 0:
570
+ batch_size_per_gpu = 1
571
+
572
+ if samples < 64:
573
+ max_samples = int(samples * 0.25)
574
+ else:
575
+ max_samples = 64
576
+
577
+ num_warmup_updates = int(samples * 0.05)
578
+ save_per_updates = int(samples * 0.10)
579
+ last_per_steps = int(save_per_updates * 5)
580
+
581
+ max_samples = (lambda num: num + 1 if num % 2 != 0 else num)(max_samples)
582
+ num_warmup_updates = (lambda num: num + 1 if num % 2 != 0 else num)(num_warmup_updates)
583
+ save_per_updates = (lambda num: num + 1 if num % 2 != 0 else num)(save_per_updates)
584
+ last_per_steps = (lambda num: num + 1 if num % 2 != 0 else num)(last_per_steps)
585
+
586
+ if finetune:
587
+ learning_rate = 1e-5
588
+ else:
589
+ learning_rate = 7.5e-5
590
+
591
+ return batch_size_per_gpu, max_samples, num_warmup_updates, save_per_updates, last_per_steps, samples, learning_rate
592
+
593
+
594
+ def extract_and_save_ema_model(checkpoint_path: str, new_checkpoint_path: str) -> None:
595
+ try:
596
+ checkpoint = torch.load(checkpoint_path)
597
+ print("Original Checkpoint Keys:", checkpoint.keys())
598
+
599
+ ema_model_state_dict = checkpoint.get("ema_model_state_dict", None)
600
+
601
+ if ema_model_state_dict is not None:
602
+ new_checkpoint = {"ema_model_state_dict": ema_model_state_dict}
603
+ torch.save(new_checkpoint, new_checkpoint_path)
604
+ return f"New checkpoint saved at: {new_checkpoint_path}"
605
+ else:
606
+ return "No 'ema_model_state_dict' found in the checkpoint."
607
+
608
+ except Exception as e:
609
+ return f"An error occurred: {e}"
610
+
611
+
612
+ def vocab_check(project_name):
613
+ name_project = project_name + "_pinyin"
614
+ path_project = os.path.join(path_data, name_project)
615
+
616
+ file_metadata = os.path.join(path_project, "metadata.csv")
617
+
618
+ file_vocab = "data/Emilia_ZH_EN_pinyin/vocab.txt"
619
+ if not os.path.isfile(file_vocab):
620
+ return f"the file {file_vocab} not found !"
621
+
622
+ with open(file_vocab, "r", encoding="utf-8") as f:
623
+ data = f.read()
624
+
625
+ vocab = data.split("\n")
626
+
627
+ if not os.path.isfile(file_metadata):
628
+ return f"the file {file_metadata} not found !"
629
+
630
+ with open(file_metadata, "r", encoding="utf-8") as f:
631
+ data = f.read()
632
+
633
+ miss_symbols = []
634
+ miss_symbols_keep = {}
635
+ for item in data.split("\n"):
636
+ sp = item.split("|")
637
+ if len(sp) != 2:
638
+ continue
639
+
640
+ text = sp[1].lower().strip()
641
+
642
+ for t in text:
643
+ if t not in vocab and t not in miss_symbols_keep:
644
+ miss_symbols.append(t)
645
+ miss_symbols_keep[t] = t
646
+ if miss_symbols == []:
647
+ info = "You can train using your language !"
648
+ else:
649
+ info = f"The following symbols are missing in your language : {len(miss_symbols)}\n\n" + "\n".join(miss_symbols)
650
+
651
+ return info
652
+
653
+
654
+ def get_random_sample_prepare(project_name):
655
+ name_project = project_name + "_pinyin"
656
+ path_project = os.path.join(path_data, name_project)
657
+ file_arrow = os.path.join(path_project, "raw.arrow")
658
+ if not os.path.isfile(file_arrow):
659
+ return "", None
660
+ dataset = Dataset_.from_file(file_arrow)
661
+ random_sample = dataset.shuffle(seed=random.randint(0, 1000)).select([0])
662
+ text = "[" + " , ".join(["' " + t + " '" for t in random_sample["text"][0]]) + "]"
663
+ audio_path = random_sample["audio_path"][0]
664
+ return text, audio_path
665
+
666
+
667
+ def get_random_sample_transcribe(project_name):
668
+ name_project = project_name + "_pinyin"
669
+ path_project = os.path.join(path_data, name_project)
670
+ file_metadata = os.path.join(path_project, "metadata.csv")
671
+ if not os.path.isfile(file_metadata):
672
+ return "", None
673
+
674
+ data = ""
675
+ with open(file_metadata, "r", encoding="utf-8") as f:
676
+ data = f.read()
677
+
678
+ list_data = []
679
+ for item in data.split("\n"):
680
+ sp = item.split("|")
681
+ if len(sp) != 2:
682
+ continue
683
+ list_data.append([os.path.join(path_project, "wavs", sp[0] + ".wav"), sp[1]])
684
+
685
+ if list_data == []:
686
+ return "", None
687
+
688
+ random_item = random.choice(list_data)
689
+
690
+ return random_item[1], random_item[0]
691
+
692
+
693
+ def get_random_sample_infer(project_name):
694
+ text, audio = get_random_sample_transcribe(project_name)
695
+ return (
696
+ text,
697
+ text,
698
+ audio,
699
+ )
700
+
701
+
702
+ def infer(file_checkpoint, exp_name, ref_text, ref_audio, gen_text, nfe_step):
703
+ global last_checkpoint, last_device, tts_api
704
+
705
+ if not os.path.isfile(file_checkpoint):
706
+ return None
707
+
708
+ if training_process is not None:
709
+ device_test = "cpu"
710
+ else:
711
+ device_test = None
712
+
713
+ if last_checkpoint != file_checkpoint or last_device != device_test:
714
+ if last_checkpoint != file_checkpoint:
715
+ last_checkpoint = file_checkpoint
716
+ if last_device != device_test:
717
+ last_device = device_test
718
+
719
+ tts_api = F5TTS(model_type=exp_name, ckpt_file=file_checkpoint, device=device_test)
720
+
721
+ print("update", device_test, file_checkpoint)
722
+
723
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
724
+ tts_api.infer(gen_text=gen_text, ref_text=ref_text, ref_file=ref_audio, nfe_step=nfe_step, file_wave=f.name)
725
+ return f.name
726
+
727
+
728
+ with gr.Blocks() as app:
729
+ with gr.Row():
730
+ project_name = gr.Textbox(label="project name", value="my_speak")
731
+ bt_create = gr.Button("create new project")
732
+
733
+ bt_create.click(fn=create_data_project, inputs=[project_name])
734
+
735
+ with gr.Tabs():
736
+ with gr.TabItem("transcribe Data"):
737
+ ch_manual = gr.Checkbox(label="user", value=False)
738
+
739
+ mark_info_transcribe = gr.Markdown(
740
+ """```plaintext
741
+ Place your 'wavs' folder and 'metadata.csv' file in the {your_project_name}' directory.
742
+
743
+ my_speak/
744
+
745
+ └── dataset/
746
+ ├── audio1.wav
747
+ └── audio2.wav
748
+ ...
749
+ ```""",
750
+ visible=False,
751
+ )
752
+
753
+ audio_speaker = gr.File(label="voice", type="filepath", file_count="multiple")
754
+ txt_lang = gr.Text(label="Language", value="english")
755
+ bt_transcribe = bt_create = gr.Button("transcribe")
756
+ txt_info_transcribe = gr.Text(label="info", value="")
757
+ bt_transcribe.click(
758
+ fn=transcribe_all,
759
+ inputs=[project_name, audio_speaker, txt_lang, ch_manual],
760
+ outputs=[txt_info_transcribe],
761
+ )
762
+ ch_manual.change(fn=check_user, inputs=[ch_manual], outputs=[audio_speaker, mark_info_transcribe])
763
+
764
+ random_sample_transcribe = gr.Button("random sample")
765
+
766
+ with gr.Row():
767
+ random_text_transcribe = gr.Text(label="Text")
768
+ random_audio_transcribe = gr.Audio(label="Audio", type="filepath")
769
+
770
+ random_sample_transcribe.click(
771
+ fn=get_random_sample_transcribe,
772
+ inputs=[project_name],
773
+ outputs=[random_text_transcribe, random_audio_transcribe],
774
+ )
775
+
776
+ with gr.TabItem("prepare Data"):
777
+ gr.Markdown(
778
+ """```plaintext
779
+ place all your wavs folder and your metadata.csv file in {your name project}
780
+ my_speak/
781
+
782
+ ├── wavs/
783
+ │ ├── audio1.wav
784
+ │ └── audio2.wav
785
+ | ...
786
+
787
+ └── metadata.csv
788
+
789
+ file format metadata.csv
790
+
791
+ audio1|text1
792
+ audio2|text1
793
+ ...
794
+
795
+ ```"""
796
+ )
797
+
798
+ bt_prepare = bt_create = gr.Button("prepare")
799
+ txt_info_prepare = gr.Text(label="info", value="")
800
+ bt_prepare.click(fn=create_metadata, inputs=[project_name], outputs=[txt_info_prepare])
801
+
802
+ random_sample_prepare = gr.Button("random sample")
803
+
804
+ with gr.Row():
805
+ random_text_prepare = gr.Text(label="Pinyin")
806
+ random_audio_prepare = gr.Audio(label="Audio", type="filepath")
807
+
808
+ random_sample_prepare.click(
809
+ fn=get_random_sample_prepare, inputs=[project_name], outputs=[random_text_prepare, random_audio_prepare]
810
+ )
811
+
812
+ with gr.TabItem("train Data"):
813
+ with gr.Row():
814
+ bt_calculate = bt_create = gr.Button("Auto Settings")
815
+ ch_finetune = bt_create = gr.Checkbox(label="finetune", value=True)
816
+ lb_samples = gr.Label(label="samples")
817
+ batch_size_type = gr.Radio(label="Batch Size Type", choices=["frame", "sample"], value="frame")
818
+
819
+ with gr.Row():
820
+ exp_name = gr.Radio(label="Model", choices=["F5TTS_Base", "E2TTS_Base"], value="F5TTS_Base")
821
+ learning_rate = gr.Number(label="Learning Rate", value=1e-5, step=1e-5)
822
+
823
+ with gr.Row():
824
+ batch_size_per_gpu = gr.Number(label="Batch Size per GPU", value=1000)
825
+ max_samples = gr.Number(label="Max Samples", value=64)
826
+
827
+ with gr.Row():
828
+ grad_accumulation_steps = gr.Number(label="Gradient Accumulation Steps", value=1)
829
+ max_grad_norm = gr.Number(label="Max Gradient Norm", value=1.0)
830
+
831
+ with gr.Row():
832
+ epochs = gr.Number(label="Epochs", value=10)
833
+ num_warmup_updates = gr.Number(label="Warmup Updates", value=5)
834
+
835
+ with gr.Row():
836
+ save_per_updates = gr.Number(label="Save per Updates", value=10)
837
+ last_per_steps = gr.Number(label="Last per Steps", value=50)
838
+
839
+ with gr.Row():
840
+ start_button = gr.Button("Start Training")
841
+ stop_button = gr.Button("Stop Training", interactive=False)
842
+
843
+ txt_info_train = gr.Text(label="info", value="")
844
+ start_button.click(
845
+ fn=start_training,
846
+ inputs=[
847
+ project_name,
848
+ exp_name,
849
+ learning_rate,
850
+ batch_size_per_gpu,
851
+ batch_size_type,
852
+ max_samples,
853
+ grad_accumulation_steps,
854
+ max_grad_norm,
855
+ epochs,
856
+ num_warmup_updates,
857
+ save_per_updates,
858
+ last_per_steps,
859
+ ch_finetune,
860
+ ],
861
+ outputs=[txt_info_train, start_button, stop_button],
862
+ )
863
+ stop_button.click(fn=stop_training, outputs=[txt_info_train, start_button, stop_button])
864
+ bt_calculate.click(
865
+ fn=calculate_train,
866
+ inputs=[
867
+ project_name,
868
+ batch_size_type,
869
+ max_samples,
870
+ learning_rate,
871
+ num_warmup_updates,
872
+ save_per_updates,
873
+ last_per_steps,
874
+ ch_finetune,
875
+ ],
876
+ outputs=[
877
+ batch_size_per_gpu,
878
+ max_samples,
879
+ num_warmup_updates,
880
+ save_per_updates,
881
+ last_per_steps,
882
+ lb_samples,
883
+ learning_rate,
884
+ ],
885
+ )
886
+
887
+ with gr.TabItem("reduse checkpoint"):
888
+ txt_path_checkpoint = gr.Text(label="path checkpoint :")
889
+ txt_path_checkpoint_small = gr.Text(label="path output :")
890
+ txt_info_reduse = gr.Text(label="info", value="")
891
+ reduse_button = gr.Button("reduse")
892
+ reduse_button.click(
893
+ fn=extract_and_save_ema_model,
894
+ inputs=[txt_path_checkpoint, txt_path_checkpoint_small],
895
+ outputs=[txt_info_reduse],
896
+ )
897
+
898
+ with gr.TabItem("vocab check experiment"):
899
+ check_button = gr.Button("check vocab")
900
+ txt_info_check = gr.Text(label="info", value="")
901
+ check_button.click(fn=vocab_check, inputs=[project_name], outputs=[txt_info_check])
902
+
903
+ with gr.TabItem("test model"):
904
+ exp_name = gr.Radio(label="Model", choices=["F5-TTS", "E2-TTS"], value="F5-TTS")
905
+ nfe_step = gr.Number(label="n_step", value=32)
906
+ file_checkpoint_pt = gr.Textbox(label="Checkpoint", value="")
907
+
908
+ random_sample_infer = gr.Button("random sample")
909
+
910
+ ref_text = gr.Textbox(label="ref text")
911
+ ref_audio = gr.Audio(label="audio ref", type="filepath")
912
+ gen_text = gr.Textbox(label="gen text")
913
+ random_sample_infer.click(
914
+ fn=get_random_sample_infer, inputs=[project_name], outputs=[ref_text, gen_text, ref_audio]
915
+ )
916
+ check_button_infer = gr.Button("infer")
917
+ gen_audio = gr.Audio(label="audio gen", type="filepath")
918
+
919
+ check_button_infer.click(
920
+ fn=infer,
921
+ inputs=[file_checkpoint_pt, exp_name, ref_text, ref_audio, gen_text, nfe_step],
922
+ outputs=[gen_audio],
923
+ )
924
+
925
+
926
+ @click.command()
927
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
928
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
929
+ @click.option(
930
+ "--share",
931
+ "-s",
932
+ default=False,
933
+ is_flag=True,
934
+ help="Share the app via Gradio share link",
935
+ )
936
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
937
+ def main(port, host, share, api):
938
+ global app
939
+ print("Starting app...")
940
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
941
+
942
+
943
+ if __name__ == "__main__":
944
+ main()
src/f5_tts/{train.py → train/train.py} RENAMED
File without changes