Update README.md
Browse files
README.md
CHANGED
@@ -122,6 +122,75 @@ VocalNet-8B was evaluated on [OpenAudioBench](https://huggingface.co/datasets/ba
|
|
122 |
<th style="padding: 10px; border: 1px solid #ddd;">Web Questions</th>
|
123 |
</tr>
|
124 |
</thead>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
<tbody>
|
126 |
<tr>
|
127 |
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
@@ -298,6 +367,61 @@ VocalNet-8B was evaluated on [OpenAudioBench](https://huggingface.co/datasets/ba
|
|
298 |
<td style="padding: 10px; border: 1px solid #ddd;">WER</td>
|
299 |
<td style="padding: 10px; border: 1px solid #ddd;">UTMOS</td>
|
300 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
<tr>
|
302 |
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
303 |
</tr>
|
|
|
122 |
<th style="padding: 10px; border: 1px solid #ddd;">Web Questions</th>
|
123 |
</tr>
|
124 |
</thead>
|
125 |
+
<tbody>
|
126 |
+
<tr>
|
127 |
+
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Tiny Models</td>
|
128 |
+
</tr>
|
129 |
+
<tr>
|
130 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">Mini-Omni</td>
|
131 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">0.5B</td>
|
132 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
133 |
+
<td style="padding: 10px; border: 1px solid #ddd;">1.84</td>
|
134 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.7</td>
|
135 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.12</td>
|
136 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.22</td>
|
137 |
+
</tr>
|
138 |
+
<tr>
|
139 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
140 |
+
<td style="padding: 10px; border: 1px solid #ddd;">1.80</td>
|
141 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.7</td>
|
142 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.08</td>
|
143 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.20</td>
|
144 |
+
</tr>
|
145 |
+
<tr>
|
146 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">SLAM-Omni</td>
|
147 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">0.5B</td>
|
148 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
149 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.50</td>
|
150 |
+
<td style="padding: 10px; border: 1px solid #ddd;">29.4</td>
|
151 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.39</td>
|
152 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.84</td>
|
153 |
+
</tr>
|
154 |
+
<tr>
|
155 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
156 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.01</td>
|
157 |
+
<td style="padding: 10px; border: 1px solid #ddd;">26.7</td>
|
158 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.34</td>
|
159 |
+
<td style="padding: 10px; border: 1px solid #ddd;">0.69</td>
|
160 |
+
</tr>
|
161 |
+
<tr>
|
162 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B (VA)</td>
|
163 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">1B</td>
|
164 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
165 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.38</td>
|
166 |
+
<td style="padding: 10px; border: 1px solid #ddd;">70.3</td>
|
167 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.38</td>
|
168 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.93</td>
|
169 |
+
</tr>
|
170 |
+
<tr>
|
171 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
172 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.83</td>
|
173 |
+
<td style="padding: 10px; border: 1px solid #ddd;">61.0</td>
|
174 |
+
<td style="padding: 10px; border: 1px solid #ddd;">2.78</td>
|
175 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.47</td>
|
176 |
+
</tr>
|
177 |
+
<tr>
|
178 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B</td>
|
179 |
+
<td rowspan="2" style="padding: 10px; border: 1px solid #ddd;">1B</td>
|
180 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→t</td>
|
181 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.79</b></td>
|
182 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>71.7</b></td>
|
183 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.60</b></td>
|
184 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.16</b></td>
|
185 |
+
</tr>
|
186 |
+
<tr>
|
187 |
+
<td style="padding: 10px; border: 1px solid #ddd;">s→s</td>
|
188 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.03</b></td>
|
189 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>63.7</b></td>
|
190 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.06</b></td>
|
191 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.68</b></td>
|
192 |
+
</tr>
|
193 |
+
</tbody>
|
194 |
<tbody>
|
195 |
<tr>
|
196 |
<td colspan="7" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
|
|
367 |
<td style="padding: 10px; border: 1px solid #ddd;">WER</td>
|
368 |
<td style="padding: 10px; border: 1px solid #ddd;">UTMOS</td>
|
369 |
</tr>
|
370 |
+
<tr>
|
371 |
+
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Tiny Models</td>
|
372 |
+
</tr>
|
373 |
+
<tr>
|
374 |
+
<td style="padding: 10px; border: 1px solid #ddd;">Mini-Omni</td>
|
375 |
+
<td style="padding: 10px; border: 1px solid #ddd;">20.78</td>
|
376 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.429</td>
|
377 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.20</td>
|
378 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.428</td>
|
379 |
+
<td style="padding: 10px; border: 1px solid #ddd;">7.43</td>
|
380 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.428</td>
|
381 |
+
<td style="padding: 10px; border: 1px solid #ddd;">8.51</td>
|
382 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.433</td>
|
383 |
+
<td style="padding: 10px; border: 1px solid #ddd;">8.66</td>
|
384 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.430</td>
|
385 |
+
</tr>
|
386 |
+
<tr>
|
387 |
+
<td style="padding: 10px; border: 1px solid #ddd;">SLAM-Omni</td>
|
388 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.52</td>
|
389 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.439</td>
|
390 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.55</td>
|
391 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.467</td>
|
392 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.16</td>
|
393 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.470</td>
|
394 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.50</td>
|
395 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.461</td>
|
396 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.17</td>
|
397 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.464</td>
|
398 |
+
</tr>
|
399 |
+
<tr>
|
400 |
+
<td style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B (VA)</td>
|
401 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.43</b></td>
|
402 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.495</b></td>
|
403 |
+
<td style="padding: 10px; border: 1px solid #ddd;">3.65</td>
|
404 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.498</b></td>
|
405 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.97</b></td>
|
406 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.499</b></td>
|
407 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.40</td>
|
408 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.489</td>
|
409 |
+
<td style="padding: 10px; border: 1px solid #ddd;">5.66</td>
|
410 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.495</b></td>
|
411 |
+
</tr>
|
412 |
+
<tr>
|
413 |
+
<td style="padding: 10px; border: 1px solid #ddd;">VocalNet-1B</td>
|
414 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.43</b></td>
|
415 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.491</td>
|
416 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>3.27</b></td>
|
417 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.497</td>
|
418 |
+
<td style="padding: 10px; border: 1px solid #ddd;">6.73</td>
|
419 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.486</td>
|
420 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.88</b></td>
|
421 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>4.493</b></td>
|
422 |
+
<td style="padding: 10px; border: 1px solid #ddd;"><b>5.31</b></td>
|
423 |
+
<td style="padding: 10px; border: 1px solid #ddd;">4.491</td>
|
424 |
+
</tr>
|
425 |
<tr>
|
426 |
<td colspan="11" style="padding: 10px; border: 1px solid #ddd; font-weight: bold; background-color: #e6f3ff;">Base Models</td>
|
427 |
</tr>
|