diff --git a/examples/aishell_asr_zh_test/data-00000-of-00001.arrow b/examples/aishell_asr_zh_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6b9d3e3cafc4535c8512b5170ac6d5007b3bab49 --- /dev/null +++ b/examples/aishell_asr_zh_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d8ed27441c9a5f6ac8b38e27b52ce555fe03322530ff16c16f2796c9f6e7f45 +size 44296 diff --git a/examples/aishell_asr_zh_test/dataset_info.json b/examples/aishell_asr_zh_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc9d0681d7a6ea1b2a0510dc7f1347a41e35802 --- /dev/null +++ b/examples/aishell_asr_zh_test/dataset_info.json @@ -0,0 +1,58 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "aishell_asr_zh_test_v1", + "dataset_size": 1115469955, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/aishell_asr_zh_test_v1@e73dc7f096ecea5c8e5c722446c8d3b4014fdd18/data/test-00000-of-00003.parquet": { + "num_bytes": 364047728, + "checksum": null + }, + "hf://datasets/AudioLLMs/aishell_asr_zh_test_v1@e73dc7f096ecea5c8e5c722446c8d3b4014fdd18/data/test-00001-of-00003.parquet": { + "num_bytes": 354834886, + "checksum": null + }, + "hf://datasets/AudioLLMs/aishell_asr_zh_test_v1@e73dc7f096ecea5c8e5c722446c8d3b4014fdd18/data/test-00002-of-00003.parquet": { + "num_bytes": 377041689, + "checksum": null + } + }, + "download_size": 1095924303, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2211394258, + "splits": { + "test": { + "name": "test", + "num_bytes": 1115469955, + "num_examples": 6920, + "shard_lengths": [ + 3207, + 3107, + 606 + ], + "dataset_name": "aishell_asr_zh_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/aishell_asr_zh_test/state.json b/examples/aishell_asr_zh_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..bedb417ce44c90dc6ea8a96d7468416c15887a41 --- /dev/null +++ b/examples/aishell_asr_zh_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "5a1f0f8b9b3ca365", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/alpaca_audio_test/data-00000-of-00001.arrow b/examples/alpaca_audio_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3fb0f9526bdf6ecad27d35efca63fdd4a9a392c1 --- /dev/null +++ b/examples/alpaca_audio_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbe0fbf69f6732a358e6c6575354464b9c5e39b87be73cc9f1ae7c284e9e575 +size 44600 diff --git a/examples/alpaca_audio_test/dataset_info.json b/examples/alpaca_audio_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..6123bb22e8961db8a3a1d7592958f3ce9a6a69a9 --- /dev/null +++ b/examples/alpaca_audio_test/dataset_info.json @@ -0,0 +1,49 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "alpaca_audio_test", + "dataset_size": 13865321, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/alpaca_audio_test@7eb2ab279975033690d67bee66e45eb612a430ee/data/test-00000-of-00001.parquet": { + "num_bytes": 12245608, + "checksum": null + } + }, + "download_size": 12245608, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "speech_instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 26110929, + "splits": { + "test": { + "name": "test", + "num_bytes": 13865321, + "num_examples": 100, + "dataset_name": "alpaca_audio_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/alpaca_audio_test/state.json b/examples/alpaca_audio_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1f85ff7a2293112cf45606ad9b020084d6950f7 --- /dev/null +++ b/examples/alpaca_audio_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "a7169220e58f4523", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/audiocaps_qa_test/data-00000-of-00001.arrow b/examples/audiocaps_qa_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c2ed5a8375e66214dbd11e9a7e8408faf1588494 --- /dev/null +++ b/examples/audiocaps_qa_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4745b0e32ece9d9f32f5a5a689eb5e24ca6755f114cd2b1f1f445819fa1375 +size 103208 diff --git a/examples/audiocaps_qa_test/dataset_info.json b/examples/audiocaps_qa_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd6ebeff5b67ea8c38568dc9932457674d1fdfe --- /dev/null +++ b/examples/audiocaps_qa_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "audiocaps_qa_test_v3", + "dataset_size": 98835422, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/audiocaps_qa_test_v3@6cf6853c52f55482126f226a8859eb71f2021e0e/data/test-00000-of-00001.parquet": { + "num_bytes": 92241687, + "checksum": null + } + }, + "download_size": 92241687, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 191077109, + "splits": { + "test": { + "name": "test", + "num_bytes": 98835422, + "num_examples": 313, + "dataset_name": "audiocaps_qa_test_v3" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/audiocaps_qa_test/state.json b/examples/audiocaps_qa_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..646b9bae328d4977b6d2777bfedbdf9aaa33c3c2 --- /dev/null +++ b/examples/audiocaps_qa_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "748c6693112f77f2", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/audiocaps_test/data-00000-of-00001.arrow b/examples/audiocaps_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7da1ca00336a28ed03b776aa1eb6c01a3485f5b1 --- /dev/null +++ b/examples/audiocaps_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cfff9093667c39e6750f23693d4d817193a0fbb829e3c6155d4a4c44462a184 +size 107320 diff --git a/examples/audiocaps_test/dataset_info.json b/examples/audiocaps_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c604ad5a5f8bb358b514e045b1eb68df42a787b8 --- /dev/null +++ b/examples/audiocaps_test/dataset_info.json @@ -0,0 +1,58 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "audiocaps_test", + "dataset_size": 1389113784, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/audiocaps_test@fb42aac15212cbddd723fbbf04b6071b60a9f8fe/data/test-00000-of-00003.parquet": { + "num_bytes": 432299885, + "checksum": null + }, + "hf://datasets/AudioLLMs/audiocaps_test@fb42aac15212cbddd723fbbf04b6071b60a9f8fe/data/test-00001-of-00003.parquet": { + "num_bytes": 435786346, + "checksum": null + }, + "hf://datasets/AudioLLMs/audiocaps_test@fb42aac15212cbddd723fbbf04b6071b60a9f8fe/data/test-00002-of-00003.parquet": { + "num_bytes": 435930887, + "checksum": null + } + }, + "download_size": 1304017118, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2693130902, + "splits": { + "test": { + "name": "test", + "num_bytes": 1389113784, + "num_examples": 4400, + "shard_lengths": [ + 1667, + 1667, + 1066 + ], + "dataset_name": "audiocaps_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/audiocaps_test/state.json b/examples/audiocaps_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3b92b89c6ab1acbf3e5271c9d06ac7165f2ec25 --- /dev/null +++ b/examples/audiocaps_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "1872292586a4b460", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/clotho_aqa_test/data-00000-of-00001.arrow b/examples/clotho_aqa_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..04b52a277bb9c99e14c951a8b28d685146ce8a28 --- /dev/null +++ b/examples/clotho_aqa_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad7fcb1e24f43a32efd32f08888b84f2ff5ebd275e43ca26e6230c7d01922cf +size 194400 diff --git a/examples/clotho_aqa_test/dataset_info.json b/examples/clotho_aqa_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa2c8eb873c097b2b6134c012af78e573f36ea8 --- /dev/null +++ b/examples/clotho_aqa_test/dataset_info.json @@ -0,0 +1,58 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "clotho_asqa_test_v2", + "dataset_size": 1485321871, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/clotho_asqa_test_v2@a7f4a7983d490e87a79eb041fede83e418c0ea4f/data/test-00000-of-00003.parquet": { + "num_bytes": 118559858, + "checksum": null + }, + "hf://datasets/AudioLLMs/clotho_asqa_test_v2@a7f4a7983d490e87a79eb041fede83e418c0ea4f/data/test-00001-of-00003.parquet": { + "num_bytes": 116519615, + "checksum": null + }, + "hf://datasets/AudioLLMs/clotho_asqa_test_v2@a7f4a7983d490e87a79eb041fede83e418c0ea4f/data/test-00002-of-00003.parquet": { + "num_bytes": 113032047, + "checksum": null + } + }, + "download_size": 348111520, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1833433391, + "splits": { + "test": { + "name": "test", + "num_bytes": 1485321871, + "num_examples": 2057, + "shard_lengths": [ + 786, + 786, + 485 + ], + "dataset_name": "clotho_asqa_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/clotho_aqa_test/state.json b/examples/clotho_aqa_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..2330ef4cbb792dcfb634b3be9d8d02719648f801 --- /dev/null +++ b/examples/clotho_aqa_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "35d1627b19106626", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/cn_college_listen_mcq_test/data-00000-of-00001.arrow b/examples/cn_college_listen_mcq_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..41e44dbc8f68e1732d420bc5477bd34082e4e67f --- /dev/null +++ b/examples/cn_college_listen_mcq_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3131dbbd9306a9622d446f8f151f1db0dd7ba3a14bd82f711126d159a813d5d5 +size 152776 diff --git a/examples/cn_college_listen_mcq_test/dataset_info.json b/examples/cn_college_listen_mcq_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ebf8d4881cd2c92cefee0df2dc4da96908f91b9e --- /dev/null +++ b/examples/cn_college_listen_mcq_test/dataset_info.json @@ -0,0 +1,66 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "cn_college_listen_mcq_test", + "dataset_size": 1532841284, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/cn_college_listen_mcq_test@708d8ef278b5eadae714c314889f5d81ded18a25/data/test-00000-of-00004.parquet": { + "num_bytes": 283046288, + "checksum": null + }, + "hf://datasets/AudioLLMs/cn_college_listen_mcq_test@708d8ef278b5eadae714c314889f5d81ded18a25/data/test-00001-of-00004.parquet": { + "num_bytes": 241939389, + "checksum": null + }, + "hf://datasets/AudioLLMs/cn_college_listen_mcq_test@708d8ef278b5eadae714c314889f5d81ded18a25/data/test-00002-of-00004.parquet": { + "num_bytes": 248376444, + "checksum": null + }, + "hf://datasets/AudioLLMs/cn_college_listen_mcq_test@708d8ef278b5eadae714c314889f5d81ded18a25/data/test-00003-of-00004.parquet": { + "num_bytes": 256294919, + "checksum": null + } + }, + "download_size": 1029657040, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "choices": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2562498324, + "splits": { + "test": { + "name": "test", + "num_bytes": 1532841284, + "num_examples": 2271, + "shard_lengths": [ + 500, + 1036, + 735 + ], + "dataset_name": "cn_college_listen_mcq_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/cn_college_listen_mcq_test/state.json b/examples/cn_college_listen_mcq_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..331b758ad811f6e1afc7451ecbc1d80a12ecea89 --- /dev/null +++ b/examples/cn_college_listen_mcq_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d42e39c8837074fc", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/cna_test/data-00000-of-00001.arrow b/examples/cna_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..681a5007f409e00844209ed5c291cf76d73d4bad --- /dev/null +++ b/examples/cna_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37e6858c1291ebeb2bcb6bfbe8f5291da48f4199a46ae88a60eb57fa135b61f +size 34600 diff --git a/examples/cna_test/dataset_info.json b/examples/cna_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/cna_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/cna_test/state.json b/examples/cna_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f95ef646029167bd311eb12fef75fc7f6fedd4 --- /dev/null +++ b/examples/cna_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "22ae0abf541af796", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/common_voice_15_en_test/data-00000-of-00001.arrow b/examples/common_voice_15_en_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bc84f0fec91284dc0b0c48a1ed1e9878045c0d0c --- /dev/null +++ b/examples/common_voice_15_en_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd5f49efa47828a4a7b9a6fc584b19562acae2f90451219939b0c11b4d41621f +size 54992 diff --git a/examples/common_voice_15_en_test/dataset_info.json b/examples/common_voice_15_en_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ad6be7e8a51413141e904ee813472f94021f2970 --- /dev/null +++ b/examples/common_voice_15_en_test/dataset_info.json @@ -0,0 +1,28 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/common_voice_15_en_test/state.json b/examples/common_voice_15_en_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d56d400cd1e74dc2c0cc6bbc575fbc84016e7058 --- /dev/null +++ b/examples/common_voice_15_en_test/state.json @@ -0,0 +1,19 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "62012827f888a0f5", + "_format_columns": [ + "answer", + "audio_length", + "context", + "instruction", + "language" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/commonvoice_17_id_asr/data-00000-of-00001.arrow b/examples/commonvoice_17_id_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b6c91f01c95cacdd72151346d68d387b3f058a58 --- /dev/null +++ b/examples/commonvoice_17_id_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae05a4b823184603908f2b4149190b4fab9965be5468f6b4fa34e0e214c97760 +size 43784 diff --git a/examples/commonvoice_17_id_asr/dataset_info.json b/examples/commonvoice_17_id_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..8e8ef99c6c6f245fb07d08be870c52f1667f79e0 --- /dev/null +++ b/examples/commonvoice_17_id_asr/dataset_info.json @@ -0,0 +1,132 @@ +{ + "builder_name": "common_voice_17_0", + "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", + "config_name": "id", + "dataset_name": "common_voice_17_0", + "dataset_size": 42157018, + "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 20408 validated hours of speech in 124 languages, but more voices and languages are always added.", + "download_checksums": { + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/n_shards.json": { + "num_bytes": 17491, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/train/id_train_0.tar": { + "num_bytes": 170035200, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/dev/id_dev_0.tar": { + "num_bytes": 102236160, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/test/id_test_0.tar": { + "num_bytes": 110315520, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/other/id_other_0.tar": { + "num_bytes": 686858240, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/invalidated/id_invalidated_0.tar": { + "num_bytes": 68034560, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/id/validated/id_validated_0.tar": { + "num_bytes": 805611520, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/train.tsv": { + "num_bytes": 1571021, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/dev.tsv": { + "num_bytes": 1016964, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/test.tsv": { + "num_bytes": 1052525, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/other.tsv": { + "num_bytes": 8607181, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/invalidated.tsv": { + "num_bytes": 784798, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/id/validated.tsv": { + "num_bytes": 7772548, + "checksum": null + } + }, + "download_size": 1963913728, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "https://commonvoice.mozilla.org/en/datasets", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "size_in_bytes": 2006070746, + "splits": { + "train": { + "name": "train", + "num_bytes": 3067499, + "num_examples": 4970, + "dataset_name": "common_voice_17_0" + }, + "validation": { + "name": "validation", + "num_bytes": 2011909, + "num_examples": 3349, + "dataset_name": "common_voice_17_0" + }, + "test": { + "name": "test", + "num_bytes": 2141524, + "num_examples": 3641, + "dataset_name": "common_voice_17_0" + }, + "other": { + "name": "other", + "num_bytes": 17492664, + "num_examples": 29508, + "dataset_name": "common_voice_17_0" + }, + "invalidated": { + "name": "invalidated", + "num_bytes": 1600375, + "num_examples": 2605, + "dataset_name": "common_voice_17_0" + }, + "validated": { + "name": "validated", + "num_bytes": 15843047, + "num_examples": 26108, + "dataset_name": "common_voice_17_0" + } + }, + "version": { + "version_str": "17.0.0", + "major": 17, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/commonvoice_17_id_asr/state.json b/examples/commonvoice_17_id_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..a12adf1a3eae3cf52ae922e943ed7cd2ba49569c --- /dev/null +++ b/examples/commonvoice_17_id_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "75931a254b089787", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/commonvoice_17_ta_asr/data-00000-of-00001.arrow b/examples/commonvoice_17_ta_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..17a2f643d4425d26ad926344c7dd8d76afdc5fcc --- /dev/null +++ b/examples/commonvoice_17_ta_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98470c84b50bb5c1b10476e4f6ea518ddd9127b64b0c680302f774af7b538b71 +size 90616 diff --git a/examples/commonvoice_17_ta_asr/dataset_info.json b/examples/commonvoice_17_ta_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..71b5cc14978acc1cf062ee78170080dc9221206c --- /dev/null +++ b/examples/commonvoice_17_ta_asr/dataset_info.json @@ -0,0 +1,156 @@ +{ + "builder_name": "common_voice_17_0", + "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", + "config_name": "ta", + "dataset_name": "common_voice_17_0", + "dataset_size": 221361139, + "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 20408 validated hours of speech in 124 languages, but more voices and languages are always added.", + "download_checksums": { + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/n_shards.json": { + "num_bytes": 17491, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/train/ta_train_0.tar": { + "num_bytes": 1598955520, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/train/ta_train_1.tar": { + "num_bytes": 224542720, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/dev/ta_dev_0.tar": { + "num_bytes": 434257920, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/test/ta_test_0.tar": { + "num_bytes": 454778880, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_0.tar": { + "num_bytes": 1560514560, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_1.tar": { + "num_bytes": 1515827200, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_2.tar": { + "num_bytes": 495831040, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/invalidated/ta_invalidated_0.tar": { + "num_bytes": 231424000, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_0.tar": { + "num_bytes": 1447434240, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_1.tar": { + "num_bytes": 1530644480, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_2.tar": { + "num_bytes": 1654978560, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_3.tar": { + "num_bytes": 652861440, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/train.tsv": { + "num_bytes": 19608830, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/dev.tsv": { + "num_bytes": 5203704, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/test.tsv": { + "num_bytes": 4944646, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/other.tsv": { + "num_bytes": 39470943, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/invalidated.tsv": { + "num_bytes": 2499761, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/validated.tsv": { + "num_bytes": 56763398, + "checksum": null + } + }, + "download_size": 11930559333, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "https://commonvoice.mozilla.org/en/datasets", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "size_in_bytes": 12151920472, + "splits": { + "train": { + "name": "train", + "num_bytes": 33336098, + "num_examples": 45587, + "dataset_name": "common_voice_17_0" + }, + "validation": { + "name": "validation", + "num_bytes": 8797317, + "num_examples": 12095, + "dataset_name": "common_voice_17_0" + }, + "test": { + "name": "test", + "num_bytes": 8556167, + "num_examples": 12074, + "dataset_name": "common_voice_17_0" + }, + "other": { + "name": "other", + "num_bytes": 67773267, + "num_examples": 93989, + "dataset_name": "common_voice_17_0" + }, + "invalidated": { + "name": "invalidated", + "num_bytes": 4282268, + "num_examples": 5693, + "dataset_name": "common_voice_17_0" + }, + "validated": { + "name": "validated", + "num_bytes": 98616022, + "num_examples": 135391, + "dataset_name": "common_voice_17_0" + } + }, + "version": { + "version_str": "17.0.0", + "major": 17, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/commonvoice_17_ta_asr/state.json b/examples/commonvoice_17_ta_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..22b1c94720f37491552186be55b3a40823241f4f --- /dev/null +++ b/examples/commonvoice_17_ta_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "380f8a99e8b3657c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/commonvoice_17_th_asr/data-00000-of-00001.arrow b/examples/commonvoice_17_th_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..079c9d0197a627caa2efee17e867b3e183d0a21f --- /dev/null +++ b/examples/commonvoice_17_th_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd84989250d4b79f4cb4b0ae5028d032e221858d6dab77494a04a8a2336aab4 +size 44792 diff --git a/examples/commonvoice_17_th_asr/dataset_info.json b/examples/commonvoice_17_th_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..234a77192e71a58fbdbbe05e22c801c727032943 --- /dev/null +++ b/examples/commonvoice_17_th_asr/dataset_info.json @@ -0,0 +1,164 @@ +{ + "builder_name": "common_voice_17_0", + "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", + "config_name": "th", + "dataset_name": "common_voice_17_0", + "dataset_size": 271837409, + "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 20408 validated hours of speech in 124 languages, but more voices and languages are always added.", + "download_checksums": { + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/n_shards.json": { + "num_bytes": 17491, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/train/th_train_0.tar": { + "num_bytes": 838082560, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/dev/th_dev_0.tar": { + "num_bytes": 323399680, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/test/th_test_0.tar": { + "num_bytes": 335851520, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_0.tar": { + "num_bytes": 1147064320, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_1.tar": { + "num_bytes": 1057300480, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_2.tar": { + "num_bytes": 943237120, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_3.tar": { + "num_bytes": 837079040, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_4.tar": { + "num_bytes": 1055385600, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/other/th_other_5.tar": { + "num_bytes": 176015360, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/invalidated/th_invalidated_0.tar": { + "num_bytes": 287416320, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/validated/th_validated_0.tar": { + "num_bytes": 1191843840, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/validated/th_validated_1.tar": { + "num_bytes": 1079930880, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/validated/th_validated_2.tar": { + "num_bytes": 1014691840, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/th/validated/th_validated_3.tar": { + "num_bytes": 744366080, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/train.tsv": { + "num_bytes": 11337047, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/dev.tsv": { + "num_bytes": 3803758, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/test.tsv": { + "num_bytes": 3702050, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/other.tsv": { + "num_bytes": 71870764, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/invalidated.tsv": { + "num_bytes": 3301372, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/th/validated.tsv": { + "num_bytes": 50649618, + "checksum": null + } + }, + "download_size": 11176346740, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "https://commonvoice.mozilla.org/en/datasets", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "size_in_bytes": 11448184149, + "splits": { + "train": { + "name": "train", + "num_bytes": 21220759, + "num_examples": 32823, + "dataset_name": "common_voice_17_0" + }, + "validation": { + "name": "validation", + "num_bytes": 7084499, + "num_examples": 11042, + "dataset_name": "common_voice_17_0" + }, + "test": { + "name": "test", + "num_bytes": 7004874, + "num_examples": 11042, + "dataset_name": "common_voice_17_0" + }, + "other": { + "name": "other", + "num_bytes": 134183952, + "num_examples": 206935, + "dataset_name": "common_voice_17_0" + }, + "invalidated": { + "name": "invalidated", + "num_bytes": 6202988, + "num_examples": 9267, + "dataset_name": "common_voice_17_0" + }, + "validated": { + "name": "validated", + "num_bytes": 96140337, + "num_examples": 147160, + "dataset_name": "common_voice_17_0" + } + }, + "version": { + "version_str": "17.0.0", + "major": 17, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/commonvoice_17_th_asr/state.json b/examples/commonvoice_17_th_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..294149774181a5d248ca3b3ec7b89a43f7bfdc0d --- /dev/null +++ b/examples/commonvoice_17_th_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "4b0c4a31664c7d67", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/commonvoice_17_vi_asr/data-00000-of-00001.arrow b/examples/commonvoice_17_vi_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..82f69edd3adc62cc321d80c46987cd7812734ceb --- /dev/null +++ b/examples/commonvoice_17_vi_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4249665ed76ad59af6281a9c0bacc6862d8cd750218637cd8570bc9d89002751 +size 39104 diff --git a/examples/commonvoice_17_vi_asr/dataset_info.json b/examples/commonvoice_17_vi_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..949b5cec9b691853ecea4f5a5975cc54de726d39 --- /dev/null +++ b/examples/commonvoice_17_vi_asr/dataset_info.json @@ -0,0 +1,132 @@ +{ + "builder_name": "common_voice_17_0", + "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", + "config_name": "vi", + "dataset_name": "common_voice_17_0", + "dataset_size": 12705254, + "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 20408 validated hours of speech in 124 languages, but more voices and languages are always added.", + "download_checksums": { + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/n_shards.json": { + "num_bytes": 17491, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/train/vi_train_0.tar": { + "num_bytes": 69468160, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/dev/vi_dev_0.tar": { + "num_bytes": 14479360, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/test/vi_test_0.tar": { + "num_bytes": 34488320, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/other/vi_other_0.tar": { + "num_bytes": 276172800, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/invalidated/vi_invalidated_0.tar": { + "num_bytes": 11345920, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/vi/validated/vi_validated_0.tar": { + "num_bytes": 144384000, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/train.tsv": { + "num_bytes": 688189, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/dev.tsv": { + "num_bytes": 184767, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/test.tsv": { + "num_bytes": 373370, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/other.tsv": { + "num_bytes": 3384906, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/invalidated.tsv": { + "num_bytes": 111099, + "checksum": null + }, + "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/vi/validated.tsv": { + "num_bytes": 1521796, + "checksum": null + } + }, + "download_size": 556620178, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "https://commonvoice.mozilla.org/en/datasets", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "size_in_bytes": 569325432, + "splits": { + "train": { + "name": "train", + "num_bytes": 1380061, + "num_examples": 2298, + "dataset_name": "common_voice_17_0" + }, + "validation": { + "name": "validation", + "num_bytes": 375111, + "num_examples": 641, + "dataset_name": "common_voice_17_0" + }, + "test": { + "name": "test", + "num_bytes": 754342, + "num_examples": 1274, + "dataset_name": "common_voice_17_0" + }, + "other": { + "name": "other", + "num_bytes": 6857667, + "num_examples": 11533, + "dataset_name": "common_voice_17_0" + }, + "invalidated": { + "name": "invalidated", + "num_bytes": 229034, + "num_examples": 377, + "dataset_name": "common_voice_17_0" + }, + "validated": { + "name": "validated", + "num_bytes": 3109039, + "num_examples": 5135, + "dataset_name": "common_voice_17_0" + } + }, + "version": { + "version_str": "17.0.0", + "major": 17, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/commonvoice_17_vi_asr/state.json b/examples/commonvoice_17_vi_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..677cbad3d7798a518ff06bbf9c38c907e9be7bee --- /dev/null +++ b/examples/commonvoice_17_vi_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "6ce07a66a0e6d7b0", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/commonvoice_zh_asr/data-00000-of-00001.arrow b/examples/commonvoice_zh_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b09045ff468b0821aa84ee1473d9323b515e29aa --- /dev/null +++ b/examples/commonvoice_zh_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:951bb47527709c895c17e6dac4a4a43b2a141b16babd39e56d9199deaec72eb5 +size 68224 diff --git a/examples/commonvoice_zh_asr/dataset_info.json b/examples/commonvoice_zh_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..c4a1c3fcc0b65943fb3feac26b16f3d9cf60a191 --- /dev/null +++ b/examples/commonvoice_zh_asr/dataset_info.json @@ -0,0 +1,182 @@ +{ + "builder_name": "common_voice_16_0", + "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", + "config_name": "zh-CN", + "dataset_name": "common_voice_16_0", + "dataset_size": 423983727, + "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 19159 validated hours of speech in 119 languages, but more voices and languages are always added.", + "download_checksums": { + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/n_shards.json": { + "num_bytes": 17487, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/train/zh-CN_train_0.tar": { + "num_bytes": 1157220864, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/dev/zh-CN_dev_0.tar": { + "num_bytes": 436442624, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/test/zh-CN_test_0.tar": { + "num_bytes": 506296320, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_0.tar": { + "num_bytes": 1252570624, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_1.tar": { + "num_bytes": 1216365056, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_2.tar": { + "num_bytes": 1057693696, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_3.tar": { + "num_bytes": 1037878784, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_4.tar": { + "num_bytes": 1006488064, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_5.tar": { + "num_bytes": 951297024, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_6.tar": { + "num_bytes": 1054305280, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_7.tar": { + "num_bytes": 1079122944, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_8.tar": { + "num_bytes": 1057605632, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_9.tar": { + "num_bytes": 1054744064, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_10.tar": { + "num_bytes": 1037184512, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_11.tar": { + "num_bytes": 1081821184, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_12.tar": { + "num_bytes": 1144596992, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_13.tar": { + "num_bytes": 1211527680, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_14.tar": { + "num_bytes": 1190928384, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/other/zh-CN_other_15.tar": { + "num_bytes": 299709952, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/invalidated/zh-CN_invalidated_0.tar": { + "num_bytes": 1000983552, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/audio/zh-CN/invalidated/zh-CN_invalidated_1.tar": { + "num_bytes": 416471552, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/transcript/zh-CN/train.tsv": { + "num_bytes": 7373507, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/transcript/zh-CN/dev.tsv": { + "num_bytes": 2567399, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/transcript/zh-CN/test.tsv": { + "num_bytes": 2457920, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/transcript/zh-CN/other.tsv": { + "num_bytes": 137605043, + "checksum": null + }, + "https://huggingface.co/datasets/fsicoli/common_voice_16_0/resolve/main/transcript/zh-CN/invalidated.tsv": { + "num_bytes": 13622503, + "checksum": null + } + }, + "download_size": 20414898643, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "https://commonvoice.mozilla.org/en/datasets", + "license": "https://creativecommons.org/publicdomain/zero/1.0/", + "size_in_bytes": 20838882370, + "splits": { + "train": { + "name": "train", + "num_bytes": 17963235, + "num_examples": 29406, + "dataset_name": "common_voice_16_0" + }, + "validation": { + "name": "validation", + "num_bytes": 6351483, + "num_examples": 10626, + "dataset_name": "common_voice_16_0" + }, + "test": { + "name": "test", + "num_bytes": 6263265, + "num_examples": 10626, + "dataset_name": "common_voice_16_0" + }, + "other": { + "name": "other", + "num_bytes": 358056452, + "num_examples": 610981, + "dataset_name": "common_voice_16_0" + }, + "invalidated": { + "name": "invalidated", + "num_bytes": 35349292, + "num_examples": 58386, + "dataset_name": "common_voice_16_0" + } + }, + "version": { + "version_str": "15.0.0", + "major": 15, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/commonvoice_zh_asr/state.json b/examples/commonvoice_zh_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..fca20c69387d026c6cb1114f52ef239bb265d7de --- /dev/null +++ b/examples/commonvoice_zh_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "41452271d4051e81", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_en_id_test/data-00000-of-00001.arrow b/examples/covost2_en_id_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2b56d3984a085bfdb287a25a699111fa999a7e30 --- /dev/null +++ b/examples/covost2_en_id_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d4bfb1098a59bf75aab6b9b824141808bb48160787b182b03fabf4a56527a9e +size 56496 diff --git a/examples/covost2_en_id_test/dataset_info.json b/examples/covost2_en_id_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..690160faf281db7503a998a45d77851c381bad0a --- /dev/null +++ b/examples/covost2_en_id_test/dataset_info.json @@ -0,0 +1,73 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_en_id_test_v1", + "dataset_size": 2842304040, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00000-of-00006.parquet": { + "num_bytes": 421383822, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00001-of-00006.parquet": { + "num_bytes": 430366828, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00002-of-00006.parquet": { + "num_bytes": 423203231, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00003-of-00006.parquet": { + "num_bytes": 420946330, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00004-of-00006.parquet": { + "num_bytes": 419734090, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_id_test_v1@51a4b1517f07e1dcf9180aa270f264961165cab3/data/test-00005-of-00006.parquet": { + "num_bytes": 419963769, + "checksum": null + } + }, + "download_size": 2535598070, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5377902110, + "splits": { + "test": { + "name": "test", + "num_bytes": 2842304040, + "num_examples": 15531, + "shard_lengths": [ + 2789, + 2789, + 2789, + 2788, + 2788, + 1588 + ], + "dataset_name": "covost2_en_id_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_en_id_test/state.json b/examples/covost2_en_id_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..fc25cd3a35429d91c8f4398de1e684511f95249f --- /dev/null +++ b/examples/covost2_en_id_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "f755c8fc2cb9504a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_en_ta_test/data-00000-of-00001.arrow b/examples/covost2_en_ta_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6a1376dac2cc2785d826d36c7385d8a15dbaa51e --- /dev/null +++ b/examples/covost2_en_ta_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89aa8cee7fa6754a1491ca48e31162126a808e07bca8731ecf2949bc183dd20f +size 58088 diff --git a/examples/covost2_en_ta_test/dataset_info.json b/examples/covost2_en_ta_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..f81d5c63837419297168c72269b28db10a152815 --- /dev/null +++ b/examples/covost2_en_ta_test/dataset_info.json @@ -0,0 +1,73 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_en_ta_test_v1", + "dataset_size": 2844002846, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00000-of-00006.parquet": { + "num_bytes": 421472102, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00001-of-00006.parquet": { + "num_bytes": 430453377, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00002-of-00006.parquet": { + "num_bytes": 423289725, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00003-of-00006.parquet": { + "num_bytes": 421032929, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00004-of-00006.parquet": { + "num_bytes": 419820665, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_ta_test_v1@23c23a71d698faf6c940e18753ade03acbcef86b/data/test-00005-of-00006.parquet": { + "num_bytes": 420052504, + "checksum": null + } + }, + "download_size": 2536121302, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5380124148, + "splits": { + "test": { + "name": "test", + "num_bytes": 2844002846, + "num_examples": 15531, + "shard_lengths": [ + 2789, + 2789, + 2789, + 2788, + 2788, + 1588 + ], + "dataset_name": "covost2_en_ta_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_en_ta_test/state.json b/examples/covost2_en_ta_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..9bba4c6c9970ea03ecb3eb25cb405aedea8344b9 --- /dev/null +++ b/examples/covost2_en_ta_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "fe170901fd58bb52", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_en_zh_test/data-00000-of-00001.arrow b/examples/covost2_en_zh_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7f5310ea8dda7a4da4af8c717d7057c2ee8e3785 --- /dev/null +++ b/examples/covost2_en_zh_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aca2fa905173efcf428fd779b4e74ff132b8bab979f925e75ad7ff8b9b19aaa8 +size 40016 diff --git a/examples/covost2_en_zh_test/dataset_info.json b/examples/covost2_en_zh_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..62de6214154ed60bea6a9e7bfa3d5dce9fa56635 --- /dev/null +++ b/examples/covost2_en_zh_test/dataset_info.json @@ -0,0 +1,73 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_en_zh_test_v1", + "dataset_size": 2842110316, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00000-of-00006.parquet": { + "num_bytes": 421376507, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00001-of-00006.parquet": { + "num_bytes": 430362096, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00002-of-00006.parquet": { + "num_bytes": 423196814, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00003-of-00006.parquet": { + "num_bytes": 420938958, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00004-of-00006.parquet": { + "num_bytes": 419730286, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_en_zh_test_v1@6fe92d7fabaee215b078b022abbe8959280f3187/data/test-00005-of-00006.parquet": { + "num_bytes": 419958145, + "checksum": null + } + }, + "download_size": 2535562806, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5377673122, + "splits": { + "test": { + "name": "test", + "num_bytes": 2842110316, + "num_examples": 15531, + "shard_lengths": [ + 2789, + 2789, + 2789, + 2788, + 2788, + 1588 + ], + "dataset_name": "covost2_en_zh_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_en_zh_test/state.json b/examples/covost2_en_zh_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6dda7fbf15a74d0df30b4c4f8061869f6e2d9ce --- /dev/null +++ b/examples/covost2_en_zh_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "7445fef56d00d00b", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_id_en_test/data-00000-of-00001.arrow b/examples/covost2_id_en_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..650163aeeda9d9b07dba6d48a1c02be66cc68201 --- /dev/null +++ b/examples/covost2_id_en_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:573589410f93a7667f83d37bd30b472069e7446d2d5e05aa98fc4138b0fd316b +size 50736 diff --git a/examples/covost2_id_en_test/dataset_info.json b/examples/covost2_id_en_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..68bc038742514d36ec58185e93d90d67b12f0bd7 --- /dev/null +++ b/examples/covost2_id_en_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_id_en_test_v1", + "dataset_size": 105217105, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_id_en_test_v1@3998e50bec6eefdae5b38278238c359acb583f4c/data/test-00000-of-00001.parquet": { + "num_bytes": 100225392, + "checksum": null + } + }, + "download_size": 100225392, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 205442497, + "splits": { + "test": { + "name": "test", + "num_bytes": 105217105, + "num_examples": 844, + "dataset_name": "covost2_id_en_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_id_en_test/state.json b/examples/covost2_id_en_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..748b84428625fe8d89e5309c2070c802e53978b6 --- /dev/null +++ b/examples/covost2_id_en_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "bf71abc414faf78c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_ta_en_test/data-00000-of-00001.arrow b/examples/covost2_ta_en_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c2b385c3425631dfd8ac5f875108ed38afe46cc5 --- /dev/null +++ b/examples/covost2_ta_en_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2ed44f89aae9aa384355903a3e8c57bf17375c283501899da606d2fe2651b34 +size 33800 diff --git a/examples/covost2_ta_en_test/dataset_info.json b/examples/covost2_ta_en_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..bfca3e41ec7343e4f34cbe750e40b605b197d965 --- /dev/null +++ b/examples/covost2_ta_en_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_ta_en_test_v2", + "dataset_size": 113835155, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_ta_en_test_v2@76a89636191da366b0d0d51f4c23dbe93257d5ba/data/test-00000-of-00001.parquet": { + "num_bytes": 100841253, + "checksum": null + } + }, + "download_size": 100841253, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 214676408, + "splits": { + "test": { + "name": "test", + "num_bytes": 113835155, + "num_examples": 754, + "dataset_name": "covost2_ta_en_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_ta_en_test/state.json b/examples/covost2_ta_en_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..1fa8fcabcdad464233d89600b64dfe95e2cd6e53 --- /dev/null +++ b/examples/covost2_ta_en_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "0c4f4bf15549877a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/covost2_zh_en_test/data-00000-of-00001.arrow b/examples/covost2_zh_en_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5d3a4e7398a5cece53a9deaf12bcb10d4eba95ad --- /dev/null +++ b/examples/covost2_zh_en_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a75d772da5e183ce2b2426810ecd13206f584fde411d6fb3aef20e4569ea44c +size 84464 diff --git a/examples/covost2_zh_en_test/dataset_info.json b/examples/covost2_zh_en_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ee9cc5d33e1dd1b7dc390b5e4e0d6bb20c0153d2 --- /dev/null +++ b/examples/covost2_zh_en_test/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "covost2_zh_en_test_v1", + "dataset_size": 950584881, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/covost2_zh_en_test_v1@f9aec51472baf5ecad82ee9440e5a99e64fb0fef/data/test-00000-of-00002.parquet": { + "num_bytes": 446947415, + "checksum": null + }, + "hf://datasets/AudioLLMs/covost2_zh_en_test_v1@f9aec51472baf5ecad82ee9440e5a99e64fb0fef/data/test-00001-of-00002.parquet": { + "num_bytes": 446471998, + "checksum": null + } + }, + "download_size": 893419413, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1844004294, + "splits": { + "test": { + "name": "test", + "num_bytes": 950584881, + "num_examples": 4898, + "shard_lengths": [ + 2649, + 2249 + ], + "dataset_name": "covost2_zh_en_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/covost2_zh_en_test/state.json b/examples/covost2_zh_en_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..0dc89e5fb882ed7ccbd045354b57539207aaf473 --- /dev/null +++ b/examples/covost2_zh_en_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "663c862c20c80194", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/dream_tts_mcq_test/data-00000-of-00001.arrow b/examples/dream_tts_mcq_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1e381771de3ab8cc6494b084bc462c1746f3eca2 --- /dev/null +++ b/examples/dream_tts_mcq_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3592dc10d2470a8b9b7c725755708016aee89bd3c4b833bbaea9e1692b5bcf8b +size 166200 diff --git a/examples/dream_tts_mcq_test/dataset_info.json b/examples/dream_tts_mcq_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..40dd55d5d96c513568dac52251e610a5fba7baa7 --- /dev/null +++ b/examples/dream_tts_mcq_test/dataset_info.json @@ -0,0 +1,71 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "dream_tts_mcq_test", + "dataset_size": 2090471063, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/dream_tts_mcq_test@a495d7c2a1facf4eb27af6dfc4c3878de8828cf8/data/test-00000-of-00005.parquet": { + "num_bytes": 134764087, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_mcq_test@a495d7c2a1facf4eb27af6dfc4c3878de8828cf8/data/test-00001-of-00005.parquet": { + "num_bytes": 146296486, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_mcq_test@a495d7c2a1facf4eb27af6dfc4c3878de8828cf8/data/test-00002-of-00005.parquet": { + "num_bytes": 140599909, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_mcq_test@a495d7c2a1facf4eb27af6dfc4c3878de8828cf8/data/test-00003-of-00005.parquet": { + "num_bytes": 143323621, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_mcq_test@a495d7c2a1facf4eb27af6dfc4c3878de8828cf8/data/test-00004-of-00005.parquet": { + "num_bytes": 142380417, + "checksum": null + } + }, + "download_size": 707364520, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "choices": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2797835583, + "splits": { + "test": { + "name": "test", + "num_bytes": 2090471063, + "num_examples": 1913, + "shard_lengths": [ + 583, + 383, + 565, + 382 + ], + "dataset_name": "dream_tts_mcq_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/dream_tts_mcq_test/state.json b/examples/dream_tts_mcq_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ca1dd6baab32074f7414dc846a48d53d590a45bf --- /dev/null +++ b/examples/dream_tts_mcq_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "61127fb562a1cd6a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/dream_tts_test/data-00000-of-00001.arrow b/examples/dream_tts_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f7e5276fe994622b53f8cb3dbd8b69be09b8a812 --- /dev/null +++ b/examples/dream_tts_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55093552ade3a7de41b9bb2194f251b75d23364e2ba6bcae3dfc47acbdd66051 +size 429624 diff --git a/examples/dream_tts_test/dataset_info.json b/examples/dream_tts_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..3ce785721b8e8016bc2e4b3e2a81895cce638c98 --- /dev/null +++ b/examples/dream_tts_test/dataset_info.json @@ -0,0 +1,67 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "dream_tts_test", + "dataset_size": 2090304304, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/dream_tts_test@bf9354b7e6bc34872869eb00ba31fd57307f8cfe/data/test-00000-of-00005.parquet": { + "num_bytes": 134740696, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_test@bf9354b7e6bc34872869eb00ba31fd57307f8cfe/data/test-00001-of-00005.parquet": { + "num_bytes": 146273135, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_test@bf9354b7e6bc34872869eb00ba31fd57307f8cfe/data/test-00002-of-00005.parquet": { + "num_bytes": 140576363, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_test@bf9354b7e6bc34872869eb00ba31fd57307f8cfe/data/test-00003-of-00005.parquet": { + "num_bytes": 143301149, + "checksum": null + }, + "hf://datasets/AudioLLMs/dream_tts_test@bf9354b7e6bc34872869eb00ba31fd57307f8cfe/data/test-00004-of-00005.parquet": { + "num_bytes": 142356857, + "checksum": null + } + }, + "download_size": 707248200, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2797552504, + "splits": { + "test": { + "name": "test", + "num_bytes": 2090304304, + "num_examples": 1913, + "shard_lengths": [ + 583, + 383, + 565, + 382 + ], + "dataset_name": "dream_tts_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/dream_tts_test/state.json b/examples/dream_tts_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..90c21fae0895e1869405ca110daf9e0b4b2e2392 --- /dev/null +++ b/examples/dream_tts_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "b49df75fceef7ccf", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/earnings21_test/data-00000-of-00001.arrow b/examples/earnings21_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7249e56d8b14bc9ffe24e7a78ad956ab177cb608 --- /dev/null +++ b/examples/earnings21_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf10b59f5f92124d7d2f40c13c3311935d111466c04b1547a323e74503c4c03 +size 34140928 diff --git a/examples/earnings21_test/dataset_info.json b/examples/earnings21_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..63804623e938e120339bf1705f1bbf7a0bb5a608 --- /dev/null +++ b/examples/earnings21_test/dataset_info.json @@ -0,0 +1,89 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "earnings21_test", + "dataset_size": 4525095376, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00000-of-00010.parquet": { + "num_bytes": 514663653, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00001-of-00010.parquet": { + "num_bytes": 458988588, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00002-of-00010.parquet": { + "num_bytes": 548797041, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00003-of-00010.parquet": { + "num_bytes": 371866372, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00004-of-00010.parquet": { + "num_bytes": 443181276, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00005-of-00010.parquet": { + "num_bytes": 416171809, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00006-of-00010.parquet": { + "num_bytes": 359420266, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00007-of-00010.parquet": { + "num_bytes": 366885124, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00008-of-00010.parquet": { + "num_bytes": 370786683, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings21_test@1edcc0d56dc6cdb6d21420f3155936e48485928d/data/test-00009-of-00010.parquet": { + "num_bytes": 527136223, + "checksum": null + } + }, + "download_size": 4377897035, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 8902992411, + "splits": { + "test": { + "name": "test", + "num_bytes": 4525095376, + "num_examples": 44, + "shard_lengths": [ + 5, + 10, + 9, + 8, + 8, + 4 + ], + "dataset_name": "earnings21_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/earnings21_test/state.json b/examples/earnings21_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f051ade7832630a262a391766938cfb5598f101d --- /dev/null +++ b/examples/earnings21_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d8f18cbadb79de1a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/earnings22_test/data-00000-of-00001.arrow b/examples/earnings22_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..61f089c7bfc30cd10a59b1d5381162f220aa6ab0 --- /dev/null +++ b/examples/earnings22_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aefd16dfe7e7bbab0055c1325440f41ee9890008c56c202f6ac2db4d70ed1ad8 +size 36179792 diff --git a/examples/earnings22_test/dataset_info.json b/examples/earnings22_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..3b611be9aad430871804d201f7b5d24ae8776134 --- /dev/null +++ b/examples/earnings22_test/dataset_info.json @@ -0,0 +1,174 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "earnings22_test", + "dataset_size": 13816378227, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00000-of-00028.parquet": { + "num_bytes": 590324247, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00001-of-00028.parquet": { + "num_bytes": 491720223, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00002-of-00028.parquet": { + "num_bytes": 513910178, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00003-of-00028.parquet": { + "num_bytes": 336743013, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00004-of-00028.parquet": { + "num_bytes": 538718032, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00005-of-00028.parquet": { + "num_bytes": 526846244, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00006-of-00028.parquet": { + "num_bytes": 528814647, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00007-of-00028.parquet": { + "num_bytes": 712229979, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00008-of-00028.parquet": { + "num_bytes": 613493484, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00009-of-00028.parquet": { + "num_bytes": 569172005, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00010-of-00028.parquet": { + "num_bytes": 572265039, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00011-of-00028.parquet": { + "num_bytes": 408970476, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00012-of-00028.parquet": { + "num_bytes": 439789191, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00013-of-00028.parquet": { + "num_bytes": 335142082, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00014-of-00028.parquet": { + "num_bytes": 475706708, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00015-of-00028.parquet": { + "num_bytes": 455591887, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00016-of-00028.parquet": { + "num_bytes": 582088920, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00017-of-00028.parquet": { + "num_bytes": 408202425, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00018-of-00028.parquet": { + "num_bytes": 427936536, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00019-of-00028.parquet": { + "num_bytes": 332794707, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00020-of-00028.parquet": { + "num_bytes": 483364300, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00021-of-00028.parquet": { + "num_bytes": 479904434, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00022-of-00028.parquet": { + "num_bytes": 283170007, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00023-of-00028.parquet": { + "num_bytes": 483919964, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00024-of-00028.parquet": { + "num_bytes": 469257162, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00025-of-00028.parquet": { + "num_bytes": 423570689, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00026-of-00028.parquet": { + "num_bytes": 361762611, + "checksum": null + }, + "hf://datasets/AudioLLMs/earnings22_test@19da542f94ac74d603b9cb5eed0e13792517ccf4/data/test-00027-of-00028.parquet": { + "num_bytes": 456697781, + "checksum": null + } + }, + "download_size": 13302106971, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 27118485198, + "splits": { + "test": { + "name": "test", + "num_bytes": 13816378227, + "num_examples": 125, + "shard_lengths": [ + 5, + 5, + 5, + 10, + 5, + 5, + 5, + 5, + 5, + 5, + 10, + 8, + 8, + 8, + 8, + 8, + 4, + 8, + 8 + ], + "dataset_name": "earnings22_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/earnings22_test/state.json b/examples/earnings22_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ca7ba8eff8402a4f5ededf83d91ae741c229945 --- /dev/null +++ b/examples/earnings22_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d960d1556f3baad6", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/fleurs_tamil_ta_30_asr/data-00000-of-00001.arrow b/examples/fleurs_tamil_ta_30_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..aa02952125226a13b406dddced9cb39dd6626e01 --- /dev/null +++ b/examples/fleurs_tamil_ta_30_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d5d917373bafaf910af1684c82942b1212a99da156f9b5c2f8337069e786bd +size 112496 diff --git a/examples/fleurs_tamil_ta_30_asr/dataset_info.json b/examples/fleurs_tamil_ta_30_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..2e09cf5a777552405012a5e1b7b504140b7ab3ad --- /dev/null +++ b/examples/fleurs_tamil_ta_30_asr/dataset_info.json @@ -0,0 +1,96 @@ +{ + "builder_name": "fleurs", + "citation": "\n", + "config_name": "ta_in", + "dataset_name": "fleurs", + "dataset_size": 2783252148, + "description": "FLEURS is the speech version of the FLORES machine translation benchmark, covering 2000 n-way parallel sentences in n=102 languages.\nFLEURS is the speech version of the FLORES machine translation benchmark, covering 2000 n-way parallel sentences in n=102 languages.", + "download_checksums": { + "data/ta_in/audio/train.tar.gz": { + "num_bytes": 1638765342, + "checksum": null + }, + "data/ta_in/audio/dev.tar.gz": { + "num_bytes": 238893728, + "checksum": null + }, + "data/ta_in/audio/test.tar.gz": { + "num_bytes": 406616353, + "checksum": null + }, + "data/ta_in/train.tsv": { + "num_bytes": 3245761, + "checksum": null + }, + "data/ta_in/dev.tsv": { + "num_bytes": 507783, + "checksum": null + }, + "data/ta_in/test.tsv": { + "num_bytes": 846022, + "checksum": null + } + }, + "download_size": 2288874989, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5072127137, + "splits": { + "train": { + "name": "train", + "num_bytes": 2002058848, + "num_examples": 2367, + "shard_lengths": [ + 1000, + 1000, + 367 + ], + "dataset_name": "fleurs" + }, + "validation": { + "name": "validation", + "num_bytes": 289505738, + "num_examples": 377, + "dataset_name": "fleurs" + }, + "test": { + "name": "test", + "num_bytes": 491687562, + "num_examples": 591, + "dataset_name": "fleurs" + } + }, + "supervised_keys": { + "input": "audio", + "output": "transcription" + }, + "version": { + "version_str": "2.0.0", + "description": "", + "major": 2, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/fleurs_tamil_ta_30_asr/state.json b/examples/fleurs_tamil_ta_30_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..be787ce7fa05a866a5b1e8fd50f291e98d5c1426 --- /dev/null +++ b/examples/fleurs_tamil_ta_30_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "2bc82ed8905684f2", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/gigaspeech2_id_test/data-00000-of-00001.arrow b/examples/gigaspeech2_id_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..70d3f3e19cfed2aaa7565941c7eb290e46072ca4 --- /dev/null +++ b/examples/gigaspeech2_id_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb691e4cca42c16284e55e7720aa047a7eae3bb074855fd908b47e100ef7b077 +size 55080 diff --git a/examples/gigaspeech2_id_test/dataset_info.json b/examples/gigaspeech2_id_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..423e10e5684874c3b9520af64995d6eb25531701 --- /dev/null +++ b/examples/gigaspeech2_id_test/dataset_info.json @@ -0,0 +1,49 @@ +{ + "builder_name": "generator", + "citation": "", + "config_name": "default", + "dataset_name": "generator", + "dataset_size": 3198110, + "description": "", + "download_checksums": {}, + "download_size": 0, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 3198110, + "splits": { + "train": { + "name": "train", + "num_bytes": 3198110, + "num_examples": 17273, + "dataset_name": "generator" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/gigaspeech2_id_test/state.json b/examples/gigaspeech2_id_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..1bbfdf1b8d797420f28a35c1fb1675db796c5bfd --- /dev/null +++ b/examples/gigaspeech2_id_test/state.json @@ -0,0 +1,19 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "641a0ae6ccbb4a1f", + "_format_columns": [ + "answer", + "audio_length", + "context", + "instruction", + "language" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/gigaspeech2_th_test/data-00000-of-00001.arrow b/examples/gigaspeech2_th_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..8fbcb9dcc3ab616cd3550ff6cbff6029b96f6bdb --- /dev/null +++ b/examples/gigaspeech2_th_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eab05de3607cded114219aef63d9cfa227ad8814a41803739ef06b0d6119540 +size 41680 diff --git a/examples/gigaspeech2_th_test/dataset_info.json b/examples/gigaspeech2_th_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..991c830143f99c9a978c86b97e0963ca7b5886b4 --- /dev/null +++ b/examples/gigaspeech2_th_test/dataset_info.json @@ -0,0 +1,49 @@ +{ + "builder_name": "generator", + "citation": "", + "config_name": "default", + "dataset_name": "generator", + "dataset_size": 2433630, + "description": "", + "download_checksums": {}, + "download_size": 0, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2433630, + "splits": { + "train": { + "name": "train", + "num_bytes": 2433630, + "num_examples": 8823, + "dataset_name": "generator" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/gigaspeech2_th_test/state.json b/examples/gigaspeech2_th_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e42c059caa4925da9d4404d33ccb425e9a4ae63 --- /dev/null +++ b/examples/gigaspeech2_th_test/state.json @@ -0,0 +1,19 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "a8da6ebd39067b29", + "_format_columns": [ + "answer", + "audio_length", + "context", + "instruction", + "language" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/gigaspeech2_vi_test/data-00000-of-00001.arrow b/examples/gigaspeech2_vi_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..18d0bfc2b5020416e3b7f34b25aeb7127d21a7e6 --- /dev/null +++ b/examples/gigaspeech2_vi_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a73e2a25c09e5ea725bc08e54a9bd9e301c7eddc694563d8d95d2f74f6b1a648 +size 96232 diff --git a/examples/gigaspeech2_vi_test/dataset_info.json b/examples/gigaspeech2_vi_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..6c29de56befeec677461005671f4cd04ceaae3a1 --- /dev/null +++ b/examples/gigaspeech2_vi_test/dataset_info.json @@ -0,0 +1,49 @@ +{ + "builder_name": "generator", + "citation": "", + "config_name": "default", + "dataset_name": "generator", + "dataset_size": 1377360, + "description": "", + "download_checksums": {}, + "download_size": 0, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1377360, + "splits": { + "train": { + "name": "train", + "num_bytes": 1377360, + "num_examples": 4422, + "dataset_name": "generator" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/gigaspeech2_vi_test/state.json b/examples/gigaspeech2_vi_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..076880c9378d838074f478e27ab51e7fc91a14a7 --- /dev/null +++ b/examples/gigaspeech2_vi_test/state.json @@ -0,0 +1,19 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "e3c0849db56a05c4", + "_format_columns": [ + "answer", + "audio_length", + "context", + "instruction", + "language" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/gigaspeech_test/data-00000-of-00001.arrow b/examples/gigaspeech_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bce5fb20b4b36f228dbfe845d9b6089586353039 --- /dev/null +++ b/examples/gigaspeech_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1882f942c0234f1a452c190bc319afda07de022ade450000686c81d46a0b5b09 +size 53016 diff --git a/examples/gigaspeech_test/dataset_info.json b/examples/gigaspeech_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..cb3df47cf467937968340ec050b7a90971634218 --- /dev/null +++ b/examples/gigaspeech_test/dataset_info.json @@ -0,0 +1,87 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "gigaspeech_test_v2", + "dataset_size": 4047504861, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00000-of-00009.parquet": { + "num_bytes": 448654989, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00001-of-00009.parquet": { + "num_bytes": 440444418, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00002-of-00009.parquet": { + "num_bytes": 444375020, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00003-of-00009.parquet": { + "num_bytes": 450135947, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00004-of-00009.parquet": { + "num_bytes": 438808842, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00005-of-00009.parquet": { + "num_bytes": 445627298, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00006-of-00009.parquet": { + "num_bytes": 443039584, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00007-of-00009.parquet": { + "num_bytes": 445525392, + "checksum": null + }, + "hf://datasets/AudioLLMs/gigaspeech_test_v2@77582c9e37dc7cc98ab2dd21702a94ddcf95f976/data/test-00008-of-00009.parquet": { + "num_bytes": 447090326, + "checksum": null + } + }, + "download_size": 4003701816, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 8051206677, + "splits": { + "test": { + "name": "test", + "num_bytes": 4047504861, + "num_examples": 18650, + "shard_lengths": [ + 2373, + 2373, + 2272, + 2372, + 2372, + 2372, + 2372, + 2144 + ], + "dataset_name": "gigaspeech_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/gigaspeech_test/state.json b/examples/gigaspeech_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..50bb8c6c6bd738f9cf18fe5a20c24ce6f5da683d --- /dev/null +++ b/examples/gigaspeech_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "6e74ad9373231f9c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/idpc_short_test/data-00000-of-00001.arrow b/examples/idpc_short_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..01e9435c6ae574504f726fe734155595ceb3fc8c --- /dev/null +++ b/examples/idpc_short_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11d5c1be1761532dd53e8c891800ad4fc8ff46f2a30ff4a73a5171416d87b365 +size 215184 diff --git a/examples/idpc_short_test/dataset_info.json b/examples/idpc_short_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..ad6be7e8a51413141e904ee813472f94021f2970 --- /dev/null +++ b/examples/idpc_short_test/dataset_info.json @@ -0,0 +1,28 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/idpc_short_test/state.json b/examples/idpc_short_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d59b1602de19653371e7c1dd36f02aa113c0916a --- /dev/null +++ b/examples/idpc_short_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "6bc22097b8204924", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/idpc_test/data-00000-of-00001.arrow b/examples/idpc_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b686b54b2b5633057f49aeea088e9805d9ec5acd --- /dev/null +++ b/examples/idpc_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42e4936fdede94114208daed891c3f34e16a5634b3f2892b10b310c2d5551f0 +size 5578144 diff --git a/examples/idpc_test/dataset_info.json b/examples/idpc_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/idpc_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/idpc_test/state.json b/examples/idpc_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0c13a5eef842353db85ec7aa158b66bb8ad80d --- /dev/null +++ b/examples/idpc_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "ce208927a7b02197", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/iemocap_emotion_test/data-00000-of-00001.arrow b/examples/iemocap_emotion_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c8f7b534f7064e3c83b2d34dbdf64dcd497d3020 --- /dev/null +++ b/examples/iemocap_emotion_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:810cefab256dc943d742acc2cd628cb7f7cc6c209bd615afb155801e8ab2174f +size 42608 diff --git a/examples/iemocap_emotion_test/dataset_info.json b/examples/iemocap_emotion_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..3c6e6dbe7f2156df942122d7001ce29b619616c0 --- /dev/null +++ b/examples/iemocap_emotion_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "iemocap_emotion_recognition", + "dataset_size": 142161933, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/iemocap_emotion_recognition@72ca1bd6e70a9522e91c9ddfe4cf3e1d4407cd80/data/test-00000-of-00001.parquet": { + "num_bytes": 139557753, + "checksum": null + } + }, + "download_size": 139557753, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 281719686, + "splits": { + "test": { + "name": "test", + "num_bytes": 142161933, + "num_examples": 1004, + "dataset_name": "iemocap_emotion_recognition" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/iemocap_emotion_test/state.json b/examples/iemocap_emotion_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..a3df0f9b446567477e06b188e06970016fa146b9 --- /dev/null +++ b/examples/iemocap_emotion_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "7609195403c36066", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/iemocap_gender_test/data-00000-of-00001.arrow b/examples/iemocap_gender_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2b222a9da66839230ae5390cf1c34c7fcef3d699 --- /dev/null +++ b/examples/iemocap_gender_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e2eee931894afe8e6dd4004e765be78e38793c5814774e652e3db97e950787 +size 21368 diff --git a/examples/iemocap_gender_test/dataset_info.json b/examples/iemocap_gender_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..b69a861698d19ccdaa377125a1410fcb7e2a8d7f --- /dev/null +++ b/examples/iemocap_gender_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "iemocap_gender_recognition", + "dataset_size": 142162142, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/iemocap_gender_recognition@4e44e63a84eb1a33035b973193b50fccf3452738/data/test-00000-of-00001.parquet": { + "num_bytes": 139554152, + "checksum": null + } + }, + "download_size": 139554152, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 281716294, + "splits": { + "test": { + "name": "test", + "num_bytes": 142162142, + "num_examples": 1004, + "dataset_name": "iemocap_gender_recognition" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/iemocap_gender_test/state.json b/examples/iemocap_gender_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..eea197b4952d5e871c61993b92fa7cd791e8c8c7 --- /dev/null +++ b/examples/iemocap_gender_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "acd5e0555618be31", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_ar_dialogue/data-00000-of-00001.arrow b/examples/imda_ar_dialogue/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b9750806700d82dcf7fec106b1e6a4a1015ee3c1 --- /dev/null +++ b/examples/imda_ar_dialogue/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6ad60c44f72330edbc7139a8b700e00fddd4e0daabf7c396aaaa5766c30f1ca +size 299208 diff --git a/examples/imda_ar_dialogue/dataset_info.json b/examples/imda_ar_dialogue/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..99fcb1acfad87e422e09fa1dbf6721ae004d5fe2 --- /dev/null +++ b/examples/imda_ar_dialogue/dataset_info.json @@ -0,0 +1,73 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 2524225492, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00000-of-00006.parquet": { + "num_bytes": 435219849, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00001-of-00006.parquet": { + "num_bytes": 434380165, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00002-of-00006.parquet": { + "num_bytes": 417054073, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00003-of-00006.parquet": { + "num_bytes": 422747822, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00004-of-00006.parquet": { + "num_bytes": 396603525, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Dialogue-Test/train-00005-of-00006.parquet": { + "num_bytes": 396875996, + "checksum": null + } + }, + "download_size": 2502881430, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5027106922, + "splits": { + "train": { + "name": "train", + "num_bytes": 2524225492, + "num_examples": 3000, + "shard_lengths": [ + 600, + 600, + 600, + 700, + 500 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_ar_dialogue/state.json b/examples/imda_ar_dialogue/state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b2f0111c975a6ec5366f680be95b7db9ac15455 --- /dev/null +++ b/examples/imda_ar_dialogue/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "5df769e9aa0ca07d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_ar_sentence/data-00000-of-00001.arrow b/examples/imda_ar_sentence/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3a7c0e5492d6fdddde623afe33be14fd449e4d9b --- /dev/null +++ b/examples/imda_ar_sentence/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6768c1fd288e9f3500843295c5b386cd773f08ab54b0a7ce5c98f9cfe644d43 +size 54296 diff --git a/examples/imda_ar_sentence/dataset_info.json b/examples/imda_ar_sentence/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..b794e91c224ba579a503fbbf0910e23d2b06af8d --- /dev/null +++ b/examples/imda_ar_sentence/dataset_info.json @@ -0,0 +1,59 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 1037472970, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Sentence-Test/train-00000-of-00003.parquet": { + "num_bytes": 365980074, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Sentence-Test/train-00001-of-00003.parquet": { + "num_bytes": 340231254, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-AR-Sentence-Test/train-00002-of-00003.parquet": { + "num_bytes": 307364651, + "checksum": null + } + }, + "download_size": 1013575979, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2051048949, + "splits": { + "train": { + "name": "train", + "num_bytes": 1037472970, + "num_examples": 6000, + "shard_lengths": [ + 2700, + 3200, + 100 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_ar_sentence/state.json b/examples/imda_ar_sentence/state.json new file mode 100644 index 0000000000000000000000000000000000000000..244f484171b8b81ca327050928191f9c113160d7 --- /dev/null +++ b/examples/imda_ar_sentence/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "89618659645140cc", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_gr_dialogue/data-00000-of-00001.arrow b/examples/imda_gr_dialogue/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ed8ce0c525e4d7e8140866476cb859eff8274331 --- /dev/null +++ b/examples/imda_gr_dialogue/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac89cb1c3b6da7c1d0f6f2de4991f43c0e330e6bacb765e3798366c1ce98c875 +size 249928 diff --git a/examples/imda_gr_dialogue/dataset_info.json b/examples/imda_gr_dialogue/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..dec49e21028681ba46a7dcc5bc20237753643cc3 --- /dev/null +++ b/examples/imda_gr_dialogue/dataset_info.json @@ -0,0 +1,73 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 2524082708, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00000-of-00006.parquet": { + "num_bytes": 435214911, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00001-of-00006.parquet": { + "num_bytes": 434376439, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00002-of-00006.parquet": { + "num_bytes": 417048099, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00003-of-00006.parquet": { + "num_bytes": 422741965, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00004-of-00006.parquet": { + "num_bytes": 396597982, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Dialogue-Test/train-00005-of-00006.parquet": { + "num_bytes": 396869883, + "checksum": null + } + }, + "download_size": 2502849279, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 5026931987, + "splits": { + "train": { + "name": "train", + "num_bytes": 2524082708, + "num_examples": 3000, + "shard_lengths": [ + 600, + 600, + 600, + 700, + 500 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_gr_dialogue/state.json b/examples/imda_gr_dialogue/state.json new file mode 100644 index 0000000000000000000000000000000000000000..83c71b8adac74c33683422851902a285795aead4 --- /dev/null +++ b/examples/imda_gr_dialogue/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "1f358da0de77d0ff", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_gr_sentence/data-00000-of-00001.arrow b/examples/imda_gr_sentence/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f251b617b1826598d8f4acbf04323ea8eb623b9b --- /dev/null +++ b/examples/imda_gr_sentence/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e66d4845af51a75bde56e6c6981c4b7117cd0eec242e354df09b077115485c99 +size 48544 diff --git a/examples/imda_gr_sentence/dataset_info.json b/examples/imda_gr_sentence/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..70dfa6a27ca04d266678969d746cbb3e1f7d5d1b --- /dev/null +++ b/examples/imda_gr_sentence/dataset_info.json @@ -0,0 +1,59 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 1037335155, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Sentence-Test/train-00000-of-00003.parquet": { + "num_bytes": 365977246, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Sentence-Test/train-00001-of-00003.parquet": { + "num_bytes": 340228326, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/PQA-GR-Sentence-Test/train-00002-of-00003.parquet": { + "num_bytes": 307361805, + "checksum": null + } + }, + "download_size": 1013567377, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2050902532, + "splits": { + "train": { + "name": "train", + "num_bytes": 1037335155, + "num_examples": 6000, + "shard_lengths": [ + 2700, + 3200, + 100 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_gr_sentence/state.json b/examples/imda_gr_sentence/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7c8738a20db4836a618a0bce0b644b97cb7e3317 --- /dev/null +++ b/examples/imda_gr_sentence/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "630249d2adf5ef8e", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part1_asr_test/data-00000-of-00001.arrow b/examples/imda_part1_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1a6e492a98060bd5befbdef3897acc7dbad67518 --- /dev/null +++ b/examples/imda_part1_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:565477e1720116cd5e628b6e9f63100dfae2f0679ffecb1e69dbd0081cca8872 +size 46912 diff --git a/examples/imda_part1_asr_test/dataset_info.json b/examples/imda_part1_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..f87d0c20e8e3c71eef384e95d6f79aa46632b10c --- /dev/null +++ b/examples/imda_part1_asr_test/dataset_info.json @@ -0,0 +1,54 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 571224336, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART1-Test/train-00000-of-00002.parquet": { + "num_bytes": 269417015, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART1-Test/train-00001-of-00002.parquet": { + "num_bytes": 290433823, + "checksum": null + } + }, + "download_size": 559850838, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1131075174, + "splits": { + "train": { + "name": "train", + "num_bytes": 571224336, + "num_examples": 3000, + "shard_lengths": [ + 2700, + 300 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part1_asr_test/state.json b/examples/imda_part1_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ca9381f98c99017820d633ddc9acde8351330c60 --- /dev/null +++ b/examples/imda_part1_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "f89e31a48247e0c3", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part2_asr_test/data-00000-of-00001.arrow b/examples/imda_part2_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0ea0b48a95983875f0f956c79255158d8f64911c --- /dev/null +++ b/examples/imda_part2_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017f6b5d4278a950a9b7b217e82f28eae1c96ecc59cc4106143e7d9a8bb2f0d7 +size 51456 diff --git a/examples/imda_part2_asr_test/dataset_info.json b/examples/imda_part2_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..2297b731882fc876b4484096548f8038fc83f7f8 --- /dev/null +++ b/examples/imda_part2_asr_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 465924178, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART2-Test/train-00000-of-00001.parquet": { + "num_bytes": 453955477, + "checksum": null + } + }, + "download_size": 453955477, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 919879655, + "splits": { + "train": { + "name": "train", + "num_bytes": 465924178, + "num_examples": 3000, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part2_asr_test/state.json b/examples/imda_part2_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..4a93488d5a59b2cdb765a9d2fcb87ec181ec879a --- /dev/null +++ b/examples/imda_part2_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "0a2fb33368797998", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part3_30s_asr_test/data-00000-of-00001.arrow b/examples/imda_part3_30s_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..53f9f5bec8e0b79724f50de5df22d071fbaf2560 --- /dev/null +++ b/examples/imda_part3_30s_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88aff0830f5cbbee4e5d8df4ff5a5a29c7cde5124efc154a67136960c44a8e00 +size 296904 diff --git a/examples/imda_part3_30s_asr_test/dataset_info.json b/examples/imda_part3_30s_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..20c2dca65463c8000dc87b70f955fe8075a81287 --- /dev/null +++ b/examples/imda_part3_30s_asr_test/dataset_info.json @@ -0,0 +1,54 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 888132281, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART3-Test/train-00000-of-00002.parquet": { + "num_bytes": 435340367, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART3-Test/train-00001-of-00002.parquet": { + "num_bytes": 434499267, + "checksum": null + } + }, + "download_size": 869839634, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1757971915, + "splits": { + "train": { + "name": "train", + "num_bytes": 888132281, + "num_examples": 1000, + "shard_lengths": [ + 600, + 400 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part3_30s_asr_test/state.json b/examples/imda_part3_30s_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..432c4b4fa37413e92792b7eae4c5b848f4679092 --- /dev/null +++ b/examples/imda_part3_30s_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "7bcfb7745730c4ee", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part3_30s_ds_human_test/data-00000-of-00001.arrow b/examples/imda_part3_30s_ds_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..18c29dd49feaf5b8377532448e632a9d74d7f8ee --- /dev/null +++ b/examples/imda_part3_30s_ds_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb2ba608816fb7b07332149870279faeb4271428e4686d8d018ba19a2edb09c +size 308640 diff --git a/examples/imda_part3_30s_ds_human_test/dataset_info.json b/examples/imda_part3_30s_ds_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..07f260d1a1abce1beb24976f9248fd09eab93e28 --- /dev/null +++ b/examples/imda_part3_30s_ds_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 90554712, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SDS-PART3-Test/train-00000-of-00001.parquet": { + "num_bytes": 89690527, + "checksum": null + } + }, + "download_size": 89690527, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 180245239, + "splits": { + "train": { + "name": "train", + "num_bytes": 90554712, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part3_30s_ds_human_test/state.json b/examples/imda_part3_30s_ds_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f113269d1797a9f4d0084b2757b3f07f56e5ad7 --- /dev/null +++ b/examples/imda_part3_30s_ds_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "4f9cd020537823f9", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part3_30s_sqa_human_test/data-00000-of-00001.arrow b/examples/imda_part3_30s_sqa_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..64365ff86e2effbf1c560702c58d2e8507a43e86 --- /dev/null +++ b/examples/imda_part3_30s_sqa_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1237cc617a9c1708b980599030e6be99eac54e83e6df935528e303433949f378 +size 311288 diff --git a/examples/imda_part3_30s_sqa_human_test/dataset_info.json b/examples/imda_part3_30s_sqa_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..56116943f3b1b9691812103314c76107090a1ab1 --- /dev/null +++ b/examples/imda_part3_30s_sqa_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 90552987, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SQA-PART3-Test/train-00000-of-00001.parquet": { + "num_bytes": 89693755, + "checksum": null + } + }, + "download_size": 89693755, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 180246742, + "splits": { + "train": { + "name": "train", + "num_bytes": 90552987, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part3_30s_sqa_human_test/state.json b/examples/imda_part3_30s_sqa_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f87d37c78470c58e350f382750741048ea86c8 --- /dev/null +++ b/examples/imda_part3_30s_sqa_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "947e169be25a888c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part4_30s_asr_test/data-00000-of-00001.arrow b/examples/imda_part4_30s_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2dc623925e9d4131842d309e8c896121ddc613fd --- /dev/null +++ b/examples/imda_part4_30s_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a406523f8dd449ffdd54b8d1959a6397501613fa6a5ef8cf82f3cd580cf1761 +size 227520 diff --git a/examples/imda_part4_30s_asr_test/dataset_info.json b/examples/imda_part4_30s_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..9b39c7fcad1435134befc3b9875d3932b99fd21e --- /dev/null +++ b/examples/imda_part4_30s_asr_test/dataset_info.json @@ -0,0 +1,54 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 840673945, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART4-Test/train-00000-of-00002.parquet": { + "num_bytes": 417189868, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART4-Test/train-00001-of-00002.parquet": { + "num_bytes": 422883495, + "checksum": null + } + }, + "download_size": 840073363, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1680747308, + "splits": { + "train": { + "name": "train", + "num_bytes": 840673945, + "num_examples": 1000, + "shard_lengths": [ + 600, + 400 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part4_30s_asr_test/state.json b/examples/imda_part4_30s_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..155e8b6f684262a2e9871697918c84bd75b5ce52 --- /dev/null +++ b/examples/imda_part4_30s_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "25078607f3794f85", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part4_30s_ds_human_test/data-00000-of-00001.arrow b/examples/imda_part4_30s_ds_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2da3bba82807d2d50c443ed352a02cc8bae5b74b --- /dev/null +++ b/examples/imda_part4_30s_ds_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfbc82bfe1273396b899e9dc402774fa2a92712dd59dbd6978138055b7759611 +size 302128 diff --git a/examples/imda_part4_30s_ds_human_test/dataset_info.json b/examples/imda_part4_30s_ds_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8fcdaf7ab469c2c1dbf37cfaf063d4008195cd --- /dev/null +++ b/examples/imda_part4_30s_ds_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 88940763, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SDS-PART4-Test/train-00000-of-00001.parquet": { + "num_bytes": 88911005, + "checksum": null + } + }, + "download_size": 88911005, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 177851768, + "splits": { + "train": { + "name": "train", + "num_bytes": 88940763, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part4_30s_ds_human_test/state.json b/examples/imda_part4_30s_ds_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..aec8ee7976dd63187ae0cbce93f8e0e53e8d36dc --- /dev/null +++ b/examples/imda_part4_30s_ds_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "bdf000c21285804c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part4_30s_sqa_human_test/data-00000-of-00001.arrow b/examples/imda_part4_30s_sqa_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..57abfd5667d4632401f6a2b0b8f49a3d73ffdd8c --- /dev/null +++ b/examples/imda_part4_30s_sqa_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:755da8a8e6be108aa35ba7435f24c02bf3e20e5fff7cd0f4a8a54a83c8e80fa1 +size 299608 diff --git a/examples/imda_part4_30s_sqa_human_test/dataset_info.json b/examples/imda_part4_30s_sqa_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..3eccde2792f549dcc3a561f364fde308c27a5075 --- /dev/null +++ b/examples/imda_part4_30s_sqa_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 88935737, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SQA-PART4-Test/train-00000-of-00001.parquet": { + "num_bytes": 88913332, + "checksum": null + } + }, + "download_size": 88913332, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 177849069, + "splits": { + "train": { + "name": "train", + "num_bytes": 88935737, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part4_30s_sqa_human_test/state.json b/examples/imda_part4_30s_sqa_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..01cca961739a5bd6b491aabe7b2d5d24f9303ee6 --- /dev/null +++ b/examples/imda_part4_30s_sqa_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "3240d6062e9154b5", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part5_30s_asr_test/data-00000-of-00001.arrow b/examples/imda_part5_30s_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..929d415652ea307f2de1182d51d8e6d5a989ffbc --- /dev/null +++ b/examples/imda_part5_30s_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:410e12a8800e0c3714132f43b581e6d61633c94672a25d45d90575639c398145 +size 291560 diff --git a/examples/imda_part5_30s_asr_test/dataset_info.json b/examples/imda_part5_30s_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..3e2b0fe698dcb1a9c56819dba5a4a94b06e30860 --- /dev/null +++ b/examples/imda_part5_30s_asr_test/dataset_info.json @@ -0,0 +1,54 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 796433593, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART5-Test/train-00000-of-00002.parquet": { + "num_bytes": 396715148, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART5-Test/train-00001-of-00002.parquet": { + "num_bytes": 396982674, + "checksum": null + } + }, + "download_size": 793697822, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1590131415, + "splits": { + "train": { + "name": "train", + "num_bytes": 796433593, + "num_examples": 1000, + "shard_lengths": [ + 700, + 300 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part5_30s_asr_test/state.json b/examples/imda_part5_30s_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb3a65c56bde7a4bcd7a1637fef3b2e4f106f48 --- /dev/null +++ b/examples/imda_part5_30s_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "80a1f91fcf308860", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part5_30s_ds_human_test/data-00000-of-00001.arrow b/examples/imda_part5_30s_ds_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..f1f429eccaaa963bed04d4595d4db18dc995d259 --- /dev/null +++ b/examples/imda_part5_30s_ds_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63dcfacb380e7c08baa935f4adade715b3ee04997b6862ffa4305846e4ac2a1e +size 297264 diff --git a/examples/imda_part5_30s_ds_human_test/dataset_info.json b/examples/imda_part5_30s_ds_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..fbcecadbf1a3d434a58e8d655ca4e9fa26e4d9ff --- /dev/null +++ b/examples/imda_part5_30s_ds_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 88124619, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SDS-PART5-Test/train-00000-of-00001.parquet": { + "num_bytes": 87803731, + "checksum": null + } + }, + "download_size": 87803731, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 175928350, + "splits": { + "train": { + "name": "train", + "num_bytes": 88124619, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part5_30s_ds_human_test/state.json b/examples/imda_part5_30s_ds_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d459118c346aceb1278ee3c80f43d7a0b18f8c75 --- /dev/null +++ b/examples/imda_part5_30s_ds_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "9714c95836e726a5", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part5_30s_sqa_human_test/data-00000-of-00001.arrow b/examples/imda_part5_30s_sqa_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b89af51a77ce723f125ee2bfb8ade18e6a37ac7e --- /dev/null +++ b/examples/imda_part5_30s_sqa_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87c3c5e408e736a62dd4fd80950c96b9c4b2c336431cd2c0bdf069ac082316d3 +size 299832 diff --git a/examples/imda_part5_30s_sqa_human_test/dataset_info.json b/examples/imda_part5_30s_sqa_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..e6a6c5ade69db981178661ad3e23ff96ed03baf5 --- /dev/null +++ b/examples/imda_part5_30s_sqa_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 88115996, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SQA-PART5-Test/train-00000-of-00001.parquet": { + "num_bytes": 87803336, + "checksum": null + } + }, + "download_size": 87803336, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 175919332, + "splits": { + "train": { + "name": "train", + "num_bytes": 88115996, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part5_30s_sqa_human_test/state.json b/examples/imda_part5_30s_sqa_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..b7cb6b444602ef7243086879b43678704af8e4a9 --- /dev/null +++ b/examples/imda_part5_30s_sqa_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "60783da3f1bcf570", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part6_30s_asr_test/data-00000-of-00001.arrow b/examples/imda_part6_30s_asr_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5afaf65572838ba8f5aac4a13f73f579d5e83a38 --- /dev/null +++ b/examples/imda_part6_30s_asr_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5f7525b4b753135bd5406a6c7c417202b7af85a0fce173ec02893aa0d059cee +size 202592 diff --git a/examples/imda_part6_30s_asr_test/dataset_info.json b/examples/imda_part6_30s_asr_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaefc75ec1a0825cad04afc0c6be69fbb192befa --- /dev/null +++ b/examples/imda_part6_30s_asr_test/dataset_info.json @@ -0,0 +1,54 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 778923073, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART6-Test/train-00000-of-00002.parquet": { + "num_bytes": 382733746, + "checksum": null + }, + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/ASR-PART6-Test/train-00001-of-00002.parquet": { + "num_bytes": 393386758, + "checksum": null + } + }, + "download_size": 776120504, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1555043577, + "splits": { + "train": { + "name": "train", + "num_bytes": 778923073, + "num_examples": 1000, + "shard_lengths": [ + 700, + 300 + ], + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part6_30s_asr_test/state.json b/examples/imda_part6_30s_asr_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..50549f9ac9950ffba441b9b710fd0487c3418fa9 --- /dev/null +++ b/examples/imda_part6_30s_asr_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "0689e66129e3a756", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part6_30s_ds_human_test/data-00000-of-00001.arrow b/examples/imda_part6_30s_ds_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d4fdbf73f00f3851b29de6548526e157500d7bf3 --- /dev/null +++ b/examples/imda_part6_30s_ds_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35c475f36fc5803f0eaa3fefe0727e529bc9ab73939eb70b0bbd52a630a346ff +size 274368 diff --git a/examples/imda_part6_30s_ds_human_test/dataset_info.json b/examples/imda_part6_30s_ds_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..30c7fb5a042bab8cebcc6184ae2b6281c4e4574c --- /dev/null +++ b/examples/imda_part6_30s_ds_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 87732805, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SDS-PART6-Test/train-00000-of-00001.parquet": { + "num_bytes": 87551938, + "checksum": null + } + }, + "download_size": 87551938, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 175284743, + "splits": { + "train": { + "name": "train", + "num_bytes": 87732805, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part6_30s_ds_human_test/state.json b/examples/imda_part6_30s_ds_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea777dc78462de0834cc451bd7cd4654b7a368a5 --- /dev/null +++ b/examples/imda_part6_30s_ds_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "ac6467866ca75a14", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/imda_part6_30s_sqa_human_test/data-00000-of-00001.arrow b/examples/imda_part6_30s_sqa_human_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a3389c234cdd6d6a39585ae6452220cb5b58f8c0 --- /dev/null +++ b/examples/imda_part6_30s_sqa_human_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4f85408e5bbfdb2818f0363c8833ae15e91e363a0631213bbcb12203c9bec5c +size 289416 diff --git a/examples/imda_part6_30s_sqa_human_test/dataset_info.json b/examples/imda_part6_30s_sqa_human_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eef07568db0c95f9290454867cb345ec0c262443 --- /dev/null +++ b/examples/imda_part6_30s_sqa_human_test/dataset_info.json @@ -0,0 +1,46 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "multitask-national-speech-corpus-v1", + "dataset_size": 87727544, + "description": "", + "download_checksums": { + "hf://datasets/MERaLiON/Multitask-National-Speech-Corpus-v1@d169debbce4a2dc07f7293360d3eed33c06793b9/SQA-PART6-Test/train-00000-of-00001.parquet": { + "num_bytes": 87554027, + "checksum": null + } + }, + "download_size": 87554027, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 175281571, + "splits": { + "train": { + "name": "train", + "num_bytes": 87727544, + "num_examples": 100, + "dataset_name": "multitask-national-speech-corpus-v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/imda_part6_30s_sqa_human_test/state.json b/examples/imda_part6_30s_sqa_human_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..9518e966b0eefa080e82e8feb3235c21599723ce --- /dev/null +++ b/examples/imda_part6_30s_sqa_human_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d00fe70e0b3a219d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/librispeech_test_clean/data-00000-of-00001.arrow b/examples/librispeech_test_clean/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6f14e7c97489ca76717b1e51f2b15d4fba10f351 --- /dev/null +++ b/examples/librispeech_test_clean/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54bb0104a039ef5ab93c6168db755ebb72e7caa8b21f5032a82fb732f62ce63a +size 120056 diff --git a/examples/librispeech_test_clean/dataset_info.json b/examples/librispeech_test_clean/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..f2eb49b684b501be36a38bc09ff0ed550c606218 --- /dev/null +++ b/examples/librispeech_test_clean/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "librispeech_test_clean_v2", + "dataset_size": 622901741, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/librispeech_test_clean_v2@80ac8624e83d495d4bb9313dbb311b57ea13b4da/data/test-00000-of-00002.parquet": { + "num_bytes": 304689956, + "checksum": null + }, + "hf://datasets/AudioLLMs/librispeech_test_clean_v2@80ac8624e83d495d4bb9313dbb311b57ea13b4da/data/test-00001-of-00002.parquet": { + "num_bytes": 309185244, + "checksum": null + } + }, + "download_size": 613875200, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1236776941, + "splits": { + "test": { + "name": "test", + "num_bytes": 622901741, + "num_examples": 2617, + "shard_lengths": [ + 2209, + 408 + ], + "dataset_name": "librispeech_test_clean_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/librispeech_test_clean/state.json b/examples/librispeech_test_clean/state.json new file mode 100644 index 0000000000000000000000000000000000000000..84e069642eb7eda6e2d904c3e8ab74789bde1b8f --- /dev/null +++ b/examples/librispeech_test_clean/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "4b1a709df12e485a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/librispeech_test_other/data-00000-of-00001.arrow b/examples/librispeech_test_other/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a4b1dc7d6e2f76a5e23bfa50c6210d037118f8e5 --- /dev/null +++ b/examples/librispeech_test_other/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdaea5551d51340dcec0cc16688fd5409f558122e1b22ea000f841396383d687 +size 43984 diff --git a/examples/librispeech_test_other/dataset_info.json b/examples/librispeech_test_other/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..a154a68b4ea4336b27341b825d3af1249bdf599b --- /dev/null +++ b/examples/librispeech_test_other/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "librispeech_test_other_v2", + "dataset_size": 615751627, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/librispeech_test_other_v2@4417cb8c3515a71afeadf36df1d30eb42952d671/data/test-00000-of-00002.parquet": { + "num_bytes": 301589972, + "checksum": null + }, + "hf://datasets/AudioLLMs/librispeech_test_other_v2@4417cb8c3515a71afeadf36df1d30eb42952d671/data/test-00001-of-00002.parquet": { + "num_bytes": 297990516, + "checksum": null + } + }, + "download_size": 599580488, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1215332115, + "splits": { + "test": { + "name": "test", + "num_bytes": 615751627, + "num_examples": 2935, + "shard_lengths": [ + 2468, + 467 + ], + "dataset_name": "librispeech_test_other_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/librispeech_test_other/state.json b/examples/librispeech_test_other/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f4953a1537437c75fea12e565532d201cfcf37 --- /dev/null +++ b/examples/librispeech_test_other/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "7930e73a3e6507f6", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/lotus_thai_th_30_asr/data-00000-of-00001.arrow b/examples/lotus_thai_th_30_asr/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..3e2aa8f5c8ca850714b55864051b4dce742fb994 --- /dev/null +++ b/examples/lotus_thai_th_30_asr/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f45920f67edae32586e36a5bd3fcf1f41383df3ea471ecc2bc033697bc6facb5 +size 55856 diff --git a/examples/lotus_thai_th_30_asr/dataset_info.json b/examples/lotus_thai_th_30_asr/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..fc59a37c9d0dcc5e1c8b88421e3ecbdf38db2ba1 --- /dev/null +++ b/examples/lotus_thai_th_30_asr/dataset_info.json @@ -0,0 +1,28 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "audio_length": { + "dtype": "float64", + "_type": "Value" + }, + "language": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/lotus_thai_th_30_asr/state.json b/examples/lotus_thai_th_30_asr/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad01b34fa9c5a3afaeb4d2836fd3fbb7b62f530b --- /dev/null +++ b/examples/lotus_thai_th_30_asr/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "47c0c22dd85ae590", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/mediacorp_short_test/data-00000-of-00001.arrow b/examples/mediacorp_short_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..29d676ef73f61b791ba6c39900488b04da3fd326 --- /dev/null +++ b/examples/mediacorp_short_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed3f12b98df5fbd3c00472a5178eda9550db9482729c5c584e51282dd8473bf +size 191704 diff --git a/examples/mediacorp_short_test/dataset_info.json b/examples/mediacorp_short_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/mediacorp_short_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/mediacorp_short_test/state.json b/examples/mediacorp_short_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..90042f74a20cc3135699c7c6f59d3bbcd2e5dac7 --- /dev/null +++ b/examples/mediacorp_short_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "08dda474446b50fb", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/mediacorp_test/data-00000-of-00001.arrow b/examples/mediacorp_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..19745331cf5b1a458b60c4c02b5c333fcbb346a6 --- /dev/null +++ b/examples/mediacorp_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4185a97b29b86d7410cd9d3fb9cefd5fad7798cca19147a03fd94bc9feae33 +size 485312 diff --git a/examples/mediacorp_test/dataset_info.json b/examples/mediacorp_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/mediacorp_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/mediacorp_test/state.json b/examples/mediacorp_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..2aa0a8f7e8fc2a085893be83382cb2279c2cdce5 --- /dev/null +++ b/examples/mediacorp_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "424bed9dc12b325f", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/meld_emotion_test/data-00000-of-00001.arrow b/examples/meld_emotion_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ad85981e72f1fab37aecedf314b8ba52521eed86 --- /dev/null +++ b/examples/meld_emotion_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e7ece7cf709b53bd2ac8a50fd19b623622965e2c925a63b309d51784c464dd1 +size 22160 diff --git a/examples/meld_emotion_test/dataset_info.json b/examples/meld_emotion_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..599293c7e4db5f3116056267794aae08ca52a4f0 --- /dev/null +++ b/examples/meld_emotion_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "meld_emotion_test", + "dataset_size": 280597621, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/meld_emotion_test@005648394595a101e7c4ebeddd70043e7ba4a7a7/data/test-00000-of-00001.parquet": { + "num_bytes": 278685107, + "checksum": null + } + }, + "download_size": 278685107, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 559282728, + "splits": { + "test": { + "name": "test", + "num_bytes": 280597621, + "num_examples": 2610, + "dataset_name": "meld_emotion_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/meld_emotion_test/state.json b/examples/meld_emotion_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..c0fa8fc7fc88388358fe6b10ff9b2ebc46bd57ac --- /dev/null +++ b/examples/meld_emotion_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "dccaa2ccaaf712bb", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/meld_sentiment_test/data-00000-of-00001.arrow b/examples/meld_sentiment_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..55db700397eab98a25a35eddc7864584bf72822b --- /dev/null +++ b/examples/meld_sentiment_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b6a5ecc57458a6647b06181c9a83275bfad23ac004d11abea6a1f10b04fcad +size 26056 diff --git a/examples/meld_sentiment_test/dataset_info.json b/examples/meld_sentiment_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..8fbc0313b07ed62e6d1761d529637b0d6b997de9 --- /dev/null +++ b/examples/meld_sentiment_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "meld_sentiment_test", + "dataset_size": 280598087, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/meld_sentiment_test@912e2aea9d2b2b8f79678fe4d5e89c98539548e7/data/test-00000-of-00001.parquet": { + "num_bytes": 278679268, + "checksum": null + } + }, + "download_size": 278679268, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 559277355, + "splits": { + "test": { + "name": "test", + "num_bytes": 280598087, + "num_examples": 2610, + "dataset_name": "meld_sentiment_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/meld_sentiment_test/state.json b/examples/meld_sentiment_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad340c8cfac295c08ea1a88a68c0ccf9d160eb05 --- /dev/null +++ b/examples/meld_sentiment_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "781aa67efd07d11c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/mmau_mini/data-00000-of-00001.arrow b/examples/mmau_mini/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9877fcea32fa4cadda9834fc18c773b86579063d --- /dev/null +++ b/examples/mmau_mini/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f378086cad8c5f92e69d5d52e2aa7df44e5d163cc0c9c28e9a3be75b085e695 +size 157576 diff --git a/examples/mmau_mini/dataset_info.json b/examples/mmau_mini/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..350b531b4f47509c8b5fb53c2b228021d77ce11f --- /dev/null +++ b/examples/mmau_mini/dataset_info.json @@ -0,0 +1,83 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "MMAU-mini", + "dataset_name": "mmau-mini", + "dataset_size": 437620992, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/MMAU-mini@6da87f361509287d0e45247bee9fc79b1b940f2a/MMAU-mini/train-00000-of-00001.parquet": { + "num_bytes": 398538423, + "checksum": null + } + }, + "download_size": 398538423, + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "choices": { + "feature": { + "dtype": "string", + "_type": "Value" + }, + "_type": "Sequence" + }, + "answer": { + "dtype": "string", + "_type": "Value" + }, + "other_attributes": { + "id": { + "dtype": "string", + "_type": "Value" + }, + "dataset": { + "dtype": "string", + "_type": "Value" + }, + "task": { + "dtype": "string", + "_type": "Value" + }, + "split": { + "dtype": "string", + "_type": "Value" + }, + "category": { + "dtype": "string", + "_type": "Value" + }, + "sub-category": { + "dtype": "string", + "_type": "Value" + }, + "difficulty": { + "dtype": "string", + "_type": "Value" + } + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 836159415, + "splits": { + "test": { + "name": "test", + "num_bytes": 437620992, + "num_examples": 1000, + "dataset_name": "mmau-mini" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/mmau_mini/state.json b/examples/mmau_mini/state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbbc7793156c0653e92aadf869c0c62a80641aec --- /dev/null +++ b/examples/mmau_mini/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d2dfac069381ab81", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/muchomusic_test/data-00000-of-00001.arrow b/examples/muchomusic_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7a37eb3d23a34285a6f37269c90318f9de74f384 --- /dev/null +++ b/examples/muchomusic_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878c8ef051e16b20e02827881cd6c37b7921b67842d677c2352044ca80c1947e +size 113512 diff --git a/examples/muchomusic_test/dataset_info.json b/examples/muchomusic_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..db23a770ff65443288a78d145b99c8ed76485acf --- /dev/null +++ b/examples/muchomusic_test/dataset_info.json @@ -0,0 +1,66 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "mu_chomusic_test", + "dataset_size": 1533069124, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/mu_chomusic_test@47531859d1be6f6389c5dfd6a1f8c1354f1ba375/data/test-00000-of-00004.parquet": { + "num_bytes": 649980100, + "checksum": null + }, + "hf://datasets/AudioLLMs/mu_chomusic_test@47531859d1be6f6389c5dfd6a1f8c1354f1ba375/data/test-00001-of-00004.parquet": { + "num_bytes": 110264321, + "checksum": null + }, + "hf://datasets/AudioLLMs/mu_chomusic_test@47531859d1be6f6389c5dfd6a1f8c1354f1ba375/data/test-00002-of-00004.parquet": { + "num_bytes": 52766135, + "checksum": null + }, + "hf://datasets/AudioLLMs/mu_chomusic_test@47531859d1be6f6389c5dfd6a1f8c1354f1ba375/data/test-00003-of-00004.parquet": { + "num_bytes": 49024006, + "checksum": null + } + }, + "download_size": 862034562, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "choices": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2395103686, + "splits": { + "test": { + "name": "test", + "num_bytes": 1533069124, + "num_examples": 1187, + "shard_lengths": [ + 200, + 197, + 790 + ], + "dataset_name": "mu_chomusic_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/muchomusic_test/state.json b/examples/muchomusic_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..f5b4c6d5765dd39c2399e710c296f3742ac7a232 --- /dev/null +++ b/examples/muchomusic_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d734774230d75b2b", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/openhermes_audio_test/data-00000-of-00001.arrow b/examples/openhermes_audio_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..a618749504c8889b3482b8467e5335e1f6363398 --- /dev/null +++ b/examples/openhermes_audio_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5ba6e97e0259a56244495daef7be591ef59d445cdc689e2d338aab7899f8f0 +size 42856 diff --git a/examples/openhermes_audio_test/dataset_info.json b/examples/openhermes_audio_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..a20a4c8eee9b4065b34f9bbe7eb5de4f2ec45f3a --- /dev/null +++ b/examples/openhermes_audio_test/dataset_info.json @@ -0,0 +1,49 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "openhermes_instruction_test", + "dataset_size": 19093328, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/openhermes_instruction_test@574f252dd1847b08bbb4331c9d7100bd872f0cfe/data/test-00000-of-00001.parquet": { + "num_bytes": 17280529, + "checksum": null + } + }, + "download_size": 17280529, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "speech_instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 36373857, + "splits": { + "test": { + "name": "test", + "num_bytes": 19093328, + "num_examples": 100, + "dataset_name": "openhermes_instruction_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/openhermes_audio_test/state.json b/examples/openhermes_audio_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..454d3808c1d4abc4fee7c53cb92ecd0488bed847 --- /dev/null +++ b/examples/openhermes_audio_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "b199099538e5cd2b", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/parliament_short_test/data-00000-of-00001.arrow b/examples/parliament_short_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2c6809ad80e9f19610e47c25f35d6120bde48796 --- /dev/null +++ b/examples/parliament_short_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a61377a89079ce7d4406523d2fa13f509f52e18a888a32e680a9d1cd03fe6e9a +size 199904 diff --git a/examples/parliament_short_test/dataset_info.json b/examples/parliament_short_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/parliament_short_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/parliament_short_test/state.json b/examples/parliament_short_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..81487e7c030b28b975ca474a0f13226c99a5d45a --- /dev/null +++ b/examples/parliament_short_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d356db89fec9fc60", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/parliament_test/data-00000-of-00001.arrow b/examples/parliament_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..b7a90a396e73080bdca89ca9c20b0c60f2fa993d --- /dev/null +++ b/examples/parliament_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2450713fbf17eb743f81b3e9ee64092a12256f5f46a673588b92a294e8aef25 +size 65820512 diff --git a/examples/parliament_test/dataset_info.json b/examples/parliament_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/parliament_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/parliament_test/state.json b/examples/parliament_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..6897dfee089259fc950240f81739b8d052d917b7 --- /dev/null +++ b/examples/parliament_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "a73f4b7f9dd8a25c", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/peoples_speech_test/data-00000-of-00001.arrow b/examples/peoples_speech_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..1583609268b1c7ef70bac9ad8421fdf38123a2fe --- /dev/null +++ b/examples/peoples_speech_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c596dc6cff20ea0a74df9d6cd89d559dff4ac90b0739bdfa4f6efa566c4aa785 +size 31112 diff --git a/examples/peoples_speech_test/dataset_info.json b/examples/peoples_speech_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..7f3139c0f8188f4b10689f70db9456cb405ef043 --- /dev/null +++ b/examples/peoples_speech_test/dataset_info.json @@ -0,0 +1,113 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "peoples_speech_test_v2", + "dataset_size": 6826772300, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00000-of-00014.parquet": { + "num_bytes": 446512579, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00001-of-00014.parquet": { + "num_bytes": 427581119, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00002-of-00014.parquet": { + "num_bytes": 459708772, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00003-of-00014.parquet": { + "num_bytes": 511685917, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00004-of-00014.parquet": { + "num_bytes": 545044792, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00005-of-00014.parquet": { + "num_bytes": 519130762, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00006-of-00014.parquet": { + "num_bytes": 507914126, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00007-of-00014.parquet": { + "num_bytes": 485830727, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00008-of-00014.parquet": { + "num_bytes": 503937198, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00009-of-00014.parquet": { + "num_bytes": 536385936, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00010-of-00014.parquet": { + "num_bytes": 447877610, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00011-of-00014.parquet": { + "num_bytes": 505056912, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00012-of-00014.parquet": { + "num_bytes": 427348723, + "checksum": null + }, + "hf://datasets/AudioLLMs/peoples_speech_test_v2@a9982cc64c0b586336472a101ca52d8f82dec4bb/data/test-00013-of-00014.parquet": { + "num_bytes": 482995107, + "checksum": null + } + }, + "download_size": 6807010280, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 13633782580, + "splits": { + "test": { + "name": "test", + "num_bytes": 6826772300, + "num_examples": 32603, + "shard_lengths": [ + 2629, + 2829, + 2329, + 2529, + 2029, + 2229, + 2429, + 2529, + 2329, + 2129, + 2629, + 2428, + 2628, + 928 + ], + "dataset_name": "peoples_speech_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/peoples_speech_test/state.json b/examples/peoples_speech_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..79fcb46390270cba9cab28d7306eeca7b43018f7 --- /dev/null +++ b/examples/peoples_speech_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "73f984e17851fe6a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/public_sg_speech_qa_test/data-00000-of-00001.arrow b/examples/public_sg_speech_qa_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..231946bca03c71bc69ce1aed3a64a02115030a77 --- /dev/null +++ b/examples/public_sg_speech_qa_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727e33a340cb1b02d260a62c80181992dd1fc71730178bd8d5b43f23ff9dc6f6 +size 354712 diff --git a/examples/public_sg_speech_qa_test/dataset_info.json b/examples/public_sg_speech_qa_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..84a5225fd0dd6f836a27d24c6c32d93adbcb6388 --- /dev/null +++ b/examples/public_sg_speech_qa_test/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "public_sg_speech_qa_test", + "dataset_size": 877974941, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/public_sg_speech_qa_test@ec4bcaf2615eaba0ca30b86f1f53ce4acb620f85/data/test-00000-of-00002.parquet": { + "num_bytes": 157735631, + "checksum": null + }, + "hf://datasets/AudioLLMs/public_sg_speech_qa_test@ec4bcaf2615eaba0ca30b86f1f53ce4acb620f85/data/test-00001-of-00002.parquet": { + "num_bytes": 164640575, + "checksum": null + } + }, + "download_size": 322376206, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1200351147, + "splits": { + "test": { + "name": "test", + "num_bytes": 877974941, + "num_examples": 688, + "shard_lengths": [ + 444, + 244 + ], + "dataset_name": "public_sg_speech_qa_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/public_sg_speech_qa_test/state.json b/examples/public_sg_speech_qa_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..4a57b0a5f30ec6a09e6e5c56cf2982078eaa1a1e --- /dev/null +++ b/examples/public_sg_speech_qa_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "ec4cfed44f745103", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/seame_dev_man/data-00000-of-00001.arrow b/examples/seame_dev_man/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c005b3b13a68d1ed5a8b0a9767e19c4ced20aff6 --- /dev/null +++ b/examples/seame_dev_man/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a601a87e62e0fd07013ea9b121f76f18b5b22c33950f5e974a1882833e7af75 +size 40528 diff --git a/examples/seame_dev_man/dataset_info.json b/examples/seame_dev_man/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/seame_dev_man/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/seame_dev_man/state.json b/examples/seame_dev_man/state.json new file mode 100644 index 0000000000000000000000000000000000000000..4d48dad7bff78ab940e424cea5408bc9116b868b --- /dev/null +++ b/examples/seame_dev_man/state.json @@ -0,0 +1,17 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "376f7bb4c3aad318", + "_format_columns": [ + "answer", + "context", + "instruction" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/seame_dev_sge/data-00000-of-00001.arrow b/examples/seame_dev_sge/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..064451bce6ffd5dc3d27a67616f110bbd20c8017 --- /dev/null +++ b/examples/seame_dev_sge/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae7a3015a3fece838f2f016014c7e1d72bc73cd75179d69d692f98b0d24315b +size 31968 diff --git a/examples/seame_dev_sge/dataset_info.json b/examples/seame_dev_sge/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/seame_dev_sge/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/seame_dev_sge/state.json b/examples/seame_dev_sge/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7dcab0fce65f56f6eae5205befd2fdd5ddf14d19 --- /dev/null +++ b/examples/seame_dev_sge/state.json @@ -0,0 +1,17 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "b28127d2713aa323", + "_format_columns": [ + "answer", + "context", + "instruction" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/slue_p2_sqa5_test/data-00000-of-00001.arrow b/examples/slue_p2_sqa5_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..bf2f3b26bd61cb01c05ae604f7ade385d5266c74 --- /dev/null +++ b/examples/slue_p2_sqa5_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47372484d1cbbc94a8d9e4596d018d0d42a244b895a812a4085577e14ff92254 +size 406832 diff --git a/examples/slue_p2_sqa5_test/dataset_info.json b/examples/slue_p2_sqa5_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..209813052343e4993c75f87fb39ac9074ee965cd --- /dev/null +++ b/examples/slue_p2_sqa5_test/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "slue_p2_sqa5_test", + "dataset_size": 520438061, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/slue_p2_sqa5_test@24809a6ef2e1243543d97a993c9b12765f0a0bff/data/test-00000-of-00002.parquet": { + "num_bytes": 192101109, + "checksum": null + }, + "hf://datasets/AudioLLMs/slue_p2_sqa5_test@24809a6ef2e1243543d97a993c9b12765f0a0bff/data/test-00001-of-00002.parquet": { + "num_bytes": 240700847, + "checksum": null + } + }, + "download_size": 432801956, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 953240017, + "splits": { + "test": { + "name": "test", + "num_bytes": 520438061, + "num_examples": 408, + "shard_lengths": [ + 404, + 4 + ], + "dataset_name": "slue_p2_sqa5_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/slue_p2_sqa5_test/state.json b/examples/slue_p2_sqa5_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..69b44db22b05f7cd56a7c45df94d5edca730643c --- /dev/null +++ b/examples/slue_p2_sqa5_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "a06c999e9f8b4572", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/spoken_squad_test/data-00000-of-00001.arrow b/examples/spoken_squad_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9217a72732193e4b3553a1ed3b87701add32a25e --- /dev/null +++ b/examples/spoken_squad_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd942b781d55cf469c585c39dbf85fcd50d030fe02e085ba986c0522a9434c8 +size 593136 diff --git a/examples/spoken_squad_test/dataset_info.json b/examples/spoken_squad_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..91e3e0716f6fb6d5cc045903dceca3252aa41c84 --- /dev/null +++ b/examples/spoken_squad_test/dataset_info.json @@ -0,0 +1,146 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "spoken_squad_test_v1", + "dataset_size": 10453862717, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00000-of-00021.parquet": { + "num_bytes": 145611984, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00001-of-00021.parquet": { + "num_bytes": 173640214, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00002-of-00021.parquet": { + "num_bytes": 164368935, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00003-of-00021.parquet": { + "num_bytes": 157849433, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00004-of-00021.parquet": { + "num_bytes": 182331320, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00005-of-00021.parquet": { + "num_bytes": 182313865, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00006-of-00021.parquet": { + "num_bytes": 157618414, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00007-of-00021.parquet": { + "num_bytes": 176230713, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00008-of-00021.parquet": { + "num_bytes": 162560736, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00009-of-00021.parquet": { + "num_bytes": 168135653, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00010-of-00021.parquet": { + "num_bytes": 163013837, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00011-of-00021.parquet": { + "num_bytes": 167050371, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00012-of-00021.parquet": { + "num_bytes": 154941646, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00013-of-00021.parquet": { + "num_bytes": 175502166, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00014-of-00021.parquet": { + "num_bytes": 169909873, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00015-of-00021.parquet": { + "num_bytes": 137990873, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00016-of-00021.parquet": { + "num_bytes": 142416561, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00017-of-00021.parquet": { + "num_bytes": 152122039, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00018-of-00021.parquet": { + "num_bytes": 170697262, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00019-of-00021.parquet": { + "num_bytes": 150092854, + "checksum": null + }, + "hf://datasets/AudioLLMs/spoken_squad_test_v1@b55aab98726d0eab95eeef1ee9992a0532b3226e/data/test-00020-of-00021.parquet": { + "num_bytes": 149210870, + "checksum": null + } + }, + "download_size": 3403609619, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 13857472336, + "splits": { + "test": { + "name": "test", + "num_bytes": 10453862717, + "num_examples": 5351, + "shard_lengths": [ + 355, + 255, + 255, + 255, + 255, + 355, + 255, + 255, + 310, + 255, + 255, + 255, + 355, + 255, + 355, + 309, + 354, + 354, + 54 + ], + "dataset_name": "spoken_squad_test_v1" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/spoken_squad_test/state.json b/examples/spoken_squad_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..70689a0dc9deb298e8e214d8400423618599c355 --- /dev/null +++ b/examples/spoken_squad_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "1308ae85fe4d7a20", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/ste_test3/data-00000-of-00001.arrow b/examples/ste_test3/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..5fe6733318d864da3c5dc18390f1628ffad7284b --- /dev/null +++ b/examples/ste_test3/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11483198938489ac5a0bfa1dac455b0cf91b96eae065ed7cab07dc5f518b4b6e +size 2998784 diff --git a/examples/ste_test3/dataset_info.json b/examples/ste_test3/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/ste_test3/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/ste_test3/state.json b/examples/ste_test3/state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fc4edc400da143a4d817b55eccb26a40a6d4a0a --- /dev/null +++ b/examples/ste_test3/state.json @@ -0,0 +1,17 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "abff8d785d59685e", + "_format_columns": [ + "answer", + "context", + "instruction" + ], + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/tedlium3_long_form_test/data-00000-of-00001.arrow b/examples/tedlium3_long_form_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..2e5558b9f01bf1ac0756fb21e7007edf03955fe1 --- /dev/null +++ b/examples/tedlium3_long_form_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74333ea28b36d285183d8e01f9d6dfbdde438aa426a1e01c5ad7e09168abf58e +size 10878952 diff --git a/examples/tedlium3_long_form_test/dataset_info.json b/examples/tedlium3_long_form_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..d3e6327ec36b1e94b41b5c3070f1f8ae95e31e14 --- /dev/null +++ b/examples/tedlium3_long_form_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "tedlium3_long_form_test_v2", + "dataset_size": 301640781, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/tedlium3_long_form_test_v2@097511c11b7b8550e17f938bd2bcb127de0ded13/data/test-00000-of-00001.parquet": { + "num_bytes": 301457168, + "checksum": null + } + }, + "download_size": 301457168, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 603097949, + "splits": { + "test": { + "name": "test", + "num_bytes": 301640781, + "num_examples": 11, + "dataset_name": "tedlium3_long_form_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/tedlium3_long_form_test/state.json b/examples/tedlium3_long_form_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..71a2afb30912de104b8b8e3e25e2f963a29b894f --- /dev/null +++ b/examples/tedlium3_long_form_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "70079e1e42b9947f", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/tedlium3_test/data-00000-of-00001.arrow b/examples/tedlium3_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..ce082e1bcc48c20bf5faa88d0ec24e0ed07fc117 --- /dev/null +++ b/examples/tedlium3_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3f9228b6d8c33f60e6378c31496ae81f4fd2d1e855758354df64e91484d7a1 +size 82768 diff --git a/examples/tedlium3_test/dataset_info.json b/examples/tedlium3_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..f051a5708266ce96f078eb13144ae6c07c69a730 --- /dev/null +++ b/examples/tedlium3_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "tedlium3_test_v2", + "dataset_size": 301421712, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/tedlium3_test_v2@aef8b18c0ee54a0123de5e671b2b889dbdd0b5c6/data/test-00000-of-00001.parquet": { + "num_bytes": 301257849, + "checksum": null + } + }, + "download_size": 301257849, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 602679561, + "splits": { + "test": { + "name": "test", + "num_bytes": 301421712, + "num_examples": 1142, + "dataset_name": "tedlium3_test_v2" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/tedlium3_test/state.json b/examples/tedlium3_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba16e862cb888f877714dd2cab29717551688113 --- /dev/null +++ b/examples/tedlium3_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "8fe2d2b83bdccca4", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/ukusnews_short_test/data-00000-of-00001.arrow b/examples/ukusnews_short_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..0978a20330a05939dba8288ea3bab955682e5f76 --- /dev/null +++ b/examples/ukusnews_short_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a93438b1e9ea0a75e1c814f3ec01d35ccc6c66e734c877889566878d488ebcb +size 204272 diff --git a/examples/ukusnews_short_test/dataset_info.json b/examples/ukusnews_short_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/ukusnews_short_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/ukusnews_short_test/state.json b/examples/ukusnews_short_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7aa57b6375095ed5df561c53e9baa4e001354789 --- /dev/null +++ b/examples/ukusnews_short_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "e75ae6e9fe444ef3", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/ukusnews_test/data-00000-of-00001.arrow b/examples/ukusnews_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..172ff3521d9ccb560e93fbe049ee6df458da5b07 --- /dev/null +++ b/examples/ukusnews_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8753151458f62d967ee190d9d3cd35fa10688b331633e20628b9ce3b2aa232a8 +size 8098600 diff --git a/examples/ukusnews_test/dataset_info.json b/examples/ukusnews_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 --- /dev/null +++ b/examples/ukusnews_test/dataset_info.json @@ -0,0 +1,20 @@ +{ + "citation": "", + "description": "", + "features": { + "context": { + "sampling_rate": 16000, + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/examples/ukusnews_test/state.json b/examples/ukusnews_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..ade384adbda455d716d43767b2550dc5f7979e5e --- /dev/null +++ b/examples/ukusnews_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "382fcc5959facd14", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/voxceleb_accent_test/data-00000-of-00001.arrow b/examples/voxceleb_accent_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..67ec0c65842f8eeb3d8f3b213aac5387199713ee --- /dev/null +++ b/examples/voxceleb_accent_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2c74d8a3682d8ed9c03231ed85aff41e270af83702fc95eafe9c253477add30 +size 79032 diff --git a/examples/voxceleb_accent_test/dataset_info.json b/examples/voxceleb_accent_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..2852ec3f0550f803219aab3cb3648c518f9a914c --- /dev/null +++ b/examples/voxceleb_accent_test/dataset_info.json @@ -0,0 +1,58 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "voxceleb_accent_test", + "dataset_size": 1291489263, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/voxceleb_accent_test@d89c114a8337920f51879e1cf7b93a9c01697263/data/test-00000-of-00003.parquet": { + "num_bytes": 441572573, + "checksum": null + }, + "hf://datasets/AudioLLMs/voxceleb_accent_test@d89c114a8337920f51879e1cf7b93a9c01697263/data/test-00001-of-00003.parquet": { + "num_bytes": 409329531, + "checksum": null + }, + "hf://datasets/AudioLLMs/voxceleb_accent_test@d89c114a8337920f51879e1cf7b93a9c01697263/data/test-00002-of-00003.parquet": { + "num_bytes": 436070931, + "checksum": null + } + }, + "download_size": 1286973035, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2578462298, + "splits": { + "test": { + "name": "test", + "num_bytes": 1291489263, + "num_examples": 4874, + "shard_lengths": [ + 1925, + 2025, + 924 + ], + "dataset_name": "voxceleb_accent_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/voxceleb_accent_test/state.json b/examples/voxceleb_accent_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbb25b98ee8a23098cc9ac9cfb0a5087477e6363 --- /dev/null +++ b/examples/voxceleb_accent_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "a59871b9650aef63", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/voxceleb_gender_test/data-00000-of-00001.arrow b/examples/voxceleb_gender_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d0c5b3ab5e103e65cd81b96ff34160d9e5f68017 --- /dev/null +++ b/examples/voxceleb_gender_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc807934a1df34e896cd8b9808e0275879f268659003fcda277f1f8fab370d8 +size 131688 diff --git a/examples/voxceleb_gender_test/dataset_info.json b/examples/voxceleb_gender_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..6595a9ce4190740214037d55501cb756302ab5d9 --- /dev/null +++ b/examples/voxceleb_gender_test/dataset_info.json @@ -0,0 +1,58 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "voxceleb_gender_test", + "dataset_size": 1291371447, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/voxceleb_gender_test@0844e44484101caf53beaf8b243ec574de016359/data/test-00000-of-00003.parquet": { + "num_bytes": 441577193, + "checksum": null + }, + "hf://datasets/AudioLLMs/voxceleb_gender_test@0844e44484101caf53beaf8b243ec574de016359/data/test-00001-of-00003.parquet": { + "num_bytes": 409333996, + "checksum": null + }, + "hf://datasets/AudioLLMs/voxceleb_gender_test@0844e44484101caf53beaf8b243ec574de016359/data/test-00002-of-00003.parquet": { + "num_bytes": 436075552, + "checksum": null + } + }, + "download_size": 1286986741, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 2578358188, + "splits": { + "test": { + "name": "test", + "num_bytes": 1291371447, + "num_examples": 4874, + "shard_lengths": [ + 1925, + 2025, + 924 + ], + "dataset_name": "voxceleb_gender_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/voxceleb_gender_test/state.json b/examples/voxceleb_gender_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..49c7468c2df5568015f93463864984e38e848ded --- /dev/null +++ b/examples/voxceleb_gender_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "bcb2aa81fc7ecdbd", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/wavcaps_qa_test/data-00000-of-00001.arrow b/examples/wavcaps_qa_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c10d7dcb4487e9e861f8f241377b449ba5896372 --- /dev/null +++ b/examples/wavcaps_qa_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cafc73a5d78348629db3df191540d29547b5d154d7abeb0caa4513d255d3fcf +size 84800 diff --git a/examples/wavcaps_qa_test/dataset_info.json b/examples/wavcaps_qa_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0d4d41f4549a82d2f91c97d51817982e540b03 --- /dev/null +++ b/examples/wavcaps_qa_test/dataset_info.json @@ -0,0 +1,45 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "wavcaps_qa_test_v3", + "dataset_size": 100124750, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/wavcaps_qa_test_v3@9be38b7c3c8b971ec42cee607a7670a3ec2d116a/data/test-00000-of-00001.parquet": { + "num_bytes": 94442458, + "checksum": null + } + }, + "download_size": 94442458, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 194567208, + "splits": { + "test": { + "name": "test", + "num_bytes": 100124750, + "num_examples": 304, + "dataset_name": "wavcaps_qa_test_v3" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/wavcaps_qa_test/state.json b/examples/wavcaps_qa_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4a1ed20549f890d4a7f6dcf8267812a293804b8 --- /dev/null +++ b/examples/wavcaps_qa_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "d2d32569314db05d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/wavcaps_test/data-00000-of-00001.arrow b/examples/wavcaps_test/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..68ed82aac30d1c0dba6d146c443500a97aad0684 --- /dev/null +++ b/examples/wavcaps_test/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b18a55158a5c91d174ea2d4871db716d03b038c4c8430527c229a9d6bc8b4d0 +size 94528 diff --git a/examples/wavcaps_test/dataset_info.json b/examples/wavcaps_test/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..609815d254f5f803f13197c9e41187605823e3fe --- /dev/null +++ b/examples/wavcaps_test/dataset_info.json @@ -0,0 +1,53 @@ +{ + "builder_name": "parquet", + "citation": "", + "config_name": "default", + "dataset_name": "wavcaps_test", + "dataset_size": 566587531, + "description": "", + "download_checksums": { + "hf://datasets/AudioLLMs/wavcaps_test@3f87f655ebfb769579890d952504fd32fb853649/data/test-00000-of-00002.parquet": { + "num_bytes": 269177087, + "checksum": null + }, + "hf://datasets/AudioLLMs/wavcaps_test@3f87f655ebfb769579890d952504fd32fb853649/data/test-00001-of-00002.parquet": { + "num_bytes": 263361128, + "checksum": null + } + }, + "download_size": 532538215, + "features": { + "context": { + "_type": "Audio" + }, + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "answer": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "", + "size_in_bytes": 1099125746, + "splits": { + "test": { + "name": "test", + "num_bytes": 566587531, + "num_examples": 1730, + "shard_lengths": [ + 1565, + 165 + ], + "dataset_name": "wavcaps_test" + } + }, + "version": { + "version_str": "0.0.0", + "major": 0, + "minor": 0, + "patch": 0 + } +} \ No newline at end of file diff --git a/examples/wavcaps_test/state.json b/examples/wavcaps_test/state.json new file mode 100644 index 0000000000000000000000000000000000000000..509c08df9bf8be441f7785512a1a1db9348f78bb --- /dev/null +++ b/examples/wavcaps_test/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "c49898876fcdbb73", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/examples/ytb_asr_batch1/data-00000-of-00001.arrow b/examples/ytb_asr_batch1/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e2954d93e9866405341f17d06ecfaad679d4e44d --- /dev/null +++ b/examples/ytb_asr_batch1/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155fbe0aee98f112df256dfe0562019e766de916bd7a2dc1b39ac50f0725476f +size 289792 diff --git a/examples/ytb_asr_batch1/dataset_info.json b/examples/ytb_asr_batch1/dataset_info.json index b5754bf0c92c28c7d87e0ad60d095ae4212cecf1..ad6be7e8a51413141e904ee813472f94021f2970 100644 --- a/examples/ytb_asr_batch1/dataset_info.json +++ b/examples/ytb_asr_batch1/dataset_info.json @@ -3,75 +3,16 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "other_attributes": { - "YouTube": { - "feature": { - "dtype": "string", - "_type": "Value" - }, - "_type": "Sequence" - }, - "YouTube End Time": { - "dtype": "float64", - "_type": "Value" - }, - "YouTube Start Time": { - "dtype": "float64", - "_type": "Value" - }, - "audio_path": { - "dtype": "string", - "_type": "Value" - }, - "index": { - "dtype": "string", - "_type": "Value" - }, - "segments": [ - { - "end_time": { - "dtype": "float64", - "_type": "Value" - }, - "start_time": { - "dtype": "float64", - "_type": "Value" - }, - "text": { - "dtype": "string", - "_type": "Value" - } - } - ] + "dtype": "string", + "_type": "Value" }, "language": { "dtype": "string", diff --git a/examples/ytb_asr_batch1/state.json b/examples/ytb_asr_batch1/state.json index cf216768b894da2450a328e78701000bf539fd96..1959c1b369cf376ab26cf91870bb3accf5e6ce8d 100644 --- a/examples/ytb_asr_batch1/state.json +++ b/examples/ytb_asr_batch1/state.json @@ -4,14 +4,13 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "7360fb5a1a8ab82b", + "_fingerprint": "24ec52b8feb521b9", "_format_columns": [ "answer", "audio_length", "context", "instruction", - "language", - "other_attributes" + "language" ], "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_asr_batch2/data-00000-of-00001.arrow b/examples/ytb_asr_batch2/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..01bc5ba19760be6d8649f85c637e2c0ddc46d705 --- /dev/null +++ b/examples/ytb_asr_batch2/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943d6e9d6f25d1b61804ec317c90e7de9240b7a54a4c6d2dbf9c0a8b9b8366ac +size 322112 diff --git a/examples/ytb_asr_batch2/dataset_info.json b/examples/ytb_asr_batch2/dataset_info.json index 421fca37d477b638bd13327f3f5d8edcf74ffd7e..ad6be7e8a51413141e904ee813472f94021f2970 100644 --- a/examples/ytb_asr_batch2/dataset_info.json +++ b/examples/ytb_asr_batch2/dataset_info.json @@ -3,48 +3,16 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "other_attributes": { - "emotion": { - "dtype": "string", - "_type": "Value" - }, - "file": { - "dtype": "string", - "_type": "Value" - }, - "index": { - "dtype": "string", - "_type": "Value" - } + "dtype": "string", + "_type": "Value" }, "language": { "dtype": "string", diff --git a/examples/ytb_asr_batch2/state.json b/examples/ytb_asr_batch2/state.json index 235fa4bab5d6e618e6f16a9d8883949f0d1985ad..9c873550bb816000106349e5b20afc7fdd6436ba 100644 --- a/examples/ytb_asr_batch2/state.json +++ b/examples/ytb_asr_batch2/state.json @@ -4,14 +4,13 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "6c4b1fab13123a29", + "_fingerprint": "315db5a61f76ecac", "_format_columns": [ "answer", "audio_length", "context", "instruction", - "language", - "other_attributes" + "language" ], "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_asr_batch3_chinese/data-00000-of-00001.arrow b/examples/ytb_asr_batch3_chinese/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..347731cd772c0056f44000007429af3bd8d3b1b6 --- /dev/null +++ b/examples/ytb_asr_batch3_chinese/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d592acea0074c58c3c68a6c6c7083619f8047d6bd5993f5e0895bcf28cf6342 +size 299416 diff --git a/examples/ytb_asr_batch3_chinese/dataset_info.json b/examples/ytb_asr_batch3_chinese/dataset_info.json index a4e6ff45a2dcc5864e80dbee8e6f39b43dc610d3..ad6be7e8a51413141e904ee813472f94021f2970 100644 --- a/examples/ytb_asr_batch3_chinese/dataset_info.json +++ b/examples/ytb_asr_batch3_chinese/dataset_info.json @@ -3,34 +3,16 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "language": { "dtype": "string", @@ -39,28 +21,6 @@ "audio_length": { "dtype": "float64", "_type": "Value" - }, - "other_attributes": { - "youtube_name": { - "dtype": "string", - "_type": "Value" - }, - "start_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "end_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "link": { - "dtype": "string", - "_type": "Value" - }, - "percentage_of_chinese": { - "dtype": "string", - "_type": "Value" - } } }, "homepage": "", diff --git a/examples/ytb_asr_batch3_chinese/state.json b/examples/ytb_asr_batch3_chinese/state.json index 111dff0ff62f3f6bfc8fc266ef7a596ee88f15a9..9c7d0a8fbe1ddbbef8fd7d4d1cbc56780a699e9c 100644 --- a/examples/ytb_asr_batch3_chinese/state.json +++ b/examples/ytb_asr_batch3_chinese/state.json @@ -4,7 +4,7 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "ee34dce79289288d", + "_fingerprint": "b1848f66b8960f55", "_format_columns": null, "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_asr_batch3_malay/data-00000-of-00001.arrow b/examples/ytb_asr_batch3_malay/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..005df151c188390d282d02b015c08dedc0beaf3a --- /dev/null +++ b/examples/ytb_asr_batch3_malay/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f913e69efda63a46837d8ba100f82f502669f9dfd647b1503642bceebb76a5a +size 505128 diff --git a/examples/ytb_asr_batch3_malay/dataset_info.json b/examples/ytb_asr_batch3_malay/dataset_info.json index 4b7d454867df85702d23932ef32523f887f14341..ad6be7e8a51413141e904ee813472f94021f2970 100644 --- a/examples/ytb_asr_batch3_malay/dataset_info.json +++ b/examples/ytb_asr_batch3_malay/dataset_info.json @@ -3,34 +3,16 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "language": { "dtype": "string", @@ -39,28 +21,6 @@ "audio_length": { "dtype": "float64", "_type": "Value" - }, - "other_attributes": { - "youtube_name": { - "dtype": "string", - "_type": "Value" - }, - "start_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "end_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "link": { - "dtype": "string", - "_type": "Value" - }, - "percentage_of_malay": { - "dtype": "string", - "_type": "Value" - } } }, "homepage": "", diff --git a/examples/ytb_asr_batch3_malay/state.json b/examples/ytb_asr_batch3_malay/state.json index 70087d8572efd7979edfef7855a5d54883f3d78d..7eb8f9eab1e5fdd6374f4bb560c407e9f72b9896 100644 --- a/examples/ytb_asr_batch3_malay/state.json +++ b/examples/ytb_asr_batch3_malay/state.json @@ -4,7 +4,7 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "23fffb8632becbbd", + "_fingerprint": "b64db39453848f5b", "_format_columns": null, "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_asr_batch3_tamil/data-00000-of-00001.arrow b/examples/ytb_asr_batch3_tamil/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c61e97dee92879702b11ce11eb0edf1bef85d212 --- /dev/null +++ b/examples/ytb_asr_batch3_tamil/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13bedcf85067727cc84be070703d589c19021085dbf0bf69caa11d95e5af50f8 +size 804648 diff --git a/examples/ytb_asr_batch3_tamil/dataset_info.json b/examples/ytb_asr_batch3_tamil/dataset_info.json index f384bb20b474d082fdf5e571cdd314aa80887153..ad6be7e8a51413141e904ee813472f94021f2970 100644 --- a/examples/ytb_asr_batch3_tamil/dataset_info.json +++ b/examples/ytb_asr_batch3_tamil/dataset_info.json @@ -3,34 +3,16 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "dtype": "string", + "_type": "Value" }, "language": { "dtype": "string", @@ -39,28 +21,6 @@ "audio_length": { "dtype": "float64", "_type": "Value" - }, - "other_attributes": { - "youtube_name": { - "dtype": "string", - "_type": "Value" - }, - "start_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "end_time_in_sec": { - "dtype": "float32", - "_type": "Value" - }, - "link": { - "dtype": "string", - "_type": "Value" - }, - "percentage_of_tamil": { - "dtype": "string", - "_type": "Value" - } } }, "homepage": "", diff --git a/examples/ytb_asr_batch3_tamil/state.json b/examples/ytb_asr_batch3_tamil/state.json index 070a1ae7bec5971e2de5445fb2353cea5403380c..60e0acc112e7f1dcf3ccf7c78b60bf535f2ddf19 100644 --- a/examples/ytb_asr_batch3_tamil/state.json +++ b/examples/ytb_asr_batch3_tamil/state.json @@ -4,7 +4,7 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "eb504eb1889a4aac", + "_fingerprint": "d135ebb3b2708b61", "_format_columns": null, "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_pqa_batch1/data-00000-of-00001.arrow b/examples/ytb_pqa_batch1/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6f72e946e94e11799a7b987162fc5af5caaba92a --- /dev/null +++ b/examples/ytb_pqa_batch1/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf6642b0badc26bf89a5cab4839e309df59e582676881b0214fde04ec25876f +size 786024 diff --git a/examples/ytb_pqa_batch1/dataset_info.json b/examples/ytb_pqa_batch1/dataset_info.json index b822bb17c82071de97955f83556db39c3c821560..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 100644 --- a/examples/ytb_pqa_batch1/dataset_info.json +++ b/examples/ytb_pqa_batch1/dataset_info.json @@ -3,63 +3,15 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "other_attributes": { - "End_Time": { - "dtype": "null", - "_type": "Value" - }, - "Link": { - "dtype": "null", - "_type": "Value" - }, - "Question_Type": { - "dtype": "null", - "_type": "Value" - }, - "Start_Time": { - "dtype": "null", - "_type": "Value" - }, - "Title": { - "dtype": "null", - "_type": "Value" - } - }, - "language": { "dtype": "string", "_type": "Value" }, - "audio_length": { - "dtype": "float64", + "answer": { + "dtype": "string", "_type": "Value" } }, diff --git a/examples/ytb_pqa_batch1/state.json b/examples/ytb_pqa_batch1/state.json index 77dca61c3439f0c2b1db097de7bf69a20716d843..7166e60c28dbeb0c693b05ae9f491fa3df0d26d6 100644 --- a/examples/ytb_pqa_batch1/state.json +++ b/examples/ytb_pqa_batch1/state.json @@ -4,14 +4,11 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "51c95b1c01487908", + "_fingerprint": "64f4eb524b485792", "_format_columns": [ "answer", - "audio_length", "context", - "instruction", - "language", - "other_attributes" + "instruction" ], "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_sds_batch1/data-00000-of-00001.arrow b/examples/ytb_sds_batch1/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..38b7fc14de8c36195f6100a2c2479073a8c9a6e2 --- /dev/null +++ b/examples/ytb_sds_batch1/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ffe1613c2e0cb969f22a36132c6549d9a73dabed04afbba0650143f335c61a5 +size 286208 diff --git a/examples/ytb_sds_batch1/dataset_info.json b/examples/ytb_sds_batch1/dataset_info.json index 11f921d18bc7b52b370ee167ecbfadfcb8618eea..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 100644 --- a/examples/ytb_sds_batch1/dataset_info.json +++ b/examples/ytb_sds_batch1/dataset_info.json @@ -3,63 +3,15 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "other_attributes": { - "End_Time": { - "dtype": "float64", - "_type": "Value" - }, - "Link": { - "dtype": "string", - "_type": "Value" - }, - "Question_Type": { - "dtype": "int64", - "_type": "Value" - }, - "Start_Time": { - "dtype": "float64", - "_type": "Value" - }, - "Title": { - "dtype": "string", - "_type": "Value" - } - }, - "language": { "dtype": "string", "_type": "Value" }, - "audio_length": { - "dtype": "float64", + "answer": { + "dtype": "string", "_type": "Value" } }, diff --git a/examples/ytb_sds_batch1/state.json b/examples/ytb_sds_batch1/state.json index a67d6fa844f4fe7ce1544dddcfcee7454cb865b2..7e303953941192cab4c6feb35530e8da1860c841 100644 --- a/examples/ytb_sds_batch1/state.json +++ b/examples/ytb_sds_batch1/state.json @@ -4,14 +4,11 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "001adf7ce82937e0", + "_fingerprint": "4391324da580d9d5", "_format_columns": [ "answer", - "audio_length", "context", - "instruction", - "language", - "other_attributes" + "instruction" ], "_format_kwargs": {}, "_format_type": null, diff --git a/examples/ytb_sqa_batch1/data-00000-of-00001.arrow b/examples/ytb_sqa_batch1/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9b35d8bfd4bc134f0b5263fb83fb866a142c04c9 --- /dev/null +++ b/examples/ytb_sqa_batch1/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfdff5eddac0891188842532f26f48fcaf87bde18d7a89dd4fbbed669ecbb2fe +size 277624 diff --git a/examples/ytb_sqa_batch1/dataset_info.json b/examples/ytb_sqa_batch1/dataset_info.json index 11f921d18bc7b52b370ee167ecbfadfcb8618eea..eaa26d3d5fe4410f72426a40f20ffa26002f6b05 100644 --- a/examples/ytb_sqa_batch1/dataset_info.json +++ b/examples/ytb_sqa_batch1/dataset_info.json @@ -3,63 +3,15 @@ "description": "", "features": { "context": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } + "sampling_rate": 16000, + "_type": "Audio" }, "instruction": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "answer": { - "text": { - "dtype": "string", - "_type": "Value" - }, - "audio": { - "sampling_rate": 16000, - "_type": "Audio" - } - }, - "other_attributes": { - "End_Time": { - "dtype": "float64", - "_type": "Value" - }, - "Link": { - "dtype": "string", - "_type": "Value" - }, - "Question_Type": { - "dtype": "int64", - "_type": "Value" - }, - "Start_Time": { - "dtype": "float64", - "_type": "Value" - }, - "Title": { - "dtype": "string", - "_type": "Value" - } - }, - "language": { "dtype": "string", "_type": "Value" }, - "audio_length": { - "dtype": "float64", + "answer": { + "dtype": "string", "_type": "Value" } }, diff --git a/examples/ytb_sqa_batch1/state.json b/examples/ytb_sqa_batch1/state.json index e149f6e0e2bb6297f7a1df77fbc08d719db0f3cb..756e71a60f1f69a179e527418f3f7455a5047c97 100644 --- a/examples/ytb_sqa_batch1/state.json +++ b/examples/ytb_sqa_batch1/state.json @@ -4,14 +4,11 @@ "filename": "data-00000-of-00001.arrow" } ], - "_fingerprint": "f02f9612426fa5b1", + "_fingerprint": "f21943d877cf4be2", "_format_columns": [ "answer", - "audio_length", "context", - "instruction", - "language", - "other_attributes" + "instruction" ], "_format_kwargs": {}, "_format_type": null,