Spaces:
Sleeping
Sleeping
File size: 2,672 Bytes
f4c80a2 6f349df f4c80a2 6f349df f4c80a2 6f349df f4c80a2 6f349df f4c80a2 6f349df f4c80a2 6f349df f4c80a2 6f349df f4c80a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from datasets import load_dataset, concatenate_datasets
ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache")
combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]])
# 2. filter rows by singer: baber
combined = combined.filter(lambda x: x["singer"] == "barber")
# 3. create a new column, which counts the nonzero numbers in the list in the note_midi column
combined = combined.map(
lambda x: {
"note_midi_length": len([n for n in x["note_midi"] if n != 0]),
"lyric_word_length": len(
[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
), # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
}
)
combined = combined.map(
lambda x: {
"lyric_word_length": len(
[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
)
} # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
)
# 4. sort by segment_id
combined = combined.sort("segment_id")
# 5. iterate over rows
prev_songid = None
prev_song_segment_id = None
song2note_lengths = {}
song2word_lengths = {}
for row in combined:
# segment_id: kising_barber_{songid}_{song_segment_id}
_, _, songid, song_segment_id = row["segment_id"].split("_")
if prev_songid != songid:
if prev_songid is not None:
assert (
song_segment_id == "001"
), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}"
song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]]
song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]]
else:
assert (
int(song_segment_id) >= int(prev_song_segment_id) + 1
), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}"
song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"])
song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"])
prev_songid = songid
prev_song_segment_id = song_segment_id
# 6. write to json
import json
with open("data/song2note_lengths.json", "w") as f:
json.dump(song2note_lengths, f, indent=4)
with open("data/song2word_lengths.json", "w") as f:
json.dump(song2word_lengths, f, indent=4)
# 7. push score segments to hub
# remove audio and singer columns
combined = combined.remove_columns(["audio", "singer"])
# replace kising_barber_ with kising_
combined = combined.map(
lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")}
)
# upload to hub
combined.push_to_hub("jhansss/kising_score_segments")
|