File size: 4,026 Bytes
0dc360b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Copyright 2025 Balacoon

Fetches samples from `balacoon/speech_gen_baselines` and
`balacoon/speech_gen_eval_testsets` datasets.
"""

import re
import logging
import requests

import pandas as pd

from huggingface_hub import hf_hub_url


def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]:
    """
    Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset.
    Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`.
    Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines`
    according to `demo`.
    """
    testsets_repo = "balacoon/speech_gen_eval_testsets"
    # 1. get demo and id_mapping
    demo_path = f"{dataset}/demo"
    id_mapping_path = f"{dataset}/id_mapping"
    try:
        # read demo ids
        url = hf_hub_url(
            repo_id=testsets_repo,
            filename=demo_path,
            repo_type="dataset"
        )
        response = requests.get(url)
        demo = response.text.splitlines()
        demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo]

        if system_type == "vocoder":
            # no need for mapping, mapping is to itself
            mapping = {name: name for name, _ in demo}
        else:
            # read id mapping
            url = hf_hub_url(
                repo_id=testsets_repo,
                filename=id_mapping_path,
                repo_type="dataset"
            )
            response = requests.get(url)
            mapping = response.text.splitlines()
            mapping = [x.split() for x in mapping]
            mapping = {k: v for k, v in mapping}
    except Exception as e:
        logging.error(f"Failed to read demo / mapping for {dataset}: {e}")
        return pd.DataFrame()
  
    # 2. get reference files
    if not all(x in mapping for x, _ in demo):
        raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.")
    ref_ids = list(set([mapping[x] for x, _ in demo]))
    reference_samples = {}
    for id in ref_ids:
        try:
            url = hf_hub_url(
                repo_id=testsets_repo,
                filename=f"{dataset}/wav/{id}.wav",
                repo_type="dataset"
            )
            reference_samples[id] = f"<audio src='{url}' controls></audio>"
        except Exception as e:
            logging.error(f"Failed to read reference {id} for {dataset}: {e}")
            continue

    # 3. get synthetic samples
    systems_samples = {model: {} for model in models}
    baselines_repo = "balacoon/speech_gen_baselines"
    for model in models:
        for id, _ in demo:
            try:
                filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav"
                url = hf_hub_url(
                    repo_id=baselines_repo,
                    filename=filename,
                    repo_type="dataset"
                )
                systems_samples[model][id] = f"<audio src='{url}' controls></audio>"
            except Exception as e:
                logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}")
                continue
    
    # filter out demo ids, checking if all samples are present
    filtered_demo = []
    for id, txt in demo:
        if id not in mapping:
            continue
        ref_id = mapping[id]
        if ref_id not in reference_samples:
            continue
        if all(id in systems_samples[model] for model in models):
            filtered_demo.append((id, txt))
    
    # finally create a dataframe
    rows = []
    for id, txt in filtered_demo:
        row = {
            "id": id,
            "text": txt,
            "reference": reference_samples[mapping[id]],
        }
        for model in models:
            row[model] = systems_samples[model][id]
        rows.append(row)
    datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models)
    return pd.DataFrame(rows), datatypes