Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,8 +13,6 @@ from transformers import pipeline
|
|
13 |
import numpy as np
|
14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
15 |
import librosa # fallback / resampling
|
16 |
-
import pandas as pd
|
17 |
-
from huggingface_hub import HfApi
|
18 |
|
19 |
# Optional: modest thread hints for CPU Spaces
|
20 |
try:
|
@@ -27,24 +25,52 @@ except Exception:
|
|
27 |
# Basic logging so we can verify which model is loaded per inference
|
28 |
logging.basicConfig(level=logging.INFO)
|
29 |
|
|
|
|
|
|
|
|
|
30 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
31 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
33 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
36 |
"""
|
37 |
-
Append a single example to the HF dataset repo
|
38 |
-
|
|
|
39 |
"""
|
40 |
if not PUSH_TO_HF:
|
41 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
42 |
|
43 |
example = dict(row)
|
44 |
|
45 |
-
#
|
46 |
-
example["
|
47 |
-
|
48 |
# Normalize types
|
49 |
def _to_int(v):
|
50 |
try:
|
@@ -62,62 +88,78 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
62 |
for k in ["rtf", "audio_duration_s"]:
|
63 |
example[k] = _to_float(example.get(k))
|
64 |
|
65 |
-
# Create a
|
66 |
-
|
67 |
-
|
68 |
-
parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
|
69 |
-
|
70 |
-
# Convert to DataFrame and save as Parquet
|
71 |
-
df = pd.DataFrame([example])
|
72 |
-
|
73 |
-
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
74 |
-
df.to_parquet(tmp_file.name, engine='pyarrow')
|
75 |
-
tmp_path = tmp_file.name
|
76 |
-
|
77 |
try:
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
token=HF_TOKEN,
|
105 |
-
commit_message=
|
106 |
)
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
except Exception as e:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
# --- Map display names to your HF Hub model IDs ---
|
123 |
language_models = {
|
|
|
13 |
import numpy as np
|
14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
15 |
import librosa # fallback / resampling
|
|
|
|
|
16 |
|
17 |
# Optional: modest thread hints for CPU Spaces
|
18 |
try:
|
|
|
25 |
# Basic logging so we can verify which model is loaded per inference
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
|
28 |
+
# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
|
29 |
+
from datasets import Dataset, Features, Value, Audio, load_dataset
|
30 |
+
from huggingface_hub import HfApi
|
31 |
+
|
32 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
33 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
34 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
35 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
36 |
|
37 |
+
HF_FEATURES = Features({
|
38 |
+
"timestamp": Value("string"),
|
39 |
+
"session_id": Value("string"),
|
40 |
+
"language_display": Value("string"),
|
41 |
+
"model_id": Value("string"),
|
42 |
+
"model_revision": Value("string"),
|
43 |
+
|
44 |
+
"audio": Audio(sampling_rate=None), # uploaded only if user consents
|
45 |
+
"audio_duration_s": Value("float32"),
|
46 |
+
"sample_rate": Value("int32"),
|
47 |
+
"source": Value("string"),
|
48 |
+
"decode_params": Value("string"),
|
49 |
+
|
50 |
+
"transcript_hyp": Value("string"),
|
51 |
+
"corrected_text": Value("string"),
|
52 |
+
|
53 |
+
"latency_ms": Value("int32"),
|
54 |
+
"rtf": Value("float32"),
|
55 |
+
|
56 |
+
"score_out_of_10": Value("int32"),
|
57 |
+
"share_publicly": Value("bool"),
|
58 |
+
})
|
59 |
+
|
60 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
61 |
"""
|
62 |
+
Append a single example to the HF dataset repo (train split).
|
63 |
+
If user didn't consent or no audio path, 'audio' field is None.
|
64 |
+
Uses the modern datasets library approach with proper appending.
|
65 |
"""
|
66 |
if not PUSH_TO_HF:
|
67 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
68 |
|
69 |
example = dict(row)
|
70 |
|
71 |
+
# Audio: only include if user consented and file exists
|
72 |
+
example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
|
73 |
+
|
74 |
# Normalize types
|
75 |
def _to_int(v):
|
76 |
try:
|
|
|
88 |
for k in ["rtf", "audio_duration_s"]:
|
89 |
example[k] = _to_float(example.get(k))
|
90 |
|
91 |
+
# Create a dataset with single row
|
92 |
+
ds_new = Dataset.from_list([example], features=HF_FEATURES)
|
93 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
try:
|
95 |
+
# Try to load existing dataset and append
|
96 |
+
try:
|
97 |
+
# Load existing dataset
|
98 |
+
ds_existing = load_dataset(
|
99 |
+
HF_DATASET_REPO,
|
100 |
+
split="train",
|
101 |
+
token=HF_TOKEN,
|
102 |
+
download_mode="force_redownload" # Ensure we get the latest version
|
103 |
+
)
|
104 |
+
|
105 |
+
# Concatenate with new data
|
106 |
+
ds_combined = ds_existing.add_item(example)
|
107 |
+
|
108 |
+
# Push the combined dataset
|
109 |
+
ds_combined.push_to_hub(
|
110 |
+
HF_DATASET_REPO,
|
111 |
+
split="train",
|
112 |
+
private=True,
|
113 |
+
token=HF_TOKEN,
|
114 |
+
commit_message=f"Append feedback row at {example['timestamp']}"
|
115 |
+
)
|
116 |
+
|
117 |
+
return "Successfully appended to existing HF Dataset."
|
118 |
+
|
119 |
+
except Exception as e:
|
120 |
+
# If dataset doesn't exist or error loading, create new
|
121 |
+
if "404" in str(e) or "doesn't exist" in str(e) or "EmptyDatasetError" in str(e):
|
122 |
+
# Dataset doesn't exist, create it
|
123 |
+
ds_new.push_to_hub(
|
124 |
+
HF_DATASET_REPO,
|
125 |
+
split="train",
|
126 |
+
private=True,
|
127 |
token=HF_TOKEN,
|
128 |
+
commit_message="Initialize dataset with first feedback row"
|
129 |
)
|
130 |
+
return "Created new HF Dataset with first row."
|
131 |
+
else:
|
132 |
+
# Try alternative approach: push with create_pr=True to avoid conflicts
|
133 |
+
ds_new.push_to_hub(
|
134 |
+
HF_DATASET_REPO,
|
135 |
+
split="train",
|
136 |
+
private=True,
|
137 |
+
token=HF_TOKEN,
|
138 |
+
commit_message=f"Append feedback row at {example['timestamp']}",
|
139 |
+
create_pr=True # Create a PR to avoid conflicts
|
140 |
+
)
|
141 |
+
return "Pushed to HF Dataset via PR (will auto-merge)."
|
142 |
+
|
143 |
except Exception as e:
|
144 |
+
logging.error(f"Failed to push to HF Dataset: {e}")
|
145 |
+
|
146 |
+
# Final fallback: try using HfApi to check if repo exists
|
147 |
+
try:
|
148 |
+
api = HfApi()
|
149 |
+
api.dataset_info(HF_DATASET_REPO, token=HF_TOKEN)
|
150 |
+
|
151 |
+
# Repo exists, try one more time with force push
|
152 |
+
ds_new.push_to_hub(
|
153 |
+
HF_DATASET_REPO,
|
154 |
+
split=f"train_{int(time.time())}", # Use unique split name as last resort
|
155 |
+
private=True,
|
156 |
+
token=HF_TOKEN,
|
157 |
+
commit_message=f"Append feedback row at {example['timestamp']}"
|
158 |
+
)
|
159 |
+
return f"Pushed to HF Dataset with unique split."
|
160 |
+
|
161 |
+
except Exception as final_error:
|
162 |
+
return f"Failed to push to HF Dataset: {final_error}"
|
163 |
|
164 |
# --- Map display names to your HF Hub model IDs ---
|
165 |
language_models = {
|