Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,8 @@ from transformers import pipeline
|
|
13 |
import numpy as np
|
14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
15 |
import librosa # fallback / resampling
|
|
|
|
|
16 |
|
17 |
# Optional: modest thread hints for CPU Spaces
|
18 |
try:
|
@@ -25,50 +27,24 @@ except Exception:
|
|
25 |
# Basic logging so we can verify which model is loaded per inference
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
|
28 |
-
# --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
|
29 |
-
from datasets import Dataset, Features, Value, Audio, load_dataset
|
30 |
-
|
31 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
32 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
33 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
34 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
35 |
|
36 |
-
HF_FEATURES = Features({
|
37 |
-
"timestamp": Value("string"),
|
38 |
-
"session_id": Value("string"),
|
39 |
-
"language_display": Value("string"),
|
40 |
-
"model_id": Value("string"),
|
41 |
-
"model_revision": Value("string"),
|
42 |
-
|
43 |
-
"audio": Audio(sampling_rate=None), # uploaded only if user consents
|
44 |
-
"audio_duration_s": Value("float32"),
|
45 |
-
"sample_rate": Value("int32"),
|
46 |
-
"source": Value("string"),
|
47 |
-
"decode_params": Value("string"),
|
48 |
-
|
49 |
-
"transcript_hyp": Value("string"),
|
50 |
-
"corrected_text": Value("string"),
|
51 |
-
|
52 |
-
"latency_ms": Value("int32"),
|
53 |
-
"rtf": Value("float32"),
|
54 |
-
|
55 |
-
"score_out_of_10": Value("int32"),
|
56 |
-
"share_publicly": Value("bool"),
|
57 |
-
})
|
58 |
-
|
59 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
60 |
"""
|
61 |
-
Append a single example to the HF dataset repo
|
62 |
-
|
63 |
"""
|
64 |
if not PUSH_TO_HF:
|
65 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
66 |
|
67 |
example = dict(row)
|
68 |
|
69 |
-
#
|
70 |
-
example["
|
71 |
-
|
72 |
# Normalize types
|
73 |
def _to_int(v):
|
74 |
try:
|
@@ -86,23 +62,62 @@ def _push_row_to_hf_dataset(row, audio_file_path):
|
|
86 |
for k in ["rtf", "audio_duration_s"]:
|
87 |
example[k] = _to_float(example.get(k))
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
try:
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# --- Map display names to your HF Hub model IDs ---
|
108 |
language_models = {
|
@@ -137,7 +152,7 @@ language_models = {
|
|
137 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
138 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
139 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
140 |
-
"Krio":
|
141 |
}
|
142 |
|
143 |
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
@@ -310,6 +325,7 @@ def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, au
|
|
310 |
status = f"Feedback saved. {hf_status}"
|
311 |
except Exception as e:
|
312 |
status = f"Failed to push to HF Dataset: {e}"
|
|
|
313 |
|
314 |
return {
|
315 |
"status": status,
|
|
|
13 |
import numpy as np
|
14 |
import soundfile as sf # librosa depends on this; good for wav/flac/ogg
|
15 |
import librosa # fallback / resampling
|
16 |
+
import pandas as pd
|
17 |
+
from huggingface_hub import HfApi
|
18 |
|
19 |
# Optional: modest thread hints for CPU Spaces
|
20 |
try:
|
|
|
27 |
# Basic logging so we can verify which model is loaded per inference
|
28 |
logging.basicConfig(level=logging.INFO)
|
29 |
|
|
|
|
|
|
|
30 |
# -------- CONFIG: Hub dataset target (no persistent storage needed) --------
|
31 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
|
32 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
33 |
PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def _push_row_to_hf_dataset(row, audio_file_path):
|
36 |
"""
|
37 |
+
Append a single example to the HF dataset repo using Parquet files.
|
38 |
+
This approach is more robust for incremental updates.
|
39 |
"""
|
40 |
if not PUSH_TO_HF:
|
41 |
return "HF push disabled (missing HF_TOKEN or repo)."
|
42 |
|
43 |
example = dict(row)
|
44 |
|
45 |
+
# Store audio path reference if audio should be saved
|
46 |
+
example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
|
47 |
+
|
48 |
# Normalize types
|
49 |
def _to_int(v):
|
50 |
try:
|
|
|
62 |
for k in ["rtf", "audio_duration_s"]:
|
63 |
example[k] = _to_float(example.get(k))
|
64 |
|
65 |
+
# Create a unique filename for this submission
|
66 |
+
timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
|
67 |
+
unique_id = str(uuid.uuid4())[:8]
|
68 |
+
parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
|
69 |
+
|
70 |
+
# Convert to DataFrame and save as Parquet
|
71 |
+
df = pd.DataFrame([example])
|
72 |
+
|
73 |
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
74 |
+
df.to_parquet(tmp_file.name, engine='pyarrow')
|
75 |
+
tmp_path = tmp_file.name
|
76 |
+
|
77 |
try:
|
78 |
+
# Upload the Parquet file to the dataset repo
|
79 |
+
api = HfApi()
|
80 |
+
|
81 |
+
# Upload to a data/ directory in the repo
|
82 |
+
api.upload_file(
|
83 |
+
path_or_fileobj=tmp_path,
|
84 |
+
path_in_repo=f"data/{parquet_filename}",
|
85 |
+
repo_id=HF_DATASET_REPO,
|
86 |
+
repo_type="dataset",
|
87 |
+
token=HF_TOKEN,
|
88 |
+
commit_message=f"Add feedback row {timestamp}"
|
89 |
+
)
|
90 |
+
|
91 |
+
# Clean up temp file
|
92 |
+
os.remove(tmp_path)
|
93 |
+
|
94 |
+
# If audio file should be stored
|
95 |
+
if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
|
96 |
+
try:
|
97 |
+
audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
|
98 |
+
audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
|
99 |
+
api.upload_file(
|
100 |
+
path_or_fileobj=audio_file_path,
|
101 |
+
path_in_repo=f"audio/{audio_filename}",
|
102 |
+
repo_id=HF_DATASET_REPO,
|
103 |
+
repo_type="dataset",
|
104 |
+
token=HF_TOKEN,
|
105 |
+
commit_message=f"Add audio for feedback {timestamp}"
|
106 |
+
)
|
107 |
+
example["audio_filename"] = audio_filename
|
108 |
+
except Exception as audio_error:
|
109 |
+
logging.warning(f"Failed to upload audio: {audio_error}")
|
110 |
+
|
111 |
+
return f"Pushed to HF Dataset as {parquet_filename}"
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
# Clean up temp file on error
|
115 |
+
if os.path.exists(tmp_path):
|
116 |
+
try:
|
117 |
+
os.remove(tmp_path)
|
118 |
+
except:
|
119 |
+
pass
|
120 |
+
return f"Failed to push to HF Dataset: {e}"
|
121 |
|
122 |
# --- Map display names to your HF Hub model IDs ---
|
123 |
language_models = {
|
|
|
152 |
"Pidgin": "FarmerlineML/pidgin_nigerian",
|
153 |
"Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
|
154 |
"Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
|
155 |
+
"Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
|
156 |
}
|
157 |
|
158 |
# -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
|
|
|
325 |
status = f"Feedback saved. {hf_status}"
|
326 |
except Exception as e:
|
327 |
status = f"Failed to push to HF Dataset: {e}"
|
328 |
+
logging.error(f"Push error: {e}")
|
329 |
|
330 |
return {
|
331 |
"status": status,
|