FarmerlineML commited on
Commit
fbcc780
·
verified ·
1 Parent(s): 9f891d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -48
app.py CHANGED
@@ -13,6 +13,8 @@ from transformers import pipeline
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
 
 
16
 
17
  # Optional: modest thread hints for CPU Spaces
18
  try:
@@ -25,50 +27,24 @@ except Exception:
25
  # Basic logging so we can verify which model is loaded per inference
26
  logging.basicConfig(level=logging.INFO)
27
 
28
- # --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
29
- from datasets import Dataset, Features, Value, Audio, load_dataset
30
-
31
  # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
32
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
35
 
36
- HF_FEATURES = Features({
37
- "timestamp": Value("string"),
38
- "session_id": Value("string"),
39
- "language_display": Value("string"),
40
- "model_id": Value("string"),
41
- "model_revision": Value("string"),
42
-
43
- "audio": Audio(sampling_rate=None), # uploaded only if user consents
44
- "audio_duration_s": Value("float32"),
45
- "sample_rate": Value("int32"),
46
- "source": Value("string"),
47
- "decode_params": Value("string"),
48
-
49
- "transcript_hyp": Value("string"),
50
- "corrected_text": Value("string"),
51
-
52
- "latency_ms": Value("int32"),
53
- "rtf": Value("float32"),
54
-
55
- "score_out_of_10": Value("int32"),
56
- "share_publicly": Value("bool"),
57
- })
58
-
59
  def _push_row_to_hf_dataset(row, audio_file_path):
60
  """
61
- Append a single example to the HF dataset repo (train split).
62
- If user didn't consent or no audio path, 'audio' field is None.
63
  """
64
  if not PUSH_TO_HF:
65
  return "HF push disabled (missing HF_TOKEN or repo)."
66
 
67
  example = dict(row)
68
 
69
- # Audio: only include if user consented and file exists
70
- example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
71
-
72
  # Normalize types
73
  def _to_int(v):
74
  try:
@@ -86,23 +62,62 @@ def _push_row_to_hf_dataset(row, audio_file_path):
86
  for k in ["rtf", "audio_duration_s"]:
87
  example[k] = _to_float(example.get(k))
88
 
89
- ds = Dataset.from_list([example], features=HF_FEATURES)
90
-
91
- # Load existing split if present, then append
 
 
 
 
 
 
 
 
 
92
  try:
93
- existing = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
94
- merged = existing.concatenate(ds)
95
- except Exception:
96
- merged = ds
97
-
98
- merged.push_to_hub(
99
- HF_DATASET_REPO,
100
- split="train",
101
- private=True,
102
- token=HF_TOKEN,
103
- commit_message="append feedback row"
104
- )
105
- return "Pushed to HF Dataset."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # --- Map display names to your HF Hub model IDs ---
108
  language_models = {
@@ -137,7 +152,7 @@ language_models = {
137
  "Pidgin": "FarmerlineML/pidgin_nigerian",
138
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
139
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
140
- "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
141
  }
142
 
143
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
@@ -310,6 +325,7 @@ def submit_feedback(meta, corrected_text, score, store_audio, share_publicly, au
310
  status = f"Feedback saved. {hf_status}"
311
  except Exception as e:
312
  status = f"Failed to push to HF Dataset: {e}"
 
313
 
314
  return {
315
  "status": status,
 
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
16
+ import pandas as pd
17
+ from huggingface_hub import HfApi
18
 
19
  # Optional: modest thread hints for CPU Spaces
20
  try:
 
27
  # Basic logging so we can verify which model is loaded per inference
28
  logging.basicConfig(level=logging.INFO)
29
 
 
 
 
30
  # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
31
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
33
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _push_row_to_hf_dataset(row, audio_file_path):
36
  """
37
+ Append a single example to the HF dataset repo using Parquet files.
38
+ This approach is more robust for incremental updates.
39
  """
40
  if not PUSH_TO_HF:
41
  return "HF push disabled (missing HF_TOKEN or repo)."
42
 
43
  example = dict(row)
44
 
45
+ # Store audio path reference if audio should be saved
46
+ example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
47
+
48
  # Normalize types
49
  def _to_int(v):
50
  try:
 
62
  for k in ["rtf", "audio_duration_s"]:
63
  example[k] = _to_float(example.get(k))
64
 
65
+ # Create a unique filename for this submission
66
+ timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
67
+ unique_id = str(uuid.uuid4())[:8]
68
+ parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
69
+
70
+ # Convert to DataFrame and save as Parquet
71
+ df = pd.DataFrame([example])
72
+
73
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
74
+ df.to_parquet(tmp_file.name, engine='pyarrow')
75
+ tmp_path = tmp_file.name
76
+
77
  try:
78
+ # Upload the Parquet file to the dataset repo
79
+ api = HfApi()
80
+
81
+ # Upload to a data/ directory in the repo
82
+ api.upload_file(
83
+ path_or_fileobj=tmp_path,
84
+ path_in_repo=f"data/{parquet_filename}",
85
+ repo_id=HF_DATASET_REPO,
86
+ repo_type="dataset",
87
+ token=HF_TOKEN,
88
+ commit_message=f"Add feedback row {timestamp}"
89
+ )
90
+
91
+ # Clean up temp file
92
+ os.remove(tmp_path)
93
+
94
+ # If audio file should be stored
95
+ if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
96
+ try:
97
+ audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
98
+ audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
99
+ api.upload_file(
100
+ path_or_fileobj=audio_file_path,
101
+ path_in_repo=f"audio/{audio_filename}",
102
+ repo_id=HF_DATASET_REPO,
103
+ repo_type="dataset",
104
+ token=HF_TOKEN,
105
+ commit_message=f"Add audio for feedback {timestamp}"
106
+ )
107
+ example["audio_filename"] = audio_filename
108
+ except Exception as audio_error:
109
+ logging.warning(f"Failed to upload audio: {audio_error}")
110
+
111
+ return f"Pushed to HF Dataset as {parquet_filename}"
112
+
113
+ except Exception as e:
114
+ # Clean up temp file on error
115
+ if os.path.exists(tmp_path):
116
+ try:
117
+ os.remove(tmp_path)
118
+ except:
119
+ pass
120
+ return f"Failed to push to HF Dataset: {e}"
121
 
122
  # --- Map display names to your HF Hub model IDs ---
123
  language_models = {
 
152
  "Pidgin": "FarmerlineML/pidgin_nigerian",
153
  "Kikuyu": "FarmerlineML/w2v-bert-2.0_kikuyu",
154
  "Igbo": "FarmerlineML/w2v-bert-2.0_igbo_v1",
155
+ "Krio": "FarmerlineML/w2v-bert-2.0_krio_v3",
156
  }
157
 
158
  # -------- Robust audio loader (handles MP3/M4A via ffmpeg; wav/flac via soundfile) --------
 
325
  status = f"Feedback saved. {hf_status}"
326
  except Exception as e:
327
  status = f"Failed to push to HF Dataset: {e}"
328
+ logging.error(f"Push error: {e}")
329
 
330
  return {
331
  "status": status,