FarmerlineML commited on
Commit
52ae594
·
verified ·
1 Parent(s): fbcc780

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -59
app.py CHANGED
@@ -13,8 +13,6 @@ from transformers import pipeline
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
16
- import pandas as pd
17
- from huggingface_hub import HfApi
18
 
19
  # Optional: modest thread hints for CPU Spaces
20
  try:
@@ -27,24 +25,52 @@ except Exception:
27
  # Basic logging so we can verify which model is loaded per inference
28
  logging.basicConfig(level=logging.INFO)
29
 
 
 
 
 
30
  # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
31
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
33
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _push_row_to_hf_dataset(row, audio_file_path):
36
  """
37
- Append a single example to the HF dataset repo using Parquet files.
38
- This approach is more robust for incremental updates.
 
39
  """
40
  if not PUSH_TO_HF:
41
  return "HF push disabled (missing HF_TOKEN or repo)."
42
 
43
  example = dict(row)
44
 
45
- # Store audio path reference if audio should be saved
46
- example["audio_stored"] = bool(audio_file_path and os.path.isfile(audio_file_path))
47
-
48
  # Normalize types
49
  def _to_int(v):
50
  try:
@@ -62,62 +88,78 @@ def _push_row_to_hf_dataset(row, audio_file_path):
62
  for k in ["rtf", "audio_duration_s"]:
63
  example[k] = _to_float(example.get(k))
64
 
65
- # Create a unique filename for this submission
66
- timestamp = time.strftime("%Y%m%d_%H%M%S", time.gmtime())
67
- unique_id = str(uuid.uuid4())[:8]
68
- parquet_filename = f"feedback_{timestamp}_{unique_id}.parquet"
69
-
70
- # Convert to DataFrame and save as Parquet
71
- df = pd.DataFrame([example])
72
-
73
- with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
74
- df.to_parquet(tmp_file.name, engine='pyarrow')
75
- tmp_path = tmp_file.name
76
-
77
  try:
78
- # Upload the Parquet file to the dataset repo
79
- api = HfApi()
80
-
81
- # Upload to a data/ directory in the repo
82
- api.upload_file(
83
- path_or_fileobj=tmp_path,
84
- path_in_repo=f"data/{parquet_filename}",
85
- repo_id=HF_DATASET_REPO,
86
- repo_type="dataset",
87
- token=HF_TOKEN,
88
- commit_message=f"Add feedback row {timestamp}"
89
- )
90
-
91
- # Clean up temp file
92
- os.remove(tmp_path)
93
-
94
- # If audio file should be stored
95
- if audio_file_path and os.path.isfile(audio_file_path) and example.get("share_publicly"):
96
- try:
97
- audio_ext = os.path.splitext(audio_file_path)[1] or ".wav"
98
- audio_filename = f"audio_{timestamp}_{unique_id}{audio_ext}"
99
- api.upload_file(
100
- path_or_fileobj=audio_file_path,
101
- path_in_repo=f"audio/{audio_filename}",
102
- repo_id=HF_DATASET_REPO,
103
- repo_type="dataset",
 
 
 
 
 
 
104
  token=HF_TOKEN,
105
- commit_message=f"Add audio for feedback {timestamp}"
106
  )
107
- example["audio_filename"] = audio_filename
108
- except Exception as audio_error:
109
- logging.warning(f"Failed to upload audio: {audio_error}")
110
-
111
- return f"Pushed to HF Dataset as {parquet_filename}"
112
-
 
 
 
 
 
 
 
113
  except Exception as e:
114
- # Clean up temp file on error
115
- if os.path.exists(tmp_path):
116
- try:
117
- os.remove(tmp_path)
118
- except:
119
- pass
120
- return f"Failed to push to HF Dataset: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # --- Map display names to your HF Hub model IDs ---
123
  language_models = {
 
13
  import numpy as np
14
  import soundfile as sf # librosa depends on this; good for wav/flac/ogg
15
  import librosa # fallback / resampling
 
 
16
 
17
  # Optional: modest thread hints for CPU Spaces
18
  try:
 
25
  # Basic logging so we can verify which model is loaded per inference
26
  logging.basicConfig(level=logging.INFO)
27
 
28
+ # --- External logging: push to a HF Dataset repo on each submit (no local storage) ---
29
+ from datasets import Dataset, Features, Value, Audio, load_dataset
30
+ from huggingface_hub import HfApi
31
+
32
  # -------- CONFIG: Hub dataset target (no persistent storage needed) --------
33
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "DarliAI/asr-feedback-logs")
34
  HF_TOKEN = os.environ.get("HF_TOKEN")
35
  PUSH_TO_HF = bool(HF_TOKEN and HF_DATASET_REPO)
36
 
37
+ HF_FEATURES = Features({
38
+ "timestamp": Value("string"),
39
+ "session_id": Value("string"),
40
+ "language_display": Value("string"),
41
+ "model_id": Value("string"),
42
+ "model_revision": Value("string"),
43
+
44
+ "audio": Audio(sampling_rate=None), # uploaded only if user consents
45
+ "audio_duration_s": Value("float32"),
46
+ "sample_rate": Value("int32"),
47
+ "source": Value("string"),
48
+ "decode_params": Value("string"),
49
+
50
+ "transcript_hyp": Value("string"),
51
+ "corrected_text": Value("string"),
52
+
53
+ "latency_ms": Value("int32"),
54
+ "rtf": Value("float32"),
55
+
56
+ "score_out_of_10": Value("int32"),
57
+ "share_publicly": Value("bool"),
58
+ })
59
+
60
  def _push_row_to_hf_dataset(row, audio_file_path):
61
  """
62
+ Append a single example to the HF dataset repo (train split).
63
+ If user didn't consent or no audio path, 'audio' field is None.
64
+ Uses the modern datasets library approach with proper appending.
65
  """
66
  if not PUSH_TO_HF:
67
  return "HF push disabled (missing HF_TOKEN or repo)."
68
 
69
  example = dict(row)
70
 
71
+ # Audio: only include if user consented and file exists
72
+ example["audio"] = audio_file_path if (audio_file_path and os.path.isfile(audio_file_path)) else None
73
+
74
  # Normalize types
75
  def _to_int(v):
76
  try:
 
88
  for k in ["rtf", "audio_duration_s"]:
89
  example[k] = _to_float(example.get(k))
90
 
91
+ # Create a dataset with single row
92
+ ds_new = Dataset.from_list([example], features=HF_FEATURES)
93
+
 
 
 
 
 
 
 
 
 
94
  try:
95
+ # Try to load existing dataset and append
96
+ try:
97
+ # Load existing dataset
98
+ ds_existing = load_dataset(
99
+ HF_DATASET_REPO,
100
+ split="train",
101
+ token=HF_TOKEN,
102
+ download_mode="force_redownload" # Ensure we get the latest version
103
+ )
104
+
105
+ # Concatenate with new data
106
+ ds_combined = ds_existing.add_item(example)
107
+
108
+ # Push the combined dataset
109
+ ds_combined.push_to_hub(
110
+ HF_DATASET_REPO,
111
+ split="train",
112
+ private=True,
113
+ token=HF_TOKEN,
114
+ commit_message=f"Append feedback row at {example['timestamp']}"
115
+ )
116
+
117
+ return "Successfully appended to existing HF Dataset."
118
+
119
+ except Exception as e:
120
+ # If dataset doesn't exist or error loading, create new
121
+ if "404" in str(e) or "doesn't exist" in str(e) or "EmptyDatasetError" in str(e):
122
+ # Dataset doesn't exist, create it
123
+ ds_new.push_to_hub(
124
+ HF_DATASET_REPO,
125
+ split="train",
126
+ private=True,
127
  token=HF_TOKEN,
128
+ commit_message="Initialize dataset with first feedback row"
129
  )
130
+ return "Created new HF Dataset with first row."
131
+ else:
132
+ # Try alternative approach: push with create_pr=True to avoid conflicts
133
+ ds_new.push_to_hub(
134
+ HF_DATASET_REPO,
135
+ split="train",
136
+ private=True,
137
+ token=HF_TOKEN,
138
+ commit_message=f"Append feedback row at {example['timestamp']}",
139
+ create_pr=True # Create a PR to avoid conflicts
140
+ )
141
+ return "Pushed to HF Dataset via PR (will auto-merge)."
142
+
143
  except Exception as e:
144
+ logging.error(f"Failed to push to HF Dataset: {e}")
145
+
146
+ # Final fallback: try using HfApi to check if repo exists
147
+ try:
148
+ api = HfApi()
149
+ api.dataset_info(HF_DATASET_REPO, token=HF_TOKEN)
150
+
151
+ # Repo exists, try one more time with force push
152
+ ds_new.push_to_hub(
153
+ HF_DATASET_REPO,
154
+ split=f"train_{int(time.time())}", # Use unique split name as last resort
155
+ private=True,
156
+ token=HF_TOKEN,
157
+ commit_message=f"Append feedback row at {example['timestamp']}"
158
+ )
159
+ return f"Pushed to HF Dataset with unique split."
160
+
161
+ except Exception as final_error:
162
+ return f"Failed to push to HF Dataset: {final_error}"
163
 
164
  # --- Map display names to your HF Hub model IDs ---
165
  language_models = {