wuhp commited on
Commit
e5a54b4
·
verified ·
1 Parent(s): d386f34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -11
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # app.py
2
  import os
 
3
  import json
4
  import tempfile
5
  from typing import List, Optional
@@ -8,6 +9,7 @@ import pandas as pd
8
  import gradio as gr
9
  from pydantic import BaseModel, Field, ValidationError
10
 
 
11
  # -----------------------------
12
  # Lazy import so UI always loads
13
  # -----------------------------
@@ -29,6 +31,7 @@ def _import_genai():
29
  f"Original import error: {e}"
30
  )
31
 
 
32
  # =========================
33
  # Structured Output Types
34
  # =========================
@@ -46,6 +49,7 @@ class PageMetrics(BaseModel):
46
  # Include all metrics found (mAP variants, Precision, Recall, etc.)
47
  metrics: List[MetricKV]
48
 
 
49
  # =========================
50
  # System Instructions
51
  # =========================
@@ -67,12 +71,14 @@ For each page, extract:
67
  * For each metric:
68
  - label: the exact on-page label string (no normalization).
69
  - value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
 
70
  STRICT RULES:
71
  - Do NOT guess; only extract what is clearly present on the page.
72
  - Do NOT filter by any threshold; include all found metrics.
73
  - Preserve the order of pages as provided by the caller.
74
  - If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
75
  - The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
 
76
  """
77
 
78
  def build_prompt(urls: List[str]) -> str:
@@ -89,9 +95,34 @@ For each URL:
89
  - Return ALL metrics you can find with labels + numeric % values.
90
  - Do not filter or apply thresholds; this is a pure scraper.
91
 
92
- Return JSON only, strictly matching the schema.
93
  """
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # =========================
96
  # Core LLM Scraper
97
  # =========================
@@ -140,20 +171,30 @@ def run_llm_scraper(
140
  except Exception as e:
141
  raise gr.Error(f"Gemini API error: {e}")
142
 
143
- # Prefer parsed output from SDK; fallback to JSON text.
144
- raw_text = response.text or "[]"
145
  pages: List[PageMetrics] = []
146
- try:
147
- parsed = getattr(response, "parsed", None)
148
- if parsed:
149
  pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
150
- else:
151
- data = json.loads(raw_text)
 
 
 
 
 
 
 
 
 
 
152
  if isinstance(data, list):
153
  pages = [PageMetrics.model_validate(x) for x in data]
154
- except (ValidationError, json.JSONDecodeError) as e:
155
- # If parsing fails, show the raw text to help debug
156
- raise gr.Error(f"Could not parse structured output:\n{e}\n\nRaw:\n{raw_text[:1200]}")
 
157
 
158
  # Flatten to a row per metric for the table/CSV
159
  rows = []
 
1
  # app.py
2
  import os
3
+ import re
4
  import json
5
  import tempfile
6
  from typing import List, Optional
 
9
  import gradio as gr
10
  from pydantic import BaseModel, Field, ValidationError
11
 
12
+
13
  # -----------------------------
14
  # Lazy import so UI always loads
15
  # -----------------------------
 
31
  f"Original import error: {e}"
32
  )
33
 
34
+
35
  # =========================
36
  # Structured Output Types
37
  # =========================
 
49
  # Include all metrics found (mAP variants, Precision, Recall, etc.)
50
  metrics: List[MetricKV]
51
 
52
+
53
  # =========================
54
  # System Instructions
55
  # =========================
 
71
  * For each metric:
72
  - label: the exact on-page label string (no normalization).
73
  - value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
74
+
75
  STRICT RULES:
76
  - Do NOT guess; only extract what is clearly present on the page.
77
  - Do NOT filter by any threshold; include all found metrics.
78
  - Preserve the order of pages as provided by the caller.
79
  - If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
80
  - The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
81
+ - Return ONLY raw JSON (no markdown fences, no prose).
82
  """
83
 
84
  def build_prompt(urls: List[str]) -> str:
 
95
  - Return ALL metrics you can find with labels + numeric % values.
96
  - Do not filter or apply thresholds; this is a pure scraper.
97
 
98
+ Return raw JSON only, strictly matching the schema. Do NOT wrap output in code fences.
99
  """
100
 
101
+
102
+ # =========================
103
+ # Helpers
104
+ # =========================
105
+ FENCE_RE = re.compile(r"^```(?:json)?\s*|\s*```$", re.IGNORECASE | re.DOTALL)
106
+
107
+ def _unfence_json(text: str) -> str:
108
+ """
109
+ Remove surrounding ``` or ```json fences if present.
110
+ Also tries to extract the first top-level JSON array if the model added prose.
111
+ """
112
+ s = text.strip()
113
+ # Remove a single fenced block if the whole thing is fenced
114
+ if s.startswith("```"):
115
+ s = FENCE_RE.sub("", s).strip()
116
+
117
+ # If still contains prose, try to slice from first '[' to last ']'
118
+ if not (s.startswith("[") and s.endswith("]")):
119
+ start = s.find("[")
120
+ end = s.rfind("]")
121
+ if start != -1 and end != -1 and end > start:
122
+ s = s[start:end + 1].strip()
123
+ return s
124
+
125
+
126
  # =========================
127
  # Core LLM Scraper
128
  # =========================
 
171
  except Exception as e:
172
  raise gr.Error(f"Gemini API error: {e}")
173
 
174
+ # Prefer parsed output from SDK; fallback to JSON text (defensively unfenced)
 
175
  pages: List[PageMetrics] = []
176
+ parsed = getattr(response, "parsed", None)
177
+ if parsed:
178
+ try:
179
  pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
180
+ except ValidationError as e:
181
+ raise gr.Error(f"Schema validation failed on parsed output:\n{e}")
182
+ else:
183
+ raw_text = response.text or "[]"
184
+ candidate = _unfence_json(raw_text)
185
+ try:
186
+ data = json.loads(candidate)
187
+ except json.JSONDecodeError as e:
188
+ # Surface a concise debug view
189
+ snippet = raw_text[:1200]
190
+ raise gr.Error(f"Could not parse structured output:\n{e}\n\nRaw (truncated):\n{snippet}")
191
+ try:
192
  if isinstance(data, list):
193
  pages = [PageMetrics.model_validate(x) for x in data]
194
+ else:
195
+ raise gr.Error("Model did not return a top-level JSON array.")
196
+ except ValidationError as e:
197
+ raise gr.Error(f"Schema validation failed on JSON output:\n{e}")
198
 
199
  # Flatten to a row per metric for the table/CSV
200
  rows = []