Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# app.py
|
2 |
import os
|
|
|
3 |
import json
|
4 |
import tempfile
|
5 |
from typing import List, Optional
|
@@ -8,6 +9,7 @@ import pandas as pd
|
|
8 |
import gradio as gr
|
9 |
from pydantic import BaseModel, Field, ValidationError
|
10 |
|
|
|
11 |
# -----------------------------
|
12 |
# Lazy import so UI always loads
|
13 |
# -----------------------------
|
@@ -29,6 +31,7 @@ def _import_genai():
|
|
29 |
f"Original import error: {e}"
|
30 |
)
|
31 |
|
|
|
32 |
# =========================
|
33 |
# Structured Output Types
|
34 |
# =========================
|
@@ -46,6 +49,7 @@ class PageMetrics(BaseModel):
|
|
46 |
# Include all metrics found (mAP variants, Precision, Recall, etc.)
|
47 |
metrics: List[MetricKV]
|
48 |
|
|
|
49 |
# =========================
|
50 |
# System Instructions
|
51 |
# =========================
|
@@ -67,12 +71,14 @@ For each page, extract:
|
|
67 |
* For each metric:
|
68 |
- label: the exact on-page label string (no normalization).
|
69 |
- value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
|
|
|
70 |
STRICT RULES:
|
71 |
- Do NOT guess; only extract what is clearly present on the page.
|
72 |
- Do NOT filter by any threshold; include all found metrics.
|
73 |
- Preserve the order of pages as provided by the caller.
|
74 |
- If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
|
75 |
- The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
|
|
|
76 |
"""
|
77 |
|
78 |
def build_prompt(urls: List[str]) -> str:
|
@@ -89,9 +95,34 @@ For each URL:
|
|
89 |
- Return ALL metrics you can find with labels + numeric % values.
|
90 |
- Do not filter or apply thresholds; this is a pure scraper.
|
91 |
|
92 |
-
Return JSON only, strictly matching the schema.
|
93 |
"""
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# =========================
|
96 |
# Core LLM Scraper
|
97 |
# =========================
|
@@ -140,20 +171,30 @@ def run_llm_scraper(
|
|
140 |
except Exception as e:
|
141 |
raise gr.Error(f"Gemini API error: {e}")
|
142 |
|
143 |
-
# Prefer parsed output from SDK; fallback to JSON text
|
144 |
-
raw_text = response.text or "[]"
|
145 |
pages: List[PageMetrics] = []
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
if isinstance(data, list):
|
153 |
pages = [PageMetrics.model_validate(x) for x in data]
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
157 |
|
158 |
# Flatten to a row per metric for the table/CSV
|
159 |
rows = []
|
|
|
1 |
# app.py
|
2 |
import os
|
3 |
+
import re
|
4 |
import json
|
5 |
import tempfile
|
6 |
from typing import List, Optional
|
|
|
9 |
import gradio as gr
|
10 |
from pydantic import BaseModel, Field, ValidationError
|
11 |
|
12 |
+
|
13 |
# -----------------------------
|
14 |
# Lazy import so UI always loads
|
15 |
# -----------------------------
|
|
|
31 |
f"Original import error: {e}"
|
32 |
)
|
33 |
|
34 |
+
|
35 |
# =========================
|
36 |
# Structured Output Types
|
37 |
# =========================
|
|
|
49 |
# Include all metrics found (mAP variants, Precision, Recall, etc.)
|
50 |
metrics: List[MetricKV]
|
51 |
|
52 |
+
|
53 |
# =========================
|
54 |
# System Instructions
|
55 |
# =========================
|
|
|
71 |
* For each metric:
|
72 |
- label: the exact on-page label string (no normalization).
|
73 |
- value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
|
74 |
+
|
75 |
STRICT RULES:
|
76 |
- Do NOT guess; only extract what is clearly present on the page.
|
77 |
- Do NOT filter by any threshold; include all found metrics.
|
78 |
- Preserve the order of pages as provided by the caller.
|
79 |
- If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
|
80 |
- The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
|
81 |
+
- Return ONLY raw JSON (no markdown fences, no prose).
|
82 |
"""
|
83 |
|
84 |
def build_prompt(urls: List[str]) -> str:
|
|
|
95 |
- Return ALL metrics you can find with labels + numeric % values.
|
96 |
- Do not filter or apply thresholds; this is a pure scraper.
|
97 |
|
98 |
+
Return raw JSON only, strictly matching the schema. Do NOT wrap output in code fences.
|
99 |
"""
|
100 |
|
101 |
+
|
102 |
+
# =========================
|
103 |
+
# Helpers
|
104 |
+
# =========================
|
105 |
+
FENCE_RE = re.compile(r"^```(?:json)?\s*|\s*```$", re.IGNORECASE | re.DOTALL)
|
106 |
+
|
107 |
+
def _unfence_json(text: str) -> str:
|
108 |
+
"""
|
109 |
+
Remove surrounding ``` or ```json fences if present.
|
110 |
+
Also tries to extract the first top-level JSON array if the model added prose.
|
111 |
+
"""
|
112 |
+
s = text.strip()
|
113 |
+
# Remove a single fenced block if the whole thing is fenced
|
114 |
+
if s.startswith("```"):
|
115 |
+
s = FENCE_RE.sub("", s).strip()
|
116 |
+
|
117 |
+
# If still contains prose, try to slice from first '[' to last ']'
|
118 |
+
if not (s.startswith("[") and s.endswith("]")):
|
119 |
+
start = s.find("[")
|
120 |
+
end = s.rfind("]")
|
121 |
+
if start != -1 and end != -1 and end > start:
|
122 |
+
s = s[start:end + 1].strip()
|
123 |
+
return s
|
124 |
+
|
125 |
+
|
126 |
# =========================
|
127 |
# Core LLM Scraper
|
128 |
# =========================
|
|
|
171 |
except Exception as e:
|
172 |
raise gr.Error(f"Gemini API error: {e}")
|
173 |
|
174 |
+
# Prefer parsed output from SDK; fallback to JSON text (defensively unfenced)
|
|
|
175 |
pages: List[PageMetrics] = []
|
176 |
+
parsed = getattr(response, "parsed", None)
|
177 |
+
if parsed:
|
178 |
+
try:
|
179 |
pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
|
180 |
+
except ValidationError as e:
|
181 |
+
raise gr.Error(f"Schema validation failed on parsed output:\n{e}")
|
182 |
+
else:
|
183 |
+
raw_text = response.text or "[]"
|
184 |
+
candidate = _unfence_json(raw_text)
|
185 |
+
try:
|
186 |
+
data = json.loads(candidate)
|
187 |
+
except json.JSONDecodeError as e:
|
188 |
+
# Surface a concise debug view
|
189 |
+
snippet = raw_text[:1200]
|
190 |
+
raise gr.Error(f"Could not parse structured output:\n{e}\n\nRaw (truncated):\n{snippet}")
|
191 |
+
try:
|
192 |
if isinstance(data, list):
|
193 |
pages = [PageMetrics.model_validate(x) for x in data]
|
194 |
+
else:
|
195 |
+
raise gr.Error("Model did not return a top-level JSON array.")
|
196 |
+
except ValidationError as e:
|
197 |
+
raise gr.Error(f"Schema validation failed on JSON output:\n{e}")
|
198 |
|
199 |
# Flatten to a row per metric for the table/CSV
|
200 |
rows = []
|