Spaces:

wuhp
/

autotest

Sleeping

App Files Files Community

wuhp commited on Sep 2

Commit

e5a54b4

verified ·

1 Parent(s): d386f34

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -11

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # app.py
 import os
 import json
 import tempfile
 from typing import List, Optional
@@ -8,6 +9,7 @@ import pandas as pd
 import gradio as gr
 from pydantic import BaseModel, Field, ValidationError
 # -----------------------------
 # Lazy import so UI always loads
 # -----------------------------
@@ -29,6 +31,7 @@ def _import_genai():
             f"Original import error: {e}"
         )
 # =========================
 #   Structured Output Types
 # =========================
@@ -46,6 +49,7 @@ class PageMetrics(BaseModel):
     # Include all metrics found (mAP variants, Precision, Recall, etc.)
     metrics: List[MetricKV]
 # =========================
 #   System Instructions
 # =========================
@@ -67,12 +71,14 @@ For each page, extract:
     * For each metric:
         - label: the exact on-page label string (no normalization).
         - value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
 STRICT RULES:
 - Do NOT guess; only extract what is clearly present on the page.
 - Do NOT filter by any threshold; include all found metrics.
 - Preserve the order of pages as provided by the caller.
 - If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
 - The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
 """
 def build_prompt(urls: List[str]) -> str:
@@ -89,9 +95,34 @@ For each URL:
 - Return ALL metrics you can find with labels + numeric % values.
 - Do not filter or apply thresholds; this is a pure scraper.
-Return JSON only, strictly matching the schema.
 """
 # =========================
 #   Core LLM Scraper
 # =========================
@@ -140,20 +171,30 @@ def run_llm_scraper(
     except Exception as e:
         raise gr.Error(f"Gemini API error: {e}")
-    # Prefer parsed output from SDK; fallback to JSON text.
-    raw_text = response.text or "[]"
     pages: List[PageMetrics] = []
-    try:
-        parsed = getattr(response, "parsed", None)
-        if parsed:
             pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
-        else:
-            data = json.loads(raw_text)
             if isinstance(data, list):
                 pages = [PageMetrics.model_validate(x) for x in data]
-    except (ValidationError, json.JSONDecodeError) as e:
-        # If parsing fails, show the raw text to help debug
-        raise gr.Error(f"Could not parse structured output:\n{e}\n\nRaw:\n{raw_text[:1200]}")
     # Flatten to a row per metric for the table/CSV
     rows = []

 # app.py
 import os
+import re
 import json
 import tempfile
 from typing import List, Optional
 import gradio as gr
 from pydantic import BaseModel, Field, ValidationError
 # -----------------------------
 # Lazy import so UI always loads
 # -----------------------------
             f"Original import error: {e}"
         )
 # =========================
 #   Structured Output Types
 # =========================
     # Include all metrics found (mAP variants, Precision, Recall, etc.)
     metrics: List[MetricKV]
 # =========================
 #   System Instructions
 # =========================
     * For each metric:
         - label: the exact on-page label string (no normalization).
         - value: the numeric percentage as a float (0..100). Do NOT include '%' in the number.
 STRICT RULES:
 - Do NOT guess; only extract what is clearly present on the page.
 - Do NOT filter by any threshold; include all found metrics.
 - Preserve the order of pages as provided by the caller.
 - If a field (dataset_name, model_name, task) is absent, omit it or leave it null.
 - The output MUST be a JSON array of PageMetrics objects in the same order as input URLs.
+- Return ONLY raw JSON (no markdown fences, no prose).
 """
 def build_prompt(urls: List[str]) -> str:
 - Return ALL metrics you can find with labels + numeric % values.
 - Do not filter or apply thresholds; this is a pure scraper.
+Return raw JSON only, strictly matching the schema. Do NOT wrap output in code fences.
 """
+# =========================
+#   Helpers
+# =========================
+FENCE_RE = re.compile(r"^```(?:json)?\s*|\s*```$", re.IGNORECASE | re.DOTALL)
+def _unfence_json(text: str) -> str:
+    """
+    Remove surrounding ``` or ```json fences if present.
+    Also tries to extract the first top-level JSON array if the model added prose.
+    """
+    s = text.strip()
+    # Remove a single fenced block if the whole thing is fenced
+    if s.startswith("```"):
+        s = FENCE_RE.sub("", s).strip()
+    # If still contains prose, try to slice from first '[' to last ']'
+    if not (s.startswith("[") and s.endswith("]")):
+        start = s.find("[")
+        end = s.rfind("]")
+        if start != -1 and end != -1 and end > start:
+            s = s[start:end + 1].strip()
+    return s
 # =========================
 #   Core LLM Scraper
 # =========================
     except Exception as e:
         raise gr.Error(f"Gemini API error: {e}")
+    # Prefer parsed output from SDK; fallback to JSON text (defensively unfenced)
     pages: List[PageMetrics] = []
+    parsed = getattr(response, "parsed", None)
+    if parsed:
+        try:
             pages = [PageMetrics.model_validate(p) if not isinstance(p, PageMetrics) else p for p in parsed]
+        except ValidationError as e:
+            raise gr.Error(f"Schema validation failed on parsed output:\n{e}")
+    else:
+        raw_text = response.text or "[]"
+        candidate = _unfence_json(raw_text)
+        try:
+            data = json.loads(candidate)
+        except json.JSONDecodeError as e:
+            # Surface a concise debug view
+            snippet = raw_text[:1200]
+            raise gr.Error(f"Could not parse structured output:\n{e}\n\nRaw (truncated):\n{snippet}")
+        try:
             if isinstance(data, list):
                 pages = [PageMetrics.model_validate(x) for x in data]
+            else:
+                raise gr.Error("Model did not return a top-level JSON array.")
+        except ValidationError as e:
+            raise gr.Error(f"Schema validation failed on JSON output:\n{e}")
     # Flatten to a row per metric for the table/CSV
     rows = []