Spaces:
Running
Running
Yacine Jernite
commited on
Commit
·
36de078
1
Parent(s):
d6d8868
added TLDR functionality
Browse files- analysis_utils.py +684 -0
- app.py +548 -349
- llm_interface.py +1 -0
- utils.py +86 -34
analysis_utils.py
ADDED
|
@@ -0,0 +1,684 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json # Added for TLDR JSON parsing
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
from huggingface_hub import HfApi
|
| 7 |
+
from huggingface_hub.inference._generated.types import \
|
| 8 |
+
ChatCompletionOutput # Added for type hinting
|
| 9 |
+
|
| 10 |
+
# Imports from other project modules
|
| 11 |
+
from llm_interface import (ERROR_503_DICT, parse_qwen_response,
|
| 12 |
+
query_qwen_endpoint)
|
| 13 |
+
from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
| 14 |
+
from utils import (PRIVACY_FILENAME, # Import constants for filenames
|
| 15 |
+
SUMMARY_FILENAME, TLDR_FILENAME, check_report_exists,
|
| 16 |
+
download_cached_reports, get_space_code_files)
|
| 17 |
+
|
| 18 |
+
# Configure logging (can inherit from app.py if called from there, but good practice)
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Load environment variables - redundant if always called by app.py which already loads them
|
| 24 |
+
# load_dotenv()
|
| 25 |
+
|
| 26 |
+
# Constants needed by helper functions (can be passed as args too)
|
| 27 |
+
# Consider passing these from app.py if they might change or for clarity
|
| 28 |
+
CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
|
| 29 |
+
TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
|
| 30 |
+
|
| 31 |
+
# --- Constants for TLDR Generation ---
|
| 32 |
+
TLDR_SYSTEM_PROMPT = (
|
| 33 |
+
"You are an AI assistant specialized in summarizing privacy analysis reports for Hugging Face Spaces. "
|
| 34 |
+
"You will receive two reports: a detailed privacy analysis and a summary/highlights report. "
|
| 35 |
+
"Based **only** on the content of these two reports, generate a concise JSON object containing a structured TLDR (Too Long; Didn't Read). "
|
| 36 |
+
"Do not use any information not present in the provided reports. "
|
| 37 |
+
"The JSON object must have the following keys:\n"
|
| 38 |
+
'- "app_description": A 1-2 sentence summary of what the application does from a user\'s perspective.\n'
|
| 39 |
+
'- "privacy_tldr": A 2-3 sentence high-level overview of privacy. Mention if the analysis was conclusive based on available code, if data processing is local, or if/what data goes to external services.\n'
|
| 40 |
+
'- "data_types": A list of JSON objects, where each object has two keys: \'name\' (a short, unique identifier string for the data type, e.g., "User Text") and \'description\' (a brief string explaining the data type in context, max 6-8 words, e.g., "Text prompt entered by the user").\n'
|
| 41 |
+
"- \"user_input_data\": A list of strings, where each string is the 'name' of a data type defined in 'data_types' that is provided by the user to the app.\n"
|
| 42 |
+
"- \"local_processing\": A list of strings describing data processed locally. Each string should start with the 'name' of a data type defined in 'data_types', followed by details (like the processing model) in parentheses if mentioned in the reports. Example: \"User Text (Local Model XYZ)\".\n"
|
| 43 |
+
"- \"remote_processing\": A list of strings describing data sent to remote services. Each string should start with the 'name' of a data type defined in 'data_types', followed by the service/model name in parentheses if mentioned in the reports. Example: \"User Text (HF Inference API)\".\n"
|
| 44 |
+
"- \"external_logging\": A list of strings describing data logged or saved externally. Each string should start with the 'name' of a data type defined in 'data_types', followed by the location/service in parentheses if mentioned. Example: \"User Text (External DB)\".\n"
|
| 45 |
+
"Ensure the output is **only** a valid JSON object, starting with `{` and ending with `}`. Ensure all listed data types in the processing/logging lists exactly match a 'name' defined in the 'data_types' list."
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# --- Analysis Pipeline Helper Functions ---
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def check_cache_and_download(space_id: str, dataset_id: str, hf_token: str | None):
|
| 52 |
+
"""Checks cache and downloads if reports exist."""
|
| 53 |
+
logging.info(f"Checking cache for '{space_id}'...")
|
| 54 |
+
found_in_cache = False
|
| 55 |
+
if hf_token:
|
| 56 |
+
try:
|
| 57 |
+
found_in_cache = check_report_exists(space_id, dataset_id, hf_token)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logging.warning(f"Cache check failed for {space_id}: {e}. Proceeding.")
|
| 60 |
+
# Return cache_miss even if check failed, proceed to live analysis
|
| 61 |
+
return {"status": "cache_miss", "error_message": f"Cache check failed: {e}"}
|
| 62 |
+
|
| 63 |
+
if found_in_cache:
|
| 64 |
+
logging.info(f"Cache hit for {space_id}. Downloading.")
|
| 65 |
+
try:
|
| 66 |
+
cached_reports = download_cached_reports(space_id, dataset_id, hf_token)
|
| 67 |
+
summary_report = (
|
| 68 |
+
cached_reports.get("summary", "Error: Cached summary not found.")
|
| 69 |
+
+ CACHE_INFO_MSG
|
| 70 |
+
)
|
| 71 |
+
privacy_report = (
|
| 72 |
+
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
| 73 |
+
+ CACHE_INFO_MSG
|
| 74 |
+
)
|
| 75 |
+
logging.info(f"Successfully downloaded cached reports for {space_id}.")
|
| 76 |
+
return {
|
| 77 |
+
"status": "cache_hit",
|
| 78 |
+
"summary": summary_report,
|
| 79 |
+
"privacy": privacy_report,
|
| 80 |
+
"tldr_json_str": cached_reports.get("tldr_json_str"),
|
| 81 |
+
}
|
| 82 |
+
except Exception as e:
|
| 83 |
+
error_msg = f"Cache download failed for {space_id}: {e}"
|
| 84 |
+
logging.warning(f"{error_msg}. Proceeding with live analysis.")
|
| 85 |
+
# Return error, but let caller decide if live analysis proceeds
|
| 86 |
+
return {"status": "cache_error", "ui_message": error_msg}
|
| 87 |
+
else:
|
| 88 |
+
logging.info(f"Cache miss for {space_id}. Performing live analysis.")
|
| 89 |
+
return {"status": "cache_miss"}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def check_endpoint_status(
|
| 93 |
+
endpoint_name: str, hf_token: str | None, error_503_user_message: str
|
| 94 |
+
):
|
| 95 |
+
"""Checks the status of the inference endpoint."""
|
| 96 |
+
logging.info(f"Checking endpoint status for '{endpoint_name}'...")
|
| 97 |
+
if not hf_token:
|
| 98 |
+
# Allow proceeding if token missing, maybe endpoint is public
|
| 99 |
+
logging.warning("HF_TOKEN not set, cannot check endpoint status definitively.")
|
| 100 |
+
return {"status": "ready", "warning": "HF_TOKEN not set"}
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
api = HfApi(token=hf_token)
|
| 104 |
+
endpoint = api.get_inference_endpoint(name=endpoint_name)
|
| 105 |
+
status = endpoint.status
|
| 106 |
+
logging.info(f"Endpoint '{endpoint_name}' status: {status}")
|
| 107 |
+
|
| 108 |
+
if status == "running":
|
| 109 |
+
return {"status": "ready"}
|
| 110 |
+
else:
|
| 111 |
+
logging.warning(
|
| 112 |
+
f"Endpoint '{endpoint_name}' is not ready (Status: {status})."
|
| 113 |
+
)
|
| 114 |
+
if status == "scaledToZero":
|
| 115 |
+
logging.info(
|
| 116 |
+
f"Endpoint '{endpoint_name}' is scaled to zero. Attempting to resume..."
|
| 117 |
+
)
|
| 118 |
+
try:
|
| 119 |
+
endpoint.resume()
|
| 120 |
+
# Still return an error message suggesting retry, as resume takes time
|
| 121 |
+
# Keep this message concise as the action is specific (wait)
|
| 122 |
+
msg = f"**Endpoint Resuming:** The analysis endpoint ('{endpoint_name}') was scaled to zero and is now restarting.\n\n{error_503_user_message}"
|
| 123 |
+
return {"status": "error", "ui_message": msg}
|
| 124 |
+
except Exception as resume_error:
|
| 125 |
+
# Resume failed, provide detailed message
|
| 126 |
+
logging.error(
|
| 127 |
+
f"Failed to resume endpoint {endpoint_name}: {resume_error}"
|
| 128 |
+
)
|
| 129 |
+
# Construct detailed message including full explanation
|
| 130 |
+
msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') is currently {status} and an attempt to resume it failed ({resume_error}).\n\n{error_503_user_message}"
|
| 131 |
+
return {"status": "error", "ui_message": msg}
|
| 132 |
+
else: # Paused, failed, pending etc.
|
| 133 |
+
# Construct detailed message including full explanation
|
| 134 |
+
msg = f"**Endpoint Issue:** The analysis endpoint ('{endpoint_name}') status is currently <span style='color:red'>**{status}**</span>.\n\n{error_503_user_message}"
|
| 135 |
+
return {"status": "error", "ui_message": msg}
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
error_msg = f"Error checking analysis endpoint status for {endpoint_name}: {e}"
|
| 139 |
+
logging.error(error_msg)
|
| 140 |
+
# Let analysis stop if endpoint check fails critically
|
| 141 |
+
return {"status": "error", "ui_message": f"Error checking endpoint status: {e}"}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def fetch_and_validate_code(space_id: str):
|
| 145 |
+
"""Fetches and validates code files for the space."""
|
| 146 |
+
logging.info(f"Fetching code files for {space_id}...")
|
| 147 |
+
code_files = get_space_code_files(space_id)
|
| 148 |
+
if not code_files:
|
| 149 |
+
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
|
| 150 |
+
logging.warning(error_msg)
|
| 151 |
+
return {
|
| 152 |
+
"status": "error",
|
| 153 |
+
"ui_message": f"**Error:**\n{error_msg}\nAnalysis Canceled.",
|
| 154 |
+
}
|
| 155 |
+
logging.info(f"Successfully fetched {len(code_files)} files for {space_id}.")
|
| 156 |
+
return {"status": "success", "code_files": code_files}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def generate_detailed_report(
|
| 160 |
+
space_id: str, code_files: dict, error_503_user_message: str
|
| 161 |
+
):
|
| 162 |
+
"""Generates the detailed privacy report using the LLM."""
|
| 163 |
+
logging.info("Generating detailed privacy analysis report...")
|
| 164 |
+
privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
|
| 165 |
+
space_id, code_files
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
|
| 169 |
+
|
| 170 |
+
if privacy_api_response == ERROR_503_DICT:
|
| 171 |
+
logging.warning("LLM Call 1 (Privacy) failed with 503.")
|
| 172 |
+
return {"status": "error", "ui_message": error_503_user_message}
|
| 173 |
+
|
| 174 |
+
detailed_privacy_report = parse_qwen_response(privacy_api_response)
|
| 175 |
+
|
| 176 |
+
if "Error:" in detailed_privacy_report:
|
| 177 |
+
error_msg = (
|
| 178 |
+
f"Failed to generate detailed privacy report: {detailed_privacy_report}"
|
| 179 |
+
)
|
| 180 |
+
logging.error(error_msg)
|
| 181 |
+
return {
|
| 182 |
+
"status": "error",
|
| 183 |
+
"ui_message": f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}\nAnalysis Halted.",
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
if privacy_truncated:
|
| 187 |
+
detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report
|
| 188 |
+
|
| 189 |
+
logging.info("Successfully generated detailed privacy report.")
|
| 190 |
+
return {
|
| 191 |
+
"status": "success",
|
| 192 |
+
"report": detailed_privacy_report,
|
| 193 |
+
"truncated": privacy_truncated,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def generate_summary_report(
|
| 198 |
+
space_id: str,
|
| 199 |
+
code_files: dict,
|
| 200 |
+
detailed_privacy_report: str,
|
| 201 |
+
error_503_user_message: str,
|
| 202 |
+
):
|
| 203 |
+
"""Generates the summary & highlights report using the LLM."""
|
| 204 |
+
logging.info("Generating summary and highlights report...")
|
| 205 |
+
# Remove potential truncation warning from detailed report before sending to next LLM
|
| 206 |
+
clean_detailed_report = detailed_privacy_report.replace(TRUNCATION_WARNING, "")
|
| 207 |
+
|
| 208 |
+
summary_highlights_prompt_messages, summary_truncated = (
|
| 209 |
+
format_summary_highlights_prompt(space_id, code_files, clean_detailed_report)
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
summary_highlights_api_response = query_qwen_endpoint(
|
| 213 |
+
summary_highlights_prompt_messages, max_tokens=2048
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
if summary_highlights_api_response == ERROR_503_DICT:
|
| 217 |
+
logging.warning("LLM Call 2 (Summary) failed with 503.")
|
| 218 |
+
# Return specific status to indicate partial success
|
| 219 |
+
return {"status": "error_503_summary", "ui_message": error_503_user_message}
|
| 220 |
+
|
| 221 |
+
summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
|
| 222 |
+
|
| 223 |
+
if "Error:" in summary_highlights_report:
|
| 224 |
+
error_msg = (
|
| 225 |
+
f"Failed to generate summary/highlights report: {summary_highlights_report}"
|
| 226 |
+
)
|
| 227 |
+
logging.error(error_msg)
|
| 228 |
+
# Return specific status to indicate partial success
|
| 229 |
+
return {
|
| 230 |
+
"status": "error_summary",
|
| 231 |
+
"ui_message": f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
if summary_truncated:
|
| 235 |
+
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
|
| 236 |
+
|
| 237 |
+
logging.info("Successfully generated summary & highlights report.")
|
| 238 |
+
return {
|
| 239 |
+
"status": "success",
|
| 240 |
+
"report": summary_highlights_report,
|
| 241 |
+
"truncated": summary_truncated,
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def upload_results(
|
| 246 |
+
space_id: str,
|
| 247 |
+
summary_report: str,
|
| 248 |
+
detailed_report: str,
|
| 249 |
+
dataset_id: str,
|
| 250 |
+
hf_token: str | None,
|
| 251 |
+
tldr_json_data: dict | None = None,
|
| 252 |
+
):
|
| 253 |
+
"""Uploads the generated reports (Markdown and optional JSON TLDR) to the specified dataset repository."""
|
| 254 |
+
if not hf_token:
|
| 255 |
+
logging.warning("HF Token not provided, skipping dataset report upload.")
|
| 256 |
+
return {"status": "skipped", "reason": "HF_TOKEN not set"}
|
| 257 |
+
if "Error:" in detailed_report or "Error:" in summary_report:
|
| 258 |
+
msg = "Skipping cache upload due to errors in generated reports."
|
| 259 |
+
logging.warning(msg)
|
| 260 |
+
return {"status": "skipped", "reason": msg}
|
| 261 |
+
|
| 262 |
+
safe_space_id = space_id.replace("..", "")
|
| 263 |
+
|
| 264 |
+
try:
|
| 265 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 266 |
+
# Define local paths
|
| 267 |
+
summary_path_local = os.path.join(tmpdir, SUMMARY_FILENAME)
|
| 268 |
+
privacy_path_local = os.path.join(tmpdir, PRIVACY_FILENAME)
|
| 269 |
+
tldr_json_path_local = os.path.join(tmpdir, TLDR_FILENAME)
|
| 270 |
+
|
| 271 |
+
# Write Markdown reports
|
| 272 |
+
with open(summary_path_local, "w", encoding="utf-8") as f:
|
| 273 |
+
f.write(summary_report)
|
| 274 |
+
with open(privacy_path_local, "w", encoding="utf-8") as f:
|
| 275 |
+
f.write(detailed_report)
|
| 276 |
+
|
| 277 |
+
# Prepare commit message
|
| 278 |
+
commit_message = f"Add analysis reports for Space: {safe_space_id}"
|
| 279 |
+
if tldr_json_data:
|
| 280 |
+
commit_message += " (including TLDR JSON)"
|
| 281 |
+
print(f"Successfully wrote TLDR JSON locally for {safe_space_id}.")
|
| 282 |
+
# Write JSON TLDR data if available
|
| 283 |
+
try:
|
| 284 |
+
with open(tldr_json_path_local, "w", encoding="utf-8") as f:
|
| 285 |
+
json.dump(tldr_json_data, f, indent=2, ensure_ascii=False)
|
| 286 |
+
logging.info(
|
| 287 |
+
f"Successfully wrote TLDR JSON locally for {safe_space_id}."
|
| 288 |
+
)
|
| 289 |
+
except Exception as json_err:
|
| 290 |
+
logging.error(
|
| 291 |
+
f"Failed to write TLDR JSON locally for {safe_space_id}: {json_err}"
|
| 292 |
+
)
|
| 293 |
+
tldr_json_data = None # Prevent upload attempt if writing failed
|
| 294 |
+
|
| 295 |
+
# Ensure repo exists
|
| 296 |
+
api = HfApi(token=hf_token)
|
| 297 |
+
repo_url = api.create_repo(
|
| 298 |
+
repo_id=dataset_id,
|
| 299 |
+
repo_type="dataset",
|
| 300 |
+
exist_ok=True,
|
| 301 |
+
)
|
| 302 |
+
logging.info(f"Ensured dataset repo {repo_url} exists.")
|
| 303 |
+
|
| 304 |
+
# Upload summary report
|
| 305 |
+
api.upload_file(
|
| 306 |
+
path_or_fileobj=summary_path_local,
|
| 307 |
+
path_in_repo=f"{safe_space_id}/{SUMMARY_FILENAME}",
|
| 308 |
+
repo_id=dataset_id,
|
| 309 |
+
repo_type="dataset",
|
| 310 |
+
commit_message=commit_message,
|
| 311 |
+
)
|
| 312 |
+
logging.info(f"Successfully uploaded summary report for {safe_space_id}.")
|
| 313 |
+
|
| 314 |
+
# Upload privacy report
|
| 315 |
+
api.upload_file(
|
| 316 |
+
path_or_fileobj=privacy_path_local,
|
| 317 |
+
path_in_repo=f"{safe_space_id}/{PRIVACY_FILENAME}",
|
| 318 |
+
repo_id=dataset_id,
|
| 319 |
+
repo_type="dataset",
|
| 320 |
+
commit_message=commit_message,
|
| 321 |
+
)
|
| 322 |
+
logging.info(
|
| 323 |
+
f"Successfully uploaded detailed privacy report for {safe_space_id}."
|
| 324 |
+
)
|
| 325 |
+
# print(f"Successfully uploaded detailed privacy report for {safe_space_id}.") # Keep if needed for debug
|
| 326 |
+
|
| 327 |
+
# Upload JSON TLDR if it was successfully written locally
|
| 328 |
+
if tldr_json_data and os.path.exists(tldr_json_path_local):
|
| 329 |
+
api.upload_file(
|
| 330 |
+
path_or_fileobj=tldr_json_path_local,
|
| 331 |
+
path_in_repo=f"{safe_space_id}/{TLDR_FILENAME}",
|
| 332 |
+
repo_id=dataset_id,
|
| 333 |
+
repo_type="dataset",
|
| 334 |
+
commit_message=commit_message, # Can reuse commit message or make specific
|
| 335 |
+
)
|
| 336 |
+
logging.info(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
|
| 337 |
+
print(f"Successfully uploaded TLDR JSON for {safe_space_id}.")
|
| 338 |
+
|
| 339 |
+
# Return success if all uploads finished without error
|
| 340 |
+
return {"status": "success"}
|
| 341 |
+
|
| 342 |
+
except Exception as e:
|
| 343 |
+
error_msg = f"Non-critical error during report upload for {safe_space_id}: {e}"
|
| 344 |
+
logging.error(error_msg)
|
| 345 |
+
print(error_msg)
|
| 346 |
+
return {"status": "error", "message": error_msg}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
# --- New TLDR Generation Functions ---
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def format_tldr_prompt(
|
| 353 |
+
detailed_report: str, summary_report: str
|
| 354 |
+
) -> list[dict[str, str]]:
|
| 355 |
+
"""Formats the prompt for the TLDR generation task."""
|
| 356 |
+
# Clean potential cache/truncation markers from input reports for the LLM
|
| 357 |
+
cleaned_detailed = detailed_report.replace(CACHE_INFO_MSG, "").replace(
|
| 358 |
+
TRUNCATION_WARNING, ""
|
| 359 |
+
)
|
| 360 |
+
cleaned_summary = summary_report.replace(CACHE_INFO_MSG, "").replace(
|
| 361 |
+
TRUNCATION_WARNING, ""
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
user_content = (
|
| 365 |
+
"Please generate a structured JSON TLDR based on the following reports:\n\n"
|
| 366 |
+
"--- DETAILED PRIVACY ANALYSIS REPORT START ---\n"
|
| 367 |
+
f"{cleaned_detailed}\n"
|
| 368 |
+
"--- DETAILED PRIVACY ANALYSIS REPORT END ---\n\n"
|
| 369 |
+
"--- SUMMARY & HIGHLIGHTS REPORT START ---\n"
|
| 370 |
+
f"{cleaned_summary}\n"
|
| 371 |
+
"--- SUMMARY & HIGHLIGHTS REPORT END ---"
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
# Note: We are not handling truncation here, assuming the input reports
|
| 375 |
+
# are already reasonably sized from the previous steps.
|
| 376 |
+
# If reports could be extremely long, add truncation logic similar to other format_* functions.
|
| 377 |
+
|
| 378 |
+
messages = [
|
| 379 |
+
{"role": "system", "content": TLDR_SYSTEM_PROMPT},
|
| 380 |
+
{"role": "user", "content": user_content},
|
| 381 |
+
]
|
| 382 |
+
return messages
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def parse_tldr_json_response(
|
| 386 |
+
response: ChatCompletionOutput | dict | None,
|
| 387 |
+
) -> dict | None:
|
| 388 |
+
"""Parses the LLM response, expecting JSON content for the TLDR."""
|
| 389 |
+
if response is None:
|
| 390 |
+
logging.error("TLDR Generation: Failed to get response from LLM.")
|
| 391 |
+
return None
|
| 392 |
+
|
| 393 |
+
# Check for 503 error dict first
|
| 394 |
+
if isinstance(response, dict) and response.get("error_type") == "503":
|
| 395 |
+
logging.error(f"TLDR Generation: Received 503 error: {response.get('message')}")
|
| 396 |
+
return None # Treat 503 as failure for this specific task
|
| 397 |
+
|
| 398 |
+
# --- Direct Content Extraction (Replaces call to parse_qwen_response) ---
|
| 399 |
+
raw_content = ""
|
| 400 |
+
try:
|
| 401 |
+
# Check if it's likely the expected ChatCompletionOutput structure
|
| 402 |
+
if not hasattr(response, "choices"):
|
| 403 |
+
logging.error(
|
| 404 |
+
f"TLDR Generation: Unexpected response type received: {type(response)}. Content: {response}"
|
| 405 |
+
)
|
| 406 |
+
return None # Return None if not the expected structure
|
| 407 |
+
|
| 408 |
+
# Access the generated content according to the ChatCompletionOutput structure
|
| 409 |
+
if response.choices and len(response.choices) > 0:
|
| 410 |
+
content = response.choices[0].message.content
|
| 411 |
+
if content:
|
| 412 |
+
raw_content = content.strip()
|
| 413 |
+
logging.info(
|
| 414 |
+
"TLDR Generation: Successfully extracted raw content from response."
|
| 415 |
+
)
|
| 416 |
+
else:
|
| 417 |
+
logging.warning(
|
| 418 |
+
"TLDR Generation: Response received, but content is empty."
|
| 419 |
+
)
|
| 420 |
+
return None
|
| 421 |
+
else:
|
| 422 |
+
logging.warning("TLDR Generation: Response received, but no choices found.")
|
| 423 |
+
return None
|
| 424 |
+
except AttributeError as e:
|
| 425 |
+
# This might catch cases where response looks like the object but lacks expected attributes
|
| 426 |
+
logging.error(
|
| 427 |
+
f"TLDR Generation: Attribute error parsing response object: {e}. Response structure might be unexpected. Response: {response}"
|
| 428 |
+
)
|
| 429 |
+
return None
|
| 430 |
+
except Exception as e:
|
| 431 |
+
logging.error(
|
| 432 |
+
f"TLDR Generation: Unexpected error extracting content from response object: {e}"
|
| 433 |
+
)
|
| 434 |
+
return None
|
| 435 |
+
# --- End Direct Content Extraction ---
|
| 436 |
+
|
| 437 |
+
# --- JSON Parsing Logic ---
|
| 438 |
+
if not raw_content: # Should be caught by checks above, but belts and suspenders
|
| 439 |
+
logging.error("TLDR Generation: Raw content is empty after extraction attempt.")
|
| 440 |
+
return None
|
| 441 |
+
|
| 442 |
+
try:
|
| 443 |
+
# Clean potential markdown code block formatting
|
| 444 |
+
if raw_content.strip().startswith("```json"):
|
| 445 |
+
raw_content = raw_content.strip()[7:-3].strip()
|
| 446 |
+
elif raw_content.strip().startswith("```"):
|
| 447 |
+
raw_content = raw_content.strip()[3:-3].strip()
|
| 448 |
+
|
| 449 |
+
tldr_data = json.loads(raw_content)
|
| 450 |
+
|
| 451 |
+
# Validate structure: Check if it's a dict and has all required keys
|
| 452 |
+
required_keys = [
|
| 453 |
+
"app_description",
|
| 454 |
+
"privacy_tldr",
|
| 455 |
+
"data_types",
|
| 456 |
+
"user_input_data",
|
| 457 |
+
"local_processing",
|
| 458 |
+
"remote_processing",
|
| 459 |
+
"external_logging",
|
| 460 |
+
]
|
| 461 |
+
if not isinstance(tldr_data, dict):
|
| 462 |
+
logging.error(
|
| 463 |
+
f"TLDR Generation: Parsed content is not a dictionary. Content: {raw_content[:500]}..."
|
| 464 |
+
)
|
| 465 |
+
return None
|
| 466 |
+
if not all(key in tldr_data for key in required_keys):
|
| 467 |
+
missing_keys = [key for key in required_keys if key not in tldr_data]
|
| 468 |
+
logging.error(
|
| 469 |
+
f"TLDR Generation: Parsed JSON is missing required keys: {missing_keys}. Content: {raw_content[:500]}..."
|
| 470 |
+
)
|
| 471 |
+
return None
|
| 472 |
+
|
| 473 |
+
# --- Add validation for the new data_types structure ---
|
| 474 |
+
data_types_list = tldr_data.get("data_types")
|
| 475 |
+
if not isinstance(data_types_list, list):
|
| 476 |
+
logging.error(
|
| 477 |
+
f"TLDR Generation: 'data_types' is not a list. Content: {data_types_list}"
|
| 478 |
+
)
|
| 479 |
+
return None
|
| 480 |
+
for item in data_types_list:
|
| 481 |
+
if (
|
| 482 |
+
not isinstance(item, dict)
|
| 483 |
+
or "name" not in item
|
| 484 |
+
or "description" not in item
|
| 485 |
+
):
|
| 486 |
+
logging.error(
|
| 487 |
+
f"TLDR Generation: Invalid item found in 'data_types' list: {item}. Must be dict with 'name' and 'description'."
|
| 488 |
+
)
|
| 489 |
+
return None
|
| 490 |
+
if not isinstance(item["name"], str) or not isinstance(
|
| 491 |
+
item["description"], str
|
| 492 |
+
):
|
| 493 |
+
logging.error(
|
| 494 |
+
f"TLDR Generation: Invalid types for name/description in 'data_types' item: {item}. Must be strings."
|
| 495 |
+
)
|
| 496 |
+
return None
|
| 497 |
+
# --- End validation for data_types ---
|
| 498 |
+
|
| 499 |
+
# Basic validation for other lists (should contain strings)
|
| 500 |
+
validation_passed = True
|
| 501 |
+
for key in [
|
| 502 |
+
"user_input_data",
|
| 503 |
+
"local_processing",
|
| 504 |
+
"remote_processing",
|
| 505 |
+
"external_logging",
|
| 506 |
+
]:
|
| 507 |
+
data_list = tldr_data.get(key)
|
| 508 |
+
# Add more detailed check and logging
|
| 509 |
+
if not isinstance(data_list, list):
|
| 510 |
+
logging.error(
|
| 511 |
+
f"TLDR Generation Validation Error: Key '{key}' is not a list. Found type: {type(data_list)}, Value: {data_list}"
|
| 512 |
+
)
|
| 513 |
+
validation_passed = False
|
| 514 |
+
# Allow continuing validation for other keys, but mark as failed
|
| 515 |
+
elif not all(isinstance(x, str) for x in data_list):
|
| 516 |
+
# This check might be too strict if LLM includes non-strings, but keep for now
|
| 517 |
+
logging.warning(
|
| 518 |
+
f"TLDR Generation Validation Warning: Not all items in list '{key}' are strings. Content: {data_list}"
|
| 519 |
+
)
|
| 520 |
+
# Decide if this should cause failure - currently it doesn't, just warns
|
| 521 |
+
|
| 522 |
+
if not validation_passed:
|
| 523 |
+
logging.error(
|
| 524 |
+
"TLDR Generation: Validation failed due to incorrect list types."
|
| 525 |
+
)
|
| 526 |
+
return None # Ensure failure if any key wasn't a list
|
| 527 |
+
|
| 528 |
+
logging.info("Successfully parsed and validated TLDR JSON response.")
|
| 529 |
+
return tldr_data
|
| 530 |
+
|
| 531 |
+
except json.JSONDecodeError as e:
|
| 532 |
+
logging.error(
|
| 533 |
+
f"TLDR Generation: Failed to decode JSON response: {e}. Content: {raw_content[:500]}..."
|
| 534 |
+
)
|
| 535 |
+
return None
|
| 536 |
+
except Exception as e:
|
| 537 |
+
logging.error(f"TLDR Generation: Unexpected error parsing JSON response: {e}")
|
| 538 |
+
return None
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def render_tldr_markdown(tldr_data: dict | None, space_id: str | None = None) -> str:
|
| 542 |
+
"""Renders the top-level TLDR (description, privacy) data into a Markdown string.
|
| 543 |
+
|
| 544 |
+
(Does not include the data lists)
|
| 545 |
+
"""
|
| 546 |
+
if not tldr_data:
|
| 547 |
+
# Return a more specific message for this part
|
| 548 |
+
return "*TLDR Summary could not be generated.*\n"
|
| 549 |
+
|
| 550 |
+
output = []
|
| 551 |
+
|
| 552 |
+
# Add Space link if space_id is provided
|
| 553 |
+
if space_id:
|
| 554 |
+
output.append(
|
| 555 |
+
f"**Source Space:** [`{space_id}`](https://huggingface.co/spaces/{space_id})\n"
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
output.append(f"**App Description:** {tldr_data.get('app_description', 'N/A')}\n")
|
| 559 |
+
privacy_summary = tldr_data.get("privacy_tldr", "N/A")
|
| 560 |
+
output.append(f"**Privacy TLDR:** {privacy_summary}") # Removed extra newline
|
| 561 |
+
|
| 562 |
+
# Removed data list rendering from this function
|
| 563 |
+
|
| 564 |
+
return "\n".join(output)
|
| 565 |
+
|
| 566 |
+
|
| 567 |
+
def render_data_details_markdown(tldr_data: dict | None) -> str:
|
| 568 |
+
"""Renders the data lists (types, input, processing, logging) from TLDR data."""
|
| 569 |
+
if not tldr_data:
|
| 570 |
+
return "*Data details could not be generated.*\n"
|
| 571 |
+
|
| 572 |
+
output = []
|
| 573 |
+
# Get defined names for formatting
|
| 574 |
+
defined_names = sorted(
|
| 575 |
+
[
|
| 576 |
+
dt.get("name", "")
|
| 577 |
+
for dt in tldr_data.get("data_types", [])
|
| 578 |
+
if dt.get("name")
|
| 579 |
+
],
|
| 580 |
+
key=len,
|
| 581 |
+
reverse=True,
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
output.append("**Data Types Defined:**") # Renamed slightly for clarity
|
| 585 |
+
data_types = tldr_data.get("data_types")
|
| 586 |
+
if data_types and isinstance(data_types, list):
|
| 587 |
+
if not data_types:
|
| 588 |
+
output.append("- None identified.")
|
| 589 |
+
else:
|
| 590 |
+
for item in data_types:
|
| 591 |
+
name = item.get("name", "Unnamed")
|
| 592 |
+
desc = item.get("description", "No description")
|
| 593 |
+
output.append(f"- `{name}`: {desc}")
|
| 594 |
+
else:
|
| 595 |
+
output.append("- (Error loading data types)")
|
| 596 |
+
output.append("") # Add newline for spacing
|
| 597 |
+
|
| 598 |
+
# Reusable helper for rendering lists
|
| 599 |
+
def render_list(title, key):
|
| 600 |
+
output.append(f"**{title}:**")
|
| 601 |
+
data_list = tldr_data.get(key)
|
| 602 |
+
if isinstance(data_list, list):
|
| 603 |
+
if not data_list:
|
| 604 |
+
output.append("- None identified.")
|
| 605 |
+
else:
|
| 606 |
+
for item_str in data_list:
|
| 607 |
+
formatted_item = item_str # Default
|
| 608 |
+
found_match = False
|
| 609 |
+
for name in defined_names:
|
| 610 |
+
if item_str == name:
|
| 611 |
+
formatted_item = f"`{name}`"
|
| 612 |
+
found_match = True
|
| 613 |
+
break
|
| 614 |
+
elif item_str.startswith(name + " "):
|
| 615 |
+
formatted_item = f"`{name}`{item_str[len(name):]}"
|
| 616 |
+
found_match = True
|
| 617 |
+
break
|
| 618 |
+
if (
|
| 619 |
+
not found_match
|
| 620 |
+
and " " not in item_str
|
| 621 |
+
and not item_str.startswith("`")
|
| 622 |
+
):
|
| 623 |
+
formatted_item = f"`{item_str}`"
|
| 624 |
+
output.append(f"- {formatted_item}")
|
| 625 |
+
else:
|
| 626 |
+
output.append("- (Error loading list)")
|
| 627 |
+
output.append("")
|
| 628 |
+
|
| 629 |
+
render_list("Data Sent by User to App", "user_input_data")
|
| 630 |
+
render_list("Data Processed Locally within App", "local_processing")
|
| 631 |
+
render_list("Data Processed Remotely", "remote_processing")
|
| 632 |
+
render_list("Data Logged/Saved Externally", "external_logging")
|
| 633 |
+
|
| 634 |
+
# Remove the last empty line
|
| 635 |
+
if output and output[-1] == "":
|
| 636 |
+
output.pop()
|
| 637 |
+
|
| 638 |
+
return "\n".join(output)
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
# --- Combined TLDR Generation Function ---
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
def generate_and_parse_tldr(detailed_report: str, summary_report: str) -> dict | None:
|
| 645 |
+
"""Formats prompt, queries LLM, and parses JSON response for TLDR.
|
| 646 |
+
|
| 647 |
+
Args:
|
| 648 |
+
detailed_report: The detailed privacy report content.
|
| 649 |
+
summary_report: The summary & highlights report content.
|
| 650 |
+
|
| 651 |
+
Returns:
|
| 652 |
+
A dictionary with the parsed TLDR data, or None if any step fails.
|
| 653 |
+
"""
|
| 654 |
+
logging.info("Starting TLDR generation and parsing...")
|
| 655 |
+
try:
|
| 656 |
+
# Format
|
| 657 |
+
tldr_prompt_messages = format_tldr_prompt(detailed_report, summary_report)
|
| 658 |
+
if not tldr_prompt_messages:
|
| 659 |
+
logging.error("TLDR Generation: Failed to format prompt.")
|
| 660 |
+
return None
|
| 661 |
+
|
| 662 |
+
# Query (using existing import within analysis_utils)
|
| 663 |
+
# Use slightly smaller max_tokens
|
| 664 |
+
llm_response = query_qwen_endpoint(tldr_prompt_messages, max_tokens=1024)
|
| 665 |
+
if llm_response is None: # Check if query itself failed critically
|
| 666 |
+
logging.error("TLDR Generation: LLM query returned None.")
|
| 667 |
+
return None
|
| 668 |
+
# 503 handled within parse function below
|
| 669 |
+
|
| 670 |
+
# Parse
|
| 671 |
+
parsed_data = parse_tldr_json_response(llm_response)
|
| 672 |
+
if parsed_data:
|
| 673 |
+
logging.info("Successfully generated and parsed TLDR.")
|
| 674 |
+
return parsed_data
|
| 675 |
+
else:
|
| 676 |
+
logging.error("TLDR Generation: Failed to parse JSON response.")
|
| 677 |
+
return None
|
| 678 |
+
|
| 679 |
+
except Exception as e:
|
| 680 |
+
logging.error(
|
| 681 |
+
f"TLDR Generation: Unexpected error in generate_and_parse_tldr: {e}",
|
| 682 |
+
exc_info=True,
|
| 683 |
+
)
|
| 684 |
+
return None
|
app.py
CHANGED
|
@@ -1,25 +1,37 @@
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
-
|
| 7 |
from huggingface_hub import HfApi
|
| 8 |
|
| 9 |
-
|
| 10 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
# Updated prompt imports for new order
|
| 13 |
-
from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
from utils import (
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
)
|
| 23 |
|
| 24 |
# Configure logging
|
| 25 |
logging.basicConfig(
|
|
@@ -34,10 +46,13 @@ load_dotenv()
|
|
| 34 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 35 |
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
|
| 36 |
DATASET_ID = "yjernite/spaces-privacy-reports"
|
| 37 |
-
CACHE_INFO_MSG =
|
|
|
|
|
|
|
| 38 |
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
|
| 39 |
|
| 40 |
-
TRUNCATION_WARNING
|
|
|
|
| 41 |
|
| 42 |
ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
|
| 43 |
|
|
@@ -49,419 +64,582 @@ You have a few options:
|
|
| 49 |
"""
|
| 50 |
|
| 51 |
|
| 52 |
-
def
|
| 53 |
-
selected_cached_space: str | None,
|
| 54 |
-
new_space_id: str | None,
|
| 55 |
-
progress=gr.Progress(track_tqdm=True),
|
| 56 |
-
):
|
| 57 |
"""
|
| 58 |
-
|
| 59 |
-
Handles the logic based on Dropdown and Textbox inputs.
|
| 60 |
Yields tuples of Gradio updates.
|
| 61 |
"""
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
|
| 67 |
-
target_space_id = new_space_id.strip()
|
| 68 |
-
if target_space_id == selected_cached_space:
|
| 69 |
-
source = "dropdown_match" # User typed ID that exists in dropdown
|
| 70 |
-
else:
|
| 71 |
-
source = "new"
|
| 72 |
-
elif selected_cached_space:
|
| 73 |
-
target_space_id = selected_cached_space
|
| 74 |
-
source = "dropdown"
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
gr.update(
|
| 92 |
-
value=f"
|
| 93 |
visible=True,
|
| 94 |
),
|
| 95 |
-
gr.update(value="", visible=False),
|
| 96 |
-
gr.update(visible=True, open=True),
|
| 97 |
-
gr.update(visible=False),
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
|
| 101 |
-
|
| 102 |
-
# --- Cache Handling ---
|
| 103 |
-
# If the user explicitly selected from the dropdown, try to fetch it directly.
|
| 104 |
-
if source == "dropdown":
|
| 105 |
-
progress(
|
| 106 |
-
0.1, desc="Fetching cached report..."
|
| 107 |
-
) # Simple progress for cache fetch
|
| 108 |
-
yield (
|
| 109 |
-
gr.update(value="Fetching selected cached report...", visible=True),
|
| 110 |
gr.update(value="", visible=True),
|
| 111 |
-
gr.update(visible=True, open=
|
|
|
|
| 112 |
gr.update(visible=True, open=False),
|
| 113 |
)
|
| 114 |
-
|
| 115 |
-
cached_reports = download_cached_reports(
|
| 116 |
-
target_space_id, DATASET_ID, HF_TOKEN
|
| 117 |
-
)
|
| 118 |
-
summary_report = (
|
| 119 |
-
cached_reports.get("summary", "Error: Cached summary not found.")
|
| 120 |
-
+ CACHE_INFO_MSG
|
| 121 |
-
)
|
| 122 |
-
privacy_report = (
|
| 123 |
-
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
| 124 |
-
+ CACHE_INFO_MSG
|
| 125 |
-
)
|
| 126 |
-
logging.info(
|
| 127 |
-
f"Successfully displayed cached reports for selected '{target_space_id}'."
|
| 128 |
-
)
|
| 129 |
-
progress(1.0, desc="Complete (from cache)")
|
| 130 |
-
yield (
|
| 131 |
-
gr.update(value=summary_report, visible=True),
|
| 132 |
-
gr.update(value=privacy_report, visible=True),
|
| 133 |
-
gr.update(visible=True, open=True),
|
| 134 |
-
gr.update(visible=True, open=True),
|
| 135 |
-
)
|
| 136 |
-
except Exception as e:
|
| 137 |
-
error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
|
| 138 |
-
logging.error(error_msg)
|
| 139 |
-
progress(1.0, desc="Error")
|
| 140 |
-
yield (
|
| 141 |
-
gr.update(value=error_msg, visible=True),
|
| 142 |
-
gr.update(value="", visible=False),
|
| 143 |
-
gr.update(visible=True, open=True),
|
| 144 |
-
gr.update(visible=False),
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
# --- Live Analysis or Check Cache for New Input ---
|
| 148 |
-
# If it came from the textbox OR was a dropdown match, we first check cache, then run live.
|
| 149 |
-
else: # source == "new" or source == "dropdown_match"
|
| 150 |
-
# This generator now performs the full analysis if needed
|
| 151 |
-
# Yield intermediate updates from the generator
|
| 152 |
-
# Important: Need to use a loop to consume the generator
|
| 153 |
-
final_update = None
|
| 154 |
-
for update_tuple in _run_live_analysis(target_space_id, progress):
|
| 155 |
-
yield update_tuple
|
| 156 |
-
final_update = update_tuple # Keep track of the last update
|
| 157 |
-
yield final_update # Return the very last state
|
| 158 |
-
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
"""
|
| 166 |
-
steps = 8 # Steps for the full pipeline
|
| 167 |
-
privacy_truncated = False
|
| 168 |
-
summary_truncated = False
|
| 169 |
-
|
| 170 |
-
# --- Step 1: Check Cache --- (Check again for new/matched input)
|
| 171 |
-
progress(1 / steps, desc="Step 1/8: Checking cache...")
|
| 172 |
-
logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
|
| 173 |
yield (
|
| 174 |
-
gr.update(value=
|
|
|
|
|
|
|
| 175 |
gr.update(value="", visible=True),
|
| 176 |
-
gr.update(visible=True, open=
|
|
|
|
| 177 |
gr.update(visible=True, open=False),
|
| 178 |
)
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
|
| 183 |
-
except Exception as e:
|
| 184 |
-
logging.warning(f"Cache check failed: {e}. Proceeding.")
|
| 185 |
-
yield (
|
| 186 |
-
gr.update(
|
| 187 |
-
value="Cache check failed, proceeding with live analysis...",
|
| 188 |
-
visible=True,
|
| 189 |
-
),
|
| 190 |
-
gr.update(value="", visible=True),
|
| 191 |
-
gr.update(visible=True, open=True),
|
| 192 |
-
gr.update(visible=True, open=False),
|
| 193 |
-
)
|
| 194 |
|
| 195 |
-
if
|
| 196 |
-
|
| 197 |
-
|
|
|
|
| 198 |
yield (
|
| 199 |
-
gr.update(value=
|
| 200 |
-
gr.update(value="", visible=
|
| 201 |
-
gr.update(
|
| 202 |
-
gr.update(
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
summary_report = (
|
| 207 |
-
cached_reports.get("summary", "Error: Cached summary not found.")
|
| 208 |
-
+ CACHE_INFO_MSG
|
| 209 |
-
)
|
| 210 |
-
privacy_report = (
|
| 211 |
-
cached_reports.get("privacy", "Error: Cached privacy report not found.")
|
| 212 |
-
+ CACHE_INFO_MSG
|
| 213 |
-
)
|
| 214 |
-
logging.info(f"Successfully displayed cached reports for {space_id}.")
|
| 215 |
-
progress(8 / steps, desc="Complete (from cache)")
|
| 216 |
-
yield (
|
| 217 |
-
gr.update(value=summary_report, visible=True),
|
| 218 |
-
gr.update(value=privacy_report, visible=True),
|
| 219 |
-
gr.update(visible=True, open=True),
|
| 220 |
-
gr.update(visible=True, open=True),
|
| 221 |
-
)
|
| 222 |
-
return # End generation here if cache successful
|
| 223 |
-
except Exception as e:
|
| 224 |
-
logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
|
| 225 |
-
yield (
|
| 226 |
-
gr.update(
|
| 227 |
-
value="Cache download failed, proceeding with live analysis...",
|
| 228 |
-
visible=True,
|
| 229 |
-
),
|
| 230 |
-
gr.update(value="", visible=True),
|
| 231 |
-
gr.update(visible=True, open=True),
|
| 232 |
-
gr.update(visible=True, open=False),
|
| 233 |
-
)
|
| 234 |
-
else:
|
| 235 |
-
logging.info(f"Cache miss for {space_id}. Performing live analysis.")
|
| 236 |
-
yield (
|
| 237 |
-
gr.update(value="Cache miss. Fetching code...", visible=True),
|
| 238 |
-
gr.update(value="", visible=True),
|
| 239 |
-
gr.update(visible=True, open=True),
|
| 240 |
-
gr.update(visible=True, open=False),
|
| 241 |
)
|
|
|
|
| 242 |
|
| 243 |
-
# --- Step
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
| 246 |
yield (
|
| 247 |
-
gr.update(value=
|
|
|
|
|
|
|
| 248 |
gr.update(value="", visible=True),
|
| 249 |
-
gr.update(visible=True, open=
|
|
|
|
| 250 |
gr.update(visible=True, open=False),
|
| 251 |
)
|
|
|
|
| 252 |
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")
|
| 261 |
-
|
| 262 |
-
if status == 'running':
|
| 263 |
-
endpoint_ready = True
|
| 264 |
-
else:
|
| 265 |
-
logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
|
| 266 |
-
if status == 'scaledToZero':
|
| 267 |
-
logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
|
| 268 |
-
endpoint.resume()
|
| 269 |
-
msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE
|
| 270 |
-
yield (
|
| 271 |
-
gr.update(value=msg_503, visible=True),
|
| 272 |
-
gr.update(value="", visible=False),
|
| 273 |
-
gr.update(visible=True, open=True),
|
| 274 |
-
gr.update(visible=False)
|
| 275 |
-
)
|
| 276 |
-
return # Stop analysis, user needs to retry
|
| 277 |
-
except Exception as e:
|
| 278 |
-
logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
|
| 279 |
-
yield (
|
| 280 |
-
gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
|
| 281 |
-
gr.update(value="", visible=False),
|
| 282 |
-
gr.update(visible=True, open=True),
|
| 283 |
-
gr.update(visible=False)
|
| 284 |
-
)
|
| 285 |
-
return # Stop analysis
|
| 286 |
-
|
| 287 |
-
# --- Step 3: Fetch Code Files (if not cached) ---
|
| 288 |
-
progress(3 / steps, desc="Step 3/8: Fetching code files...")
|
| 289 |
-
logging.info("Step 3/8: Fetching code files...")
|
| 290 |
-
code_files = get_space_code_files(space_id)
|
| 291 |
-
if not code_files:
|
| 292 |
-
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
|
| 293 |
-
logging.warning(error_msg)
|
| 294 |
yield (
|
| 295 |
-
gr.update(value=
|
|
|
|
|
|
|
| 296 |
gr.update(value="Analysis Canceled", visible=True),
|
| 297 |
-
gr.update(visible=
|
|
|
|
| 298 |
gr.update(visible=True, open=False),
|
| 299 |
)
|
| 300 |
-
return
|
|
|
|
| 301 |
|
| 302 |
# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
|
| 303 |
-
|
| 304 |
-
|
|
|
|
| 305 |
)
|
| 306 |
-
|
|
|
|
| 307 |
yield (
|
| 308 |
-
gr.update(value=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
gr.update(value="Generating detailed privacy report via AI...", visible=True),
|
| 310 |
-
gr.update(visible=True, open=
|
|
|
|
| 311 |
gr.update(visible=True, open=True),
|
| 312 |
)
|
| 313 |
-
|
| 314 |
-
space_id, code_files
|
| 315 |
)
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
yield (
|
| 322 |
-
gr.update(
|
| 323 |
-
|
| 324 |
-
),
|
| 325 |
-
gr.update(value="", visible=False),
|
| 326 |
-
gr.update(visible=
|
| 327 |
-
gr.update(visible=False),
|
|
|
|
| 328 |
)
|
| 329 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
yield (
|
| 338 |
-
gr.update(value=
|
|
|
|
| 339 |
gr.update(
|
| 340 |
-
value=f"
|
| 341 |
visible=True,
|
| 342 |
),
|
| 343 |
-
gr.update(
|
|
|
|
|
|
|
| 344 |
gr.update(visible=True, open=True),
|
| 345 |
)
|
| 346 |
-
return
|
| 347 |
-
|
| 348 |
-
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
yield (
|
| 351 |
-
gr.update(value=
|
| 352 |
-
gr.update(value=
|
| 353 |
-
gr.update(
|
|
|
|
|
|
|
|
|
|
| 354 |
gr.update(visible=True, open=True),
|
| 355 |
)
|
| 356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
-
#
|
| 359 |
-
progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
|
| 360 |
-
logging.info("Step 5/8: Fetching model descriptions...")
|
| 361 |
yield (
|
| 362 |
-
gr.update(value=
|
|
|
|
| 363 |
gr.update(),
|
| 364 |
gr.update(),
|
|
|
|
|
|
|
| 365 |
gr.update(),
|
| 366 |
)
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
| 370 |
yield (
|
| 371 |
-
gr.update(value=
|
|
|
|
|
|
|
|
|
|
| 372 |
gr.update(),
|
| 373 |
gr.update(),
|
| 374 |
gr.update(),
|
| 375 |
)
|
| 376 |
-
|
| 377 |
-
|
|
|
|
| 378 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
-
#
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
)
|
| 384 |
-
|
| 385 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
yield (
|
|
|
|
|
|
|
| 387 |
gr.update(
|
| 388 |
-
value=
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
),
|
| 393 |
-
gr.update(visible=True, open=
|
| 394 |
-
gr.update(visible=
|
| 395 |
)
|
| 396 |
-
return
|
| 397 |
-
|
| 398 |
-
summary_highlights_report = parse_qwen_response(summary_highlights_api_response)
|
| 399 |
|
| 400 |
-
if "
|
| 401 |
-
|
| 402 |
-
f"Failed to generate summary/highlights report: {summary_highlights_report}"
|
| 403 |
-
)
|
| 404 |
yield (
|
|
|
|
|
|
|
| 405 |
gr.update(
|
| 406 |
-
value=f"
|
| 407 |
visible=True,
|
| 408 |
),
|
| 409 |
-
gr.update(value=
|
| 410 |
-
gr.update(visible=True, open=
|
| 411 |
-
gr.update(visible=True, open=
|
|
|
|
| 412 |
)
|
| 413 |
-
return
|
| 414 |
-
if summary_truncated:
|
| 415 |
-
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report
|
| 416 |
|
| 417 |
-
|
| 418 |
-
yield (
|
| 419 |
-
gr.update(value=summary_highlights_report, visible=True),
|
| 420 |
-
gr.update(value=detailed_privacy_report, visible=True),
|
| 421 |
-
gr.update(visible=True, open=True),
|
| 422 |
-
gr.update(visible=True, open=True),
|
| 423 |
-
)
|
| 424 |
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
upload_reports_to_dataset(
|
| 442 |
-
space_id=space_id,
|
| 443 |
-
summary_report=summary_to_save,
|
| 444 |
-
detailed_report=privacy_to_save,
|
| 445 |
-
dataset_id=DATASET_ID,
|
| 446 |
-
hf_token=HF_TOKEN,
|
| 447 |
)
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
-
# ---
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
|
| 466 |
|
| 467 |
# --- Load Initial Data Function (for demo.load) ---
|
|
@@ -511,7 +689,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 511 |
with gr.Row():
|
| 512 |
with gr.Column(scale=1): # Left column for inputs
|
| 513 |
description_accordion = gr.Accordion(
|
| 514 |
-
"What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
|
|
|
|
|
|
|
| 515 |
)
|
| 516 |
with description_accordion:
|
| 517 |
gr.Markdown(DESCRIPTION)
|
|
@@ -532,12 +712,28 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 532 |
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
|
| 533 |
|
| 534 |
with gr.Column(scale=1): # Right column for outputs
|
| 535 |
-
# Define
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
summary_accordion = gr.Accordion(
|
| 537 |
-
"Summary & Privacy Highlights",
|
|
|
|
|
|
|
| 538 |
)
|
| 539 |
privacy_accordion = gr.Accordion(
|
| 540 |
-
"Detailed Privacy Analysis Report",
|
|
|
|
|
|
|
| 541 |
)
|
| 542 |
with summary_accordion:
|
| 543 |
summary_markdown = gr.Markdown(
|
|
@@ -559,8 +755,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 559 |
fn=get_space_report_wrapper,
|
| 560 |
inputs=[cached_spaces_dropdown, space_id_input],
|
| 561 |
outputs=[
|
|
|
|
|
|
|
| 562 |
summary_markdown,
|
| 563 |
privacy_markdown,
|
|
|
|
| 564 |
summary_accordion,
|
| 565 |
privacy_accordion,
|
| 566 |
],
|
|
|
|
| 1 |
+
import json
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from dotenv import load_dotenv
|
|
|
|
| 7 |
from huggingface_hub import HfApi
|
| 8 |
|
| 9 |
+
# Import analysis pipeline helpers
|
| 10 |
+
from analysis_utils import (check_cache_and_download, check_endpoint_status,
|
| 11 |
+
fetch_and_validate_code, format_tldr_prompt,
|
| 12 |
+
generate_and_parse_tldr, generate_detailed_report,
|
| 13 |
+
generate_summary_report, parse_tldr_json_response,
|
| 14 |
+
render_data_details_markdown, render_tldr_markdown,
|
| 15 |
+
upload_results)
|
| 16 |
+
# Import general utils
|
| 17 |
+
from utils import list_cached_spaces # Added import
|
| 18 |
+
|
| 19 |
+
# Removed LLM interface imports, handled by analysis_utils
|
| 20 |
+
# from llm_interface import ERROR_503_DICT
|
| 21 |
+
# from llm_interface import parse_qwen_response, query_qwen_endpoint
|
| 22 |
+
|
| 23 |
+
# Removed prompts import, handled by analysis_utils
|
| 24 |
+
# from prompts import format_privacy_prompt, format_summary_highlights_prompt
|
| 25 |
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
|
| 28 |
+
# Removed specific utils imports now handled via analysis_utils
|
| 29 |
+
# from utils import (
|
| 30 |
+
# check_report_exists,
|
| 31 |
+
# download_cached_reports,
|
| 32 |
+
# get_space_code_files,
|
| 33 |
+
# upload_reports_to_dataset,
|
| 34 |
+
# )
|
| 35 |
|
| 36 |
# Configure logging
|
| 37 |
logging.basicConfig(
|
|
|
|
| 46 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 47 |
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
|
| 48 |
DATASET_ID = "yjernite/spaces-privacy-reports"
|
| 49 |
+
CACHE_INFO_MSG = (
|
| 50 |
+
"\n\n*(Report retrieved from cache)*" # Still needed for dropdown cache hit message
|
| 51 |
+
)
|
| 52 |
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"
|
| 53 |
|
| 54 |
+
# TRUNCATION_WARNING now defined and used within analysis_utils
|
| 55 |
+
# TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""
|
| 56 |
|
| 57 |
ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.
|
| 58 |
|
|
|
|
| 64 |
"""
|
| 65 |
|
| 66 |
|
| 67 |
+
def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
"""
|
| 69 |
+
Performs the full analysis pipeline using helper functions from analysis_utils.
|
|
|
|
| 70 |
Yields tuples of Gradio updates.
|
| 71 |
"""
|
| 72 |
+
total_steps = 9 # Increased step count for TLDR generation
|
| 73 |
+
current_step = 0
|
| 74 |
+
summary_report = ""
|
| 75 |
+
privacy_report = ""
|
| 76 |
+
tldr_data = None
|
| 77 |
+
tldr_markdown_content = "*TLDR loading...*"
|
| 78 |
+
data_details_content = (
|
| 79 |
+
"*Data details loading...*" # Default message for new component
|
| 80 |
+
)
|
| 81 |
|
| 82 |
+
# Initial message before first step
|
| 83 |
+
tldr_status_message = "*Starting analysis...*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# --- Step 1: Check Cache ---
|
| 86 |
+
current_step += 1
|
| 87 |
+
progress_desc = f"Step {current_step}/{total_steps}: Checking cache..."
|
| 88 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 89 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 90 |
+
yield (
|
| 91 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 92 |
+
gr.update(value="*Checking cache...*", visible=True),
|
| 93 |
+
gr.update(value="Checking cache for existing reports...", visible=True),
|
| 94 |
+
gr.update(value="", visible=True),
|
| 95 |
+
gr.update(visible=True, open=False),
|
| 96 |
+
gr.update(visible=True, open=False),
|
| 97 |
+
gr.update(visible=True, open=False),
|
| 98 |
+
)
|
| 99 |
+
cache_result = check_cache_and_download(space_id, DATASET_ID, HF_TOKEN)
|
| 100 |
+
|
| 101 |
+
if cache_result["status"] == "cache_hit":
|
| 102 |
+
progress(total_steps / total_steps, desc="Complete (from cache)")
|
| 103 |
+
# Try to parse and render TLDR from cache
|
| 104 |
+
tldr_json_str = cache_result.get("tldr_json_str")
|
| 105 |
+
rendered_tldr = "*TLDR not found in cache.*"
|
| 106 |
+
if tldr_json_str:
|
| 107 |
+
try:
|
| 108 |
+
cached_tldr_data = json.loads(tldr_json_str)
|
| 109 |
+
# Render both parts
|
| 110 |
+
rendered_tldr = render_tldr_markdown(cached_tldr_data, space_id)
|
| 111 |
+
rendered_data_details = render_data_details_markdown(cached_tldr_data)
|
| 112 |
+
except Exception as parse_err:
|
| 113 |
+
logging.warning(
|
| 114 |
+
f"Failed to parse cached TLDR JSON for {space_id}: {parse_err}"
|
| 115 |
+
)
|
| 116 |
+
rendered_tldr = "*Error parsing cached TLDR.*"
|
| 117 |
+
rendered_data_details = (
|
| 118 |
+
"*Could not load data details due to parsing error.*"
|
| 119 |
+
)
|
| 120 |
|
| 121 |
+
yield (
|
| 122 |
+
gr.update(value=rendered_tldr, visible=True),
|
| 123 |
+
gr.update(value=rendered_data_details, visible=True),
|
| 124 |
+
gr.update(value=cache_result["summary"], visible=True),
|
| 125 |
+
gr.update(value=cache_result["privacy"], visible=True),
|
| 126 |
+
gr.update(visible=True, open=False),
|
| 127 |
+
gr.update(visible=True, open=False),
|
| 128 |
+
gr.update(visible=True, open=False),
|
| 129 |
+
)
|
| 130 |
+
return # End generation successfully from cache
|
| 131 |
+
elif cache_result["status"] == "cache_error":
|
| 132 |
+
# Display final error in TLDR field
|
| 133 |
+
tldr_status_message = (
|
| 134 |
+
f"*Cache download failed. {cache_result.get('ui_message', '')}*"
|
| 135 |
+
)
|
| 136 |
+
data_details_content = "*Data details unavailable due to cache error.*"
|
| 137 |
+
yield (
|
| 138 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 139 |
+
gr.update(value=data_details_content, visible=True),
|
| 140 |
+
gr.update(value=cache_result["ui_message"], visible=True),
|
| 141 |
+
gr.update(value="", visible=True),
|
| 142 |
+
gr.update(visible=True, open=False),
|
| 143 |
+
gr.update(visible=True, open=False),
|
| 144 |
+
gr.update(visible=True, open=False),
|
| 145 |
+
)
|
| 146 |
+
# Still continue to live analysis if cache download fails
|
| 147 |
+
elif cache_result["status"] == "cache_miss":
|
| 148 |
+
tldr_status_message = f"*{progress_desc} - Cache miss.*" # Update status
|
| 149 |
+
data_details_content = "*Generating report...*"
|
| 150 |
+
yield (
|
| 151 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 152 |
+
gr.update(value=data_details_content, visible=True),
|
| 153 |
+
gr.update(value="Cache miss. Starting live analysis...", visible=True),
|
| 154 |
+
gr.update(value="", visible=True),
|
| 155 |
+
gr.update(visible=True, open=False),
|
| 156 |
+
gr.update(visible=True, open=False),
|
| 157 |
+
gr.update(visible=True, open=False),
|
| 158 |
+
)
|
| 159 |
+
elif "error_message" in cache_result:
|
| 160 |
+
# Display final error in TLDR field
|
| 161 |
+
tldr_status_message = (
|
| 162 |
+
f"*Cache check failed. {cache_result.get('error_message', '')}*"
|
| 163 |
+
)
|
| 164 |
+
data_details_content = "*Data details unavailable due to cache error.*"
|
| 165 |
+
yield (
|
| 166 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 167 |
+
gr.update(value=data_details_content, visible=True),
|
| 168 |
gr.update(
|
| 169 |
+
value=f"Cache check failed: {cache_result.get('error_message', 'Unknown error')}. Proceeding with live analysis...",
|
| 170 |
visible=True,
|
| 171 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
gr.update(value="", visible=True),
|
| 173 |
+
gr.update(visible=True, open=False),
|
| 174 |
+
gr.update(visible=True, open=False),
|
| 175 |
gr.update(visible=True, open=False),
|
| 176 |
)
|
| 177 |
+
# Still continue if cache check fails
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
+
# --- Step 2: Check Endpoint Status ---
|
| 180 |
+
current_step += 1
|
| 181 |
+
progress_desc = f"Step {current_step}/{total_steps}: Checking endpoint..."
|
| 182 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 183 |
+
tldr_status_message = f"*{progress_desc}*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
yield (
|
| 185 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 186 |
+
gr.update(),
|
| 187 |
+
gr.update(value="Checking analysis model endpoint status...", visible=True),
|
| 188 |
gr.update(value="", visible=True),
|
| 189 |
+
gr.update(visible=True, open=False),
|
| 190 |
+
gr.update(visible=True, open=False),
|
| 191 |
gr.update(visible=True, open=False),
|
| 192 |
)
|
| 193 |
+
endpoint_result = check_endpoint_status(
|
| 194 |
+
ENDPOINT_NAME, HF_TOKEN, ERROR_503_USER_MESSAGE
|
| 195 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
if endpoint_result["status"] == "error":
|
| 198 |
+
progress(total_steps / total_steps, desc="Endpoint Error")
|
| 199 |
+
# Display final error in TLDR field
|
| 200 |
+
tldr_markdown_content = endpoint_result["ui_message"]
|
| 201 |
yield (
|
| 202 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 203 |
+
gr.update(value="", visible=False),
|
| 204 |
+
gr.update(value="", visible=False),
|
| 205 |
+
gr.update(value="", visible=False),
|
| 206 |
+
gr.update(visible=False),
|
| 207 |
+
gr.update(visible=False),
|
| 208 |
+
gr.update(visible=False),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
+
return
|
| 211 |
|
| 212 |
+
# --- Step 3: Fetch Code Files ---
|
| 213 |
+
current_step += 1
|
| 214 |
+
progress_desc = f"Step {current_step}/{total_steps}: Fetching code..."
|
| 215 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 216 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 217 |
yield (
|
| 218 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 219 |
+
gr.update(),
|
| 220 |
+
gr.update(value="Fetching code files from the Space...", visible=True),
|
| 221 |
gr.update(value="", visible=True),
|
| 222 |
+
gr.update(visible=True, open=False),
|
| 223 |
+
gr.update(visible=True, open=False),
|
| 224 |
gr.update(visible=True, open=False),
|
| 225 |
)
|
| 226 |
+
code_result = fetch_and_validate_code(space_id)
|
| 227 |
|
| 228 |
+
if code_result["status"] == "error":
|
| 229 |
+
progress(total_steps / total_steps, desc="Code Fetch Error")
|
| 230 |
+
# Display final error in TLDR field
|
| 231 |
+
tldr_markdown_content = (
|
| 232 |
+
f"**Error:** {code_result.get('ui_message', 'Failed to fetch code.')}"
|
| 233 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
yield (
|
| 235 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 236 |
+
gr.update(value="", visible=False),
|
| 237 |
+
gr.update(value="", visible=False),
|
| 238 |
gr.update(value="Analysis Canceled", visible=True),
|
| 239 |
+
gr.update(visible=False),
|
| 240 |
+
gr.update(visible=False),
|
| 241 |
gr.update(visible=True, open=False),
|
| 242 |
)
|
| 243 |
+
return
|
| 244 |
+
code_files = code_result["code_files"]
|
| 245 |
|
| 246 |
# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
|
| 247 |
+
current_step += 1
|
| 248 |
+
progress_desc = (
|
| 249 |
+
f"Step {current_step}/{total_steps}: Generating privacy report (AI Call 1)..."
|
| 250 |
)
|
| 251 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 252 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 253 |
yield (
|
| 254 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 255 |
+
gr.update(),
|
| 256 |
+
gr.update(
|
| 257 |
+
value="Generating detailed privacy report (AI Call 1)...", visible=True
|
| 258 |
+
),
|
| 259 |
gr.update(value="Generating detailed privacy report via AI...", visible=True),
|
| 260 |
+
gr.update(visible=True, open=False),
|
| 261 |
+
gr.update(visible=True, open=False),
|
| 262 |
gr.update(visible=True, open=True),
|
| 263 |
)
|
| 264 |
+
privacy_result = generate_detailed_report(
|
| 265 |
+
space_id, code_files, ERROR_503_USER_MESSAGE
|
| 266 |
)
|
| 267 |
|
| 268 |
+
if privacy_result["status"] == "error":
|
| 269 |
+
progress(total_steps / total_steps, desc="Privacy Report Error")
|
| 270 |
+
# Display final error in TLDR field
|
| 271 |
+
tldr_markdown_content = f"**Error:** {privacy_result.get('ui_message', 'Failed during detailed report generation.')}"
|
| 272 |
yield (
|
| 273 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 274 |
+
gr.update(value="", visible=False),
|
| 275 |
+
gr.update(value="", visible=False),
|
| 276 |
+
gr.update(value="", visible=False),
|
| 277 |
+
gr.update(visible=False),
|
| 278 |
+
gr.update(visible=False),
|
| 279 |
+
gr.update(visible=False),
|
| 280 |
)
|
| 281 |
+
return
|
| 282 |
+
privacy_report = privacy_result["report"]
|
| 283 |
+
|
| 284 |
+
# Update UI with successful detailed report
|
| 285 |
+
yield (
|
| 286 |
+
gr.update(value=tldr_status_message, visible=True), # Still show progress
|
| 287 |
+
gr.update(),
|
| 288 |
+
gr.update(
|
| 289 |
+
value="Detailed privacy report generated. Proceeding...", visible=True
|
| 290 |
+
),
|
| 291 |
+
gr.update(value=privacy_report, visible=True),
|
| 292 |
+
gr.update(visible=True, open=False),
|
| 293 |
+
gr.update(visible=True, open=False),
|
| 294 |
+
gr.update(visible=True, open=True),
|
| 295 |
+
)
|
| 296 |
|
| 297 |
+
# --- Step 5: Fetch Model Descriptions (Placeholder/Optional) ---
|
| 298 |
+
current_step += 1
|
| 299 |
+
progress_desc = f"Step {current_step}/{total_steps}: Extracting model info..."
|
| 300 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 301 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 302 |
+
logging.info(progress_desc + " (Placeholder)")
|
| 303 |
+
yield (
|
| 304 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 305 |
+
gr.update(),
|
| 306 |
+
gr.update(value="Extracting model info...", visible=True),
|
| 307 |
+
gr.update(),
|
| 308 |
+
gr.update(),
|
| 309 |
+
gr.update(),
|
| 310 |
+
gr.update(),
|
| 311 |
+
)
|
| 312 |
+
# model_ids = extract_hf_model_ids(code_files) # utils function not imported
|
| 313 |
+
# model_descriptions = get_model_descriptions(model_ids) # utils function not imported
|
| 314 |
+
# Add model_descriptions to context if needed for summary prompt later
|
| 315 |
|
| 316 |
+
# --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
|
| 317 |
+
current_step += 1
|
| 318 |
+
progress_desc = (
|
| 319 |
+
f"Step {current_step}/{total_steps}: Generating summary (AI Call 2)..."
|
| 320 |
+
)
|
| 321 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 322 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 323 |
+
yield (
|
| 324 |
+
gr.update(value=tldr_status_message, visible=True), # TLDR shows progress
|
| 325 |
+
gr.update(),
|
| 326 |
+
gr.update(value="Generating summary & highlights (AI Call 2)...", visible=True),
|
| 327 |
+
gr.update(),
|
| 328 |
+
gr.update(),
|
| 329 |
+
gr.update(),
|
| 330 |
+
gr.update(),
|
| 331 |
+
)
|
| 332 |
+
summary_result = generate_summary_report(
|
| 333 |
+
space_id, code_files, privacy_report, ERROR_503_USER_MESSAGE
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
if (
|
| 337 |
+
summary_result["status"] == "error_503_summary"
|
| 338 |
+
or summary_result["status"] == "error_summary"
|
| 339 |
+
):
|
| 340 |
+
progress(total_steps / total_steps, desc="Summary Report Error")
|
| 341 |
+
# Display error in TLDR, show partial results below
|
| 342 |
+
tldr_markdown_content = f"**Error:** {summary_result.get('ui_message', 'Failed during summary generation.')}"
|
| 343 |
+
data_details_content = "*Data details may be incomplete.*"
|
| 344 |
+
yield (
|
| 345 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 346 |
+
gr.update(value=data_details_content, visible=True),
|
| 347 |
+
gr.update(value=summary_result["ui_message"], visible=True),
|
| 348 |
+
gr.update(value=privacy_report, visible=True),
|
| 349 |
+
gr.update(visible=True, open=False),
|
| 350 |
+
gr.update(visible=True, open=False),
|
| 351 |
+
gr.update(visible=True, open=True),
|
| 352 |
)
|
| 353 |
+
return
|
| 354 |
+
elif summary_result["status"] != "success":
|
| 355 |
+
progress(total_steps / total_steps, desc="Summary Report Error")
|
| 356 |
+
# Display error in TLDR, show partial results below
|
| 357 |
+
tldr_markdown_content = f"**Error:** Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}"
|
| 358 |
+
data_details_content = "*Data details unavailable.*"
|
| 359 |
yield (
|
| 360 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 361 |
+
gr.update(value=data_details_content, visible=True),
|
| 362 |
gr.update(
|
| 363 |
+
value=f"Unexpected error generating summary: {summary_result.get('ui_message', 'Unknown')}",
|
| 364 |
visible=True,
|
| 365 |
),
|
| 366 |
+
gr.update(value=privacy_report, visible=True),
|
| 367 |
+
gr.update(visible=True, open=False),
|
| 368 |
+
gr.update(visible=True, open=False),
|
| 369 |
gr.update(visible=True, open=True),
|
| 370 |
)
|
| 371 |
+
return
|
| 372 |
+
|
| 373 |
+
summary_report = summary_result["report"]
|
| 374 |
|
| 375 |
+
# Update UI with successful summary report before TLDR generation
|
| 376 |
+
tldr_status_message = (
|
| 377 |
+
f"*{progress_desc} - Success. Generating TLDR...*" # Update status
|
| 378 |
+
)
|
| 379 |
+
data_details_content = "*Generating data details...*"
|
| 380 |
yield (
|
| 381 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 382 |
+
gr.update(value=data_details_content, visible=True),
|
| 383 |
+
gr.update(value=summary_report, visible=True),
|
| 384 |
+
gr.update(value=privacy_report, visible=True),
|
| 385 |
+
gr.update(visible=True, open=False),
|
| 386 |
+
gr.update(visible=True, open=False),
|
| 387 |
gr.update(visible=True, open=True),
|
| 388 |
)
|
| 389 |
|
| 390 |
+
# --- Step 7: Generate TLDR --- (New Step)
|
| 391 |
+
current_step += 1
|
| 392 |
+
progress_desc = f"Step {current_step}/{total_steps}: Generating TLDR summary..."
|
| 393 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 394 |
+
tldr_status_message = f"*{progress_desc}*"
|
| 395 |
+
yield (
|
| 396 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 397 |
+
gr.update(),
|
| 398 |
+
gr.update(),
|
| 399 |
+
gr.update(),
|
| 400 |
+
gr.update(),
|
| 401 |
+
gr.update(),
|
| 402 |
+
gr.update(),
|
| 403 |
+
)
|
| 404 |
+
tldr_data = None # Reset tldr_data before attempt
|
| 405 |
+
try:
|
| 406 |
+
# Call the combined helper function from analysis_utils
|
| 407 |
+
tldr_data = generate_and_parse_tldr(privacy_report, summary_report)
|
| 408 |
+
|
| 409 |
+
if tldr_data:
|
| 410 |
+
logging.info(f"Successfully generated and parsed TLDR for {space_id}.")
|
| 411 |
+
tldr_markdown_content = render_tldr_markdown(tldr_data, space_id)
|
| 412 |
+
data_details_content = render_data_details_markdown(tldr_data)
|
| 413 |
+
else:
|
| 414 |
+
logging.warning(
|
| 415 |
+
f"Failed to generate or parse TLDR for {space_id}. Proceeding without it."
|
| 416 |
+
)
|
| 417 |
+
tldr_markdown_content = "*TLDR generation failed.*"
|
| 418 |
+
data_details_content = "*Data details generation failed.*"
|
| 419 |
+
except Exception as tldr_err:
|
| 420 |
+
# This catch block might be redundant now if generate_and_parse_tldr handles its errors
|
| 421 |
+
logging.error(
|
| 422 |
+
f"Unexpected error during TLDR generation step call for {space_id}: {tldr_err}"
|
| 423 |
+
)
|
| 424 |
+
tldr_markdown_content = "*Error during TLDR generation step.*"
|
| 425 |
+
data_details_content = "*Error generating data details.*"
|
| 426 |
+
tldr_data = None # Ensure it's None on error
|
| 427 |
|
| 428 |
+
# Update UI including the generated (or failed) TLDR before upload
|
|
|
|
|
|
|
| 429 |
yield (
|
| 430 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 431 |
+
gr.update(value=data_details_content, visible=True),
|
| 432 |
gr.update(),
|
| 433 |
gr.update(),
|
| 434 |
+
gr.update(visible=True, open=False),
|
| 435 |
+
gr.update(),
|
| 436 |
gr.update(),
|
| 437 |
)
|
| 438 |
+
|
| 439 |
+
# --- Step 8: Upload to Cache --- (Old Step 7)
|
| 440 |
+
current_step += 1
|
| 441 |
+
progress_desc = f"Step {current_step}/{total_steps}: Uploading to cache..."
|
| 442 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 443 |
+
tldr_status_message = f"*{progress_desc}*" # Display final action in TLDR field
|
| 444 |
yield (
|
| 445 |
+
gr.update(value=tldr_status_message, visible=True),
|
| 446 |
+
gr.update(),
|
| 447 |
+
gr.update(value="Uploading results to cache...", visible=True),
|
| 448 |
+
gr.update(),
|
| 449 |
gr.update(),
|
| 450 |
gr.update(),
|
| 451 |
gr.update(),
|
| 452 |
)
|
| 453 |
+
upload_needed = (
|
| 454 |
+
cache_result["status"] != "cache_hit"
|
| 455 |
+
and cache_result["status"] != "cache_error"
|
| 456 |
)
|
| 457 |
+
if upload_needed:
|
| 458 |
+
# Call imported function, now passing tldr_data
|
| 459 |
+
upload_result = upload_results(
|
| 460 |
+
space_id,
|
| 461 |
+
summary_report,
|
| 462 |
+
privacy_report,
|
| 463 |
+
DATASET_ID,
|
| 464 |
+
HF_TOKEN,
|
| 465 |
+
tldr_json_data=tldr_data,
|
| 466 |
+
)
|
| 467 |
+
if upload_result["status"] == "error":
|
| 468 |
+
# Ensure logging uses f-string if adding step count here
|
| 469 |
+
logging.error(
|
| 470 |
+
f"Cache upload failed: {upload_result.get('message', 'Unknown error')}"
|
| 471 |
+
)
|
| 472 |
+
# Non-critical, don't stop the UI, just log
|
| 473 |
+
elif upload_result["status"] == "skipped":
|
| 474 |
+
logging.info(f"Cache upload skipped: {upload_result.get('reason', '')}")
|
| 475 |
+
else:
|
| 476 |
+
logging.info(
|
| 477 |
+
"Skipping cache upload as results were loaded from cache or cache check failed."
|
| 478 |
+
)
|
| 479 |
|
| 480 |
+
# Update UI including the generated (or failed) TLDR before upload
|
| 481 |
+
# Yield 7 updates
|
| 482 |
+
yield (
|
| 483 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 484 |
+
gr.update(value=data_details_content, visible=True),
|
| 485 |
+
gr.update(value=summary_report, visible=True),
|
| 486 |
+
gr.update(value=privacy_report, visible=True),
|
| 487 |
+
gr.update(visible=True, open=False),
|
| 488 |
+
gr.update(visible=True, open=False),
|
| 489 |
+
gr.update(visible=True, open=False),
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
# --- Step 9: Final Update --- (Old Step 8)
|
| 493 |
+
current_step += 1
|
| 494 |
+
progress_desc = f"Step {current_step}/{total_steps}: Analysis Complete!"
|
| 495 |
+
progress(current_step / total_steps, desc=progress_desc)
|
| 496 |
+
logging.info(progress_desc + f" Analysis complete for {space_id}.")
|
| 497 |
+
# Yield final state again to ensure UI is correct after potential upload messages
|
| 498 |
+
# Display final generated TLDR and Data Details
|
| 499 |
+
yield (
|
| 500 |
+
gr.update(value=tldr_markdown_content, visible=True),
|
| 501 |
+
gr.update(value=data_details_content, visible=True),
|
| 502 |
+
gr.update(value=summary_report, visible=True),
|
| 503 |
+
gr.update(value=privacy_report, visible=True),
|
| 504 |
+
gr.update(visible=True, open=False),
|
| 505 |
+
gr.update(visible=True, open=False),
|
| 506 |
+
gr.update(visible=True, open=False),
|
| 507 |
)
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
# --- Original Input Handling Wrapper (updated yields for initial errors) ---
|
| 511 |
+
def get_space_report_wrapper(
|
| 512 |
+
selected_cached_space: str | None,
|
| 513 |
+
new_space_id: str | None,
|
| 514 |
+
progress=gr.Progress(track_tqdm=True),
|
| 515 |
+
):
|
| 516 |
+
"""
|
| 517 |
+
Wrapper function to decide whether to fetch cache or run live analysis.
|
| 518 |
+
Handles the logic based on Dropdown and Textbox inputs.
|
| 519 |
+
Yields tuples of Gradio updates.
|
| 520 |
+
"""
|
| 521 |
+
target_space_id = None
|
| 522 |
+
source = "new" # Assume new input unless dropdown is chosen
|
| 523 |
+
|
| 524 |
+
# Prioritize new_space_id if provided
|
| 525 |
+
if new_space_id and new_space_id.strip():
|
| 526 |
+
target_space_id = new_space_id.strip()
|
| 527 |
+
if target_space_id == selected_cached_space:
|
| 528 |
+
source = "dropdown_match" # User typed ID that exists in dropdown
|
| 529 |
+
else:
|
| 530 |
+
source = "new"
|
| 531 |
+
elif selected_cached_space:
|
| 532 |
+
target_space_id = selected_cached_space
|
| 533 |
+
source = "dropdown"
|
| 534 |
+
|
| 535 |
+
if not target_space_id:
|
| 536 |
+
# Yield 7 updates
|
| 537 |
yield (
|
| 538 |
+
gr.update(value="*Please provide a Space ID.*", visible=True),
|
| 539 |
+
gr.update(value="", visible=False),
|
| 540 |
gr.update(
|
| 541 |
+
value="Please select an existing report or enter a new Space ID.",
|
| 542 |
+
visible=True,
|
| 543 |
+
),
|
| 544 |
+
gr.update(value="", visible=False),
|
| 545 |
+
gr.update(visible=True, open=False),
|
| 546 |
+
gr.update(visible=True, open=False),
|
| 547 |
+
gr.update(visible=False),
|
| 548 |
)
|
| 549 |
+
return
|
|
|
|
|
|
|
| 550 |
|
| 551 |
+
if "/" not in target_space_id:
|
| 552 |
+
# Yield 7 updates
|
|
|
|
|
|
|
| 553 |
yield (
|
| 554 |
+
gr.update(value="*Invalid Space ID format.*", visible=True),
|
| 555 |
+
gr.update(value="", visible=False),
|
| 556 |
gr.update(
|
| 557 |
+
value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
|
| 558 |
visible=True,
|
| 559 |
),
|
| 560 |
+
gr.update(value="", visible=False),
|
| 561 |
+
gr.update(visible=True, open=False),
|
| 562 |
+
gr.update(visible=True, open=False),
|
| 563 |
+
gr.update(visible=False),
|
| 564 |
)
|
| 565 |
+
return
|
|
|
|
|
|
|
| 566 |
|
| 567 |
+
logging.info(f"Request received for: '{target_space_id}' (Source: {source})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
+
if source == "dropdown":
|
| 570 |
+
progress(0.1, desc="Fetching selected cached report...")
|
| 571 |
+
# Yield 7 updates (initial placeholder)
|
| 572 |
+
yield (
|
| 573 |
+
gr.update(value="*Loading TLDR...*", visible=True),
|
| 574 |
+
gr.update(value="*Loading data details...*", visible=True),
|
| 575 |
+
gr.update(value="Fetching selected cached report...", visible=True),
|
| 576 |
+
gr.update(value="", visible=True),
|
| 577 |
+
gr.update(visible=True, open=False),
|
| 578 |
+
gr.update(visible=True, open=False),
|
| 579 |
+
gr.update(visible=True, open=False),
|
| 580 |
+
)
|
| 581 |
+
cache_result = check_cache_and_download(target_space_id, DATASET_ID, HF_TOKEN)
|
| 582 |
+
if cache_result["status"] == "cache_hit":
|
| 583 |
+
logging.info(
|
| 584 |
+
f"Successfully displayed cached reports for selected '{target_space_id}'."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
)
|
| 586 |
+
progress(1.0, desc="Complete (from cache)")
|
| 587 |
+
# Use the cached report text directly here, adding the cache message is done within the helper now.
|
| 588 |
+
# Parse and render TLDR if available
|
| 589 |
+
tldr_json_str = cache_result.get("tldr_json_str")
|
| 590 |
+
rendered_tldr = "*TLDR not found in cache.*"
|
| 591 |
+
if tldr_json_str:
|
| 592 |
+
try:
|
| 593 |
+
cached_tldr_data = json.loads(tldr_json_str)
|
| 594 |
+
rendered_tldr = render_tldr_markdown(
|
| 595 |
+
cached_tldr_data, target_space_id
|
| 596 |
+
)
|
| 597 |
+
rendered_data_details = render_data_details_markdown(
|
| 598 |
+
cached_tldr_data
|
| 599 |
+
)
|
| 600 |
+
except Exception as parse_err:
|
| 601 |
+
logging.warning(
|
| 602 |
+
f"Failed to parse cached TLDR JSON for {target_space_id}: {parse_err}"
|
| 603 |
+
)
|
| 604 |
+
rendered_tldr = "*Error parsing cached TLDR.*"
|
| 605 |
+
rendered_data_details = (
|
| 606 |
+
"*Could not load data details due to parsing error.*"
|
| 607 |
+
)
|
| 608 |
|
| 609 |
+
yield (
|
| 610 |
+
gr.update(value=rendered_tldr, visible=True),
|
| 611 |
+
gr.update(value=rendered_data_details, visible=True),
|
| 612 |
+
gr.update(value=cache_result["summary"], visible=True),
|
| 613 |
+
gr.update(value=cache_result["privacy"], visible=True),
|
| 614 |
+
gr.update(visible=True, open=False),
|
| 615 |
+
gr.update(visible=True, open=False),
|
| 616 |
+
gr.update(visible=True, open=False),
|
| 617 |
+
)
|
| 618 |
+
else: # Cache miss or error for a dropdown selection is an error state
|
| 619 |
+
error_msg = cache_result.get(
|
| 620 |
+
"ui_message",
|
| 621 |
+
f"Failed to find or download cached report for selected '{target_space_id}'.",
|
| 622 |
+
)
|
| 623 |
+
logging.error(error_msg)
|
| 624 |
+
progress(1.0, desc="Error")
|
| 625 |
+
yield (
|
| 626 |
+
gr.update(value="*TLDR load failed.*", visible=True),
|
| 627 |
+
gr.update(value="*Data details load failed.*", visible=True),
|
| 628 |
+
gr.update(value=error_msg, visible=True),
|
| 629 |
+
gr.update(value="", visible=False),
|
| 630 |
+
gr.update(visible=True, open=False),
|
| 631 |
+
gr.update(visible=True, open=False),
|
| 632 |
+
gr.update(visible=False),
|
| 633 |
+
)
|
| 634 |
+
return # Stop after handling dropdown source
|
| 635 |
|
| 636 |
+
# --- Live Analysis or Check Cache for New Input ---
|
| 637 |
+
# If it came from the textbox OR was a dropdown match, run the full live analysis pipeline
|
| 638 |
+
# which includes its own cache check at the beginning.
|
| 639 |
+
else: # source == "new" or source == "dropdown_match"
|
| 640 |
+
# Yield intermediate updates from the generator by iterating through it
|
| 641 |
+
for update_tuple in _run_live_analysis(target_space_id, progress):
|
| 642 |
+
yield update_tuple
|
| 643 |
|
| 644 |
|
| 645 |
# --- Load Initial Data Function (for demo.load) ---
|
|
|
|
| 689 |
with gr.Row():
|
| 690 |
with gr.Column(scale=1): # Left column for inputs
|
| 691 |
description_accordion = gr.Accordion(
|
| 692 |
+
"What Privacy Questions do 🤗 Spaces Raise? Click here for Demo Description 👇",
|
| 693 |
+
open=False,
|
| 694 |
+
visible=True,
|
| 695 |
)
|
| 696 |
with description_accordion:
|
| 697 |
gr.Markdown(DESCRIPTION)
|
|
|
|
| 712 |
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)
|
| 713 |
|
| 714 |
with gr.Column(scale=1): # Right column for outputs
|
| 715 |
+
# Define TLDR Markdown component first, always visible
|
| 716 |
+
gr.Markdown("### Privacy TLDR 🕵️\n", visible=True)
|
| 717 |
+
tldr_markdown = gr.Markdown(
|
| 718 |
+
"*Select or enter a Space ID to get started.*", visible=True
|
| 719 |
+
)
|
| 720 |
+
|
| 721 |
+
# Define Accordions next, closed by default, visible
|
| 722 |
+
data_types_accordion = gr.Accordion(
|
| 723 |
+
"Data Types at Play", open=False, visible=True
|
| 724 |
+
)
|
| 725 |
+
with data_types_accordion:
|
| 726 |
+
data_details_markdown = gr.Markdown("*Data details will appear here.*")
|
| 727 |
+
|
| 728 |
summary_accordion = gr.Accordion(
|
| 729 |
+
"Summary & Privacy Highlights",
|
| 730 |
+
open=False,
|
| 731 |
+
visible=True, # Changed to open=False
|
| 732 |
)
|
| 733 |
privacy_accordion = gr.Accordion(
|
| 734 |
+
"Detailed Privacy Analysis Report",
|
| 735 |
+
open=False,
|
| 736 |
+
visible=True, # Changed to open=False
|
| 737 |
)
|
| 738 |
with summary_accordion:
|
| 739 |
summary_markdown = gr.Markdown(
|
|
|
|
| 755 |
fn=get_space_report_wrapper,
|
| 756 |
inputs=[cached_spaces_dropdown, space_id_input],
|
| 757 |
outputs=[
|
| 758 |
+
tldr_markdown,
|
| 759 |
+
data_details_markdown, # Added data details output
|
| 760 |
summary_markdown,
|
| 761 |
privacy_markdown,
|
| 762 |
+
data_types_accordion, # Added data details accordion output
|
| 763 |
summary_accordion,
|
| 764 |
privacy_accordion,
|
| 765 |
],
|
llm_interface.py
CHANGED
|
@@ -79,6 +79,7 @@ def query_qwen_endpoint(
|
|
| 79 |
return None # Return None for other HTTP errors
|
| 80 |
except Exception as e:
|
| 81 |
logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
|
|
|
| 82 |
return None
|
| 83 |
|
| 84 |
|
|
|
|
| 79 |
return None # Return None for other HTTP errors
|
| 80 |
except Exception as e:
|
| 81 |
logging.error(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
| 82 |
+
print(f"An unexpected error occurred querying Inference Endpoint: {e}")
|
| 83 |
return None
|
| 84 |
|
| 85 |
|
utils.py
CHANGED
|
@@ -80,6 +80,7 @@ MAX_MODEL_DESC_LENGTH = 1500
|
|
| 80 |
|
| 81 |
SUMMARY_FILENAME = "summary_highlights.md"
|
| 82 |
PRIVACY_FILENAME = "privacy_report.md"
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def _is_relevant_file(filename):
|
|
@@ -367,7 +368,13 @@ def check_report_exists(space_id: str, dataset_id: str, hf_token: str | None) ->
|
|
| 367 |
def download_cached_reports(
|
| 368 |
space_id: str, dataset_id: str, hf_token: str | None
|
| 369 |
) -> dict[str, str]:
|
| 370 |
-
"""Downloads cached reports from the dataset repo.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
if not hf_token:
|
| 372 |
raise ValueError("HF Token required to download cached reports.")
|
| 373 |
|
|
@@ -378,50 +385,95 @@ def download_cached_reports(
|
|
| 378 |
# Define paths relative to dataset root for hf_hub_download
|
| 379 |
summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
|
| 380 |
privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
|
|
|
|
|
|
|
| 381 |
try:
|
| 382 |
# Download summary
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
# Download privacy report
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
return reports
|
| 405 |
|
| 406 |
-
except
|
| 407 |
-
# More specific error based on which file failed
|
| 408 |
-
missing_file = (
|
| 409 |
-
summary_repo_path if summary_repo_path in str(e) else privacy_repo_path
|
| 410 |
-
)
|
| 411 |
logging.error(
|
| 412 |
-
f"Cache download error:
|
| 413 |
)
|
| 414 |
-
raise FileNotFoundError(
|
| 415 |
-
|
| 416 |
-
) from e
|
| 417 |
-
except RepositoryNotFoundError as e:
|
| 418 |
-
logging.error(f"Cache download error: Dataset repo {dataset_id} not found. {e}")
|
| 419 |
-
raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e
|
| 420 |
-
except Exception as e:
|
| 421 |
logging.error(
|
| 422 |
-
f"Unexpected error downloading cached reports for {space_id} from {dataset_id}: {
|
| 423 |
)
|
| 424 |
-
raise IOError(
|
|
|
|
|
|
|
| 425 |
|
| 426 |
|
| 427 |
def upload_reports_to_dataset(
|
|
|
|
| 80 |
|
| 81 |
SUMMARY_FILENAME = "summary_highlights.md"
|
| 82 |
PRIVACY_FILENAME = "privacy_report.md"
|
| 83 |
+
TLDR_FILENAME = "tldr_summary.json"
|
| 84 |
|
| 85 |
|
| 86 |
def _is_relevant_file(filename):
|
|
|
|
| 368 |
def download_cached_reports(
|
| 369 |
space_id: str, dataset_id: str, hf_token: str | None
|
| 370 |
) -> dict[str, str]:
|
| 371 |
+
"""Downloads cached reports (summary, privacy, tldr json) from the dataset repo.
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
Dict containing report contents keyed by 'summary', 'privacy', 'tldr_json_str'.
|
| 375 |
+
Keys will be missing if a specific file is not found.
|
| 376 |
+
Raises error on critical download failures (repo not found, etc.).
|
| 377 |
+
"""
|
| 378 |
if not hf_token:
|
| 379 |
raise ValueError("HF Token required to download cached reports.")
|
| 380 |
|
|
|
|
| 385 |
# Define paths relative to dataset root for hf_hub_download
|
| 386 |
summary_repo_path = f"{space_id}/{SUMMARY_FILENAME}"
|
| 387 |
privacy_repo_path = f"{space_id}/{PRIVACY_FILENAME}"
|
| 388 |
+
tldr_repo_path = f"{space_id}/{TLDR_FILENAME}" # Path for TLDR JSON
|
| 389 |
+
|
| 390 |
try:
|
| 391 |
# Download summary
|
| 392 |
+
try:
|
| 393 |
+
summary_path_local = hf_hub_download(
|
| 394 |
+
repo_id=dataset_id,
|
| 395 |
+
filename=summary_repo_path,
|
| 396 |
+
repo_type="dataset",
|
| 397 |
+
token=hf_token,
|
| 398 |
+
)
|
| 399 |
+
with open(summary_path_local, "r", encoding="utf-8") as f:
|
| 400 |
+
reports["summary"] = f.read()
|
| 401 |
+
logging.info(f"Successfully downloaded cached summary for {space_id}.")
|
| 402 |
+
except EntryNotFoundError:
|
| 403 |
+
logging.warning(
|
| 404 |
+
f"Cached summary file {summary_repo_path} not found for {space_id}."
|
| 405 |
+
)
|
| 406 |
+
except Exception as e_summary:
|
| 407 |
+
logging.error(
|
| 408 |
+
f"Error downloading cached summary for {space_id}: {e_summary}"
|
| 409 |
+
)
|
| 410 |
+
# Decide if this is critical - for now, we warn and continue
|
| 411 |
|
| 412 |
# Download privacy report
|
| 413 |
+
try:
|
| 414 |
+
privacy_path_local = hf_hub_download(
|
| 415 |
+
repo_id=dataset_id,
|
| 416 |
+
filename=privacy_repo_path,
|
| 417 |
+
repo_type="dataset",
|
| 418 |
+
token=hf_token,
|
| 419 |
+
)
|
| 420 |
+
with open(privacy_path_local, "r", encoding="utf-8") as f:
|
| 421 |
+
reports["privacy"] = f.read()
|
| 422 |
+
logging.info(
|
| 423 |
+
f"Successfully downloaded cached privacy report for {space_id}."
|
| 424 |
+
)
|
| 425 |
+
except EntryNotFoundError:
|
| 426 |
+
logging.warning(
|
| 427 |
+
f"Cached privacy file {privacy_repo_path} not found for {space_id}."
|
| 428 |
+
)
|
| 429 |
+
except Exception as e_privacy:
|
| 430 |
+
logging.error(
|
| 431 |
+
f"Error downloading cached privacy report for {space_id}: {e_privacy}"
|
| 432 |
+
)
|
| 433 |
+
# Decide if this is critical - for now, we warn and continue
|
| 434 |
+
|
| 435 |
+
# Download TLDR JSON
|
| 436 |
+
try:
|
| 437 |
+
tldr_path_local = hf_hub_download(
|
| 438 |
+
repo_id=dataset_id,
|
| 439 |
+
filename=tldr_repo_path,
|
| 440 |
+
repo_type="dataset",
|
| 441 |
+
token=hf_token,
|
| 442 |
+
)
|
| 443 |
+
with open(tldr_path_local, "r", encoding="utf-8") as f:
|
| 444 |
+
reports["tldr_json_str"] = f.read() # Store raw string content
|
| 445 |
+
logging.info(f"Successfully downloaded cached TLDR JSON for {space_id}.")
|
| 446 |
+
except EntryNotFoundError:
|
| 447 |
+
logging.warning(
|
| 448 |
+
f"Cached TLDR file {tldr_repo_path} not found for {space_id}."
|
| 449 |
+
)
|
| 450 |
+
# Don't treat TLDR absence as an error, just won't be in the dict
|
| 451 |
+
except Exception as e_tldr:
|
| 452 |
+
logging.error(
|
| 453 |
+
f"Error downloading cached TLDR JSON for {space_id}: {e_tldr}"
|
| 454 |
+
)
|
| 455 |
+
# Don't treat TLDR download error as critical, just won't be included
|
| 456 |
+
|
| 457 |
+
# Check if at least one report was downloaded successfully
|
| 458 |
+
if not reports.get("summary") and not reports.get("privacy"):
|
| 459 |
+
raise FileNotFoundError(
|
| 460 |
+
f"Failed to download *any* primary cache files (summary/privacy) for {space_id}"
|
| 461 |
+
)
|
| 462 |
|
| 463 |
return reports
|
| 464 |
|
| 465 |
+
except RepositoryNotFoundError as e_repo:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
logging.error(
|
| 467 |
+
f"Cache download error: Dataset repo {dataset_id} not found. {e_repo}"
|
| 468 |
)
|
| 469 |
+
raise FileNotFoundError(f"Dataset repo {dataset_id} not found") from e_repo
|
| 470 |
+
except Exception as e_critical: # Catch other potential critical errors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
logging.error(
|
| 472 |
+
f"Unexpected critical error downloading cached reports for {space_id} from {dataset_id}: {e_critical}"
|
| 473 |
)
|
| 474 |
+
raise IOError(
|
| 475 |
+
f"Failed critically during cached report download for {space_id}"
|
| 476 |
+
) from e_critical
|
| 477 |
|
| 478 |
|
| 479 |
def upload_reports_to_dataset(
|