Spaces:
Running
Running
File size: 17,492 Bytes
af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 2b6be77 af5e0d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
\
import os
import random
import uuid
import pandas as pd
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download, login
from PIL import Image
import shutil
import config
# --- Hugging Face Hub Functions ---
def login_hugging_face():
"""Logs in to Hugging Face Hub using token from config or environment variable."""
token = config.HF_TOKEN or os.getenv("HF_HUB_TOKEN")
if token:
login(token=token)
print("Successfully logged into Hugging Face Hub.")
else:
print("HF_TOKEN not set in config and HF_HUB_TOKEN not in environment. Proceeding without login. Uploads to private repos will fail.")
def load_preferences_from_hf_hub(repo_id, filename):
"""Downloads the preferences CSV from the Hugging Face Hub dataset repo.
Overwrites the local file specified by `filename` with the downloaded content.
Returns a Pandas DataFrame loaded from the (potentially overwritten) local file.
Returns None if the file doesn't exist on the Hub and the local file also doesn't exist.
Returns an empty DataFrame with correct headers if the Hub file is empty or if errors occur during download
and the local file is also problematic (e.g., empty or wrong headers).
"""
local_file_path = filename # The target local file path
download_successful = False
hub_file_exists = True
try:
print(f"Attempting to download {filename} from {repo_id} to {local_file_path}")
# hf_hub_download will download to a cache and return its path.
# We want to ensure our target local_file_path is the one used.
downloaded_cache_path = hf_hub_download(
repo_id=repo_id,
filename=filename, # This is path_in_repo
repo_type="dataset",
local_dir=os.path.dirname(local_file_path) or ".", # Ensure download into the correct directory
local_dir_use_symlinks=False,
# force_filename=os.path.basename(local_file_path) # Ensure the final name is correct
)
# After download, hf_hub_download might place it in a nested structure based on the repo.
# We need to ensure it is moved to the exact `local_file_path` if it's not already there.
# The `downloaded_cache_path` is often like `../hub/datasets--repo--id/snapshots/hash/filename`
# or directly `filename` if `local_dir` was specific enough and `force_filename` worked as expected.
# To be safe, explicitly move from where hf_hub_download put it to our desired local_file_path.
# Ensure target directory exists
target_dir = os.path.dirname(local_file_path)
if target_dir and not os.path.exists(target_dir):
os.makedirs(target_dir)
# Overwrite local_file_path with the downloaded file
shutil.move(downloaded_cache_path, local_file_path)
print(f"Successfully downloaded and moved {filename} from {repo_id} to {local_file_path}")
download_successful = True
except Exception as e: # Broadly catch hf_hub_download errors (e.g., file not found, network issues)
if "404" in str(e) or "does not exist" in str(e).lower(): # More specific check for file not found
print(f"File {filename} not found on Hugging Face Hub repository {repo_id}.")
hub_file_exists = False
# If Hub file doesn't exist, we might want to delete any existing local file
# to ensure we start fresh or rely on a truly empty state if no local file exists.
if os.path.exists(local_file_path):
print(f"Hub file {filename} not found. Deleting existing local file {local_file_path} to ensure clean state.")
# Before deleting, consider if we should back it up or if this is the desired behavior.
# For now, let's assume we want to reflect the Hub's state (i.e., no file).
# However, the app.py logic expects an empty DataFrame with headers if the Hub is empty.
# So, instead of deleting, we will ensure an empty CSV with headers is created later.
pass # Handled by logic below: if download failed, local file is checked.
else:
print(f"Could not download {filename} from {repo_id}. Error: {e}")
# Download failed, proceed to load/check local file or create empty.
# After attempting download (successful or not), manage the local file and load it.
if download_successful:
# File was downloaded and moved to local_file_path. Load it.
try:
df = pd.read_csv(local_file_path)
if list(df.columns) != config.CSV_HEADERS:
print(f"Warning: Downloaded file {local_file_path} has incorrect headers. Re-initializing as empty with correct headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False) # Overwrite with empty + headers
elif df.empty:
# Check if the file itself had incorrect headers or was truly empty
current_headers = []
if os.path.getsize(local_file_path) > 0:
try:
current_headers = list(pd.read_csv(local_file_path, nrows=0).columns)
except Exception:
pass
if current_headers != config.CSV_HEADERS:
print(f"Downloaded file {local_file_path} is empty but has incorrect/no headers. Re-initializing with correct headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False)
else: # Empty dataframe, but headers in file are correct
df = pd.DataFrame(columns=config.CSV_HEADERS) # Ensure in-memory df also has columns
return df
except pd.errors.EmptyDataError:
print(f"Downloaded file {local_file_path} is empty. Initializing DataFrame with headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False) # Ensure empty file has headers
return df
except Exception as e:
print(f"Error reading downloaded file {local_file_path}: {e}. Returning empty DataFrame with headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False) # Ensure file has headers
return df
else: # Download was not successful (Hub file not found or other error)
if not hub_file_exists:
# Hub file does not exist. We should ensure the local file is also effectively empty (with headers).
print(f"Hub file {filename} does not exist. Ensuring local file {local_file_path} is empty with correct headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False) # Create/overwrite local as empty with headers
return df
else: # Other download error, but Hub file might exist. Try loading local as fallback.
print(f"Download of {filename} failed. Attempting to load from local file {local_file_path}.")
if os.path.exists(local_file_path):
try:
df = pd.read_csv(local_file_path)
if list(df.columns) != config.CSV_HEADERS:
print(f"Warning: Local file {local_file_path} (fallback) has incorrect headers. Re-initializing as empty with correct headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False)
elif df.empty:
current_headers = []
if os.path.getsize(local_file_path) > 0:
try:
current_headers = list(pd.read_csv(local_file_path, nrows=0).columns)
except Exception:
pass
if current_headers != config.CSV_HEADERS:
print(f"Local file {local_file_path} (fallback) is empty but has incorrect/no headers. Re-initializing with correct headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False)
else: # Empty dataframe, but headers in file are correct
df = pd.DataFrame(columns=config.CSV_HEADERS) # Ensure in-memory df also has columns
return df
except pd.errors.EmptyDataError:
print(f"Local file {local_file_path} (fallback) is empty. Initializing DataFrame with headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False)
return df
except Exception as e:
print(f"Error reading local file {local_file_path} (fallback): {e}. Returning empty DataFrame with headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False)
return df
else:
# Download failed, Hub file might exist but couldn't be fetched, local file also doesn't exist.
print(f"Download of {filename} failed, and local file {local_file_path} not found. Initializing empty DataFrame with headers.")
df = pd.DataFrame(columns=config.CSV_HEADERS)
df.to_csv(local_file_path, index=False) # Create new local empty file with headers
return df
def save_preferences_to_hf_hub(df, repo_id, filename, commit_message="Update preferences"):
"""Saves the DataFrame to a local CSV and uploads it to the Hugging Face Hub."""
if df is None or df.empty:
print("Preferences DataFrame (passed for checking) is empty. Nothing to upload based on this check.")
# However, the primary source for upload should be the file itself if it exists and has content.
# This check is more of a guard based on the state when the scheduler decided to run.
# Let's ensure we check the file on disk if df is empty.
if not (os.path.exists(filename) and os.path.getsize(filename) > 0):
print(f"Local file {filename} is also non-existent or empty. Nothing to upload.")
return
print(f"Passed DataFrame was empty, but local file {filename} exists and has content. Proceeding with upload of the file.")
try:
# CRITICAL CHANGE: Removed df.to_csv(filename, index=False)
# The local CSV (specified by `filename`) is now the direct source of truth for uploading.
# It is appended to by process_vote and periodically read by the scheduler.
# This function should only be responsible for uploading that file.
print(f"Attempting to upload existing file: {filename} to {repo_id}")
api = HfApi()
api.upload_file(
path_or_fileobj=filename,
path_in_repo=filename,
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
)
print(f"Successfully uploaded {filename} to {repo_id}")
except Exception as e:
print(f"Error saving or uploading {filename} to Hugging Face Hub: {e}")
print("Changes are saved locally. Will attempt upload on next scheduled push.")
# --- Data Loading and Sampling ---
def scan_data_directory(data_folder):
"""
Scans the data directory to find domains and their samples.
Returns a dictionary: {"domain_name": ["sample_id1", "sample_id2", ...]}
"""
all_samples_by_domain = {}
if not os.path.isdir(data_folder):
print(f"Error: Data folder '{data_folder}' not found.")
return all_samples_by_domain
for domain_name in os.listdir(data_folder):
domain_path = os.path.join(data_folder, domain_name)
if os.path.isdir(domain_path):
all_samples_by_domain[domain_name] = []
for sample_id in os.listdir(domain_path):
sample_path = os.path.join(domain_path, sample_id)
# Basic check: ensure it's a directory and contains expected files (e.g., prompt)
prompt_file = os.path.join(sample_path, config.PROMPT_FILE_NAME)
bg_image = os.path.join(sample_path, config.BACKGROUND_IMAGE_NAME)
if os.path.isdir(sample_path) and os.path.exists(prompt_file) and os.path.exists(bg_image):
all_samples_by_domain[domain_name].append(sample_id)
if not all_samples_by_domain[domain_name]:
print(f"Warning: No valid samples found in domain '{domain_name}'.")
if not all_samples_by_domain:
print(f"Warning: No domains found or no valid samples in any domain in '{data_folder}'.")
return all_samples_by_domain
def prepare_session_samples(all_samples_by_domain, samples_per_domain):
"""
Prepares a list of (domain, sample_id) tuples for a user session.
Randomly selects 'samples_per_domain' from each domain.
The returned list is shuffled.
"""
session_queue = []
for domain, samples in all_samples_by_domain.items():
if samples: # only if there are samples in the domain
chosen_samples = random.sample(samples, min(len(samples), samples_per_domain))
for sample_id in chosen_samples:
session_queue.append((domain, sample_id))
random.shuffle(session_queue)
return session_queue
# --- Session and Data Handling ---
def generate_session_id():
"""Generates a unique session ID."""
return uuid.uuid4().hex[:config.SESSION_ID_LENGTH]
def load_sample_data(domain, sample_id):
"""
Loads data for a specific sample: prompt, input images, and output image paths.
Returns a dictionary or None if data is incomplete.
"""
sample_path = os.path.join(config.DATA_FOLDER, domain, sample_id)
prompt_path = os.path.join(sample_path, config.PROMPT_FILE_NAME)
bg_image_path = os.path.join(sample_path, config.BACKGROUND_IMAGE_NAME)
fg_image_path = os.path.join(sample_path, config.FOREGROUND_IMAGE_NAME)
if not all(os.path.exists(p) for p in [prompt_path, bg_image_path, fg_image_path]):
print(f"Error: Missing core files for sample {domain}/{sample_id}")
return None
try:
with open(prompt_path, 'r', encoding='utf-8') as f:
prompt_text = f.read().strip()
except Exception as e:
print(f"Error reading prompt for {domain}/{sample_id}: {e}")
return None
output_images = {} # {model_key: path_to_image}
for model_key, img_name in config.MODEL_OUTPUT_IMAGE_NAMES.items():
img_path = os.path.join(sample_path, img_name)
if os.path.exists(img_path):
output_images[model_key] = img_path
else:
print(f"Warning: Missing output image {img_name} for model {model_key} in sample {domain}/{sample_id}")
# Decide if a sample is invalid if an output is missing, or if it can proceed
# For now, we'll allow it to proceed and it just won't show that option.
# A better approach might be to ensure all 4 are present during data prep.
if len(output_images) < len(config.MODEL_OUTPUT_IMAGE_NAMES):
print(f"Warning: Sample {domain}/{sample_id} is missing one or more model outputs. It will have fewer than 4 options.")
if not output_images: # No outputs at all
return None
return {
"prompt": prompt_text,
"background_img_path": bg_image_path,
"foreground_img_path": fg_image_path,
"output_image_paths": output_images # dict {model_key: path}
}
def record_preference(df, session_id, domain, sample_id, prompt, bg_path, fg_path, displayed_models_info, preferred_model_key):
"""
Appends a new preference record to the DataFrame.
displayed_models_info: list of (model_key, image_path) in the order they were displayed.
preferred_model_key: The key of the model the user selected (e.g., "model_a").
"""
timestamp = datetime.now().isoformat()
# Create a dictionary for the new row
new_row = {
"session_id": session_id,
"timestamp": timestamp,
"domain": domain,
"sample_id": sample_id,
"prompt": prompt,
"input_background": os.path.basename(bg_path), # Storing just filename for brevity
"input_foreground": os.path.basename(fg_path), # Storing just filename for brevity
"preferred_model_key": preferred_model_key,
"preferred_model_filename": config.MODEL_OUTPUT_IMAGE_NAMES.get(preferred_model_key, "N/A")
}
# Add displayed order; ensure all columns exist even if fewer than 4 models were shown
for i in range(4): # Assuming max 4 display slots
col_name = f"displayed_order_model_{i+1}"
if i < len(displayed_models_info):
new_row[col_name] = displayed_models_info[i][0] # Store model_key
else:
new_row[col_name] = None # Or some placeholder like "EMPTY_SLOT"
new_df_row = pd.DataFrame([new_row], columns=config.CSV_HEADERS)
if df is None:
df = new_df_row
else:
df = pd.concat([df, new_df_row], ignore_index=True)
return df
|