Spaces:

matsant01
/

dit-editor-user-study

Running

App Files Files Community

matsant01 commited on 24 days ago

Commit

c61e1ad

1 Parent(s): fc4cd8f

Pushing collected preferences to hf dataset

Browse files

Files changed (2) hide show

app.py +160 -67
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -3,17 +3,67 @@ import os
 import random
 import csv
 from pathlib import Path
-from datetime import datetime
 DATA_DIR = Path("data")
-RESULTS_DIR = Path("results")
-RESULTS_FILE = RESULTS_DIR / "preferences.csv"
 IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp"]
 # --- Data Loading ---
 def find_image(folder_path: Path, base_name: str) -> Path | None:
-    """Finds an image file starting with base_name in a folder."""
     for ext in IMAGE_EXTENSIONS:
         file_path = folder_path / f"{base_name}{ext}"
         if file_path.exists():
@@ -21,12 +71,10 @@ def find_image(folder_path: Path, base_name: str) -> Path | None:
     return None
 def get_sample_ids() -> list[str]:
-    """Scans the data directory for valid sample IDs."""
     sample_ids = []
     if DATA_DIR.is_dir():
         for item in DATA_DIR.iterdir():
             if item.is_dir():
-                # Check if required files exist
                 prompt_file = item / "prompt.txt"
                 input_bg = find_image(item, "input_bg")
                 input_fg = find_image(item, "input_fg")
@@ -37,7 +85,6 @@ def get_sample_ids() -> list[str]:
     return sample_ids
 def load_sample_data(sample_id: str) -> dict | None:
-    """Loads data for a specific sample ID."""
     sample_path = DATA_DIR / sample_id
     if not sample_path.is_dir():
         return None
@@ -72,7 +119,6 @@ def load_sample_data(sample_id: str) -> dict | None:
 INITIAL_SAMPLE_IDS = get_sample_ids()
 def get_next_sample(available_ids: list[str]) -> tuple[dict | None, list[str]]:
-    """Selects a random sample ID from the available list."""
     if not available_ids:
         return None, []
     chosen_id = random.choice(available_ids)
@@ -80,9 +126,7 @@ def get_next_sample(available_ids: list[str]) -> tuple[dict | None, list[str]]:
     sample_data = load_sample_data(chosen_id)
     return sample_data, remaining_ids
 def display_new_sample(state: dict, available_ids: list[str]):
-    """Loads and prepares a new sample for display."""
     sample_data, remaining_ids = get_next_sample(available_ids)
     if not sample_data:
@@ -129,16 +173,15 @@ def display_new_sample(state: dict, available_ids: list[str]):
     }
 def record_preference(choice: str, state: dict, request: gr.Request):
-    """Records the user's preference and prepares for the next sample."""
-    if not request: # Add a check if request is None
         print("Error: Request object is None. Cannot get session ID.")
-        session_id = "unknown_session" # Fallback session ID
     else:
         try:
-            session_id = request.client.host # Use IP address as a basic session identifier
         except AttributeError:
-             print("Error: request.client is None or has no 'host' attribute.")
-             session_id = "unknown_client" # Fallback if client object is weird
     if not state or "current_sample_id" not in state:
         print("Warning: State missing, cannot record preference.")
@@ -147,67 +190,97 @@ def record_preference(choice: str, state: dict, request: gr.Request):
             choice_button_b: gr.update(interactive=False),
             next_button: gr.update(visible=True, interactive=True),
             status_display: gr.update(value="Error: Session state lost. Click Next Sample."),
-            app_state: state # Return unchanged state
         }
     chosen_model_name = state["output_a_model_name"] if choice == "A" else state["output_b_model_name"]
-    # Ensure results directory exists
-    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-    # Append result to CSV
-    file_exists = RESULTS_FILE.exists()
     try:
-        with open(RESULTS_FILE, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            if not file_exists:
-                writer.writerow([
-                    "timestamp", "session_id", "sample_id",
-                    "baseline_displayed_as", "tficon_displayed_as",
-                    "chosen_display", "chosen_model_name"
-                ]) # Header
-            baseline_display = "A" if state["output_a_model_name"] == "baseline" else "B"
-            tficon_display = "B" if state["output_a_model_name"] == "baseline" else "A"
-            writer.writerow([
-                datetime.now().isoformat(),
-                session_id,
-                state["current_sample_id"],
-                baseline_display,
-                tficon_display,
-                choice, # A or B
-                chosen_model_name # baseline or tf-icon
-            ])
     except Exception as e:
-        print(f"Error writing results: {e}")
         return {
             choice_button_a: gr.update(interactive=False),
             choice_button_b: gr.update(interactive=False),
-            next_button: gr.update(visible=True, interactive=True), # Allow user to continue
-            status_display: gr.update(value=f"Error saving preference: {e}. Click Next Sample."),
             app_state: state
         }
-    # Update UI: disable choice buttons, show next button
     return {
         choice_button_a: gr.update(interactive=False),
         choice_button_b: gr.update(interactive=False),
         next_button: gr.update(visible=True, interactive=True),
         status_display: gr.update(value=f"Preference recorded (Chose {choice}). Click Next Sample."),
-        app_state: state # Return unchanged state
     }
-# --- New Handler Functions ---
 def handle_choice_a(state: dict, request: gr.Request):
     return record_preference("A", state, request)
 def handle_choice_b(state: dict, request: gr.Request):
     return record_preference("B", state, request)
-# --- Gradio Interface ---
 with gr.Blocks(title="Image Composition User Study") as demo:
     gr.Markdown("# Image Composition User Study")
     gr.Markdown(
@@ -215,12 +288,9 @@ with gr.Blocks(title="Image Composition User Study") as demo:
         "Then, compare the two output images (Output A and Output B) and click the button below the one you prefer."
     )
-    # State variables
-    app_state = gr.State({}) # Stores current sample info (id, output mapping)
-    # Keep track of samples available *for this session*
     available_samples_state = gr.State(INITIAL_SAMPLE_IDS)
-    # Displays
     prompt_display = gr.Textbox(label="Prompt", interactive=False)
     status_display = gr.Textbox(label="Status", value="Loading first sample...", interactive=False)
@@ -241,9 +311,6 @@ with gr.Blocks(title="Image Composition User Study") as demo:
     next_button = gr.Button("Next Sample", visible=False)
-    # --- Event Handlers ---
-    # Load first sample on page load
     demo.load(
         fn=display_new_sample,
         inputs=[app_state, available_samples_state],
@@ -255,23 +322,20 @@ with gr.Blocks(title="Image Composition User Study") as demo:
         ]
     )
-    # Handle choice A click - Use the new handler function
     choice_button_a.click(
-        fn=handle_choice_a, # Use the dedicated handler
-        inputs=[app_state], # Input is still just the state component
         outputs=[choice_button_a, choice_button_b, next_button, status_display, app_state],
         api_name=False,
     )
-    # Handle choice B click - Use the new handler function
     choice_button_b.click(
-        fn=handle_choice_b, # Use the dedicated handler
-        inputs=[app_state], # Input is still just the state component
         outputs=[choice_button_a, choice_button_b, next_button, status_display, app_state],
         api_name=False,
     )
-    # Handle next sample click
     next_button.click(
         fn=display_new_sample,
         inputs=[app_state, available_samples_state],
@@ -282,16 +346,45 @@ with gr.Blocks(title="Image Composition User Study") as demo:
             app_state, available_samples_state
         ],
         api_name=False,
-        # queue=True
     )
 if __name__ == "__main__":
     if not INITIAL_SAMPLE_IDS:
         print("Error: No valid samples found in the 'data' directory.")
         print("Please ensure the 'data' directory exists and contains subdirectories")
         print("named like 'sample_id', each with 'prompt.txt', 'input_bg.*',")
         print("'input_fg.*', 'baseline.*', and 'tf-icon.*' files.")
     else:
         print(f"Found {len(INITIAL_SAMPLE_IDS)} samples.")
         print("Starting Gradio app...")
         demo.launch(server_name="0.0.0.0")

 import random
 import csv
 from pathlib import Path
+from datetime import datetime, timedelta
+import tempfile
+from huggingface_hub import HfApi, hf_hub_download, login
+from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
+from apscheduler.schedulers.background import BackgroundScheduler
+import atexit
+import threading
+import time
+import shutil
+# --- Configuration ---
+DATASET_REPO_ID = os.getenv("DATASET_REPO_ID", "matsant01/user-study-collected-preferences")
+HF_TOKEN = os.getenv("HF_TOKEN")
+RESULTS_FILENAME_IN_REPO = "preferences.csv"
+TEMP_DIR = tempfile.mkdtemp()
+LOCAL_RESULTS_FILE = Path(TEMP_DIR) / RESULTS_FILENAME_IN_REPO
+UPLOAD_INTERVAL_HOURS = 0.1
 DATA_DIR = Path("data")
 IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp"]
+# --- Global State for Upload Logic ---
+hf_api = None
+scheduler = BackgroundScheduler(daemon=True)
+upload_lock = threading.Lock()
+new_preferences_recorded_since_last_upload = threading.Event()
+# --- Hugging Face Hub Login & Initialization ---
+def initialize_hub_and_results():
+    global hf_api
+    if HF_TOKEN:
+        print("Logging into Hugging Face Hub...")
+        try:
+            login(token=HF_TOKEN)
+            hf_api = HfApi()
+            print(f"Attempting initial download of {RESULTS_FILENAME_IN_REPO} from {DATASET_REPO_ID}")
+            hf_hub_download(
+                repo_id=DATASET_REPO_ID,
+                filename=RESULTS_FILENAME_IN_REPO,
+                repo_type="dataset",
+                token=HF_TOKEN,
+                local_dir=TEMP_DIR,
+                local_dir_use_symlinks=False
+            )
+            print(f"Successfully downloaded existing {RESULTS_FILENAME_IN_REPO} to {LOCAL_RESULTS_FILE}")
+        except EntryNotFoundError:
+            print(f"{RESULTS_FILENAME_IN_REPO} not found in repo. Will create locally.")
+        except RepositoryNotFoundError:
+            print(f"Error: Dataset repository {DATASET_REPO_ID} not found or token lacks permissions.")
+            print("Results saving will be disabled.")
+            hf_api = None
+        except Exception as e:
+            print(f"Error during initial download/login: {e}")
+            print("Proceeding without initial download. File will be created locally.")
+    else:
+        print("Warning: HF_TOKEN secret not found. Results will not be saved to the Hub.")
+        hf_api = None
 # --- Data Loading ---
 def find_image(folder_path: Path, base_name: str) -> Path | None:
     for ext in IMAGE_EXTENSIONS:
         file_path = folder_path / f"{base_name}{ext}"
         if file_path.exists():
     return None
 def get_sample_ids() -> list[str]:
     sample_ids = []
     if DATA_DIR.is_dir():
         for item in DATA_DIR.iterdir():
             if item.is_dir():
                 prompt_file = item / "prompt.txt"
                 input_bg = find_image(item, "input_bg")
                 input_fg = find_image(item, "input_fg")
     return sample_ids
 def load_sample_data(sample_id: str) -> dict | None:
     sample_path = DATA_DIR / sample_id
     if not sample_path.is_dir():
         return None
 INITIAL_SAMPLE_IDS = get_sample_ids()
 def get_next_sample(available_ids: list[str]) -> tuple[dict | None, list[str]]:
     if not available_ids:
         return None, []
     chosen_id = random.choice(available_ids)
     sample_data = load_sample_data(chosen_id)
     return sample_data, remaining_ids
 def display_new_sample(state: dict, available_ids: list[str]):
     sample_data, remaining_ids = get_next_sample(available_ids)
     if not sample_data:
     }
 def record_preference(choice: str, state: dict, request: gr.Request):
+    if not request:
         print("Error: Request object is None. Cannot get session ID.")
+        session_id = "unknown_session"
     else:
         try:
+            session_id = request.client.host
         except AttributeError:
+            print("Error: request.client is None or has no 'host' attribute.")
+            session_id = "unknown_client"
     if not state or "current_sample_id" not in state:
         print("Warning: State missing, cannot record preference.")
             choice_button_b: gr.update(interactive=False),
             next_button: gr.update(visible=True, interactive=True),
             status_display: gr.update(value="Error: Session state lost. Click Next Sample."),
+            app_state: state
         }
     chosen_model_name = state["output_a_model_name"] if choice == "A" else state["output_b_model_name"]
+    baseline_display = "A" if state["output_a_model_name"] == "baseline" else "B"
+    tficon_display = "B" if state["output_a_model_name"] == "baseline" else "A"
+    new_row = {
+        "timestamp": datetime.now().isoformat(),
+        "session_id": session_id,
+        "sample_id": state["current_sample_id"],
+        "baseline_displayed_as": baseline_display,
+        "tficon_displayed_as": tficon_display,
+        "chosen_display": choice,
+        "chosen_model_name": chosen_model_name
+    }
+    header = list(new_row.keys())
     try:
+        with upload_lock:
+            file_exists = LOCAL_RESULTS_FILE.exists()
+            mode = 'a' if file_exists else 'w'
+            with open(LOCAL_RESULTS_FILE, mode, newline='', encoding='utf-8') as f:
+                writer = csv.DictWriter(f, fieldnames=header)
+                if not file_exists or os.path.getsize(LOCAL_RESULTS_FILE) == 0:
+                    writer.writeheader()
+                    print(f"Created or wrote header to {LOCAL_RESULTS_FILE}")
+                writer.writerow(new_row)
+            print(f"Appended preference for {state['current_sample_id']} to local file.")
+            new_preferences_recorded_since_last_upload.set()
     except Exception as e:
+        print(f"Error writing local results file {LOCAL_RESULTS_FILE}: {e}")
         return {
             choice_button_a: gr.update(interactive=False),
             choice_button_b: gr.update(interactive=False),
+            next_button: gr.update(visible=True, interactive=True),
+            status_display: gr.update(value=f"Error saving preference locally: {e}. Click Next."),
             app_state: state
         }
     return {
         choice_button_a: gr.update(interactive=False),
         choice_button_b: gr.update(interactive=False),
         next_button: gr.update(visible=True, interactive=True),
         status_display: gr.update(value=f"Preference recorded (Chose {choice}). Click Next Sample."),
+        app_state: state
     }
+def upload_preferences_to_hub():
+    print("Periodic upload check triggered.")
+    if not hf_api:
+        print("Upload check skipped: Hugging Face API not available.")
+        return
+    if not new_preferences_recorded_since_last_upload.is_set():
+        print("Upload check skipped: No new preferences recorded since last upload.")
+        return
+    with upload_lock:
+        if not new_preferences_recorded_since_last_upload.is_set():
+            print("Upload check skipped (race condition avoided): No new preferences.")
+            return
+        if not LOCAL_RESULTS_FILE.exists() or os.path.getsize(LOCAL_RESULTS_FILE) == 0:
+            print("Upload check skipped: Local results file is missing or empty.")
+            new_preferences_recorded_since_last_upload.clear()
+            return
+        try:
+            print(f"Attempting to upload {LOCAL_RESULTS_FILE} to {DATASET_REPO_ID}/{RESULTS_FILENAME_IN_REPO}")
+            start_time = time.time()
+            hf_api.upload_file(
+                path_or_fileobj=str(LOCAL_RESULTS_FILE),
+                path_in_repo=RESULTS_FILENAME_IN_REPO,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                commit_message=f"Periodic upload of preferences - {datetime.now().isoformat()}"
+            )
+            end_time = time.time()
+            print(f"Successfully uploaded preferences. Took {end_time - start_time:.2f} seconds.")
+            new_preferences_recorded_since_last_upload.clear()
+        except Exception as e:
+            print(f"Error uploading results file: {e}")
 def handle_choice_a(state: dict, request: gr.Request):
     return record_preference("A", state, request)
 def handle_choice_b(state: dict, request: gr.Request):
     return record_preference("B", state, request)
 with gr.Blocks(title="Image Composition User Study") as demo:
     gr.Markdown("# Image Composition User Study")
     gr.Markdown(
         "Then, compare the two output images (Output A and Output B) and click the button below the one you prefer."
     )
+    app_state = gr.State({})
     available_samples_state = gr.State(INITIAL_SAMPLE_IDS)
     prompt_display = gr.Textbox(label="Prompt", interactive=False)
     status_display = gr.Textbox(label="Status", value="Loading first sample...", interactive=False)
     next_button = gr.Button("Next Sample", visible=False)
     demo.load(
         fn=display_new_sample,
         inputs=[app_state, available_samples_state],
         ]
     )
     choice_button_a.click(
+        fn=handle_choice_a,
+        inputs=[app_state],
         outputs=[choice_button_a, choice_button_b, next_button, status_display, app_state],
         api_name=False,
     )
     choice_button_b.click(
+        fn=handle_choice_b,
+        inputs=[app_state],
         outputs=[choice_button_a, choice_button_b, next_button, status_display, app_state],
         api_name=False,
     )
     next_button.click(
         fn=display_new_sample,
         inputs=[app_state, available_samples_state],
             app_state, available_samples_state
         ],
         api_name=False,
     )
+def cleanup_temp_dir():
+    if Path(TEMP_DIR).exists():
+        print(f"Cleaning up temporary directory: {TEMP_DIR}")
+        shutil.rmtree(TEMP_DIR, ignore_errors=True)
+def shutdown_hook():
+    print("Application shutting down. Performing final upload check...")
+    upload_preferences_to_hub()
+    if scheduler.running:
+        print("Shutting down scheduler...")
+        scheduler.shutdown(wait=False)
+    cleanup_temp_dir()
+    print("Shutdown complete.")
+atexit.register(shutdown_hook)
 if __name__ == "__main__":
+    initialize_hub_and_results()
     if not INITIAL_SAMPLE_IDS:
         print("Error: No valid samples found in the 'data' directory.")
         print("Please ensure the 'data' directory exists and contains subdirectories")
         print("named like 'sample_id', each with 'prompt.txt', 'input_bg.*',")
         print("'input_fg.*', 'baseline.*', and 'tf-icon.*' files.")
+    elif not DATASET_REPO_ID:
+        print("Error: DATASET_REPO_ID environment variable is not set or is set to the default placeholder.")
+        print("Please set the DATASET_REPO_ID environment variable or update the script.")
+    elif hf_api:
+        print(f"Starting periodic upload scheduler (every {UPLOAD_INTERVAL_HOURS} hours)...")
+        scheduler.add_job(upload_preferences_to_hub, 'interval', hours=UPLOAD_INTERVAL_HOURS)
+        scheduler.start()
+        print(f"Found {len(INITIAL_SAMPLE_IDS)} samples.")
+        print(f"Configured to save results periodically to Hugging Face Dataset: {DATASET_REPO_ID}")
+        print("Starting Gradio app...")
+        demo.launch(server_name="0.0.0.0")
     else:
+        print("Warning: Running without Hugging Face Hub integration (HF_TOKEN or DATASET_REPO_ID missing/invalid).")
         print(f"Found {len(INITIAL_SAMPLE_IDS)} samples.")
         print("Starting Gradio app...")
         demo.launch(server_name="0.0.0.0")

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- gradio

+gradio
+huggingface_hub
+apscheduler # Added for periodic tasks