File size: 9,299 Bytes
e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 3ae931d e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 64e8120 0af43d5 e866dbe 0af43d5 e866dbe 64e8120 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 64e8120 0af43d5 e866dbe 0af43d5 e866dbe 64e8120 e866dbe 569798e e866dbe 569798e e866dbe 0af43d5 64e8120 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 64e8120 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 0af43d5 e866dbe 64e8120 e866dbe c968df1 e866dbe c968df1 e866dbe c968df1 64e8120 c968df1 64e8120 c968df1 e866dbe c968df1 e866dbe c968df1 e866dbe c968df1 f511d18 c968df1 e866dbe 6aca797 1223737 f8c1ea5 6aca797 e866dbe c968df1 e866dbe c968df1 e866dbe 64e8120 e866dbe 307da1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
import gradio as gr
import pandas as pd
import os
from datetime import datetime
import json
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset
# File paths
INPUT_CSV = "summaries.csv" # Change this to your input CSV path
OUTPUT_CSV = "results.csv" # Local backup file path
TEMP_JSON = "temp_results.jsonl" # Temporary file for storing results as JSONL
# HuggingFace configuration
HF_TOKEN = os.environ.get("HF_TOKEN", "") # Set your HuggingFace token as an environment variable
HF_DATASET_REPO = "boe-preference-summaries-results" # Change this to your desired repository name
HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
def load_data():
"""Load data from CSV file"""
if os.path.exists(INPUT_CSV):
return pd.read_csv(INPUT_CSV)
else:
# Create empty dataframe with required columns if file doesn't exist
return pd.DataFrame(columns=["id", "text", "summary_a", "summary_b"])
def initialize_hf_dataset():
"""Initialize a HuggingFace dataset repository if it doesn't exist"""
if not HF_TOKEN or not HF_USERNAME:
return False, "HuggingFace credentials not found. Please set HF_TOKEN and HF_USERNAME environment variables."
try:
api = HfApi(token=HF_TOKEN)
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
# Check if the repo exists, if not create it
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
print(f"Repository {repo_id} already exists")
except Exception:
print(f"Creating repository {repo_id}")
create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
# Create an empty dataset file if it doesn't exist
if not os.path.exists(TEMP_JSON):
with open(TEMP_JSON, "w") as f:
pass
return True, f"{repo_id}"
except Exception as e:
return False, f"Error initializing HuggingFace dataset: {str(e)}"
def push_to_hf_dataset(data_row):
"""Push a new data row to the HuggingFace dataset"""
if not HF_TOKEN or not HF_USERNAME:
return False, "HuggingFace credentials not found"
try:
# Append the new data to the JSONL file
with open(TEMP_JSON, "a") as f:
f.write(json.dumps(data_row) + "\n")
# Upload the file to HuggingFace
api = HfApi(token=HF_TOKEN)
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
# Upload the JSONL file
upload_file(
path_or_fileobj=TEMP_JSON,
path_in_repo="data.jsonl",
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN
)
return True, f"Data pushed to {repo_id}"
except Exception as e:
return False, f"Error pushing to HuggingFace: {str(e)}"
def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
"""Save the user's choice locally and to HuggingFace dataset"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
chosen_summary = "A" if choice == "Summary A" else "B"
# Create a new row with the data
new_row = {
"timestamp": timestamp,
"text_id": text_id,
"original_text": original_text,
"summary_a": summary_a,
"summary_b": summary_b,
"chosen_summary": chosen_summary,
"notes": notes,
"request_id": request_id
}
# Save locally
if os.path.exists(OUTPUT_CSV):
results_df = pd.read_csv(OUTPUT_CSV)
results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
else:
results_df = pd.DataFrame([new_row])
results_df.to_csv(OUTPUT_CSV, index=False)
# Push to HuggingFace
success, message = push_to_hf_dataset(new_row)
request_id_msg = f" (Request ID: {request_id})" if request_id else ""
if success:
return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {'Summary A' if choice == 'Summary A' else 'Summary B'}. Pushed to HuggingFace."
else:
return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
class SummaryChooser:
def __init__(self):
self.df = load_data()
print(self.df)
self.current_index = 0
self.total_items = len(self.df)
print("Total items: ", self.total_items)
self.hf_status = initialize_hf_dataset()
self.request_id = "" # Initialize empty request ID
def set_request_id(self, request: gr.Request):
"""Set the request ID from the URL query parameters"""
try:
query_params = request.query_params
self.request_id = query_params.get("id", "")
return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
except:
self.request_id = ""
return "Failed to get Request ID"
def get_current_item(self):
"""Get the current item from the dataframe"""
if self.total_items == 0:
return "", "", "", "", f"No data found in {INPUT_CSV}. Please check the file path."
row = self.df.iloc[self.current_index]
progress = f"Item {self.current_index + 1} of {self.total_items}"
return row["id"], row["text"], row["summary_a"], row["summary_b"], progress
def next_item(self, choice, notes):
"""Save current choice and move to next item"""
if self.total_items == 0:
return "", "", "", "", "No data available", ""
# Get current values
text_id, text, summary_a, summary_b, _ = self.get_current_item()
# Save the choice with the request ID
result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
# Move to next item or wrap around
self.current_index = (self.current_index + 1) % self.total_items
# Get next item
text_id, text, summary_a, summary_b, progress = self.get_current_item()
return text_id, text, summary_a, summary_b, progress, result_message
def prev_item(self):
"""Move to previous item"""
if self.total_items == 0:
return "", "", "", "", "No data available", ""
# Move to previous item or wrap around
self.current_index = (self.current_index - 1) % self.total_items
# Get the item
text_id, text, summary_a, summary_b, progress = self.get_current_item()
return text_id, text, summary_a, summary_b, progress, ""
def get_hf_status(self):
"""Get the status of HuggingFace integration"""
success, message = self.hf_status
return f"{'Connected' if success else 'Not Connected'} - {message}"
# Create the application
app = SummaryChooser()
# Define the Gradio interface
with gr.Blocks(title="Summary Chooser") as interface:
gr.Markdown("# Summary Comparison Tool")
gr.Markdown("Choose the better summary for each text")
with gr.Row():
with gr.Column():
progress_label = gr.Label(label="Progress")
with gr.Column():
hf_status = gr.Label(label="HuggingFace Status", value=app.get_hf_status())
with gr.Column():
request_id_label = gr.Label(label="Request ID")
with gr.Column():
text_id_box = gr.Textbox(label="Text ID", interactive=False)
with gr.Row():
text_box = gr.TextArea(label="Original Text", lines=8)
with gr.Row():
with gr.Column():
summary_a = gr.TextArea(label="Summary A", lines=5)
with gr.Column():
summary_b = gr.TextArea(label="Summary B", lines=5)
with gr.Row():
choice_radio = gr.Radio(
choices=["Summary A", "Summary B"],
label="Select the better summary"
)
with gr.Row():
notes_box = gr.TextArea(label="Notes (optional)", lines=2)
with gr.Row():
prev_button = gr.Button("Previous")
submit_button = gr.Button("Submit and Next", variant="primary")
with gr.Row():
result_box = gr.Textbox(label="Result")
# Initialize with the first item
text_id, text, sum_a, sum_b, prog = app.get_current_item()
text_id_box.value = text_id
text_box.value = text
summary_a.value = sum_a
summary_b.value = sum_b
progress_label.value = prog
# Set up event handlers
submit_button.click(
fn=app.next_item,
inputs=[choice_radio, notes_box],
outputs=[text_id_box, text_box, summary_a, summary_b, progress_label, result_box]
)
prev_button.click(
fn=app.prev_item,
inputs=[],
outputs=[text_id_box, text_box, summary_a, summary_b, progress_label, result_box]
)
# Load the request ID from the URL when the page loads
interface.load(
fn=app.set_request_id,
inputs=[],
outputs=[request_id_label]
)
# Launch the application
if __name__ == "__main__":
interface.launch(ssr_mode=False,share=True) |