File size: 9,299 Bytes
e866dbe
 
 
 
 
 
 
 
 
0af43d5
e866dbe
 
 
 
 
 
 
 
 
0af43d5
e866dbe
 
 
0af43d5
3ae931d
e866dbe
 
0af43d5
e866dbe
0af43d5
 
e866dbe
 
 
0af43d5
 
e866dbe
 
0af43d5
e866dbe
0af43d5
e866dbe
0af43d5
 
e866dbe
 
 
0af43d5
 
e866dbe
0af43d5
e866dbe
 
0af43d5
e866dbe
 
0af43d5
e866dbe
0af43d5
e866dbe
 
0af43d5
 
e866dbe
 
0af43d5
 
e866dbe
 
 
 
 
 
 
0af43d5
e866dbe
 
0af43d5
e866dbe
64e8120
0af43d5
e866dbe
 
0af43d5
 
e866dbe
 
 
 
 
 
 
64e8120
 
e866dbe
0af43d5
 
e866dbe
 
 
 
 
0af43d5
e866dbe
0af43d5
 
e866dbe
0af43d5
64e8120
0af43d5
e866dbe
0af43d5
e866dbe
64e8120
e866dbe
 
 
 
569798e
e866dbe
 
569798e
e866dbe
0af43d5
 
64e8120
0af43d5
 
 
 
 
 
 
 
 
e866dbe
0af43d5
e866dbe
0af43d5
 
e866dbe
 
0af43d5
 
e866dbe
0af43d5
e866dbe
 
0af43d5
 
e866dbe
0af43d5
 
64e8120
0af43d5
 
e866dbe
0af43d5
 
 
 
 
e866dbe
0af43d5
e866dbe
 
0af43d5
 
e866dbe
0af43d5
 
 
 
 
e866dbe
0af43d5
e866dbe
64e8120
e866dbe
 
 
 
 
c968df1
e866dbe
c968df1
e866dbe
c968df1
 
64e8120
 
c968df1
 
64e8120
 
 
 
c968df1
 
e866dbe
 
 
 
c968df1
e866dbe
c968df1
e866dbe
c968df1
f511d18
c968df1
 
 
 
 
e866dbe
 
 
 
 
 
 
 
 
 
 
 
6aca797
1223737
f8c1ea5
6aca797
 
e866dbe
 
 
 
 
c968df1
 
e866dbe
 
 
 
 
c968df1
e866dbe
64e8120
 
 
 
 
 
 
e866dbe
 
 
307da1d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import gradio as gr
import pandas as pd
import os
from datetime import datetime
import json
from huggingface_hub import HfApi, create_repo, upload_file
from datasets import Dataset

# File paths
INPUT_CSV = "summaries.csv"  # Change this to your input CSV path
OUTPUT_CSV = "results.csv"   # Local backup file path
TEMP_JSON = "temp_results.jsonl"  # Temporary file for storing results as JSONL

# HuggingFace configuration
HF_TOKEN = os.environ.get("HF_TOKEN", "")  # Set your HuggingFace token as an environment variable
HF_DATASET_REPO = "boe-preference-summaries-results"  # Change this to your desired repository name
HF_USERNAME = os.environ.get("HF_USERNAME", "")  # Set your HuggingFace username as an environment variable

def load_data():
    """Load data from CSV file"""
    if os.path.exists(INPUT_CSV):
        return pd.read_csv(INPUT_CSV)
    else:
        # Create empty dataframe with required columns if file doesn't exist
        return pd.DataFrame(columns=["id", "text", "summary_a", "summary_b"])

def initialize_hf_dataset():
    """Initialize a HuggingFace dataset repository if it doesn't exist"""
    if not HF_TOKEN or not HF_USERNAME:
        return False, "HuggingFace credentials not found. Please set HF_TOKEN and HF_USERNAME environment variables."
    
    try:
        api = HfApi(token=HF_TOKEN)
        repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
        
        # Check if the repo exists, if not create it
        try:
            api.repo_info(repo_id=repo_id, repo_type="dataset")
            print(f"Repository {repo_id} already exists")
        except Exception:
            print(f"Creating repository {repo_id}")
            create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
        
        # Create an empty dataset file if it doesn't exist
        if not os.path.exists(TEMP_JSON):
            with open(TEMP_JSON, "w") as f:
                pass
        
        return True, f"{repo_id}"
    except Exception as e:
        return False, f"Error initializing HuggingFace dataset: {str(e)}"

def push_to_hf_dataset(data_row):
    """Push a new data row to the HuggingFace dataset"""
    if not HF_TOKEN or not HF_USERNAME:
        return False, "HuggingFace credentials not found"
    
    try:
        # Append the new data to the JSONL file
        with open(TEMP_JSON, "a") as f:
            f.write(json.dumps(data_row) + "\n")
        
        # Upload the file to HuggingFace
        api = HfApi(token=HF_TOKEN)
        repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
        
        # Upload the JSONL file
        upload_file(
            path_or_fileobj=TEMP_JSON,
            path_in_repo="data.jsonl",
            repo_id=repo_id,
            repo_type="dataset",
            token=HF_TOKEN
        )
        
        return True, f"Data pushed to {repo_id}"
    except Exception as e:
        return False, f"Error pushing to HuggingFace: {str(e)}"

def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
    """Save the user's choice locally and to HuggingFace dataset"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chosen_summary = "A" if choice == "Summary A" else "B"
    
    # Create a new row with the data
    new_row = {
        "timestamp": timestamp,
        "text_id": text_id,
        "original_text": original_text,
        "summary_a": summary_a,
        "summary_b": summary_b,
        "chosen_summary": chosen_summary,
        "notes": notes,
        "request_id": request_id
    }
    
    # Save locally
    if os.path.exists(OUTPUT_CSV):
        results_df = pd.read_csv(OUTPUT_CSV)
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
    else:
        results_df = pd.DataFrame([new_row])
    
    results_df.to_csv(OUTPUT_CSV, index=False)
    
    # Push to HuggingFace
    success, message = push_to_hf_dataset(new_row)
    
    request_id_msg = f" (Request ID: {request_id})" if request_id else ""
    
    if success:
        return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {'Summary A' if choice == 'Summary A' else 'Summary B'}. Pushed to HuggingFace."
    else:
        return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"

class SummaryChooser:
    def __init__(self):
        self.df = load_data()
        print(self.df)
        self.current_index = 0
        self.total_items = len(self.df)
        print("Total items: ", self.total_items)
        self.hf_status = initialize_hf_dataset()
        self.request_id = ""  # Initialize empty request ID
    
    def set_request_id(self, request: gr.Request):
        """Set the request ID from the URL query parameters"""
        try:
            query_params = request.query_params
            self.request_id = query_params.get("id", "")
            return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
        except:
            self.request_id = ""
            return "Failed to get Request ID"
    
    def get_current_item(self):
        """Get the current item from the dataframe"""
        if self.total_items == 0:
            return "", "", "", "", f"No data found in {INPUT_CSV}. Please check the file path."
        
        row = self.df.iloc[self.current_index]
        progress = f"Item {self.current_index + 1} of {self.total_items}"
        return row["id"], row["text"], row["summary_a"], row["summary_b"], progress
    
    def next_item(self, choice, notes):
        """Save current choice and move to next item"""
        if self.total_items == 0:
            return "", "", "", "", "No data available", ""
        
        # Get current values
        text_id, text, summary_a, summary_b, _ = self.get_current_item()
        
        # Save the choice with the request ID
        result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
        
        # Move to next item or wrap around
        self.current_index = (self.current_index + 1) % self.total_items
        
        # Get next item
        text_id, text, summary_a, summary_b, progress = self.get_current_item()
        return text_id, text, summary_a, summary_b, progress, result_message
    
    def prev_item(self):
        """Move to previous item"""
        if self.total_items == 0:
            return "", "", "", "", "No data available", ""
        
        # Move to previous item or wrap around
        self.current_index = (self.current_index - 1) % self.total_items
        
        # Get the item
        text_id, text, summary_a, summary_b, progress = self.get_current_item()
        return text_id, text, summary_a, summary_b, progress, ""
    
    def get_hf_status(self):
        """Get the status of HuggingFace integration"""
        success, message = self.hf_status
        return f"{'Connected' if success else 'Not Connected'} - {message}"

# Create the application
app = SummaryChooser()

# Define the Gradio interface
with gr.Blocks(title="Summary Chooser") as interface:
    gr.Markdown("# Summary Comparison Tool")
    gr.Markdown("Choose the better summary for each text")
    
    with gr.Row():
        with gr.Column():
            progress_label = gr.Label(label="Progress")
    
        with gr.Column():
            hf_status = gr.Label(label="HuggingFace Status", value=app.get_hf_status())
        
        with gr.Column():
            request_id_label = gr.Label(label="Request ID")
        
        with gr.Column():
            text_id_box = gr.Textbox(label="Text ID", interactive=False)
    
    with gr.Row():
        text_box = gr.TextArea(label="Original Text", lines=8)
    
    with gr.Row():
        with gr.Column():
            summary_a = gr.TextArea(label="Summary A", lines=5)
        with gr.Column():
            summary_b = gr.TextArea(label="Summary B", lines=5)
    
    with gr.Row():
        choice_radio = gr.Radio(
            choices=["Summary A", "Summary B"],
            label="Select the better summary"
        )
    
    with gr.Row():
        notes_box = gr.TextArea(label="Notes (optional)", lines=2)
    
    with gr.Row():
        prev_button = gr.Button("Previous")
        submit_button = gr.Button("Submit and Next", variant="primary")
    
    with gr.Row():
        result_box = gr.Textbox(label="Result")
    
    # Initialize with the first item
    text_id, text, sum_a, sum_b, prog = app.get_current_item()
    text_id_box.value = text_id
    text_box.value = text
    summary_a.value = sum_a
    summary_b.value = sum_b
    progress_label.value = prog
    
    # Set up event handlers
    submit_button.click(
        fn=app.next_item,
        inputs=[choice_radio, notes_box],
        outputs=[text_id_box, text_box, summary_a, summary_b, progress_label, result_box]
    )
    
    prev_button.click(
        fn=app.prev_item,
        inputs=[],
        outputs=[text_id_box, text_box, summary_a, summary_b, progress_label, result_box]
    )
    
    # Load the request ID from the URL when the page loads
    interface.load(
        fn=app.set_request_id,
        inputs=[],
        outputs=[request_id_label]
    )

# Launch the application
if __name__ == "__main__":
    interface.launch(ssr_mode=False,share=True)