Update app.py
Browse files
app.py
CHANGED
@@ -17,54 +17,43 @@ HF_DATASET_REPO = "boe-preference-summaries-results" # Change this to your desi
|
|
17 |
HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
|
18 |
|
19 |
def load_data():
|
20 |
-
"""Load data from CSV file"""
|
21 |
if os.path.exists(INPUT_CSV):
|
22 |
return pd.read_csv(INPUT_CSV)
|
23 |
else:
|
24 |
-
|
25 |
-
return pd.DataFrame(columns=["text_id", "text", "summary_a", "summary_b"])
|
26 |
|
27 |
def initialize_hf_dataset():
|
28 |
-
"""Initialize a HuggingFace dataset repository if it doesn't exist"""
|
29 |
if not HF_TOKEN or not HF_USERNAME:
|
30 |
-
return False, "HuggingFace credentials not found.
|
31 |
-
|
32 |
try:
|
33 |
api = HfApi(token=HF_TOKEN)
|
34 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
35 |
-
|
36 |
-
# Check if the repo exists, if not create it
|
37 |
try:
|
38 |
api.repo_info(repo_id=repo_id, repo_type="dataset")
|
39 |
-
print(f"Repository {repo_id} already exists")
|
40 |
except Exception:
|
41 |
-
print(f"Creating repository {repo_id}")
|
42 |
create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
|
43 |
-
|
44 |
-
# Create an empty dataset file if it doesn't exist
|
45 |
if not os.path.exists(TEMP_JSON):
|
46 |
with open(TEMP_JSON, "w") as f:
|
47 |
pass
|
48 |
-
|
49 |
-
return True,
|
50 |
except Exception as e:
|
51 |
-
return False,
|
52 |
|
53 |
def push_to_hf_dataset(data_row):
|
54 |
-
"""Push a new data row to the HuggingFace dataset"""
|
55 |
if not HF_TOKEN or not HF_USERNAME:
|
56 |
return False, "HuggingFace credentials not found"
|
57 |
-
|
58 |
try:
|
59 |
-
# Append the new data to the JSONL file
|
60 |
with open(TEMP_JSON, "a") as f:
|
61 |
f.write(json.dumps(data_row) + "\n")
|
62 |
-
|
63 |
-
# Upload the file to HuggingFace
|
64 |
api = HfApi(token=HF_TOKEN)
|
65 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
66 |
-
|
67 |
-
# Upload the JSONL file
|
68 |
upload_file(
|
69 |
path_or_fileobj=TEMP_JSON,
|
70 |
path_in_repo="data.jsonl",
|
@@ -72,17 +61,15 @@ def push_to_hf_dataset(data_row):
|
|
72 |
repo_type="dataset",
|
73 |
token=HF_TOKEN
|
74 |
)
|
75 |
-
|
76 |
return True, f"Data pushed to {repo_id}"
|
77 |
except Exception as e:
|
78 |
-
return False,
|
79 |
|
80 |
def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
|
81 |
-
"""Save the user's choice locally and to HuggingFace dataset"""
|
82 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
83 |
chosen_summary = "A" if choice == "Summary A" else "B"
|
84 |
-
|
85 |
-
# Create a new row with the data
|
86 |
new_row = {
|
87 |
"timestamp": timestamp,
|
88 |
"text_id": text_id,
|
@@ -93,85 +80,65 @@ def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="",
|
|
93 |
"notes": notes,
|
94 |
"request_id": request_id
|
95 |
}
|
96 |
-
|
97 |
-
# Save locally
|
98 |
if os.path.exists(OUTPUT_CSV):
|
99 |
results_df = pd.read_csv(OUTPUT_CSV)
|
100 |
results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
|
101 |
else:
|
102 |
results_df = pd.DataFrame([new_row])
|
103 |
-
|
104 |
results_df.to_csv(OUTPUT_CSV, index=False)
|
105 |
-
|
106 |
-
# Push to HuggingFace
|
107 |
success, message = push_to_hf_dataset(new_row)
|
108 |
-
|
109 |
request_id_msg = f" (Request ID: {request_id})" if request_id else ""
|
110 |
-
|
111 |
if success:
|
112 |
-
return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {
|
113 |
else:
|
114 |
return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
|
115 |
|
|
|
116 |
class SummaryChooser:
|
117 |
def __init__(self):
|
118 |
self.df = load_data()
|
119 |
self.current_index = 0
|
120 |
self.total_items = len(self.df)
|
121 |
self.hf_status = initialize_hf_dataset()
|
122 |
-
self.request_id = ""
|
123 |
-
|
124 |
def set_request_id(self, request: gr.Request):
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
|
130 |
-
except:
|
131 |
-
self.request_id = ""
|
132 |
-
return "Failed to get Request ID"
|
133 |
-
|
134 |
def get_current_item(self):
|
135 |
-
"""Get the current item from the dataframe"""
|
136 |
if self.total_items == 0:
|
137 |
-
return "", "", "", "", f"No data found in {INPUT_CSV}.
|
138 |
-
|
139 |
row = self.df.iloc[self.current_index]
|
140 |
progress = f"Item {self.current_index + 1} of {self.total_items}"
|
141 |
-
return row["id"], row["
|
142 |
-
|
143 |
def next_item(self, choice, notes):
|
144 |
-
"""Save current choice and move to next item"""
|
145 |
if self.total_items == 0:
|
146 |
return "", "", "", "", "No data available", ""
|
147 |
-
|
148 |
-
# Get current values
|
149 |
text_id, text, summary_a, summary_b, _ = self.get_current_item()
|
150 |
-
|
151 |
-
# Save the choice with the request ID
|
152 |
result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
|
153 |
-
|
154 |
-
# Move to next item or wrap around
|
155 |
self.current_index = (self.current_index + 1) % self.total_items
|
156 |
-
|
157 |
-
|
158 |
-
text_id, text, summary_a, summary_b, progress = self.get_current_item()
|
159 |
-
return text_id, text, summary_a, summary_b, progress, result_message
|
160 |
-
|
161 |
def prev_item(self):
|
162 |
-
"""Move to previous item"""
|
163 |
if self.total_items == 0:
|
164 |
return "", "", "", "", "No data available", ""
|
165 |
-
|
166 |
-
# Move to previous item or wrap around
|
167 |
self.current_index = (self.current_index - 1) % self.total_items
|
168 |
-
|
169 |
-
|
170 |
-
text_id, text, summary_a, summary_b, progress = self.get_current_item()
|
171 |
-
return text_id, text, summary_a, summary_b, progress, ""
|
172 |
-
|
173 |
def get_hf_status(self):
|
174 |
-
"""Get the status of HuggingFace integration"""
|
175 |
success, message = self.hf_status
|
176 |
return f"{'Connected' if success else 'Not Connected'} - {message}"
|
177 |
|
|
|
17 |
HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
|
18 |
|
19 |
def load_data():
|
|
|
20 |
if os.path.exists(INPUT_CSV):
|
21 |
return pd.read_csv(INPUT_CSV)
|
22 |
else:
|
23 |
+
return pd.DataFrame(columns=["id", "boe_text_cleaned", "tweet_text_cleaned", "tweet_original"])
|
|
|
24 |
|
25 |
def initialize_hf_dataset():
|
|
|
26 |
if not HF_TOKEN or not HF_USERNAME:
|
27 |
+
return False, "HuggingFace credentials not found."
|
28 |
+
|
29 |
try:
|
30 |
api = HfApi(token=HF_TOKEN)
|
31 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
32 |
+
|
|
|
33 |
try:
|
34 |
api.repo_info(repo_id=repo_id, repo_type="dataset")
|
|
|
35 |
except Exception:
|
|
|
36 |
create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
|
37 |
+
|
|
|
38 |
if not os.path.exists(TEMP_JSON):
|
39 |
with open(TEMP_JSON, "w") as f:
|
40 |
pass
|
41 |
+
|
42 |
+
return True, repo_id
|
43 |
except Exception as e:
|
44 |
+
return False, str(e)
|
45 |
|
46 |
def push_to_hf_dataset(data_row):
|
|
|
47 |
if not HF_TOKEN or not HF_USERNAME:
|
48 |
return False, "HuggingFace credentials not found"
|
49 |
+
|
50 |
try:
|
|
|
51 |
with open(TEMP_JSON, "a") as f:
|
52 |
f.write(json.dumps(data_row) + "\n")
|
53 |
+
|
|
|
54 |
api = HfApi(token=HF_TOKEN)
|
55 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
56 |
+
|
|
|
57 |
upload_file(
|
58 |
path_or_fileobj=TEMP_JSON,
|
59 |
path_in_repo="data.jsonl",
|
|
|
61 |
repo_type="dataset",
|
62 |
token=HF_TOKEN
|
63 |
)
|
64 |
+
|
65 |
return True, f"Data pushed to {repo_id}"
|
66 |
except Exception as e:
|
67 |
+
return False, str(e)
|
68 |
|
69 |
def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
|
|
|
70 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
71 |
chosen_summary = "A" if choice == "Summary A" else "B"
|
72 |
+
|
|
|
73 |
new_row = {
|
74 |
"timestamp": timestamp,
|
75 |
"text_id": text_id,
|
|
|
80 |
"notes": notes,
|
81 |
"request_id": request_id
|
82 |
}
|
83 |
+
|
|
|
84 |
if os.path.exists(OUTPUT_CSV):
|
85 |
results_df = pd.read_csv(OUTPUT_CSV)
|
86 |
results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
|
87 |
else:
|
88 |
results_df = pd.DataFrame([new_row])
|
89 |
+
|
90 |
results_df.to_csv(OUTPUT_CSV, index=False)
|
91 |
+
|
|
|
92 |
success, message = push_to_hf_dataset(new_row)
|
93 |
+
|
94 |
request_id_msg = f" (Request ID: {request_id})" if request_id else ""
|
95 |
+
|
96 |
if success:
|
97 |
+
return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {choice}."
|
98 |
else:
|
99 |
return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
|
100 |
|
101 |
+
|
102 |
class SummaryChooser:
|
103 |
def __init__(self):
|
104 |
self.df = load_data()
|
105 |
self.current_index = 0
|
106 |
self.total_items = len(self.df)
|
107 |
self.hf_status = initialize_hf_dataset()
|
108 |
+
self.request_id = ""
|
109 |
+
|
110 |
def set_request_id(self, request: gr.Request):
|
111 |
+
query_params = request.query_params
|
112 |
+
self.request_id = query_params.get("id", "")
|
113 |
+
return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
|
114 |
+
|
|
|
|
|
|
|
|
|
|
|
115 |
def get_current_item(self):
|
|
|
116 |
if self.total_items == 0:
|
117 |
+
return "", "", "", "", f"No data found in {INPUT_CSV}."
|
118 |
+
|
119 |
row = self.df.iloc[self.current_index]
|
120 |
progress = f"Item {self.current_index + 1} of {self.total_items}"
|
121 |
+
return row["id"], row["boe_text_cleaned"], row["tweet_text_cleaned"], row["tweet_original"], progress
|
122 |
+
|
123 |
def next_item(self, choice, notes):
|
|
|
124 |
if self.total_items == 0:
|
125 |
return "", "", "", "", "No data available", ""
|
126 |
+
|
|
|
127 |
text_id, text, summary_a, summary_b, _ = self.get_current_item()
|
128 |
+
|
|
|
129 |
result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
|
130 |
+
|
|
|
131 |
self.current_index = (self.current_index + 1) % self.total_items
|
132 |
+
return (*self.get_current_item(), result_message)
|
133 |
+
|
|
|
|
|
|
|
134 |
def prev_item(self):
|
|
|
135 |
if self.total_items == 0:
|
136 |
return "", "", "", "", "No data available", ""
|
137 |
+
|
|
|
138 |
self.current_index = (self.current_index - 1) % self.total_items
|
139 |
+
return (*self.get_current_item(), "")
|
140 |
+
|
|
|
|
|
|
|
141 |
def get_hf_status(self):
|
|
|
142 |
success, message = self.hf_status
|
143 |
return f"{'Connected' if success else 'Not Connected'} - {message}"
|
144 |
|