Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ from huggingface_hub import HfApi, create_repo, upload_file
|
|
7 |
from datasets import Dataset
|
8 |
|
9 |
# File paths
|
10 |
-
INPUT_CSV = "
|
11 |
OUTPUT_CSV = "results.csv" # Local backup file path
|
12 |
TEMP_JSON = "temp_results.jsonl" # Temporary file for storing results as JSONL
|
13 |
|
@@ -17,43 +17,54 @@ HF_DATASET_REPO = "boe-preference-summaries-results" # Change this to your desi
|
|
17 |
HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
|
18 |
|
19 |
def load_data():
|
|
|
20 |
if os.path.exists(INPUT_CSV):
|
21 |
return pd.read_csv(INPUT_CSV)
|
22 |
else:
|
23 |
-
|
|
|
24 |
|
25 |
def initialize_hf_dataset():
|
|
|
26 |
if not HF_TOKEN or not HF_USERNAME:
|
27 |
-
return False, "HuggingFace credentials not found."
|
28 |
-
|
29 |
try:
|
30 |
api = HfApi(token=HF_TOKEN)
|
31 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
32 |
-
|
|
|
33 |
try:
|
34 |
api.repo_info(repo_id=repo_id, repo_type="dataset")
|
|
|
35 |
except Exception:
|
|
|
36 |
create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
|
37 |
-
|
|
|
38 |
if not os.path.exists(TEMP_JSON):
|
39 |
with open(TEMP_JSON, "w") as f:
|
40 |
pass
|
41 |
-
|
42 |
-
return True, repo_id
|
43 |
except Exception as e:
|
44 |
-
return False, str(e)
|
45 |
|
46 |
def push_to_hf_dataset(data_row):
|
|
|
47 |
if not HF_TOKEN or not HF_USERNAME:
|
48 |
return False, "HuggingFace credentials not found"
|
49 |
-
|
50 |
try:
|
|
|
51 |
with open(TEMP_JSON, "a") as f:
|
52 |
f.write(json.dumps(data_row) + "\n")
|
53 |
-
|
|
|
54 |
api = HfApi(token=HF_TOKEN)
|
55 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
56 |
-
|
|
|
57 |
upload_file(
|
58 |
path_or_fileobj=TEMP_JSON,
|
59 |
path_in_repo="data.jsonl",
|
@@ -61,15 +72,17 @@ def push_to_hf_dataset(data_row):
|
|
61 |
repo_type="dataset",
|
62 |
token=HF_TOKEN
|
63 |
)
|
64 |
-
|
65 |
return True, f"Data pushed to {repo_id}"
|
66 |
except Exception as e:
|
67 |
-
return False, str(e)
|
68 |
|
69 |
def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
|
|
|
70 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
71 |
chosen_summary = "A" if choice == "Summary A" else "B"
|
72 |
-
|
|
|
73 |
new_row = {
|
74 |
"timestamp": timestamp,
|
75 |
"text_id": text_id,
|
@@ -80,65 +93,85 @@ def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="",
|
|
80 |
"notes": notes,
|
81 |
"request_id": request_id
|
82 |
}
|
83 |
-
|
|
|
84 |
if os.path.exists(OUTPUT_CSV):
|
85 |
results_df = pd.read_csv(OUTPUT_CSV)
|
86 |
results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
|
87 |
else:
|
88 |
results_df = pd.DataFrame([new_row])
|
89 |
-
|
90 |
results_df.to_csv(OUTPUT_CSV, index=False)
|
91 |
-
|
|
|
92 |
success, message = push_to_hf_dataset(new_row)
|
93 |
-
|
94 |
request_id_msg = f" (Request ID: {request_id})" if request_id else ""
|
95 |
-
|
96 |
if success:
|
97 |
-
return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {choice}."
|
98 |
else:
|
99 |
return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
|
100 |
|
101 |
-
|
102 |
class SummaryChooser:
|
103 |
def __init__(self):
|
104 |
self.df = load_data()
|
105 |
self.current_index = 0
|
106 |
self.total_items = len(self.df)
|
107 |
self.hf_status = initialize_hf_dataset()
|
108 |
-
self.request_id = ""
|
109 |
-
|
110 |
def set_request_id(self, request: gr.Request):
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
115 |
def get_current_item(self):
|
|
|
116 |
if self.total_items == 0:
|
117 |
-
return "", "", "", "", f"No data found in {INPUT_CSV}."
|
118 |
-
|
119 |
row = self.df.iloc[self.current_index]
|
120 |
progress = f"Item {self.current_index + 1} of {self.total_items}"
|
121 |
-
return row["id"], row["
|
122 |
-
|
123 |
def next_item(self, choice, notes):
|
|
|
124 |
if self.total_items == 0:
|
125 |
return "", "", "", "", "No data available", ""
|
126 |
-
|
|
|
127 |
text_id, text, summary_a, summary_b, _ = self.get_current_item()
|
128 |
-
|
|
|
129 |
result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
|
130 |
-
|
|
|
131 |
self.current_index = (self.current_index + 1) % self.total_items
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
134 |
def prev_item(self):
|
|
|
135 |
if self.total_items == 0:
|
136 |
return "", "", "", "", "No data available", ""
|
137 |
-
|
|
|
138 |
self.current_index = (self.current_index - 1) % self.total_items
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
141 |
def get_hf_status(self):
|
|
|
142 |
success, message = self.hf_status
|
143 |
return f"{'Connected' if success else 'Not Connected'} - {message}"
|
144 |
|
|
|
7 |
from datasets import Dataset
|
8 |
|
9 |
# File paths
|
10 |
+
INPUT_CSV = "summaries.csv" # Change this to your input CSV path
|
11 |
OUTPUT_CSV = "results.csv" # Local backup file path
|
12 |
TEMP_JSON = "temp_results.jsonl" # Temporary file for storing results as JSONL
|
13 |
|
|
|
17 |
HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
|
18 |
|
19 |
def load_data():
|
20 |
+
"""Load data from CSV file"""
|
21 |
if os.path.exists(INPUT_CSV):
|
22 |
return pd.read_csv(INPUT_CSV)
|
23 |
else:
|
24 |
+
# Create empty dataframe with required columns if file doesn't exist
|
25 |
+
return pd.DataFrame(columns=["text_id", "text", "summary_a", "summary_b"])
|
26 |
|
27 |
def initialize_hf_dataset():
|
28 |
+
"""Initialize a HuggingFace dataset repository if it doesn't exist"""
|
29 |
if not HF_TOKEN or not HF_USERNAME:
|
30 |
+
return False, "HuggingFace credentials not found. Please set HF_TOKEN and HF_USERNAME environment variables."
|
31 |
+
|
32 |
try:
|
33 |
api = HfApi(token=HF_TOKEN)
|
34 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
35 |
+
|
36 |
+
# Check if the repo exists, if not create it
|
37 |
try:
|
38 |
api.repo_info(repo_id=repo_id, repo_type="dataset")
|
39 |
+
print(f"Repository {repo_id} already exists")
|
40 |
except Exception:
|
41 |
+
print(f"Creating repository {repo_id}")
|
42 |
create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
|
43 |
+
|
44 |
+
# Create an empty dataset file if it doesn't exist
|
45 |
if not os.path.exists(TEMP_JSON):
|
46 |
with open(TEMP_JSON, "w") as f:
|
47 |
pass
|
48 |
+
|
49 |
+
return True, f"{repo_id}"
|
50 |
except Exception as e:
|
51 |
+
return False, f"Error initializing HuggingFace dataset: {str(e)}"
|
52 |
|
53 |
def push_to_hf_dataset(data_row):
|
54 |
+
"""Push a new data row to the HuggingFace dataset"""
|
55 |
if not HF_TOKEN or not HF_USERNAME:
|
56 |
return False, "HuggingFace credentials not found"
|
57 |
+
|
58 |
try:
|
59 |
+
# Append the new data to the JSONL file
|
60 |
with open(TEMP_JSON, "a") as f:
|
61 |
f.write(json.dumps(data_row) + "\n")
|
62 |
+
|
63 |
+
# Upload the file to HuggingFace
|
64 |
api = HfApi(token=HF_TOKEN)
|
65 |
repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
|
66 |
+
|
67 |
+
# Upload the JSONL file
|
68 |
upload_file(
|
69 |
path_or_fileobj=TEMP_JSON,
|
70 |
path_in_repo="data.jsonl",
|
|
|
72 |
repo_type="dataset",
|
73 |
token=HF_TOKEN
|
74 |
)
|
75 |
+
|
76 |
return True, f"Data pushed to {repo_id}"
|
77 |
except Exception as e:
|
78 |
+
return False, f"Error pushing to HuggingFace: {str(e)}"
|
79 |
|
80 |
def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
|
81 |
+
"""Save the user's choice locally and to HuggingFace dataset"""
|
82 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
83 |
chosen_summary = "A" if choice == "Summary A" else "B"
|
84 |
+
|
85 |
+
# Create a new row with the data
|
86 |
new_row = {
|
87 |
"timestamp": timestamp,
|
88 |
"text_id": text_id,
|
|
|
93 |
"notes": notes,
|
94 |
"request_id": request_id
|
95 |
}
|
96 |
+
|
97 |
+
# Save locally
|
98 |
if os.path.exists(OUTPUT_CSV):
|
99 |
results_df = pd.read_csv(OUTPUT_CSV)
|
100 |
results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
|
101 |
else:
|
102 |
results_df = pd.DataFrame([new_row])
|
103 |
+
|
104 |
results_df.to_csv(OUTPUT_CSV, index=False)
|
105 |
+
|
106 |
+
# Push to HuggingFace
|
107 |
success, message = push_to_hf_dataset(new_row)
|
108 |
+
|
109 |
request_id_msg = f" (Request ID: {request_id})" if request_id else ""
|
110 |
+
|
111 |
if success:
|
112 |
+
return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {'Summary A' if choice == 'Summary A' else 'Summary B'}. Pushed to HuggingFace."
|
113 |
else:
|
114 |
return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
|
115 |
|
|
|
116 |
class SummaryChooser:
|
117 |
def __init__(self):
|
118 |
self.df = load_data()
|
119 |
self.current_index = 0
|
120 |
self.total_items = len(self.df)
|
121 |
self.hf_status = initialize_hf_dataset()
|
122 |
+
self.request_id = "" # Initialize empty request ID
|
123 |
+
|
124 |
def set_request_id(self, request: gr.Request):
|
125 |
+
"""Set the request ID from the URL query parameters"""
|
126 |
+
try:
|
127 |
+
query_params = request.query_params
|
128 |
+
self.request_id = query_params.get("id", "")
|
129 |
+
return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
|
130 |
+
except:
|
131 |
+
self.request_id = ""
|
132 |
+
return "Failed to get Request ID"
|
133 |
+
|
134 |
def get_current_item(self):
|
135 |
+
"""Get the current item from the dataframe"""
|
136 |
if self.total_items == 0:
|
137 |
+
return "", "", "", "", f"No data found in {INPUT_CSV}. Please check the file path."
|
138 |
+
|
139 |
row = self.df.iloc[self.current_index]
|
140 |
progress = f"Item {self.current_index + 1} of {self.total_items}"
|
141 |
+
return row["id"], row["text"], row["summary_a"], row["summary_b"], progress
|
142 |
+
|
143 |
def next_item(self, choice, notes):
|
144 |
+
"""Save current choice and move to next item"""
|
145 |
if self.total_items == 0:
|
146 |
return "", "", "", "", "No data available", ""
|
147 |
+
|
148 |
+
# Get current values
|
149 |
text_id, text, summary_a, summary_b, _ = self.get_current_item()
|
150 |
+
|
151 |
+
# Save the choice with the request ID
|
152 |
result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
|
153 |
+
|
154 |
+
# Move to next item or wrap around
|
155 |
self.current_index = (self.current_index + 1) % self.total_items
|
156 |
+
|
157 |
+
# Get next item
|
158 |
+
text_id, text, summary_a, summary_b, progress = self.get_current_item()
|
159 |
+
return text_id, text, summary_a, summary_b, progress, result_message
|
160 |
+
|
161 |
def prev_item(self):
|
162 |
+
"""Move to previous item"""
|
163 |
if self.total_items == 0:
|
164 |
return "", "", "", "", "No data available", ""
|
165 |
+
|
166 |
+
# Move to previous item or wrap around
|
167 |
self.current_index = (self.current_index - 1) % self.total_items
|
168 |
+
|
169 |
+
# Get the item
|
170 |
+
text_id, text, summary_a, summary_b, progress = self.get_current_item()
|
171 |
+
return text_id, text, summary_a, summary_b, progress, ""
|
172 |
+
|
173 |
def get_hf_status(self):
|
174 |
+
"""Get the status of HuggingFace integration"""
|
175 |
success, message = self.hf_status
|
176 |
return f"{'Connected' if success else 'Not Connected'} - {message}"
|
177 |
|