nandezgarcia commited on
Commit
0af43d5
·
verified ·
1 Parent(s): 5b3940b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -40
app.py CHANGED
@@ -7,7 +7,7 @@ from huggingface_hub import HfApi, create_repo, upload_file
7
  from datasets import Dataset
8
 
9
  # File paths
10
- INPUT_CSV = "summaries_boe.csv" # Change this to your input CSV path
11
  OUTPUT_CSV = "results.csv" # Local backup file path
12
  TEMP_JSON = "temp_results.jsonl" # Temporary file for storing results as JSONL
13
 
@@ -17,43 +17,54 @@ HF_DATASET_REPO = "boe-preference-summaries-results" # Change this to your desi
17
  HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
18
 
19
  def load_data():
 
20
  if os.path.exists(INPUT_CSV):
21
  return pd.read_csv(INPUT_CSV)
22
  else:
23
- return pd.DataFrame(columns=["id", "boe_text_cleaned", "tweet_text_cleaned", "tweet_original"])
 
24
 
25
  def initialize_hf_dataset():
 
26
  if not HF_TOKEN or not HF_USERNAME:
27
- return False, "HuggingFace credentials not found."
28
-
29
  try:
30
  api = HfApi(token=HF_TOKEN)
31
  repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
32
-
 
33
  try:
34
  api.repo_info(repo_id=repo_id, repo_type="dataset")
 
35
  except Exception:
 
36
  create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
37
-
 
38
  if not os.path.exists(TEMP_JSON):
39
  with open(TEMP_JSON, "w") as f:
40
  pass
41
-
42
- return True, repo_id
43
  except Exception as e:
44
- return False, str(e)
45
 
46
  def push_to_hf_dataset(data_row):
 
47
  if not HF_TOKEN or not HF_USERNAME:
48
  return False, "HuggingFace credentials not found"
49
-
50
  try:
 
51
  with open(TEMP_JSON, "a") as f:
52
  f.write(json.dumps(data_row) + "\n")
53
-
 
54
  api = HfApi(token=HF_TOKEN)
55
  repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
56
-
 
57
  upload_file(
58
  path_or_fileobj=TEMP_JSON,
59
  path_in_repo="data.jsonl",
@@ -61,15 +72,17 @@ def push_to_hf_dataset(data_row):
61
  repo_type="dataset",
62
  token=HF_TOKEN
63
  )
64
-
65
  return True, f"Data pushed to {repo_id}"
66
  except Exception as e:
67
- return False, str(e)
68
 
69
  def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
 
70
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
71
  chosen_summary = "A" if choice == "Summary A" else "B"
72
-
 
73
  new_row = {
74
  "timestamp": timestamp,
75
  "text_id": text_id,
@@ -80,65 +93,85 @@ def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="",
80
  "notes": notes,
81
  "request_id": request_id
82
  }
83
-
 
84
  if os.path.exists(OUTPUT_CSV):
85
  results_df = pd.read_csv(OUTPUT_CSV)
86
  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
87
  else:
88
  results_df = pd.DataFrame([new_row])
89
-
90
  results_df.to_csv(OUTPUT_CSV, index=False)
91
-
 
92
  success, message = push_to_hf_dataset(new_row)
93
-
94
  request_id_msg = f" (Request ID: {request_id})" if request_id else ""
95
-
96
  if success:
97
- return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {choice}."
98
  else:
99
  return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
100
 
101
-
102
  class SummaryChooser:
103
  def __init__(self):
104
  self.df = load_data()
105
  self.current_index = 0
106
  self.total_items = len(self.df)
107
  self.hf_status = initialize_hf_dataset()
108
- self.request_id = ""
109
-
110
  def set_request_id(self, request: gr.Request):
111
- query_params = request.query_params
112
- self.request_id = query_params.get("id", "")
113
- return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
114
-
 
 
 
 
 
115
  def get_current_item(self):
 
116
  if self.total_items == 0:
117
- return "", "", "", "", f"No data found in {INPUT_CSV}."
118
-
119
  row = self.df.iloc[self.current_index]
120
  progress = f"Item {self.current_index + 1} of {self.total_items}"
121
- return row["id"], row["boe_text_cleaned"], row["tweet_text_cleaned"], row["tweet_original"], progress
122
-
123
  def next_item(self, choice, notes):
 
124
  if self.total_items == 0:
125
  return "", "", "", "", "No data available", ""
126
-
 
127
  text_id, text, summary_a, summary_b, _ = self.get_current_item()
128
-
 
129
  result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
130
-
 
131
  self.current_index = (self.current_index + 1) % self.total_items
132
- return (*self.get_current_item(), result_message)
133
-
 
 
 
134
  def prev_item(self):
 
135
  if self.total_items == 0:
136
  return "", "", "", "", "No data available", ""
137
-
 
138
  self.current_index = (self.current_index - 1) % self.total_items
139
- return (*self.get_current_item(), "")
140
-
 
 
 
141
  def get_hf_status(self):
 
142
  success, message = self.hf_status
143
  return f"{'Connected' if success else 'Not Connected'} - {message}"
144
 
 
7
  from datasets import Dataset
8
 
9
  # File paths
10
+ INPUT_CSV = "summaries.csv" # Change this to your input CSV path
11
  OUTPUT_CSV = "results.csv" # Local backup file path
12
  TEMP_JSON = "temp_results.jsonl" # Temporary file for storing results as JSONL
13
 
 
17
  HF_USERNAME = os.environ.get("HF_USERNAME", "") # Set your HuggingFace username as an environment variable
18
 
19
  def load_data():
20
+ """Load data from CSV file"""
21
  if os.path.exists(INPUT_CSV):
22
  return pd.read_csv(INPUT_CSV)
23
  else:
24
+ # Create empty dataframe with required columns if file doesn't exist
25
+ return pd.DataFrame(columns=["text_id", "text", "summary_a", "summary_b"])
26
 
27
  def initialize_hf_dataset():
28
+ """Initialize a HuggingFace dataset repository if it doesn't exist"""
29
  if not HF_TOKEN or not HF_USERNAME:
30
+ return False, "HuggingFace credentials not found. Please set HF_TOKEN and HF_USERNAME environment variables."
31
+
32
  try:
33
  api = HfApi(token=HF_TOKEN)
34
  repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
35
+
36
+ # Check if the repo exists, if not create it
37
  try:
38
  api.repo_info(repo_id=repo_id, repo_type="dataset")
39
+ print(f"Repository {repo_id} already exists")
40
  except Exception:
41
+ print(f"Creating repository {repo_id}")
42
  create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
43
+
44
+ # Create an empty dataset file if it doesn't exist
45
  if not os.path.exists(TEMP_JSON):
46
  with open(TEMP_JSON, "w") as f:
47
  pass
48
+
49
+ return True, f"{repo_id}"
50
  except Exception as e:
51
+ return False, f"Error initializing HuggingFace dataset: {str(e)}"
52
 
53
  def push_to_hf_dataset(data_row):
54
+ """Push a new data row to the HuggingFace dataset"""
55
  if not HF_TOKEN or not HF_USERNAME:
56
  return False, "HuggingFace credentials not found"
57
+
58
  try:
59
+ # Append the new data to the JSONL file
60
  with open(TEMP_JSON, "a") as f:
61
  f.write(json.dumps(data_row) + "\n")
62
+
63
+ # Upload the file to HuggingFace
64
  api = HfApi(token=HF_TOKEN)
65
  repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
66
+
67
+ # Upload the JSONL file
68
  upload_file(
69
  path_or_fileobj=TEMP_JSON,
70
  path_in_repo="data.jsonl",
 
72
  repo_type="dataset",
73
  token=HF_TOKEN
74
  )
75
+
76
  return True, f"Data pushed to {repo_id}"
77
  except Exception as e:
78
+ return False, f"Error pushing to HuggingFace: {str(e)}"
79
 
80
  def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
81
+ """Save the user's choice locally and to HuggingFace dataset"""
82
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
83
  chosen_summary = "A" if choice == "Summary A" else "B"
84
+
85
+ # Create a new row with the data
86
  new_row = {
87
  "timestamp": timestamp,
88
  "text_id": text_id,
 
93
  "notes": notes,
94
  "request_id": request_id
95
  }
96
+
97
+ # Save locally
98
  if os.path.exists(OUTPUT_CSV):
99
  results_df = pd.read_csv(OUTPUT_CSV)
100
  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
101
  else:
102
  results_df = pd.DataFrame([new_row])
103
+
104
  results_df.to_csv(OUTPUT_CSV, index=False)
105
+
106
+ # Push to HuggingFace
107
  success, message = push_to_hf_dataset(new_row)
108
+
109
  request_id_msg = f" (Request ID: {request_id})" if request_id else ""
110
+
111
  if success:
112
+ return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {'Summary A' if choice == 'Summary A' else 'Summary B'}. Pushed to HuggingFace."
113
  else:
114
  return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
115
 
 
116
  class SummaryChooser:
117
  def __init__(self):
118
  self.df = load_data()
119
  self.current_index = 0
120
  self.total_items = len(self.df)
121
  self.hf_status = initialize_hf_dataset()
122
+ self.request_id = "" # Initialize empty request ID
123
+
124
  def set_request_id(self, request: gr.Request):
125
+ """Set the request ID from the URL query parameters"""
126
+ try:
127
+ query_params = request.query_params
128
+ self.request_id = query_params.get("id", "")
129
+ return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
130
+ except:
131
+ self.request_id = ""
132
+ return "Failed to get Request ID"
133
+
134
  def get_current_item(self):
135
+ """Get the current item from the dataframe"""
136
  if self.total_items == 0:
137
+ return "", "", "", "", f"No data found in {INPUT_CSV}. Please check the file path."
138
+
139
  row = self.df.iloc[self.current_index]
140
  progress = f"Item {self.current_index + 1} of {self.total_items}"
141
+ return row["id"], row["text"], row["summary_a"], row["summary_b"], progress
142
+
143
  def next_item(self, choice, notes):
144
+ """Save current choice and move to next item"""
145
  if self.total_items == 0:
146
  return "", "", "", "", "No data available", ""
147
+
148
+ # Get current values
149
  text_id, text, summary_a, summary_b, _ = self.get_current_item()
150
+
151
+ # Save the choice with the request ID
152
  result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
153
+
154
+ # Move to next item or wrap around
155
  self.current_index = (self.current_index + 1) % self.total_items
156
+
157
+ # Get next item
158
+ text_id, text, summary_a, summary_b, progress = self.get_current_item()
159
+ return text_id, text, summary_a, summary_b, progress, result_message
160
+
161
  def prev_item(self):
162
+ """Move to previous item"""
163
  if self.total_items == 0:
164
  return "", "", "", "", "No data available", ""
165
+
166
+ # Move to previous item or wrap around
167
  self.current_index = (self.current_index - 1) % self.total_items
168
+
169
+ # Get the item
170
+ text_id, text, summary_a, summary_b, progress = self.get_current_item()
171
+ return text_id, text, summary_a, summary_b, progress, ""
172
+
173
  def get_hf_status(self):
174
+ """Get the status of HuggingFace integration"""
175
  success, message = self.hf_status
176
  return f"{'Connected' if success else 'Not Connected'} - {message}"
177