Spaces:

VanoInvestigations
/

boe-preference-summaries

Sleeping

App Files Files Community

nandezgarcia commited on Mar 24

Commit

0af43d5

verified ·

1 Parent(s): 5b3940b

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -40

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from huggingface_hub import HfApi, create_repo, upload_file
 from datasets import Dataset
 # File paths
-INPUT_CSV = "summaries_boe.csv"  # Change this to your input CSV path
 OUTPUT_CSV = "results.csv"   # Local backup file path
 TEMP_JSON = "temp_results.jsonl"  # Temporary file for storing results as JSONL
@@ -17,43 +17,54 @@ HF_DATASET_REPO = "boe-preference-summaries-results"  # Change this to your desi
 HF_USERNAME = os.environ.get("HF_USERNAME", "")  # Set your HuggingFace username as an environment variable
 def load_data():
     if os.path.exists(INPUT_CSV):
         return pd.read_csv(INPUT_CSV)
     else:
-        return pd.DataFrame(columns=["id", "boe_text_cleaned", "tweet_text_cleaned", "tweet_original"])
 def initialize_hf_dataset():
     if not HF_TOKEN or not HF_USERNAME:
-        return False, "HuggingFace credentials not found."
     try:
         api = HfApi(token=HF_TOKEN)
         repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
         try:
             api.repo_info(repo_id=repo_id, repo_type="dataset")
         except Exception:
             create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
         if not os.path.exists(TEMP_JSON):
             with open(TEMP_JSON, "w") as f:
                 pass
-        return True, repo_id
     except Exception as e:
-        return False, str(e)
 def push_to_hf_dataset(data_row):
     if not HF_TOKEN or not HF_USERNAME:
         return False, "HuggingFace credentials not found"
     try:
         with open(TEMP_JSON, "a") as f:
             f.write(json.dumps(data_row) + "\n")
         api = HfApi(token=HF_TOKEN)
         repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
         upload_file(
             path_or_fileobj=TEMP_JSON,
             path_in_repo="data.jsonl",
@@ -61,15 +72,17 @@ def push_to_hf_dataset(data_row):
             repo_type="dataset",
             token=HF_TOKEN
         )
         return True, f"Data pushed to {repo_id}"
     except Exception as e:
-        return False, str(e)
 def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     chosen_summary = "A" if choice == "Summary A" else "B"
     new_row = {
         "timestamp": timestamp,
         "text_id": text_id,
@@ -80,65 +93,85 @@ def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="",
         "notes": notes,
         "request_id": request_id
     }
     if os.path.exists(OUTPUT_CSV):
         results_df = pd.read_csv(OUTPUT_CSV)
         results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
     else:
         results_df = pd.DataFrame([new_row])
     results_df.to_csv(OUTPUT_CSV, index=False)
     success, message = push_to_hf_dataset(new_row)
     request_id_msg = f" (Request ID: {request_id})" if request_id else ""
     if success:
-        return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {choice}."
     else:
         return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
 class SummaryChooser:
     def __init__(self):
         self.df = load_data()
         self.current_index = 0
         self.total_items = len(self.df)
         self.hf_status = initialize_hf_dataset()
-        self.request_id = ""
     def set_request_id(self, request: gr.Request):
-        query_params = request.query_params
-        self.request_id = query_params.get("id", "")
-        return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
     def get_current_item(self):
         if self.total_items == 0:
-            return "", "", "", "", f"No data found in {INPUT_CSV}."
         row = self.df.iloc[self.current_index]
         progress = f"Item {self.current_index + 1} of {self.total_items}"
-        return row["id"], row["boe_text_cleaned"], row["tweet_text_cleaned"], row["tweet_original"], progress
     def next_item(self, choice, notes):
         if self.total_items == 0:
             return "", "", "", "", "No data available", ""
         text_id, text, summary_a, summary_b, _ = self.get_current_item()
         result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
         self.current_index = (self.current_index + 1) % self.total_items
-        return (*self.get_current_item(), result_message)
     def prev_item(self):
         if self.total_items == 0:
             return "", "", "", "", "No data available", ""
         self.current_index = (self.current_index - 1) % self.total_items
-        return (*self.get_current_item(), "")
     def get_hf_status(self):
         success, message = self.hf_status
         return f"{'Connected' if success else 'Not Connected'} - {message}"

 from datasets import Dataset
 # File paths
+INPUT_CSV = "summaries.csv"  # Change this to your input CSV path
 OUTPUT_CSV = "results.csv"   # Local backup file path
 TEMP_JSON = "temp_results.jsonl"  # Temporary file for storing results as JSONL
 HF_USERNAME = os.environ.get("HF_USERNAME", "")  # Set your HuggingFace username as an environment variable
 def load_data():
+    """Load data from CSV file"""
     if os.path.exists(INPUT_CSV):
         return pd.read_csv(INPUT_CSV)
     else:
+        # Create empty dataframe with required columns if file doesn't exist
+        return pd.DataFrame(columns=["text_id", "text", "summary_a", "summary_b"])
 def initialize_hf_dataset():
+    """Initialize a HuggingFace dataset repository if it doesn't exist"""
     if not HF_TOKEN or not HF_USERNAME:
+        return False, "HuggingFace credentials not found. Please set HF_TOKEN and HF_USERNAME environment variables."
     try:
         api = HfApi(token=HF_TOKEN)
         repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
+        # Check if the repo exists, if not create it
         try:
             api.repo_info(repo_id=repo_id, repo_type="dataset")
+            print(f"Repository {repo_id} already exists")
         except Exception:
+            print(f"Creating repository {repo_id}")
             create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
+        # Create an empty dataset file if it doesn't exist
         if not os.path.exists(TEMP_JSON):
             with open(TEMP_JSON, "w") as f:
                 pass
+        return True, f"{repo_id}"
     except Exception as e:
+        return False, f"Error initializing HuggingFace dataset: {str(e)}"
 def push_to_hf_dataset(data_row):
+    """Push a new data row to the HuggingFace dataset"""
     if not HF_TOKEN or not HF_USERNAME:
         return False, "HuggingFace credentials not found"
     try:
+        # Append the new data to the JSONL file
         with open(TEMP_JSON, "a") as f:
             f.write(json.dumps(data_row) + "\n")
+        # Upload the file to HuggingFace
         api = HfApi(token=HF_TOKEN)
         repo_id = f"{HF_USERNAME}/{HF_DATASET_REPO}"
+        # Upload the JSONL file
         upload_file(
             path_or_fileobj=TEMP_JSON,
             path_in_repo="data.jsonl",
             repo_type="dataset",
             token=HF_TOKEN
         )
         return True, f"Data pushed to {repo_id}"
     except Exception as e:
+        return False, f"Error pushing to HuggingFace: {str(e)}"
 def save_choice(text_id, original_text, summary_a, summary_b, choice, notes="", request_id=""):
+    """Save the user's choice locally and to HuggingFace dataset"""
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     chosen_summary = "A" if choice == "Summary A" else "B"
+    # Create a new row with the data
     new_row = {
         "timestamp": timestamp,
         "text_id": text_id,
         "notes": notes,
         "request_id": request_id
     }
+    # Save locally
     if os.path.exists(OUTPUT_CSV):
         results_df = pd.read_csv(OUTPUT_CSV)
         results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
     else:
         results_df = pd.DataFrame([new_row])
     results_df.to_csv(OUTPUT_CSV, index=False)
+    # Push to HuggingFace
     success, message = push_to_hf_dataset(new_row)
     request_id_msg = f" (Request ID: {request_id})" if request_id else ""
     if success:
+        return f"Selection saved for text ID: {text_id}{request_id_msg}! You chose {'Summary A' if choice == 'Summary A' else 'Summary B'}. Pushed to HuggingFace."
     else:
         return f"Selection saved locally for text ID: {text_id}{request_id_msg}. HuggingFace push failed: {message}"
 class SummaryChooser:
     def __init__(self):
         self.df = load_data()
         self.current_index = 0
         self.total_items = len(self.df)
         self.hf_status = initialize_hf_dataset()
+        self.request_id = ""  # Initialize empty request ID
     def set_request_id(self, request: gr.Request):
+        """Set the request ID from the URL query parameters"""
+        try:
+            query_params = request.query_params
+            self.request_id = query_params.get("id", "")
+            return f"Request ID: {self.request_id}" if self.request_id else "No Request ID provided"
+        except:
+            self.request_id = ""
+            return "Failed to get Request ID"
     def get_current_item(self):
+        """Get the current item from the dataframe"""
         if self.total_items == 0:
+            return "", "", "", "", f"No data found in {INPUT_CSV}. Please check the file path."
         row = self.df.iloc[self.current_index]
         progress = f"Item {self.current_index + 1} of {self.total_items}"
+        return row["id"], row["text"], row["summary_a"], row["summary_b"], progress
     def next_item(self, choice, notes):
+        """Save current choice and move to next item"""
         if self.total_items == 0:
             return "", "", "", "", "No data available", ""
+        # Get current values
         text_id, text, summary_a, summary_b, _ = self.get_current_item()
+        # Save the choice with the request ID
         result_message = save_choice(text_id, text, summary_a, summary_b, choice, notes, self.request_id)
+        # Move to next item or wrap around
         self.current_index = (self.current_index + 1) % self.total_items
+        # Get next item
+        text_id, text, summary_a, summary_b, progress = self.get_current_item()
+        return text_id, text, summary_a, summary_b, progress, result_message
     def prev_item(self):
+        """Move to previous item"""
         if self.total_items == 0:
             return "", "", "", "", "No data available", ""
+        # Move to previous item or wrap around
         self.current_index = (self.current_index - 1) % self.total_items
+        # Get the item
+        text_id, text, summary_a, summary_b, progress = self.get_current_item()
+        return text_id, text, summary_a, summary_b, progress, ""
     def get_hf_status(self):
+        """Get the status of HuggingFace integration"""
         success, message = self.hf_status
         return f"{'Connected' if success else 'Not Connected'} - {message}"