nbroad commited on
Commit
5dd9ac2
·
verified ·
1 Parent(s): b386119

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +13 -1
app.py CHANGED
@@ -136,8 +136,20 @@ async def collect_and_store_data():
136
  logger.info(f"Creating new dataset (existing not found): {e}")
137
  combined_df = new_df
138
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Convert back to dataset and push
140
- new_dataset = Dataset.from_pandas(combined_df)
141
  new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
142
 
143
  logger.info(f"Successfully stored data for {len(results)} providers")
 
136
  logger.info(f"Creating new dataset (existing not found): {e}")
137
  combined_df = new_df
138
 
139
+ # De-duplicate by monthly_requests_int, keeping earliest timestamp for each value
140
+ combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
141
+ combined_df = combined_df.sort_values('timestamp')
142
+
143
+ # Group by provider and monthly_requests_int, keep first (earliest) occurrence
144
+ deduplicated_df = combined_df.groupby(['provider', 'monthly_requests_int']).first().reset_index()
145
+
146
+ # Convert timestamp back to string format
147
+ deduplicated_df['timestamp'] = deduplicated_df['timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
148
+
149
+ logger.info(f"De-duplicated dataset: {len(combined_df)} -> {len(deduplicated_df)} records")
150
+
151
  # Convert back to dataset and push
152
+ new_dataset = Dataset.from_pandas(deduplicated_df)
153
  new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
154
 
155
  logger.info(f"Successfully stored data for {len(results)} providers")