Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
@@ -136,8 +136,20 @@ async def collect_and_store_data():
|
|
136 |
logger.info(f"Creating new dataset (existing not found): {e}")
|
137 |
combined_df = new_df
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
# Convert back to dataset and push
|
140 |
-
new_dataset = Dataset.from_pandas(
|
141 |
new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
|
142 |
|
143 |
logger.info(f"Successfully stored data for {len(results)} providers")
|
|
|
136 |
logger.info(f"Creating new dataset (existing not found): {e}")
|
137 |
combined_df = new_df
|
138 |
|
139 |
+
# De-duplicate by monthly_requests_int, keeping earliest timestamp for each value
|
140 |
+
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
|
141 |
+
combined_df = combined_df.sort_values('timestamp')
|
142 |
+
|
143 |
+
# Group by provider and monthly_requests_int, keep first (earliest) occurrence
|
144 |
+
deduplicated_df = combined_df.groupby(['provider', 'monthly_requests_int']).first().reset_index()
|
145 |
+
|
146 |
+
# Convert timestamp back to string format
|
147 |
+
deduplicated_df['timestamp'] = deduplicated_df['timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
|
148 |
+
|
149 |
+
logger.info(f"De-duplicated dataset: {len(combined_df)} -> {len(deduplicated_df)} records")
|
150 |
+
|
151 |
# Convert back to dataset and push
|
152 |
+
new_dataset = Dataset.from_pandas(deduplicated_df)
|
153 |
new_dataset.push_to_hub(DATASET_REPO_NAME, token=HF_TOKEN, private=False)
|
154 |
|
155 |
logger.info(f"Successfully stored data for {len(results)} providers")
|