pagezyhf HF Staff commited on
Commit
a834908
·
1 Parent(s): be7d5ac
Files changed (3) hide show
  1. README.md +40 -0
  2. app.py +214 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -12,3 +12,43 @@ short_description: Meta analysis about trends on trending repos
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ # SQL Dataset Visualizer on Hugging Face Spaces
17
+
18
+ This application allows you to query Hugging Face datasets using SQL and visualize the results using Plotly.
19
+
20
+ ## Setup
21
+ 1. Install dependencies:
22
+ ```bash
23
+ pip install -r requirements.txt
24
+ ```
25
+
26
+ 2. Set up authentication:
27
+ - Get your Hugging Face token from https://huggingface.co/settings/tokens
28
+ - For local development, set the environment variable:
29
+ ```bash
30
+ export HF_TOKEN=your_token_here
31
+ ```
32
+ - For Hugging Face Spaces, add the token in the Space settings:
33
+ - Go to your Space settings
34
+ - Add a new secret with key `HF_TOKEN` and your token as the value
35
+
36
+ ## Run Locally
37
+ ```bash
38
+ python app.py
39
+ ```
40
+
41
+ ## Deploy to Hugging Face Spaces
42
+ 1. Push these files to a new Python Space on HF
43
+ 2. Add your HF_TOKEN in the Space settings
44
+ 3. The Space will automatically deploy with the token securely stored
45
+
46
+ ## Usage
47
+ 1. Enter your SQL query in the text box
48
+ 2. The results will be automatically visualized as a bar chart
49
+ 3. If there's an error in your query, it will be displayed as text
50
+
51
+ ## Security Note
52
+ - Never commit your HF_TOKEN to version control
53
+ - Always use environment variables or Space secrets for authentication
54
+ - The token is used to access private datasets and authenticate API requests
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+ import duckdb
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns # Import Seaborn
6
+ import plotly.express as px # Added for Plotly
7
+ import plotly.graph_objects as go # Added for Plotly error figure
8
+ import gradio as gr
9
+ import os
10
+ from huggingface_hub import login
11
+ from datetime import datetime, timedelta
12
+ import sys # Added for error logging
13
+
14
+ # Get token from environment variable
15
+ HF_TOKEN = os.getenv('HF_TOKEN')
16
+ if not HF_TOKEN:
17
+ raise ValueError("Please set the HF_TOKEN environment variable")
18
+
19
+ # Login to Hugging Face
20
+ login(token=HF_TOKEN)
21
+
22
+ # Apply Seaborn theme and context globally
23
+ sns.set_theme(style="whitegrid")
24
+ sns.set_context("notebook")
25
+
26
+ # Load dataset once at startup
27
+ try:
28
+ dataset = load_dataset("reach-vb/trending-repos", split="models")
29
+ df = dataset.to_pandas()
30
+ # Register the pandas DataFrame as a DuckDB table named 'models'
31
+ # This allows the SQL query to use 'FROM models'
32
+ duckdb.register('models', df)
33
+ except Exception as e:
34
+ print(f"Error loading dataset: {e}")
35
+ raise
36
+
37
+ def get_retention_data(start_date: str, end_date: str) -> pd.DataFrame:
38
+ try:
39
+ # The input start_date and end_date are already strings in YYYY-MM-DD format.
40
+ # We can pass them directly to DuckDB if the SQL column is DATE.
41
+
42
+ query = """
43
+ WITH model_presence AS (
44
+ SELECT
45
+ id AS model_id,
46
+ collected_at::DATE AS collection_day
47
+ FROM models
48
+ ),
49
+ daily_model_counts AS (
50
+ SELECT
51
+ collection_day,
52
+ COUNT(*) AS total_models_today
53
+ FROM model_presence
54
+ GROUP BY collection_day
55
+ ),
56
+ retained_models AS (
57
+ SELECT
58
+ a.collection_day,
59
+ COUNT(*) AS previously_existed_count
60
+ FROM model_presence a
61
+ JOIN model_presence b
62
+ ON a.model_id = b.model_id
63
+ AND a.collection_day = b.collection_day + INTERVAL '1 day'
64
+ GROUP BY a.collection_day
65
+ )
66
+ SELECT
67
+ d.collection_day,
68
+ d.total_models_today,
69
+ COALESCE(r.previously_existed_count, 0) AS carried_over_models,
70
+ CASE
71
+ WHEN d.total_models_today = 0 THEN NULL
72
+ ELSE ROUND(COALESCE(r.previously_existed_count, 0) * 100.0 / d.total_models_today, 2)
73
+ END AS percent_retained
74
+ FROM daily_model_counts d
75
+ LEFT JOIN retained_models r ON d.collection_day = r.collection_day
76
+ WHERE d.collection_day BETWEEN ? AND ?
77
+ ORDER BY d.collection_day
78
+ """
79
+ # Pass the string dates directly to the query, using the 'params' keyword argument.
80
+ result = duckdb.query(query, params=[start_date, end_date]).to_df()
81
+ print("SQL Query Result:") # Log the result
82
+ print(result) # Log the result
83
+ return result
84
+ except Exception as e:
85
+ # Log the error to standard error
86
+ print(f"Error in get_retention_data: {e}", file=sys.stderr)
87
+ # Return empty DataFrame with error message
88
+ return pd.DataFrame({"Error": [str(e)]})
89
+
90
+ def plot_retention_data(dataframe: pd.DataFrame):
91
+ print("DataFrame received by plot_retention_data (first 5 rows):")
92
+ print(dataframe.head())
93
+ print("\nData types in plot_retention_data before any conversion:")
94
+ print(dataframe.dtypes)
95
+
96
+ # Check if the DataFrame itself is an error signal from the previous function
97
+ if "Error" in dataframe.columns and not dataframe.empty:
98
+ error_message = dataframe['Error'].iloc[0]
99
+ print(f"Error DataFrame received: {error_message}", file=sys.stderr)
100
+ fig = go.Figure()
101
+ fig.add_annotation(
102
+ text=f"Error from data generation: {error_message}",
103
+ xref="paper", yref="paper",
104
+ x=0.5, y=0.5, showarrow=False,
105
+ font=dict(size=16)
106
+ )
107
+ return fig
108
+
109
+ try:
110
+ # Ensure 'percent_retained' column exists
111
+ if 'percent_retained' not in dataframe.columns:
112
+ raise ValueError("'percent_retained' column is missing from the DataFrame.")
113
+
114
+ if 'collection_day' not in dataframe.columns:
115
+ raise ValueError("'collection_day' column is missing from the DataFrame.")
116
+
117
+ # Explicitly convert 'percent_retained' to numeric.
118
+ # Ensure 'percent_retained' is numeric and 'collection_day' is datetime for Plotly
119
+ dataframe['percent_retained'] = pd.to_numeric(dataframe['percent_retained'], errors='coerce')
120
+ dataframe['collection_day'] = pd.to_datetime(dataframe['collection_day'])
121
+
122
+ # Drop rows where 'percent_retained' could not be converted (became NaT)
123
+ dataframe.dropna(subset=['percent_retained', 'collection_day'], inplace=True)
124
+
125
+ print("\n'percent_retained' column after pd.to_numeric (first 5 values):")
126
+ print(dataframe['percent_retained'].head())
127
+ print("'percent_retained' dtype after pd.to_numeric:", dataframe['percent_retained'].dtype)
128
+ print("\n'collection_day' column after pd.to_datetime (first 5 values):")
129
+ print(dataframe['collection_day'].head())
130
+ print("'collection_day' dtype after pd.to_datetime:", dataframe['collection_day'].dtype)
131
+
132
+ if dataframe.empty:
133
+ fig = go.Figure()
134
+ fig.add_annotation(
135
+ text="No data available to plot after processing.",
136
+ xref="paper", yref="paper",
137
+ x=0.5, y=0.5, showarrow=False,
138
+ font=dict(size=16)
139
+ )
140
+ return fig
141
+
142
+
143
+ # Create Plotly bar chart
144
+ fig = px.bar(
145
+ dataframe,
146
+ x='collection_day',
147
+ y='percent_retained',
148
+ title='Previous Day Top 200 Trending Model Retention %',
149
+ labels={'collection_day': 'Date', 'percent_retained': 'Retention Rate (%)'},
150
+ text='percent_retained' # Use the column directly for hover/text
151
+ )
152
+
153
+ # Format the text on bars
154
+ fig.update_traces(
155
+ texttemplate='%{text:.2f}%',
156
+ textposition='inside',
157
+ insidetextanchor='middle', # Anchor text to the middle of the bar
158
+ textfont_color='white',
159
+ textfont_size=10, # Adjusted size for better fit
160
+ hovertemplate='<b>Date</b>: %{x|%Y-%m-%d}<br>' +
161
+ '<b>Retention</b>: %{y:.2f}%<extra></extra>' # Custom hover
162
+ )
163
+
164
+ # Calculate and plot the average retention line
165
+ if not dataframe['percent_retained'].empty:
166
+ average_retention = dataframe['percent_retained'].mean()
167
+ fig.add_hline(
168
+ y=average_retention,
169
+ line_dash="dash",
170
+ line_color="red",
171
+ annotation_text=f"Average: {average_retention:.2f}%",
172
+ annotation_position="bottom right"
173
+ )
174
+
175
+ fig.update_xaxes(tickangle=45)
176
+ fig.update_layout(
177
+ title_x=0.5, # Center title
178
+ xaxis_title="Date",
179
+ yaxis_title="Retention Rate (%)",
180
+ plot_bgcolor='white', # Set plot background to white like seaborn whitegrid
181
+ bargap=0.2 # Gap between bars of different categories
182
+ )
183
+ return fig
184
+ except Exception as e:
185
+ print(f"Error during plot_retention_data: {e}", file=sys.stderr)
186
+ fig = go.Figure()
187
+ fig.add_annotation(
188
+ text=f"Plotting Error: {str(e)}",
189
+ xref="paper", yref="paper",
190
+ x=0.5, y=0.5, showarrow=False,
191
+ font=dict(size=16)
192
+ )
193
+ return fig
194
+
195
+ def interface_fn(start_date, end_date):
196
+ result = get_retention_data(start_date, end_date)
197
+ return plot_retention_data(result)
198
+
199
+ # Get min and max dates from the dataset
200
+ min_date = datetime.fromisoformat(df['collected_at'].min()).date()
201
+ max_date = datetime.fromisoformat(df['collected_at'].max()).date()
202
+
203
+ iface = gr.Interface(
204
+ fn=interface_fn,
205
+ inputs=[
206
+ gr.Textbox(label="Start Date (YYYY-MM-DD)", value=min_date.strftime("%Y-%m-%d")),
207
+ gr.Textbox(label="End Date (YYYY-MM-DD)", value=max_date.strftime("%Y-%m-%d"))
208
+ ],
209
+ outputs=gr.Plotly(label="Model Retention Visualization"),
210
+ title="Model Retention Analysis",
211
+ description="Visualize model retention rates over time. Enter dates in YYYY-MM-DD format."
212
+ )
213
+
214
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ datasets
2
+ duckdb
3
+ pandas
4
+ seaborn
5
+ matplotlib
6
+ gradio