Spaces:

pagezyhf
/

trending-repos-analysis

Running

App Files Files Community

pagezyhf HF Staff commited on May 21

Commit

a834908

1 Parent(s): be7d5ac

v0

Browse files

Files changed (3) hide show

README.md +40 -0
app.py +214 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -12,3 +12,43 @@ short_description: Meta analysis about trends on trending repos
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# SQL Dataset Visualizer on Hugging Face Spaces
+This application allows you to query Hugging Face datasets using SQL and visualize the results using Plotly.
+## Setup
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Set up authentication:
+   - Get your Hugging Face token from https://huggingface.co/settings/tokens
+   - For local development, set the environment variable:
+     ```bash
+     export HF_TOKEN=your_token_here
+     ```
+   - For Hugging Face Spaces, add the token in the Space settings:
+     - Go to your Space settings
+     - Add a new secret with key `HF_TOKEN` and your token as the value
+## Run Locally
+```bash
+python app.py
+```
+## Deploy to Hugging Face Spaces
+1. Push these files to a new Python Space on HF
+2. Add your HF_TOKEN in the Space settings
+3. The Space will automatically deploy with the token securely stored
+## Usage
+1. Enter your SQL query in the text box
+2. The results will be automatically visualized as a bar chart
+3. If there's an error in your query, it will be displayed as text
+## Security Note
+- Never commit your HF_TOKEN to version control
+- Always use environment variables or Space secrets for authentication
+- The token is used to access private datasets and authenticate API requests

app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from datasets import load_dataset
+import pandas as pd
+import duckdb
+import matplotlib.pyplot as plt
+import seaborn as sns # Import Seaborn
+import plotly.express as px # Added for Plotly
+import plotly.graph_objects as go # Added for Plotly error figure
+import gradio as gr
+import os
+from huggingface_hub import login
+from datetime import datetime, timedelta
+import sys # Added for error logging
+# Get token from environment variable
+HF_TOKEN = os.getenv('HF_TOKEN')
+if not HF_TOKEN:
+    raise ValueError("Please set the HF_TOKEN environment variable")
+# Login to Hugging Face
+login(token=HF_TOKEN)
+# Apply Seaborn theme and context globally
+sns.set_theme(style="whitegrid")
+sns.set_context("notebook")
+# Load dataset once at startup
+try:
+    dataset = load_dataset("reach-vb/trending-repos", split="models")
+    df = dataset.to_pandas()
+    # Register the pandas DataFrame as a DuckDB table named 'models'
+    # This allows the SQL query to use 'FROM models'
+    duckdb.register('models', df)
+except Exception as e:
+    print(f"Error loading dataset: {e}")
+    raise
+def get_retention_data(start_date: str, end_date: str) -> pd.DataFrame:
+    try:
+        # The input start_date and end_date are already strings in YYYY-MM-DD format.
+        # We can pass them directly to DuckDB if the SQL column is DATE.
+        query = """
+        WITH model_presence AS (
+          SELECT
+            id AS model_id,
+            collected_at::DATE AS collection_day
+          FROM models
+        ),
+        daily_model_counts AS (
+          SELECT
+            collection_day,
+            COUNT(*) AS total_models_today
+          FROM model_presence
+          GROUP BY collection_day
+        ),
+        retained_models AS (
+          SELECT
+            a.collection_day,
+            COUNT(*) AS previously_existed_count
+          FROM model_presence a
+          JOIN model_presence b
+            ON a.model_id = b.model_id
+            AND a.collection_day = b.collection_day + INTERVAL '1 day'
+          GROUP BY a.collection_day
+        )
+        SELECT
+          d.collection_day,
+          d.total_models_today,
+          COALESCE(r.previously_existed_count, 0) AS carried_over_models,
+          CASE
+            WHEN d.total_models_today = 0 THEN NULL
+            ELSE ROUND(COALESCE(r.previously_existed_count, 0) * 100.0 / d.total_models_today, 2)
+          END AS percent_retained
+        FROM daily_model_counts d
+        LEFT JOIN retained_models r ON d.collection_day = r.collection_day
+        WHERE d.collection_day BETWEEN ? AND ?
+        ORDER BY d.collection_day
+        """
+        # Pass the string dates directly to the query, using the 'params' keyword argument.
+        result = duckdb.query(query, params=[start_date, end_date]).to_df()
+        print("SQL Query Result:") # Log the result
+        print(result) # Log the result
+        return result
+    except Exception as e:
+        # Log the error to standard error
+        print(f"Error in get_retention_data: {e}", file=sys.stderr)
+        # Return empty DataFrame with error message
+        return pd.DataFrame({"Error": [str(e)]})
+def plot_retention_data(dataframe: pd.DataFrame):
+    print("DataFrame received by plot_retention_data (first 5 rows):")
+    print(dataframe.head())
+    print("\nData types in plot_retention_data before any conversion:")
+    print(dataframe.dtypes)
+    # Check if the DataFrame itself is an error signal from the previous function
+    if "Error" in dataframe.columns and not dataframe.empty:
+        error_message = dataframe['Error'].iloc[0]
+        print(f"Error DataFrame received: {error_message}", file=sys.stderr)
+        fig = go.Figure()
+        fig.add_annotation(
+            text=f"Error from data generation: {error_message}",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
+        )
+        return fig
+    try:
+        # Ensure 'percent_retained' column exists
+        if 'percent_retained' not in dataframe.columns:
+            raise ValueError("'percent_retained' column is missing from the DataFrame.")
+        if 'collection_day' not in dataframe.columns:
+            raise ValueError("'collection_day' column is missing from the DataFrame.")
+        # Explicitly convert 'percent_retained' to numeric.
+        # Ensure 'percent_retained' is numeric and 'collection_day' is datetime for Plotly
+        dataframe['percent_retained'] = pd.to_numeric(dataframe['percent_retained'], errors='coerce')
+        dataframe['collection_day'] = pd.to_datetime(dataframe['collection_day'])
+        # Drop rows where 'percent_retained' could not be converted (became NaT)
+        dataframe.dropna(subset=['percent_retained', 'collection_day'], inplace=True)
+        print("\n'percent_retained' column after pd.to_numeric (first 5 values):")
+        print(dataframe['percent_retained'].head())
+        print("'percent_retained' dtype after pd.to_numeric:", dataframe['percent_retained'].dtype)
+        print("\n'collection_day' column after pd.to_datetime (first 5 values):")
+        print(dataframe['collection_day'].head())
+        print("'collection_day' dtype after pd.to_datetime:", dataframe['collection_day'].dtype)
+        if dataframe.empty:
+            fig = go.Figure()
+            fig.add_annotation(
+                text="No data available to plot after processing.",
+                xref="paper", yref="paper",
+                x=0.5, y=0.5, showarrow=False,
+                font=dict(size=16)
+            )
+            return fig
+        # Create Plotly bar chart
+        fig = px.bar(
+            dataframe,
+            x='collection_day',
+            y='percent_retained',
+            title='Previous Day Top 200 Trending Model Retention %',
+            labels={'collection_day': 'Date', 'percent_retained': 'Retention Rate (%)'},
+            text='percent_retained'  # Use the column directly for hover/text
+        )
+        # Format the text on bars
+        fig.update_traces(
+            texttemplate='%{text:.2f}%',
+            textposition='inside',
+            insidetextanchor='middle', # Anchor text to the middle of the bar
+            textfont_color='white',
+            textfont_size=10, # Adjusted size for better fit
+            hovertemplate='<b>Date</b>: %{x|%Y-%m-%d}<br>' +
+                          '<b>Retention</b>: %{y:.2f}%<extra></extra>' # Custom hover
+        )
+        # Calculate and plot the average retention line
+        if not dataframe['percent_retained'].empty:
+            average_retention = dataframe['percent_retained'].mean()
+            fig.add_hline(
+                y=average_retention,
+                line_dash="dash",
+                line_color="red",
+                annotation_text=f"Average: {average_retention:.2f}%",
+                annotation_position="bottom right"
+            )
+        fig.update_xaxes(tickangle=45)
+        fig.update_layout(
+            title_x=0.5, # Center title
+            xaxis_title="Date",
+            yaxis_title="Retention Rate (%)",
+            plot_bgcolor='white', # Set plot background to white like seaborn whitegrid
+            bargap=0.2 # Gap between bars of different categories
+        )
+        return fig
+    except Exception as e:
+        print(f"Error during plot_retention_data: {e}", file=sys.stderr)
+        fig = go.Figure()
+        fig.add_annotation(
+            text=f"Plotting Error: {str(e)}",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
+        )
+        return fig
+def interface_fn(start_date, end_date):
+    result = get_retention_data(start_date, end_date)
+    return plot_retention_data(result)
+# Get min and max dates from the dataset
+min_date = datetime.fromisoformat(df['collected_at'].min()).date()
+max_date = datetime.fromisoformat(df['collected_at'].max()).date()
+iface = gr.Interface(
+    fn=interface_fn,
+    inputs=[
+        gr.Textbox(label="Start Date (YYYY-MM-DD)", value=min_date.strftime("%Y-%m-%d")),
+        gr.Textbox(label="End Date (YYYY-MM-DD)", value=max_date.strftime("%Y-%m-%d"))
+    ],
+    outputs=gr.Plotly(label="Model Retention Visualization"),
+    title="Model Retention Analysis",
+    description="Visualize model retention rates over time. Enter dates in YYYY-MM-DD format."
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets
+duckdb
+pandas
+seaborn
+matplotlib
+gradio