Spaces:

kolaslab
/

Huggingface-Contributors

Sleeping

App Files Files Community

Huggingface-Contributors / app.py

kolaslab

Update app.py

b3176eb verified 9 months ago

raw

history blame

25.3 kB

	import streamlit as st
	from huggingface_hub import HfApi
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from functools import lru_cache
	import time
	import requests
	from collections import Counter

	st.set_page_config(page_title="HF Contributions", layout="wide", initial_sidebar_state="expanded")

	# Set custom sidebar width - UPDATED to 40% of the screen
	st.markdown("""
	<style>
	[data-testid="stSidebar"] {
	min-width: 40vw !important;
	max-width: 40vw !important;
	}
	</style>
	""", unsafe_allow_html=True)
	api = HfApi()


	# Cache for API responses
	@lru_cache(maxsize=1000)
	def cached_repo_info(repo_id, repo_type):
	return api.repo_info(repo_id=repo_id, repo_type=repo_type)


	@lru_cache(maxsize=1000)
	def cached_list_commits(repo_id, repo_type):
	return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type))


	@lru_cache(maxsize=100)
	def cached_list_items(username, kind):
	if kind == "model":
	return list(api.list_models(author=username))
	elif kind == "dataset":
	return list(api.list_datasets(author=username))
	elif kind == "space":
	return list(api.list_spaces(author=username))
	return []


	# Function to fetch trending accounts and create stats
	@lru_cache(maxsize=1)
	def get_trending_accounts(limit=100):
	try:
	trending_data = {"spaces": [], "models": []}

	# Get spaces for stats calculation
	spaces_response = requests.get("https://huggingface.co/api/spaces",
	params={"limit": 10000},
	timeout=30)

	# Get models for stats calculation
	models_response = requests.get("https://huggingface.co/api/models",
	params={"limit": 10000},
	timeout=30)

	# Process spaces data
	spaces_owners = []
	if spaces_response.status_code == 200:
	spaces = spaces_response.json()

	# Count spaces by owner
	owner_counts_spaces = {}
	for space in spaces:
	if '/' in space.get('id', ''):
	owner, _ = space.get('id', '').split('/', 1)
	else:
	owner = space.get('owner', '')

	if owner != 'None':
	owner_counts_spaces[owner] = owner_counts_spaces.get(owner, 0) + 1

	# Get top owners by count for spaces
	top_owners_spaces = sorted(owner_counts_spaces.items(), key=lambda x: x[1], reverse=True)[:limit]
	trending_data["spaces"] = top_owners_spaces
	spaces_owners = [owner for owner, _ in top_owners_spaces]

	# Process models data
	models_owners = []
	if models_response.status_code == 200:
	models = models_response.json()

	# Count models by owner
	owner_counts_models = {}
	for model in models:
	if '/' in model.get('id', ''):
	owner, _ = model.get('id', '').split('/', 1)
	else:
	owner = model.get('owner', '')

	if owner != 'None':
	owner_counts_models[owner] = owner_counts_models.get(owner, 0) + 1

	# Get top owners by count for models
	top_owners_models = sorted(owner_counts_models.items(), key=lambda x: x[1], reverse=True)[:limit]
	trending_data["models"] = top_owners_models
	models_owners = [owner for owner, _ in top_owners_models]

	# Combine rankings for overall trending based on appearance in both lists
	combined_score = {}
	for i, owner in enumerate(spaces_owners):
	if owner not in combined_score:
	combined_score[owner] = 0
	combined_score[owner] += (limit - i) # Higher rank gives more points

	for i, owner in enumerate(models_owners):
	if owner not in combined_score:
	combined_score[owner] = 0
	combined_score[owner] += (limit - i) # Higher rank gives more points

	# Sort by combined score
	sorted_combined = sorted(combined_score.items(), key=lambda x: x[1], reverse=True)[:limit]
	trending_authors = [owner for owner, _ in sorted_combined]

	return trending_authors, trending_data["spaces"], trending_data["models"]
	except Exception as e:
	st.error(f"Error fetching trending accounts: {str(e)}")
	fallback_authors = ["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"]
	return fallback_authors, [(author, 0) for author in fallback_authors], [(author, 0) for author in fallback_authors]


	# Rate limiting
	class RateLimiter:
	def __init__(self, calls_per_second=10):
	self.calls_per_second = calls_per_second
	self.last_call = 0

	def wait(self):
	current_time = time.time()
	time_since_last_call = current_time - self.last_call
	if time_since_last_call < (1.0 / self.calls_per_second):
	time.sleep((1.0 / self.calls_per_second) - time_since_last_call)
	self.last_call = time.time()


	rate_limiter = RateLimiter()


	# Function to fetch commits for a repository (optimized)
	def fetch_commits_for_repo(repo_id, repo_type, username, selected_year):
	try:
	rate_limiter.wait()
	# Skip private/gated repos upfront
	repo_info = cached_repo_info(repo_id, repo_type)
	if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated):
	return [], []

	# Get initial commit date
	initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date()
	commit_dates = []
	commit_count = 0

	# Add initial commit if it's from the selected year
	if initial_commit_date.year == selected_year:
	commit_dates.append(initial_commit_date)
	commit_count += 1

	# Get all commits
	commits = cached_list_commits(repo_id, repo_type)
	for commit in commits:
	commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date()
	if commit_date.year == selected_year:
	commit_dates.append(commit_date)
	commit_count += 1

	return commit_dates, commit_count
	except Exception:
	return [], 0


	# Function to get commit events for a user (optimized)
	def get_commit_events(username, kind=None, selected_year=None):
	commit_dates = []
	items_with_type = []
	kinds = [kind] if kind else ["model", "dataset", "space"]

	for k in kinds:
	try:
	items = cached_list_items(username, k)
	items_with_type.extend((item, k) for item in items)
	repo_ids = [item.id for item in items]

	# Optimized parallel fetch with chunking
	chunk_size = 5 # Process 5 repos at a time
	for i in range(0, len(repo_ids), chunk_size):
	chunk = repo_ids[i:i + chunk_size]
	with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
	future_to_repo = {
	executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id
	for repo_id in chunk
	}
	for future in as_completed(future_to_repo):
	repo_commits, repo_count = future.result()
	if repo_commits: # Only extend if we got commits
	commit_dates.extend(repo_commits)
	except Exception as e:
	st.warning(f"Error fetching {k}s for {username}: {str(e)}")

	# Create DataFrame with all commits
	df = pd.DataFrame(commit_dates, columns=["date"])
	if not df.empty:
	df = df.drop_duplicates() # Remove any duplicate dates
	return df, items_with_type


	# Calendar heatmap function (optimized)
	def make_calendar_heatmap(df, title, year):
	if df.empty:
	st.info(f"No {title.lower()} found for {year}.")
	return

	# Optimize DataFrame operations
	df["count"] = 1
	df = df.groupby("date", as_index=False).sum()
	df["date"] = pd.to_datetime(df["date"])

	# Create date range more efficiently
	start = pd.Timestamp(f"{year}-01-01")
	end = pd.Timestamp(f"{year}-12-31")
	all_days = pd.date_range(start=start, end=end)

	# Optimize DataFrame creation and merging
	heatmap_data = pd.DataFrame({"date": all_days, "count": 0})
	heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y"))
	heatmap_data["count"] = heatmap_data["count_y"].fillna(0)
	heatmap_data = heatmap_data.drop("count_y", axis=1)

	# Calculate week and day of week more efficiently
	heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek
	heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7

	# Create pivot table more efficiently
	pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0)

	# Optimize month labels calculation
	month_labels = pd.date_range(start, end, freq="MS").strftime("%b")
	month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7)

	# Create custom colormap with specific boundaries
	from matplotlib.colors import ListedColormap, BoundaryNorm
	colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors
	bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions
	cmap = ListedColormap(colors)
	norm = BoundaryNorm(bounds, cmap.N)

	# Create plot more efficiently
	fig, ax = plt.subplots(figsize=(12, 1.2))

	# Convert pivot values to integers to ensure proper color mapping
	pivot_int = pivot.astype(int)

	# Create heatmap with explicit vmin and vmax
	sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white",
	square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"])

	ax.set_title(f"{title}", fontsize=12, pad=10)
	ax.set_xlabel("")
	ax.set_ylabel("")
	ax.set_xticks(month_positions)
	ax.set_xticklabels(month_labels, fontsize=8)
	ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
	st.pyplot(fig)


	# Fetch trending accounts with a loading spinner (do this once at the beginning)
	with st.spinner("Loading trending accounts..."):
	trending_accounts, top_owners_spaces, top_owners_models = get_trending_accounts(limit=100)

	# Sidebar
	with st.sidebar:
	st.title("👤 Contributor")

	# Create tabs for Spaces and Models rankings - ONLY SHOWING FIRST TWO TABS
	tab1, tab2 = st.tabs([
	"Top 100 Overall Contributors",
	"Top 100 by Spaces & Models"
	])

	with tab1:
	# Show combined trending accounts list
	st.subheader("🔥 Top 100 Overall Contributors")

	# Display the top 100 accounts list
	st.markdown("### Combined Contributors Ranking")

	# Create a data frame for the table
	if trending_accounts:
	# Create a mapping from username to Spaces and Models rankings
	spaces_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_spaces)}
	models_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_models)}

	# Create the overall ranking dataframe
	overall_data = []
	for idx, username in enumerate(trending_accounts[:100]):
	# Use strings for all rankings to avoid type conversion issues
	spaces_position = str(spaces_rank.get(username, "-"))
	models_position = str(models_rank.get(username, "-"))
	overall_data.append([username, spaces_position, models_position])

	ranking_data_overall = pd.DataFrame(
	overall_data,
	columns=["Contributor", "Spaces Rank", "Models Rank"]
	)
	ranking_data_overall.index = ranking_data_overall.index + 1 # Start index from 1 for ranking

	st.dataframe(
	ranking_data_overall,
	column_config={
	"Contributor": st.column_config.TextColumn("Contributor"),
	"Spaces Rank": st.column_config.TextColumn("Spaces Rank (top 100)"),
	"Models Rank": st.column_config.TextColumn("Models Rank (top 100)")
	},
	use_container_width=True,
	hide_index=False
	)

	with tab2:
	# Show trending accounts list by Spaces
	st.subheader("🚀 Top 100 by Spaces & Models")

	# Display the top 100 accounts list
	st.markdown("### Spaces Contributors Ranking")

	# Create a data frame for the table
	if top_owners_spaces:
	ranking_data_spaces = pd.DataFrame(top_owners_spaces[:100], columns=["Contributor", "Spaces Count"])
	ranking_data_spaces.index = ranking_data_spaces.index + 1 # Start index from 1 for ranking

	st.dataframe(
	ranking_data_spaces,
	column_config={
	"Contributor": st.column_config.TextColumn("Contributor"),
	"Spaces Count": st.column_config.NumberColumn("Spaces Count (based on top 500 spaces)", format="%d")
	},
	use_container_width=True,
	hide_index=False
	)

	# Add stats expander with visualization
	with st.expander("View Top 30 Spaces Contributors Chart"):
	# Create a bar chart for top 30 contributors
	if top_owners_spaces:
	chart_data = pd.DataFrame(top_owners_spaces[:30], columns=["Owner", "Spaces Count"])

	fig, ax = plt.subplots(figsize=(10, 8))
	bars = ax.barh(chart_data["Owner"], chart_data["Spaces Count"])

	# Add color gradient to bars
	for i, bar in enumerate(bars):
	bar.set_color(plt.cm.viridis(i/len(bars)))

	ax.set_title("Top 30 Contributors by Number of Spaces")
	ax.set_xlabel("Number of Spaces")
	plt.tight_layout()
	st.pyplot(fig)

	# Display the top 100 Models accounts list (ADDED SECTION)
	st.markdown("### Models Contributors Ranking")

	# Create a data frame for the Models table
	if top_owners_models:
	ranking_data_models = pd.DataFrame(top_owners_models[:100], columns=["Contributor", "Models Count"])
	ranking_data_models.index = ranking_data_models.index + 1 # Start index from 1 for ranking

	st.dataframe(
	ranking_data_models,
	column_config={
	"Contributor": st.column_config.TextColumn("Contributor"),
	"Models Count": st.column_config.NumberColumn("Models Count (based on top 500 models)", format="%d")
	},
	use_container_width=True,
	hide_index=False
	)

	# Add stats expander with visualization for Models (ADDED SECTION)
	with st.expander("View Top 30 Models Contributors Chart"):
	# Create a bar chart for top 30 models contributors
	if top_owners_models:
	chart_data = pd.DataFrame(top_owners_models[:30], columns=["Owner", "Models Count"])

	fig, ax = plt.subplots(figsize=(10, 8))
	bars = ax.barh(chart_data["Owner"], chart_data["Models Count"])

	# Add color gradient to bars
	for i, bar in enumerate(bars):
	bar.set_color(plt.cm.plasma(i/len(bars))) # Using a different colormap for distinction

	ax.set_title("Top 30 Contributors by Number of Models")
	ax.set_xlabel("Number of Models")
	plt.tight_layout()
	st.pyplot(fig)

	# Display trending accounts selection dropdown
	st.subheader("Select Contributor")
	selected_trending = st.selectbox(
	"Select trending account",
	options=trending_accounts[:100], # Limit to top 100
	index=0 if trending_accounts else None,
	key="trending_selectbox"
	)

	# Custom account input option
	st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True)
	custom = st.text_input("Enter username/org", label_visibility="collapsed")

	# Set username based on selection or custom input
	if custom.strip():
	username = custom.strip()
	elif selected_trending:
	username = selected_trending
	else:
	username = "facebook" # Default fallback

	# Year selection
	st.subheader("🗓️ Time Period")
	year_options = list(range(datetime.now().year, 2017, -1))
	selected_year = st.selectbox("Select Year", options=year_options)

	# Additional options for customization
	st.subheader("⚙️ Display Options")
	show_models = st.checkbox("Show Models", value=True)
	show_datasets = st.checkbox("Show Datasets", value=True)
	show_spaces = st.checkbox("Show Spaces", value=True)

	# Main Content
	st.title("🤗 Hugging Face Contributions")
	if username:
	with st.spinner(f"Fetching commit data for {username}..."):
	# Display contributor rank if in top 30
	if username in trending_accounts[:100]:
	rank = trending_accounts.index(username) + 1
	st.success(f"🏆 {username} is ranked #{rank} in the top trending contributors!")

	# Find user in spaces ranking
	spaces_rank = None
	for i, (owner, count) in enumerate(top_owners_spaces):
	if owner == username:
	spaces_rank = i+1
	st.info(f"🚀 Spaces Ranking: #{spaces_rank} with {count} spaces")
	break

	# Find user in models ranking
	models_rank = None
	for i, (owner, count) in enumerate(top_owners_models):
	if owner == username:
	models_rank = i+1
	st.info(f"🧠 Models Ranking: #{models_rank} with {count} models")
	break

	# Display combined ranking info
	combined_info = []
	if spaces_rank and spaces_rank <= 100:
	combined_info.append(f"Spaces: #{spaces_rank}")
	if models_rank and models_rank <= 100:
	combined_info.append(f"Models: #{models_rank}")

	if combined_info:
	st.success(f"Combined Rankings (Top 100): {', '.join(combined_info)}")

	# Create a dictionary to store commits by type
	commits_by_type = {}
	commit_counts_by_type = {}

	# Determine which types to fetch based on checkboxes
	types_to_fetch = []
	if show_models:
	types_to_fetch.append("model")
	if show_datasets:
	types_to_fetch.append("dataset")
	if show_spaces:
	types_to_fetch.append("space")

	if not types_to_fetch:
	st.warning("Please select at least one content type to display (Models, Datasets, or Spaces)")
	st.stop()

	# Fetch commits for each selected type
	for kind in types_to_fetch:
	try:
	items = cached_list_items(username, kind)
	repo_ids = [item.id for item in items]

	st.info(f"Found {len(repo_ids)} {kind}s for {username}")

	# Process repos in chunks
	chunk_size = 5
	total_commits = 0
	all_commit_dates = []

	progress_bar = st.progress(0)
	for i in range(0, len(repo_ids), chunk_size):
	chunk = repo_ids[i:i + chunk_size]
	with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
	future_to_repo = {
	executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id
	for repo_id in chunk
	}
	for future in as_completed(future_to_repo):
	repo_commits, repo_count = future.result()
	if repo_commits:
	all_commit_dates.extend(repo_commits)
	total_commits += repo_count

	# Update progress
	progress = min(1.0, (i + len(chunk)) / max(1, len(repo_ids)))
	progress_bar.progress(progress)

	# Complete progress
	progress_bar.progress(1.0)

	commits_by_type[kind] = all_commit_dates
	commit_counts_by_type[kind] = total_commits

	except Exception as e:
	st.warning(f"Error fetching {kind}s for {username}: {str(e)}")
	commits_by_type[kind] = []
	commit_counts_by_type[kind] = 0

	# Calculate total commits across all types
	total_commits = sum(commit_counts_by_type.values())

	st.subheader(f"{username}'s Activity in {selected_year}")

	# Profile information
	profile_col1, profile_col2 = st.columns([1, 3])
	with profile_col1:
	# Try to get avatar
	try:
	avatar_url = f"https://huggingface.co/avatars/{username}"
	st.image(avatar_url, width=150)
	except:
	st.info("No profile image available")

	with profile_col2:
	st.metric("Total Commits", total_commits)

	# Show contributor rank if in top owners
	for owner, count in top_owners_spaces:
	if owner.lower() == username.lower():
	st.metric("Spaces Count", count)
	break

	st.markdown(f"[View Profile on Hugging Face](https://huggingface.co/{username})")

	# Create DataFrame for all commits
	all_commits = []
	for commits in commits_by_type.values():
	all_commits.extend(commits)
	all_df = pd.DataFrame(all_commits, columns=["date"])
	if not all_df.empty:
	all_df = all_df.drop_duplicates() # Remove any duplicate dates

	make_calendar_heatmap(all_df, "All Commits", selected_year)

	# Add followers chart section
	st.subheader(f"👥 Follower Evolution for {username}")
	followers_container = st.container()
	with followers_container:
	# Create iframe to embed the external follower visualization
	iframe_html = f"""
	<iframe
	src="/index.html?username={username}"
	width="100%"
	height="500px"
	style="border:none;box-shadow:0px 0px 10px rgba(0,0,0,0.1);border-radius:10px;"
	allowfullscreen>
	</iframe>
	"""
	st.markdown(iframe_html, unsafe_allow_html=True)
	st.caption("Follower evolution data from Hugging Face. The chart displays how followers have changed over time.")

	# Metrics and heatmaps for each selected type
	cols = st.columns(len(types_to_fetch)) if types_to_fetch else st.columns(1)

	for i, (kind, emoji, label) in enumerate([
	("model", "🧠", "Models"),
	("dataset", "📦", "Datasets"),
	("space", "🚀", "Spaces")
	]):
	if kind in types_to_fetch:
	with cols[types_to_fetch.index(kind)]:
	try:
	total = len(cached_list_items(username, kind))
	commits = commits_by_type.get(kind, [])
	commit_count = commit_counts_by_type.get(kind, 0)
	df_kind = pd.DataFrame(commits, columns=["date"])
	if not df_kind.empty:
	df_kind = df_kind.drop_duplicates() # Remove any duplicate dates
	st.metric(f"{emoji} {label}", total)
	st.metric(f"Commits in {selected_year}", commit_count)
	make_calendar_heatmap(df_kind, f"{label} Commits", selected_year)
	except Exception as e:
	st.warning(f"Error processing {label}: {str(e)}")
	st.metric(f"{emoji} {label}", 0)
	st.metric(f"Commits in {selected_year}", 0)
	make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year)
	else:
	st.info("Please select an account from the sidebar to view contributions.")