HF_RepoSense / hf_utils.py
naman1102's picture
deployment
083f41a
from huggingface_hub import snapshot_download
import os
import shutil
def download_filtered_space_files(space_id: str, local_dir: str = "repo_files", file_extensions: list = None):
"""
Downloads only files with specified extensions from a Hugging Face Space repository.
Args:
space_id (str): The ID of the Hugging Face Space (e.g., "naman1102/Final_Assignment_Template").
local_dir (str): Local directory to store the downloaded files.
file_extensions (list): List of file extensions to include (e.g., ['.py', '.md']).
If None, no filtering is applied (all files are downloaded).
"""
if not file_extensions:
raise ValueError("You must specify a list of file extensions to filter by.")
print(f"Downloading Space '{space_id}' and filtering for: {', '.join(file_extensions)}")
# Clear out local_dir if it already exists
if os.path.exists(local_dir):
shutil.rmtree(local_dir)
# Convert file extensions to allow_patterns format (e.g., ['.py', '.md'] -> ['*.py', '*.md'])
allow_patterns = [f"*{ext}" for ext in file_extensions]
# Download directly to local_dir with filtering during download
repo_path = snapshot_download(
repo_id=space_id,
repo_type="space",
local_dir=local_dir,
allow_patterns=allow_patterns
)
# Count downloaded files for feedback
copied_files = 0
for root, _, files in os.walk(local_dir):
for file in files:
if any(file.endswith(ext) for ext in file_extensions):
copied_files += 1
print(f"Downloaded {copied_files} filtered file(s) to: {local_dir}")
# Example usage
# download_filtered_space_files("finegrain/finegrain-image-enhancer", file_extensions=['.py', '.md', '.txt']) # Downloads only .py, .md, and .txt files
from huggingface_hub import list_spaces
def search_top_spaces(query: str, limit: int = 5):
"""
Search and return top Hugging Face Space repo IDs based on a keyword.
Args:
query (str): The keyword to search for (e.g., "image", "chatbot").
limit (int): Maximum number of results to return.
Returns:
List of repo IDs.
"""
results = list(list_spaces(search=query, sort="likes", direction=-1)) # Convert generator to list
top_spaces = [space.id for space in results[:limit]]
return top_spaces
# Example usage
# top_image_spaces = search_top_spaces("tic tac toe", limit=10)
# print("Top games-related Spaces:")
# for space_id in top_image_spaces:
# print("-", space_id)