|
import gradio as gr |
|
import os, json, pathlib, tempfile |
|
from typing import List, Dict |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
def search_datasets(query: str, max_results: int = 20) -> List[Dict]: |
|
""" |
|
Return brief metadata for up to `max_results` public datasets |
|
whose title or description matches `query`. |
|
""" |
|
results = api.dataset_list(search=query, max_size= None) |
|
out = [] |
|
for ds in results[:max_results]: |
|
out.append({ |
|
"title": ds.title, |
|
"slug": ds.ref, |
|
"size_mb": round(ds.total_bytes/1e6, 2), |
|
"downloads": ds.download_count, |
|
"votes": ds.vote_count, |
|
}) |
|
return out |
|
|
|
def list_files(dataset_slug: str) -> List[Dict]: |
|
files = api.dataset_list_files(dataset_slug).files |
|
return [{"name": f.name, "size_mb": round(f.total_bytes / 1e6, 2)} for f in files] |
|
|
|
def download_file(dataset_slug: str, file_name: str): |
|
tmp_dir = tempfile.mkdtemp() |
|
api.dataset_download_file(dataset_slug, file_name, path=tmp_dir, quiet=False) |
|
zip_path = pathlib.Path(tmp_dir) / f"{file_name}" |
|
|
|
if not zip_path.exists(): |
|
zip_path = pathlib.Path(tmp_dir) / f"{file_name}.zip" |
|
return str(zip_path) |
|
|
|
search_iface = gr.Interface( |
|
fn=search_datasets, |
|
inputs=[ |
|
gr.Textbox(label="Search term", placeholder="e.g. titanic"), |
|
gr.Slider(1, 50, step=1, value=20, label="Max results") |
|
], |
|
outputs=gr.JSON(label="Datasets"), |
|
title="Search kaggle Datasets", |
|
description="Resturns a JSON array of dataset metadata." |
|
) |
|
|
|
list_files_iface = gr.Interface( |
|
fn=list_files, |
|
inputs=gr.Textbox(label="Dataset slug", placeholder="zynicide/wine-reviews"), |
|
outputs=gr.JSON(label="Files"), |
|
title="List Dataset Files", |
|
description="Given a dataset slug, returns its file list." |
|
) |
|
|
|
download_file_iface = gr.Interface( |
|
fn=download_file, |
|
inputs=[ |
|
gr.Textbox(label="Dataset slug", placeholder="zynicide/wine-reviews"), |
|
gr.Textbox(label="File name", placeholder="winemag-data_first150k.csv") |
|
], |
|
outputs=gr.File(label="Download file"), |
|
title="Download a File", |
|
description="Downloads one file from the dataset and returns it." |
|
) |
|
|
|
demo = gr.TabbedInterface( |
|
[search_iface, list_files_iface, download_file_iface], |
|
tab_names=["Search", "Files", "Download"] |
|
) |
|
|
|
def _bootstrap_kaggle_credentials(): |
|
user = os.getenv("KAGGLE_USERNAME") |
|
key = os.getenv("KAGGLE_KEY") |
|
if not (user and key): |
|
raise RuntimeError( |
|
"Kaggle credentials not found." |
|
"Set KAGGLE_USERNAME and KAGGLE_KEY as env vars or in .env" |
|
) |
|
cred_path = pathlib.Path.home() / ".kaggle" / "kaggle.json" |
|
if not cred_path.exists(): |
|
cred_path.parent.mkdir(exist_ok=True) |
|
cred_path.write_text(json.dumps({"username": user, "key": key})) |
|
cred_path.chmod(0o600) |
|
|
|
_bootstrap_kaggle_credentials() |
|
|
|
from kaggle.api.kaggle_api_extended import KaggleApi |
|
api = KaggleApi() |
|
api.authenticate() |
|
|
|
if __name__ == "__main__": |
|
demo.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True) |