Implement Kaggle API integration with dataset search, file listing, and download functionality in app.py
Browse files- .gitignore +2 -0
- app.py +55 -4
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.mcp-venv/
|
app.py
CHANGED
@@ -1,7 +1,58 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os, json, pathlib
|
3 |
+
import pandas as pd
|
4 |
+
import tempfile, pathlib
|
5 |
+
from typing import List, Dict
|
6 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
7 |
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
|
11 |
+
def search_datasets(query: str, max_results: int = 20) -> List[Dict]:
|
12 |
+
"""
|
13 |
+
Return brief metadata for up to `max_results` public datasets
|
14 |
+
whose title or description matches `query`.
|
15 |
+
"""
|
16 |
+
results = api.dataset_list(search=query, max_size=None, page_size=max_results)
|
17 |
+
out = []
|
18 |
+
for ds in results:
|
19 |
+
out.append({
|
20 |
+
"title": ds.title,
|
21 |
+
"slug": ds.ref,
|
22 |
+
"size_mb": round(ds.size/1e6, 2),
|
23 |
+
"downloads": ds.downloadCount,
|
24 |
+
"votes": ds.totalVotes,
|
25 |
+
})
|
26 |
+
return out
|
27 |
+
|
28 |
+
def list_files(dataset_slug: str) -> List[Dict]:
|
29 |
+
files = api.dataset_list_files(dataset_slug).files
|
30 |
+
return [{"name": f.name, "size_mb": round(f.totalBytes / 1e6, 2)} for f in files]
|
31 |
+
|
32 |
+
def downlaod_file(dataset_slug: str, file_name: str):
|
33 |
+
tmp_dir = tempfile.mkdtemp()
|
34 |
+
api.dataset_download_file(dataset_slug, file_name, path=tmp_dir, quiet=False)
|
35 |
+
zip_path = pathlib.Path(tmp_dir) / f"{file_name}"
|
36 |
+
|
37 |
+
if not zip_path.exists():
|
38 |
+
zip_path = pathlib.Path(tmp_dir) / f"{file_name}.zip"
|
39 |
+
return str(zip_path)
|
40 |
+
|
41 |
+
def _bootstrap_kaggle_credentials():
|
42 |
+
user = os.getenv("KAGGLE_USERNAME")
|
43 |
+
key = os.getenv("KAGGLE_KEY")
|
44 |
+
if not (user and key):
|
45 |
+
raise RuntimeError(
|
46 |
+
"Kaggle credentials not found."
|
47 |
+
"Set KAGGLE_USERNAME and KAGGLE_KEY as env vars or in .env"
|
48 |
+
)
|
49 |
+
cred_path = pathlib.Path.home() / ".kaggle" / "kaggle.json"
|
50 |
+
if not cred_path.exists():
|
51 |
+
cred_path.parent.mkdir(exist_ok=True)
|
52 |
+
cred_path.write_text(json.dumps({"username": user, "key": key}))
|
53 |
+
cred_path.chmod(0o600)
|
54 |
+
|
55 |
+
_bootstrap_kaggle_credentials()
|
56 |
+
|
57 |
+
api = KaggleApi()
|
58 |
+
api.authenticate()
|