Spaces:

Asura05
/

Meta-Kaggle-Dataset-Navigator

Sleeping

App Files Files Community

n0v33n commited on Jun 24

Commit

5c05919

1 Parent(s): 74058a7

Initial Space Setup

Browse files

Files changed (2) hide show

app.py +116 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import kagglehub
+import os
+import polars as pl
+import gradio as gr
+import google.generativeai as genai
+from kaggle_secrets import UserSecretsClient
+# Download Meta-Kaggle dataset in parquet format
+PARQUET_PATH = kagglehub.dataset_download("bwandowando/meta-kaggle-ported-to-parquet-format")
+print("✅ Downloaded Meta-Kaggle parquet data.")
+print("📂 PARQUET_PATH =", PARQUET_PATH)
+# === Load schema from parquet files ===
+parquet_files = sorted([f for f in os.listdir(PARQUET_PATH) if f.endswith(".parquet")])
+schema_dict = {}
+file_map = {}
+for file in parquet_files:
+    name = file.replace(".parquet", "")
+    path = os.path.join(PARQUET_PATH, file)
+    # Read only schema using polars
+    schema_dict[name] = list(pl.read_parquet(path, n_rows=0).columns)
+    file_map[name] = path
+# === Build schema as prompt context ===
+schema_description = "\n\n".join(
+    [f"### {name}\n{', '.join(cols)}" for name, cols in schema_dict.items()]
+)
+context_prompt = f"""
+You are a helpful assistant that helps users understand which parts of the Meta-Kaggle dataset they need for their analysis.
+Below is the dataset schema (parquet files):
+{schema_description}
+When the user asks a question, respond with:
+1. Which parquet file(s) are needed
+2. Which column(s) are relevant
+3. If needed, describe any join keys (e.g., "CompetitionId", "UserId")
+Do not generate or run any code. Just guide the user on what parts of the dataset are needed.
+"""
+# === Gemini setup ===
+try:
+    # In Hugging Face Spaces, use environment variable for the API key
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+    genai.configure(api_key=GOOGLE_API_KEY)
+    model = genai.GenerativeModel("gemini-1.5-flash")
+except Exception as e:
+    print(f"Error setting up Gemini API: {e}")
+    model = None
+# === Analysis Guide Function ===
+def guide_user(prompt):
+    if model is None:
+        return "Error: Gemini API not properly configured."
+    full_prompt = context_prompt + f"\n\nUser question: {prompt}\n\nAnswer:"
+    result = model.generate_content(full_prompt)
+    return result.text.strip()
+# === Custom CSS for Gradio UI ===
+css = """
+body {
+    background-color: #f0f4f8;
+    font-family: 'Arial', sans-serif;
+}
+.gradio-container {
+    max-width: 800px;
+    margin: auto;
+    padding: 20px;
+    background-color: white;
+    border-radius: 10px;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+}
+h1 {
+    color: #1a73e8;
+    text-align: center;
+}
+h2, p.subtitle, .subheading, .description {
+    color: #555;
+    text-align: center;
+    margin-bottom: 20px;
+}
+input[type="text"] {
+    border: 1px solid #ccc;
+    border-radius: 5px;
+    padding: 10px;
+}
+button {
+    background-color: #1a73e8;
+    color: white;
+    border-radius: 5px;
+    padding: 10px 20px;
+}
+button:hover {
+    background-color: #1557b0;
+}
+.output-text {
+    background-color: #f9f9f9;
+    border: 1px solid #ddd;
+    border-radius: 5px;
+    padding: 15px;
+}
+"""
+# === Launch Gradio UI ===
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Meta-Kaggle Dataset Navigator")
+    gr.Markdown("Ask which parquet files and columns you need for your analysis!")
+    input_text = gr.Textbox(label="Your Question", placeholder="E.g., Which files and columns do I need to analyze competition rankings?")
+    output_text = gr.Textbox(label="Guidance", interactive=False)
+    submit_button = gr.Button("Get Guidance")
+    submit_button.click(fn=guide_user, inputs=input_text, outputs=output_text)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+kagglehub
+polars
+gradio
+google-generativeai
+kaggle