File size: 3,792 Bytes
421b068
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0709a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421b068
6d0709a
 
 
 
421b068
 
 
 
 
 
 
 
 
6d0709a
421b068
6d0709a
421b068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d0709a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421b068
6d0709a
 
421b068
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi

def create_notebook_file(cell_commands, notebook_name="generated_notebook.ipynb"):
    nb = nbf.v4.new_notebook()
    nb['cells'] = [nbf.v4.new_code_cell(command) for command in cell_commands]        

    with open(notebook_name, 'w') as f:
        nbf.write(nb, f)
    
    print(f"Notebook '{notebook_name}' created successfully.")

def push_notebook(file_path, dataset_id, token):
    api = HfApi(token=token)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo="dataset_analysis.ipynb",
        repo_id=dataset_id,
        repo_type="dataset",
    )
    print("Notebook uploaded to Huggingface Hub.")
    link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/dataset_analyst.ipynb"
    return f'<a target="_blank" href="{link}"  style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">See notebook</a>'

def generate_notebook(dataset_id, token):
    api = HfApi(token=token)
    # TODO: Handle auth error
    # TODO: Get first config and split? or generate a dataframe per each split maybe?
    commands = [
        f"!pip install pandas",
        f"import pandas as pd",
        f"df = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')",
        f"df.head()",
    ]
    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
    create_notebook_file(commands, notebook_name=notebook_name)
    api.upload_file(
        path_or_fileobj=notebook_name,
        path_in_repo="dataset_analysis.ipynb",
        repo_id="asoria/en-text",
        repo_type="dataset",
    )
    # TODO: Handle permission error
    print("Notebook uploaded to Huggingface Hub.")
    return notebook_name

with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dataset auto analyst creator 🕵️")
    dataset_name = HuggingfaceHubSearch(
            label="Hub Dataset ID",
            placeholder="Search for dataset id on Huggingface",
            search_type="dataset",
            value="",
        )

    @gr.render(inputs=dataset_name)
    def embed(name):
        if not name:
            return gr.Markdown("### No dataset provided")
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="600px"
        ></iframe>
            """
        return gr.HTML(value=html_code)

    generate_btn = gr.Button("Generate notebook and push to repo", visible=True)

    download_link = gr.File(label="Download Notebook")
    generate_btn.click(generate_notebook, inputs=[dataset_name], outputs=[download_link])
    with gr.Row() as auth_page:
        with gr.Column():
            auth_title = gr.Markdown(
                "Enter your token ([settings](https://huggingface.co/settings/tokens)):"
            )
            token_box = gr.Textbox("", label="token", placeholder="hf_xxx", type="password"
            )
            auth_error = gr.Markdown("", visible=False)

    def auth(token):
        if not token:
            return {
                    auth_error: gr.Markdown(value="", visible=False),
                    push_btn: gr.Row(visible=False)
                }
        return {
            auth_error: gr.Markdown(value="", visible=False),
            push_btn: gr.Row(visible=True)
        }


    push_btn = gr.Button("Push notebook to repo", visible=False)
    token_box.change(
        auth,
        inputs=token_box,
        outputs=[auth_error, push_btn],
    )
    output_lbl = gr.HTML(value="")

    push_btn.click(push_notebook, inputs=[download_link, dataset_name, token_box], outputs=[output_lbl])
demo.launch()