File size: 2,095 Bytes
215f60a
 
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
724b1ea
 
 
 
215f60a
 
 
 
 
 
 
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
215f60a
724b1ea
215f60a
 
 
 
724b1ea
215f60a
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
b421ee0
215f60a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json

import streamlit as st
import streamlit.components.v1 as components

BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"


def load_jsonl(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))

    return data


if "idx" not in st.session_state:
    st.session_state.idx = 0


def get_next_item():
    st.session_state.idx += 1


def save_and_get_next_item(sample, issue):
    sample["issue"] = issue

    with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
        f.write(json.dumps(sample) + "\n")

    get_next_item()


datasets = [
    "gutenberg_raw",
    "stackexchange2",
    "bigcode_python_code",
    "bigcode_python_github_issues",
    "bigcode_python_jupyter_scripts_dedup_filtered",
    "books3",
    "c4",
    "s2orc_raw",
    "reddit_threaded",
    "cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
    pass

st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
    st.sidebar.download_button(
        "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
    )

st.sidebar.button(
    "Clear bad examples file",
    on_click=lambda: open(
        f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
    ).close(),
)

with st.form(key="bad_form", clear_on_submit=True):
    sample = data[st.session_state.idx]
    text = sample["text"]
    st.text_area(f"text id: {st.session_state.idx}", text, height=500)

    issue = st.text_input(
        "What's wrong with this example? (leave blank if example is fine)"
    )

    good = st.form_submit_button(
        "GOOD", on_click=get_next_item, 
    )
    bad = st.form_submit_button(
        "BAD",
        on_click=save_and_get_next_item,
        args=(sample, issue),
    )