File size: 2,095 Bytes
215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a b421ee0 215f60a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import json
import streamlit as st
import streamlit.components.v1 as components
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_and_get_next_item(sample, issue):
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD", on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_and_get_next_item,
args=(sample, issue),
)
|