Spaces:
Sleeping
Sleeping
File size: 5,289 Bytes
afffb58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import os
import random
import argilla as rg
import streamlit as st
from PIL import Image
st.set_page_config(
page_title="Argilla Dolly Cleaning Tool", page_icon=":sponge:", layout="wide"
)
try:
api_url = os.environ["ARGILLA_API_URL"]
api_key = os.environ["ARGILLA_API_KEY"]
cleaned_status = "cleaned"
rg.init(api_url=api_url, api_key=api_key)
rg.set_workspace("recognai")
except Exception:
st.error(
"Incorrect or no ARGILLA_API_URL and ARGILLA_API_KEY environment variables"
" provided!"
)
st.stop()
st.title("Argilla Dolly Cleaning Tool :sponge: π")
with st.expander("Curation Guidelines"):
st.info(
"""
Welcome to the Argilla Dolly Cleaning Tool! :sponge: π
This is a collaborative effort to improve the quality of the Dolly v2 dataset. Most of the issues in this batch are related to incorrect instructions and the combination of inputs with responses. Some examples are records where the input already contains the expected response, or the instruction corresponds to the input.
We kindly appreciate your help. The idea is following:
1. You can correct any field of the record using the fields on the right-hand side.
2. Once you changed the field values you can submit the cleaned record.
3. If you want to pass on the record you can click on the Get New Record button.
"""
)
with st.expander("Task Categories"):
st.info(
"""
- `closed_qa`: These are questions that can be answered using only the information contained in a passage of reference text. For instance, given a paragraph from Wikipedia on the atom, one might ask, βWhat is the ratio between protons and neutrons in the nucleus?β
- `information extraction`: Here an annotator would copy a paragraph from Wikipedia and extract entities or other factual information such as weights or measurements from the passage.
- `summarization`: For this, annotators provided a passage from Wikipedia and were asked to distill it to a short summary.
"""
)
with st.expander("Example of a cleaned record"):
st.error(
"The response is longer than the input and the response does not give an actual"
" summarization."
)
st.success("Create a nice and concise summary of the input.")
image = Image.open("example.png")
st.image(image)
def get_new_record():
records = None
cleaned_records = rg.load(
"dolly-clean-2",
query=f"NOT status: Discarded",
)
records = rg.load("dolly-dirty-2", query="NOT status: Discarded")
records = [
record
for record in records
if record.id not in [cleaned_record.id for cleaned_record in cleaned_records]
]
if records:
return random.choice(records)
else:
st.warning(
"No more records to clean! Feel free to send and email to Argilla to"
" request more records."
)
st.stop()
def clean_new_record(record):
instruction, inputs, response = "", "", ""
if record:
instruction = record.inputs.get("instruction")
inputs = record.inputs.get("context")
response = record.inputs.get("response")
return instruction, inputs, response
def log_record(record):
record_clean = rg.TextClassificationRecord( # noqa
inputs={
"instruction": st.session_state["instruction"],
"context": st.session_state["inputs"],
"response": st.session_state["response"],
},
**record.dict(exclude={"inputs", "text"}),
)
record.metadata["status"] = cleaned_status
rg.log(name="dolly-clean-2", records=record_clean)
rg.log(name="dolly-dirty-2", records=record)
st.balloons()
with st.form("my_form", clear_on_submit=False):
record = get_new_record()
instruction, inputs, response = clean_new_record(record)
cols = st.columns(2)
cols[0].write(f"## Original Record for `{record.metadata.get('category')}` task")
cols[0].text_area(
"Instruction", value=instruction, key="instruction_fixed", disabled=True
)
cols[0].text_area(
"Input (i.e. context)",
value=inputs,
key="inputs_fixed",
disabled=True,
height=200,
)
cols[0].text_area(
"Response", value=response, key="response_fixed", disabled=True, height=200
)
cols[1].write(f"## Corrected Record for `{record.metadata.get('category')}` task")
updated_instruction = cols[1].text_area(
"Instruction", value=instruction, key="instruction", on_change=None
)
updated_inputs = cols[1].text_area(
"Input (i.e. context)",
value=inputs,
key="inputs",
on_change=None,
help=(
"Originally this was called `context` but we renamed it to `input` to make"
" it more intuitive."
),
height=200,
)
updated_response = cols[1].text_area(
"Response", value=response, key="response", on_change=None, height=200
)
cols = st.columns(2)
new_record = cols[0].form_submit_button("Get new record")
submitted = cols[1].form_submit_button(
"Submit cleaned record", on_click=log_record, args=[record]
)
|