Daniel Vila
Duplicate from dvilasuero/argilla-dolly-cleaner
afffb58
import os
import random
import argilla as rg
import streamlit as st
from PIL import Image
st.set_page_config(
page_title="Argilla Dolly Cleaning Tool", page_icon=":sponge:", layout="wide"
)
try:
api_url = os.environ["ARGILLA_API_URL"]
api_key = os.environ["ARGILLA_API_KEY"]
cleaned_status = "cleaned"
rg.init(api_url=api_url, api_key=api_key)
rg.set_workspace("recognai")
except Exception:
st.error(
"Incorrect or no ARGILLA_API_URL and ARGILLA_API_KEY environment variables"
" provided!"
)
st.stop()
st.title("Argilla Dolly Cleaning Tool :sponge: 🐑")
with st.expander("Curation Guidelines"):
st.info(
"""
Welcome to the Argilla Dolly Cleaning Tool! :sponge: 🐑
This is a collaborative effort to improve the quality of the Dolly v2 dataset. Most of the issues in this batch are related to incorrect instructions and the combination of inputs with responses. Some examples are records where the input already contains the expected response, or the instruction corresponds to the input.
We kindly appreciate your help. The idea is following:
1. You can correct any field of the record using the fields on the right-hand side.
2. Once you changed the field values you can submit the cleaned record.
3. If you want to pass on the record you can click on the Get New Record button.
"""
)
with st.expander("Task Categories"):
st.info(
"""
- `closed_qa`: These are questions that can be answered using only the information contained in a passage of reference text. For instance, given a paragraph from Wikipedia on the atom, one might ask, “What is the ratio between protons and neutrons in the nucleus?”
- `information extraction`: Here an annotator would copy a paragraph from Wikipedia and extract entities or other factual information such as weights or measurements from the passage.
- `summarization`: For this, annotators provided a passage from Wikipedia and were asked to distill it to a short summary.
"""
)
with st.expander("Example of a cleaned record"):
st.error(
"The response is longer than the input and the response does not give an actual"
" summarization."
)
st.success("Create a nice and concise summary of the input.")
image = Image.open("example.png")
st.image(image)
def get_new_record():
records = None
cleaned_records = rg.load(
"dolly-clean-2",
query=f"NOT status: Discarded",
)
records = rg.load("dolly-dirty-2", query="NOT status: Discarded")
records = [
record
for record in records
if record.id not in [cleaned_record.id for cleaned_record in cleaned_records]
]
if records:
return random.choice(records)
else:
st.warning(
"No more records to clean! Feel free to send and email to Argilla to"
" request more records."
)
st.stop()
def clean_new_record(record):
instruction, inputs, response = "", "", ""
if record:
instruction = record.inputs.get("instruction")
inputs = record.inputs.get("context")
response = record.inputs.get("response")
return instruction, inputs, response
def log_record(record):
record_clean = rg.TextClassificationRecord( # noqa
inputs={
"instruction": st.session_state["instruction"],
"context": st.session_state["inputs"],
"response": st.session_state["response"],
},
**record.dict(exclude={"inputs", "text"}),
)
record.metadata["status"] = cleaned_status
rg.log(name="dolly-clean-2", records=record_clean)
rg.log(name="dolly-dirty-2", records=record)
st.balloons()
with st.form("my_form", clear_on_submit=False):
record = get_new_record()
instruction, inputs, response = clean_new_record(record)
cols = st.columns(2)
cols[0].write(f"## Original Record for `{record.metadata.get('category')}` task")
cols[0].text_area(
"Instruction", value=instruction, key="instruction_fixed", disabled=True
)
cols[0].text_area(
"Input (i.e. context)",
value=inputs,
key="inputs_fixed",
disabled=True,
height=200,
)
cols[0].text_area(
"Response", value=response, key="response_fixed", disabled=True, height=200
)
cols[1].write(f"## Corrected Record for `{record.metadata.get('category')}` task")
updated_instruction = cols[1].text_area(
"Instruction", value=instruction, key="instruction", on_change=None
)
updated_inputs = cols[1].text_area(
"Input (i.e. context)",
value=inputs,
key="inputs",
on_change=None,
help=(
"Originally this was called `context` but we renamed it to `input` to make"
" it more intuitive."
),
height=200,
)
updated_response = cols[1].text_area(
"Response", value=response, key="response", on_change=None, height=200
)
cols = st.columns(2)
new_record = cols[0].form_submit_button("Get new record")
submitted = cols[1].form_submit_button(
"Submit cleaned record", on_click=log_record, args=[record]
)