Spaces:
Runtime error
Runtime error
File size: 4,333 Bytes
0d49845 42528b7 0d49845 42528b7 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 42528b7 f8f27d8 0d49845 f8f27d8 0d49845 f8f27d8 0d49845 5a6cb05 0d49845 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import streamlit as st
import pandas as pd
st.set_page_config(layout="wide")
with st.sidebar.expander("Explanation", expanded=False):
st.markdown("""This demo allows you to explore the data inside [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398),
showing how we can use Word Position Deviation (WPD) and Lexical Deviation (LD) to find different types of paraphrases.
By using what we observe from the data, we can also correct numerous labelling errors inside MRPC, presenting the a revision of MRPC termed as MRPC-R1.
You can see the rejected and corrected paraphrases by changing the **Display Types** option below.
This demo accompanies the paper ["Towards Better Characterization of Paraphrases" (ACL 2022)](https://github.com/tlkh/paraphrase-metrics).""")
with st.sidebar.expander("Dataset Options", expanded=False):
split = st.selectbox("Dataset Split", ["train", "test"])
display = st.selectbox("Source", ["All", "Only MRPC", "Only MRPC-R1"])
ptype = st.sidebar.radio("Display Types", ["All",
"Only Paraphrases (MRPC-R1)",
"Only Paraphrases (MRPC)",
"Rejected Paraphrases from MRPC",
"Corrected Paraphrases from MRPC"])
st.sidebar.markdown("**Score Filter Options**")
filter_by = st.sidebar.selectbox("Filter By Scores From", ["MRPC", "MRPC-R1"])
display_range_wpd = st.sidebar.slider(
"Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.7))
display_range_ld = st.sidebar.slider(
"Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.4))
display_scores = st.sidebar.checkbox("Display scores", value=False)
def load_df(split):
if split == "train":
df = pd.read_csv("./mrpc_train_scores.csv")
else:
df = pd.read_csv("./mrpc_test_scores.csv")
df.reset_index(drop=True, inplace=True)
return df
def filter_df(df, display, ptype, filter_by, display_scores):
# filter data
if display == "MRPC":
df = df.drop(["new_s1", "new_s2"], axis=1)
elif display == "MRPC-R1":
df = df.drop(["og_s1", "og_s2"], axis=1)
# filter paraphrase type
if ptype == "Only Paraphrases (MRPC)":
condition = df.og_label == 1
df_sel = df[condition]
elif ptype == "Only Paraphrases (MRPC-R1)":
condition = df.new_label == 1
df_sel = df[condition]
elif ptype == "Rejected Paraphrases from MRPC":
condition = (df.new_label == 0) & (df.og_label == 1)
df_sel = df[condition]
elif ptype == "Corrected Paraphrases from MRPC":
condition = df.remarks == "corrected"
df_sel = df[condition]
else:
# all
df_sel = df
# sort by scores
if filter_by == "MRPC":
# wpd
condition = (df_sel.og_wpd >= display_range_wpd[0]) & (
df_sel.og_wpd < display_range_wpd[1])
df_sel = df_sel[condition]
# ld
condition = (df_sel.og_ld >= display_range_ld[0]) & (
df_sel.og_ld < display_range_ld[1])
df_sel = df_sel[condition]
else:
# wpd
condition = (df_sel.new_wpd >= display_range_wpd[0]) & (
df_sel.new_wpd < display_range_wpd[1])
df_sel = df_sel[condition]
# ld
condition = (df_sel.new_ld >= display_range_ld[0]) & (
df_sel.new_ld < display_range_ld[1])
df_sel = df_sel[condition]
# filter scores
if filter_by == "MRPC":
df_sel.sort_values("og_ld", inplace=True)
df_sel.sort_values("og_wpd", inplace=True)
else:
df_sel.sort_values("new_ld", inplace=True)
df_sel.sort_values("new_wpd", inplace=True)
if not display_scores:
df_sel.drop(["og_ld", "og_wpd", "new_ld", "new_wpd"],
axis=1, inplace=True)
label_col = df_sel["og_label"].astype(
str)+"->"+df_sel["new_label"].astype(str)
df_sel["og/new label"] = label_col
df_sel.drop(["remarks", "og_label", "new_label"], axis=1, inplace=True)
return df_sel
df = load_df(split)
df_sel = filter_df(df, display, ptype, filter_by, display_scores)
st.markdown("**MRPC Paraphrase Data Explorer** (Displaying "+str(len(df_sel))+" items)")
st.table(data=df_sel)
|