Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
@@ -8,21 +8,25 @@ split = st.sidebar.selectbox("Dataset Split", ["train", "test"])
|
|
8 |
display = st.sidebar.selectbox("Source", ["All", "Only MRPC", "Only MRPC-R1"])
|
9 |
|
10 |
ptype = st.sidebar.radio("Paraphrase Pair Types", ["All",
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
|
16 |
st.sidebar.markdown("**Score Filter Options**")
|
17 |
filter_by = st.sidebar.selectbox("Filter By Scores From", ["MRPC", "MRPC-R1"])
|
18 |
-
display_range_wpd = st.sidebar.slider(
|
19 |
-
|
|
|
|
|
|
|
20 |
|
21 |
st.sidebar.markdown("""**Explanation**
|
22 |
This demo allows you to explore the data inside [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398), showing how we can use Word Position Deviation (WPD) and Lexical Deviation (LD) to find different types of paraphrases. By using what we observe from the data, we can also correct numerous labelling errors inside MRPC, presenting the a revision of MRPC termed as MRPC-R1. This demo accompanies the paper ["Towards Better Characterization of Paraphrases" (ACL 2022)](https://github.com/tlkh/paraphrase-metrics).""")
|
23 |
|
24 |
st.markdown("**MRPC Paraphrase Data Explorer**")
|
25 |
|
|
|
26 |
def load_df(split):
|
27 |
if split == "train":
|
28 |
df = pd.read_csv("./mrpc_train_scores.csv")
|
@@ -31,7 +35,8 @@ def load_df(split):
|
|
31 |
df.reset_index(drop=True, inplace=True)
|
32 |
return df
|
33 |
|
34 |
-
|
|
|
35 |
# filter data
|
36 |
if display == "MRPC":
|
37 |
df = df.drop(["new_s1", "new_s2"], axis=1)
|
@@ -56,17 +61,21 @@ def filter_df(df, display, ptype, filter_by):
|
|
56 |
# sort by scores
|
57 |
if filter_by == "MRPC":
|
58 |
# wpd
|
59 |
-
condition = (df_sel.og_wpd >= display_range_wpd[0]) & (
|
|
|
60 |
df_sel = df_sel[condition]
|
61 |
# ld
|
62 |
-
condition = (df_sel.og_ld >= display_range_ld[0]) & (
|
|
|
63 |
df_sel = df_sel[condition]
|
64 |
else:
|
65 |
# wpd
|
66 |
-
condition = (df_sel.new_wpd >= display_range_wpd[0]) & (
|
|
|
67 |
df_sel = df_sel[condition]
|
68 |
# ld
|
69 |
-
condition = (df_sel.new_ld >= display_range_ld[0]) & (
|
|
|
70 |
df_sel = df_sel[condition]
|
71 |
# filter scores
|
72 |
if filter_by == "MRPC":
|
@@ -75,22 +84,18 @@ def filter_df(df, display, ptype, filter_by):
|
|
75 |
else:
|
76 |
df_sel.sort_values("new_ld", inplace=True)
|
77 |
df_sel.sort_values("new_wpd", inplace=True)
|
|
|
|
|
|
|
|
|
|
|
78 |
return df_sel
|
79 |
-
|
80 |
|
81 |
-
df = load_df(split)
|
82 |
|
83 |
-
|
84 |
|
85 |
-
|
86 |
-
hide_table_row_index = """
|
87 |
-
<style>
|
88 |
-
tbody th {display:none}
|
89 |
-
.blank {display:none}
|
90 |
-
</style>
|
91 |
-
"""
|
92 |
|
93 |
-
|
94 |
-
st.markdown("Total "+str(len(df_sel))+" items"+hide_table_row_index, unsafe_allow_html=True)
|
95 |
|
96 |
st.table(data=df_sel)
|
|
|
8 |
display = st.sidebar.selectbox("Source", ["All", "Only MRPC", "Only MRPC-R1"])
|
9 |
|
10 |
ptype = st.sidebar.radio("Paraphrase Pair Types", ["All",
|
11 |
+
"Only Paraphrases (MRPC-R1)",
|
12 |
+
"Only Paraphrases (MRPC)",
|
13 |
+
"Rejected Paraphrases from MRPC",
|
14 |
+
"Corrected Paraphrases from MRPC"])
|
15 |
|
16 |
st.sidebar.markdown("**Score Filter Options**")
|
17 |
filter_by = st.sidebar.selectbox("Filter By Scores From", ["MRPC", "MRPC-R1"])
|
18 |
+
display_range_wpd = st.sidebar.slider(
|
19 |
+
"Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.7))
|
20 |
+
display_range_ld = st.sidebar.slider(
|
21 |
+
"Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.4))
|
22 |
+
display_scores = st.sidebar.checkbox("Display scores", value=False)
|
23 |
|
24 |
st.sidebar.markdown("""**Explanation**
|
25 |
This demo allows you to explore the data inside [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398), showing how we can use Word Position Deviation (WPD) and Lexical Deviation (LD) to find different types of paraphrases. By using what we observe from the data, we can also correct numerous labelling errors inside MRPC, presenting the a revision of MRPC termed as MRPC-R1. This demo accompanies the paper ["Towards Better Characterization of Paraphrases" (ACL 2022)](https://github.com/tlkh/paraphrase-metrics).""")
|
26 |
|
27 |
st.markdown("**MRPC Paraphrase Data Explorer**")
|
28 |
|
29 |
+
|
30 |
def load_df(split):
|
31 |
if split == "train":
|
32 |
df = pd.read_csv("./mrpc_train_scores.csv")
|
|
|
35 |
df.reset_index(drop=True, inplace=True)
|
36 |
return df
|
37 |
|
38 |
+
|
39 |
+
def filter_df(df, display, ptype, filter_by, display_scores):
|
40 |
# filter data
|
41 |
if display == "MRPC":
|
42 |
df = df.drop(["new_s1", "new_s2"], axis=1)
|
|
|
61 |
# sort by scores
|
62 |
if filter_by == "MRPC":
|
63 |
# wpd
|
64 |
+
condition = (df_sel.og_wpd >= display_range_wpd[0]) & (
|
65 |
+
df_sel.og_wpd < display_range_wpd[1])
|
66 |
df_sel = df_sel[condition]
|
67 |
# ld
|
68 |
+
condition = (df_sel.og_ld >= display_range_ld[0]) & (
|
69 |
+
df_sel.og_ld < display_range_ld[1])
|
70 |
df_sel = df_sel[condition]
|
71 |
else:
|
72 |
# wpd
|
73 |
+
condition = (df_sel.new_wpd >= display_range_wpd[0]) & (
|
74 |
+
df_sel.new_wpd < display_range_wpd[1])
|
75 |
df_sel = df_sel[condition]
|
76 |
# ld
|
77 |
+
condition = (df_sel.new_ld >= display_range_ld[0]) & (
|
78 |
+
df_sel.new_ld < display_range_ld[1])
|
79 |
df_sel = df_sel[condition]
|
80 |
# filter scores
|
81 |
if filter_by == "MRPC":
|
|
|
84 |
else:
|
85 |
df_sel.sort_values("new_ld", inplace=True)
|
86 |
df_sel.sort_values("new_wpd", inplace=True)
|
87 |
+
if not display_scores:
|
88 |
+
df_sel.drop(["og_ld", "og_wpd", "new_ld", "new_wpd"], axis=1, inplace=True)
|
89 |
+
label_col = df_sel["og_label"].astype(str)+"->"+df_sel["new_label"].astype(str)
|
90 |
+
df_sel["og/new label"] = label_col
|
91 |
+
df_sel.drop(["remarks", "og_label", "new_label"], axis=1, inplace=True)
|
92 |
return df_sel
|
|
|
93 |
|
|
|
94 |
|
95 |
+
df = load_df(split)
|
96 |
|
97 |
+
df_sel = filter_df(df, display, ptype, filter_by, display_scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
+
st.markdown("Displaying "+str(len(df_sel))+" items")
|
|
|
100 |
|
101 |
st.table(data=df_sel)
|