tung commited on
Commit
4bd9fc2
·
1 Parent(s): f6eba42

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ # -----------------------------------------------------------------------------
8
+ # Configuration – adjust these paths to point at your data location
9
+ # -----------------------------------------------------------------------------
10
+ DATA_PATH = "human_judgement/selected_samples.json" # CSV with columns: question, answer1, answer2
11
+ RATINGS_PATH = (
12
+ "human_judgement/human_judgement.csv" # File where user ratings will be appended
13
+ )
14
+
15
+ # -----------------------------------------------------------------------------
16
+ # Helper functions
17
+ # -----------------------------------------------------------------------------
18
+
19
+
20
+ def load_data(path: str = DATA_PATH) -> pd.DataFrame:
21
+ """Load the Q/A pairs once and cache them inside gradio runtime."""
22
+ if not os.path.exists(path):
23
+ raise FileNotFoundError(f"Could not find data file at {path}.")
24
+ df = pd.read_json(path, lines=True)
25
+ expected_cols = {"question", "response1", "response2"}
26
+ if not expected_cols.issubset(df.columns):
27
+ raise ValueError(f"CSV file must contain columns: {', '.join(expected_cols)}")
28
+ return df
29
+
30
+
31
+ def load_ratings(path: str = RATINGS_PATH) -> pd.DataFrame:
32
+ """Load the ratings file (creates an empty one if absent)."""
33
+ if os.path.exists(path):
34
+ return pd.read_csv(path)
35
+ return pd.DataFrame(columns=["user_id", "row_index", "choice", "timestamp"])
36
+
37
+
38
+ def save_rating(user_id: str, row_index: int, choice: int, path: str = RATINGS_PATH):
39
+ """Append a single rating row to disk, avoiding accidental duplicates."""
40
+ ratings = load_ratings(path)
41
+
42
+ # Prevent duplicate entries for the same user/question pair
43
+ duplicate = (ratings.user_id == user_id) & (ratings.row_index == row_index)
44
+ if duplicate.any():
45
+ return # already stored, nothing to do
46
+
47
+ new_entry = {
48
+ "user_id": user_id,
49
+ "row_index": row_index,
50
+ "choice": choice, # 1 means answer1 preferred, 2 means answer2 preferred
51
+ "timestamp": datetime.utcnow().isoformat(),
52
+ }
53
+ ratings = pd.concat([ratings, pd.DataFrame([new_entry])], ignore_index=True)
54
+ ratings.to_csv(path, index=False)
55
+
56
+
57
+ def get_next_unrated(df: pd.DataFrame, ratings: pd.DataFrame, user_id: str):
58
+ """Return (row_index, question, answer1, answer2) or None if finished."""
59
+ rated_indices = ratings.loc[ratings.user_id == user_id, "row_index"].tolist()
60
+ unrated_df = df[~df.index.isin(rated_indices)]
61
+ if unrated_df.empty:
62
+ return None
63
+ row = unrated_df.iloc[0]
64
+ return row.name, row.question, row.response1, row.response2
65
+
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # Gradio callbacks
69
+ # -----------------------------------------------------------------------------
70
+
71
+
72
+ def start_or_resume(user_id: str, state_df):
73
+ """Initialise or resume a session for a given user id."""
74
+ if not user_id.strip():
75
+ return (
76
+ gr.update(visible=True),
77
+ gr.update(visible=False),
78
+ gr.update(visible=False),
79
+ "",
80
+ "",
81
+ "",
82
+ "",
83
+ "Please enter a non‑empty identifier to begin.",
84
+ )
85
+
86
+ ratings = load_ratings()
87
+ record = get_next_unrated(state_df, ratings, user_id)
88
+ if record is None:
89
+ # Completed all tasks
90
+ return (
91
+ gr.update(visible=True),
92
+ gr.update(visible=False),
93
+ gr.update(visible=False),
94
+ "",
95
+ "",
96
+ "",
97
+ "",
98
+ "🎉 You have evaluated every item – thank you!",
99
+ )
100
+
101
+ idx, q, a1, a2 = record
102
+ return (
103
+ gr.update(visible=True), # keep user id input visible for reference
104
+ gr.update(visible=True), # show evaluation section
105
+ gr.update(visible=True), # enable submit button
106
+ "**" + q + "**",
107
+ a1,
108
+ a2,
109
+ str(idx),
110
+ "",
111
+ )
112
+
113
+
114
+ def submit_preference(user_id: str, row_idx_str: str, choice: str, state_df):
115
+ """Handle a single preference submission and load the next question."""
116
+ if choice not in {"answer1", "answer2"}:
117
+ return gr.update(
118
+ value="Please choose either Answer 1 or Answer 2 before submitting."
119
+ )
120
+
121
+ row_idx = int(row_idx_str)
122
+ save_rating(user_id, row_idx, 1 if choice == "answer1" else 2)
123
+
124
+ ratings = load_ratings()
125
+ record = get_next_unrated(state_df, ratings, user_id)
126
+ if record is None:
127
+ return "", "", "", "", "🎉 You have evaluated every item – thank you!"
128
+
129
+ idx, q, a1, a2 = record
130
+ return "**" + q + "**", a1, a2, str(idx), ""
131
+
132
+
133
+ # -----------------------------------------------------------------------------
134
+ # Build Gradio interface
135
+ # -----------------------------------------------------------------------------
136
+
137
+
138
+ def build_demo():
139
+ df = load_data()
140
+
141
+ with gr.Blocks(title="Question/Answer Preference Rater") as demo:
142
+ gr.Markdown(
143
+ """# Q/A Preference Rater
144
+ Enter your identifier below to start or resume your evaluation session. For every question, select which answer you prefer. Your progress is saved automatically so you can return at any time using the **same identifier**."""
145
+ )
146
+
147
+ state_df = gr.State(df) # keep dataset in memory for callbacks
148
+ state_row_idx = gr.State("")
149
+
150
+ # User identifier section
151
+ id_input = gr.Textbox(
152
+ label="User Identifier", placeholder="e.g. Alice", scale=3
153
+ )
154
+ start_btn = gr.Button("Start / Resume", scale=1)
155
+
156
+ # Feedback / status message
157
+ info_md = gr.Markdown("", visible=True)
158
+
159
+ # Evaluation section (initially hidden)
160
+ with gr.Column(visible=False) as eval_col:
161
+ question_md = gr.Markdown("", label="Question")
162
+ with gr.Row():
163
+ # answer1_box = gr.Textbox(
164
+ # label="Answer\u00a01", interactive=False, lines=10
165
+ # )
166
+ # answer2_box = gr.Textbox(
167
+ # label="Answer\u00a02", interactive=False, lines=10
168
+ # )
169
+ answer1_box = gr.Markdown(label="Answer 1")
170
+ answer2_box = gr.Markdown(label="Answer 2")
171
+ choice_radio = gr.Radio(
172
+ ["answer1", "answer2"],
173
+ label="Which answer do you prefer?",
174
+ interactive=True,
175
+ )
176
+ submit_btn = gr.Button("Submit Preference", visible=False)
177
+
178
+ # Wire callbacks
179
+ start_btn.click(
180
+ fn=start_or_resume,
181
+ inputs=[id_input, state_df],
182
+ outputs=[
183
+ id_input,
184
+ eval_col,
185
+ submit_btn,
186
+ question_md,
187
+ answer1_box,
188
+ answer2_box,
189
+ state_row_idx,
190
+ info_md,
191
+ ],
192
+ )
193
+
194
+ submit_btn.click(
195
+ fn=submit_preference,
196
+ inputs=[id_input, state_row_idx, choice_radio, state_df],
197
+ outputs=[question_md, answer1_box, answer2_box, state_row_idx, info_md],
198
+ )
199
+
200
+ return demo
201
+
202
+
203
+ # if __name__ == "__main__":
204
+ # build_demo().launch()
205
+ build_demo().launch()
human_judgement/human_judgement.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ user_id,row_index,choice,timestamp
2
+ tung,0,2,2025-07-11T14:23:04.840194
3
+ tung,1,2,2025-07-11T14:23:11.475744
4
+ tung,2,2,2025-07-11T14:23:13.510468
5
+ tung,3,1,2025-07-11T14:35:31.929781
6
+ tung,4,2,2025-07-11T14:35:33.996639
7
+ tung,5,1,2025-07-11T14:35:36.862713
8
+ tung,6,2,2025-07-11T14:35:39.247847
9
+ t,0,1,2025-07-11T14:48:38.266875
10
+ 1,0,2,2025-07-11T14:49:47.794730
11
+ t,1,2,2025-07-11T14:54:17.996900
12
+ t,2,2,2025-07-11T14:54:20.045234
13
+ t,3,2,2025-07-11T14:54:21.926244
14
+ t,4,2,2025-07-11T14:54:23.190934
15
+ t,5,2,2025-07-11T14:54:24.518148
16
+ t,6,2,2025-07-11T14:54:26.607548
17
+ t,7,2,2025-07-11T14:54:28.998385
18
+ t,8,1,2025-07-11T14:54:30.680462
19
+ t,9,2,2025-07-11T14:54:32.624527
20
+ t,10,1,2025-07-11T14:54:36.288496
21
+ t,11,2,2025-07-11T14:54:46.931882
22
+ t,12,2,2025-07-11T14:54:48.409522
23
+ 1,1,2,2025-07-11T14:57:30.841905
24
+ 123,0,1,2025-07-11T14:58:59.498550
25
+ 123,1,2,2025-07-11T14:59:01.489360
26
+ 123,2,2,2025-07-11T14:59:03.171541
human_judgement/selected_samples.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9138e1765027de6639319c7acf865a1b1356a58074c632e88f1bdde54840d1be
3
+ size 91168861