rafmacalaba commited on
Commit
ef57de8
ยท
1 Parent(s): 9117aba

change file and update

Browse files
Files changed (1) hide show
  1. app.py +94 -138
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import boto3
2
  import os
3
  import json
4
- import re
5
  import gradio as gr
6
- from typing import List, Dict, Tuple, Optional, Union, Any
7
 
8
  # โ”€โ”€ S3 CONFIG โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
9
  s3 = boto3.client(
@@ -13,18 +12,18 @@ s3 = boto3.client(
13
  region_name = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2"),
14
  )
15
 
16
- # ai4data/datause-annotation
17
- # S3 bucket and keys
18
  BUCKET = "doccano-processed"
19
- #INIT_KEY = "gradio/initial_data_train.json"
20
- INIT_KEY = "gradio/holdout_data_review.json"
21
- #VALID_PREFIX = "validated_records/"
22
- VALID_PREFIX = "holdout_data_review_output/"
23
 
24
  # โ”€โ”€ Helpers to load & save from S3 โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
25
  def load_initial_data() -> List[Dict]:
26
  obj = s3.get_object(Bucket=BUCKET, Key=INIT_KEY)
27
- return json.loads(obj['Body'].read())
 
 
 
 
28
 
29
  def load_all_validations() -> Dict[int, Dict]:
30
  records = {}
@@ -33,10 +32,10 @@ def load_all_validations() -> Dict[int, Dict]:
33
  )
34
  for page in pages:
35
  for obj in page.get("Contents", []):
36
- key = obj["Key"]
37
- idx = int(os.path.splitext(os.path.basename(key))[0])
38
- data = s3.get_object(Bucket=BUCKET, Key=key)["Body"].read()
39
- records[idx] = json.loads(data)
40
  return records
41
 
42
  def save_single_validation(idx: int, record: Dict):
@@ -47,14 +46,13 @@ def save_single_validation(idx: int, record: Dict):
47
  Body = json.dumps(record, indent=2).encode('utf-8'),
48
  ContentType = 'application/json'
49
  )
 
50
 
51
  class DynamicDataset:
52
  def __init__(self, data: List[Dict]):
53
  self.data = data
54
  self.len = len(data)
55
  self.current = 0
56
- for ex in self.data:
57
- ex.setdefault("validated", False)
58
 
59
  def example(self, idx: int) -> Dict:
60
  self.current = max(0, min(self.len - 1, idx))
@@ -87,161 +85,120 @@ class DynamicDataset:
87
  def validate(self):
88
  self.data[self.current]["validated"] = True
89
 
90
- def tokenize_text(text: str) -> List[str]:
91
- return re.findall(r"\w+(?:[-_]\w+)*|[^\s\w]", text)
92
-
93
  def prepare_for_highlight(data: Dict) -> List[Tuple[str, Optional[str]]]:
94
- tokens = data["tokenized_text"]
95
- ner = data["ner"]
96
- highlighted, curr_ent, ent_buf, norm_buf = [], None, [], []
97
- for idx, tok in enumerate(tokens):
98
- if curr_ent is None or idx > curr_ent[1]:
99
- if ent_buf:
100
- highlighted.append((" ".join(ent_buf), curr_ent[2]))
101
- ent_buf = []
102
- curr_ent = next((e for e in ner if e[0] == idx), None)
103
- if curr_ent and curr_ent[0] <= idx <= curr_ent[1]:
104
- if norm_buf:
105
- highlighted.append((" ".join(norm_buf), None))
106
- norm_buf = []
107
- ent_buf.append(tok)
108
- else:
109
- if ent_buf:
110
- highlighted.append((" ".join(ent_buf), curr_ent[2]))
111
- ent_buf = []
112
- norm_buf.append(tok)
113
- if ent_buf:
114
- highlighted.append((" ".join(ent_buf), curr_ent[2]))
115
- if norm_buf:
116
- highlighted.append((" ".join(norm_buf), None))
117
- return [(re.sub(r"\s(?=[,\.!?โ€ฆ:;])", "", txt), lbl) for txt, lbl in highlighted]
118
-
119
-
120
- def extract_tokens_and_labels(highlighted: List[Dict[str, Union[str, None]]]
121
- ) -> Tuple[List[str], List[Tuple[int,int,str]]]:
122
- tokens, ner = [], []
123
- token_idx = 0
124
-
125
  for entry in highlighted:
126
- text = entry['token']
127
- label = entry.get('class_or_confidence') or entry.get('class') or entry.get('label')
128
- # split into real tokens
129
- toks = tokenize_text(text)
130
- start = token_idx
131
- end = token_idx + len(toks) - 1
132
-
133
- tokens.extend(toks)
134
- if label:
135
- ner.append((start, end, label))
136
-
137
- token_idx = end + 1
138
-
139
- return tokens, ner
140
 
 
141
  def create_demo() -> gr.Blocks:
142
  data = load_initial_data()
143
  validated_store = load_all_validations()
144
-
145
- # mark any pre-validated examples
146
- for idx in validated_store:
147
- if 0 <= idx < len(data):
148
- data[idx]["validated"] = True
149
-
150
  dynamic_dataset = DynamicDataset(data)
151
 
152
- def make_info(rec):
153
  fn = rec.get("filename", "โ€”")
154
  pg = rec.get("page", "โ€”")
155
- # Markdown with line break for Gradio
156
- return f"**File:** `{fn}` \n**Page:** `{pg}`"
157
 
158
- def align_spans_to_tokens(
159
- highlighted: List[Dict[str, Union[str, None]]],
160
- tokens: List[str]
161
- ) -> List[Tuple[int, int, str]]:
162
- """
163
- Align each highlighted chunk to the next matching tokens in the list,
164
- advancing a pointer so repeated tokens map in the order you clicked them.
165
- """
166
- spans = []
167
- search_start = 0
168
-
169
- for entry in highlighted:
170
- text = entry["token"]
171
- label = entry.get("class_or_confidence") or entry.get("label") or entry.get("class")
172
- if not label:
173
- continue
174
-
175
- chunk_toks = tokenize_text(text)
176
- # scan only from the end of the last match
177
- for i in range(search_start, len(tokens) - len(chunk_toks) + 1):
178
- if tokens[i:i + len(chunk_toks)] == chunk_toks:
179
- spans.append((i, i + len(chunk_toks) - 1, label))
180
- search_start = i + len(chunk_toks)
181
- break
182
- else:
183
- print(f"โš ๏ธ Couldnโ€™t align chunk: {text!r}")
184
-
185
- return spans
186
-
187
- def load_example(idx):
188
  rec = validated_store.get(idx, dynamic_dataset.example(idx))
189
  segs = prepare_for_highlight(rec)
190
  return segs, rec.get("validated", False), idx, make_info(rec)
191
 
192
  def update_example(highlighted, idx: int):
193
- rec = dynamic_dataset.data[idx]
194
- # reโ€tokenize
195
- orig_tokens = tokenize_text(rec["text"])
196
- # realign highlights
197
- new_ner = align_spans_to_tokens(highlighted, orig_tokens)
198
- # overwrite & mark un-validated
199
- rec["tokenized_text"] = orig_tokens
200
- rec["ner"] = new_ner
201
- rec["validated"] = False
202
  return prepare_for_highlight(rec), rec["validated"], idx, make_info(rec)
203
 
204
  def do_validate(highlighted, idx: int):
205
- # in-memory mark
 
 
 
 
206
  dynamic_dataset.validate()
207
- rec = dynamic_dataset.data[idx]
208
- orig_tokens = tokenize_text(rec["text"])
209
- new_ner = align_spans_to_tokens(highlighted, orig_tokens)
210
- rec["tokenized_text"] = orig_tokens
211
- rec["ner"] = new_ner
212
- # persist to disk/store
213
- save_single_validation(idx, rec)
214
- return prepare_for_highlight(rec), True, make_info(rec)
215
 
216
  def nav(fn):
217
- rec = fn()
 
 
 
 
218
  segs = prepare_for_highlight(rec)
219
- return segs, rec.get("validated", False), dynamic_dataset.current, make_info(rec)
220
 
221
  with gr.Blocks() as demo:
222
- prog = gr.Slider(0, dynamic_dataset.len-1, value=0, step=1, label="Example #", interactive=False)
223
- inp_box = gr.HighlightedText(label="Sentence", interactive=True)
224
- info_md = gr.Markdown(label="Source") # โ† shows filename & page
225
- status = gr.Checkbox(label="Validated?", value=False, interactive=False)
226
-
227
- gr.Markdown(
228
- "[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)"
229
  )
 
 
 
230
 
231
- with gr.Row():
232
- prev_btn = gr.Button("โ—€๏ธ Previous")
233
- apply_btn = gr.Button("๐Ÿ“ Apply Changes")
234
- next_btn = gr.Button("Next โ–ถ๏ธ")
235
 
 
 
 
 
236
  with gr.Row():
237
  skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
238
  validate_btn = gr.Button("โœ… Validate")
239
  skip_next = gr.Button("โญ๏ธ Next Unvalidated")
240
 
241
- # initial load
 
 
 
 
 
242
  demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog, info_md])
243
-
244
- # wire up actions (all now also update info_md)
245
  apply_btn.click(update_example, inputs=[inp_box, prog], outputs=[inp_box, status, prog, info_md])
246
  prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog, info_md])
247
  next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog, info_md])
@@ -252,5 +209,4 @@ def create_demo() -> gr.Blocks:
252
  return demo
253
 
254
  if __name__ == "__main__":
255
- demo = create_demo()
256
- demo.launch(share=True, inline=True, debug=True)
 
1
  import boto3
2
  import os
3
  import json
 
4
  import gradio as gr
5
+ from typing import List, Dict, Tuple, Optional, Any
6
 
7
  # โ”€โ”€ S3 CONFIG โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
8
  s3 = boto3.client(
 
12
  region_name = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2"),
13
  )
14
 
 
 
15
  BUCKET = "doccano-processed"
16
+ INIT_KEY = "gradio/ai4data-revalidate-data.json"
17
+ VALID_PREFIX = "ai4data-revalidate-data-output/"
 
 
18
 
19
  # โ”€โ”€ Helpers to load & save from S3 โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
20
  def load_initial_data() -> List[Dict]:
21
  obj = s3.get_object(Bucket=BUCKET, Key=INIT_KEY)
22
+ data = json.loads(obj['Body'].read())
23
+ # assume ner_text spans use end-index as non-inclusive
24
+ for rec in data:
25
+ rec.setdefault("validated", False)
26
+ return data
27
 
28
  def load_all_validations() -> Dict[int, Dict]:
29
  records = {}
 
32
  )
33
  for page in pages:
34
  for obj in page.get("Contents", []):
35
+ idx = int(os.path.splitext(os.path.basename(obj["Key"]))[0])
36
+ rec = json.loads(s3.get_object(Bucket=BUCKET, Key=obj["Key"])['Body'].read())
37
+ rec.setdefault("validated", True)
38
+ records[idx] = rec
39
  return records
40
 
41
  def save_single_validation(idx: int, record: Dict):
 
46
  Body = json.dumps(record, indent=2).encode('utf-8'),
47
  ContentType = 'application/json'
48
  )
49
+ ##fckxk
50
 
51
  class DynamicDataset:
52
  def __init__(self, data: List[Dict]):
53
  self.data = data
54
  self.len = len(data)
55
  self.current = 0
 
 
56
 
57
  def example(self, idx: int) -> Dict:
58
  self.current = max(0, min(self.len - 1, idx))
 
85
  def validate(self):
86
  self.data[self.current]["validated"] = True
87
 
88
+ # โ”€โ”€ Highlight utils using raw text (half-open intervals) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
89
  def prepare_for_highlight(data: Dict) -> List[Tuple[str, Optional[str]]]:
90
+ text = data.get("text", "")
91
+ # use annotated spans if any, else original ner_text
92
+ ner_spans = data.get("ner_annotated", data.get("ner_text", []))
93
+ segments: List[Tuple[str, Optional[str]]] = []
94
+ last_idx = 0
95
+ for start, end, label in sorted(ner_spans, key=lambda x: x[0]):
96
+ # slice in [start, end) since end is non-inclusive
97
+ if start > last_idx:
98
+ segments.append((text[last_idx:start], None))
99
+ segments.append((text[start:end], label))
100
+ last_idx = end
101
+ if last_idx < len(text):
102
+ segments.append((text[last_idx:], None))
103
+ return segments
104
+
105
+ def align_spans_to_text(highlighted: List[Dict[str, Any]], text: str) -> List[Tuple[int, int, str]]:
106
+ spans: List[Tuple[int, int, str]] = []
107
+ search_start = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  for entry in highlighted:
109
+ chunk = entry["token"]
110
+ label = entry.get("class_or_confidence") or entry.get("class") or entry.get("label")
111
+ pos = text.find(chunk, search_start)
112
+ if pos >= 0:
113
+ # new end is start + len(chunk)
114
+ spans.append((pos, pos + len(chunk), label))
115
+ search_start = pos + len(chunk)
116
+ else:
117
+ print(f"โš ๏ธ Couldnโ€™t align chunk: {chunk!r}")
118
+ return spans
 
 
 
 
119
 
120
+ # โ”€โ”€ Gradio demo โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
121
  def create_demo() -> gr.Blocks:
122
  data = load_initial_data()
123
  validated_store = load_all_validations()
 
 
 
 
 
 
124
  dynamic_dataset = DynamicDataset(data)
125
 
126
+ def make_info(rec: Dict) -> str:
127
  fn = rec.get("filename", "โ€”")
128
  pg = rec.get("page", "โ€”")
129
+ sg = rec.get("segment", "โ€”")
130
+ return f"**File:** `{fn}` \n**Page:** `{pg}`\n**sSegment:** `{sg}`"
131
 
132
+ def load_example(idx: int):
133
+ # If thereโ€™s a validated version, show that; otherwise fall back
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  rec = validated_store.get(idx, dynamic_dataset.example(idx))
135
  segs = prepare_for_highlight(rec)
136
  return segs, rec.get("validated", False), idx, make_info(rec)
137
 
138
  def update_example(highlighted, idx: int):
139
+ # Always edit the dynamic data, not the validated copy.
140
+ rec = dynamic_dataset.data[idx]
141
+ text = rec.get("text", "")
142
+ new_spans = align_spans_to_text(highlighted, text)
143
+ # store edits as half-open
144
+ rec["ner_annotated"] = new_spans
145
+ rec["validated"] = False
 
 
146
  return prepare_for_highlight(rec), rec["validated"], idx, make_info(rec)
147
 
148
  def do_validate(highlighted, idx: int):
149
+ # Edit dynamic data first
150
+ rec = dynamic_dataset.data[idx]
151
+ text = rec.get("text", "")
152
+ new_spans = align_spans_to_text(highlighted, text)
153
+ rec["ner_annotated"] = new_spans
154
  dynamic_dataset.validate()
155
+ # Now push that validated copy to S3 and to validated_store
156
+ rec_to_save = rec.copy()
157
+ rec_to_save["validated"] = True
158
+ save_single_validation(idx, rec_to_save)
159
+ validated_store[idx] = rec_to_save
160
+ return prepare_for_highlight(rec_to_save), True, make_info(rec_to_save)
 
 
161
 
162
  def nav(fn):
163
+ # Move the index/cursor in dynamic_dataset
164
+ _ = fn()
165
+ idx = dynamic_dataset.current
166
+ # If thereโ€™s a validated version, show that; else show dynamic data
167
+ rec = validated_store.get(idx, dynamic_dataset.data[idx])
168
  segs = prepare_for_highlight(rec)
169
+ return segs, rec.get("validated", False), idx, make_info(rec)
170
 
171
  with gr.Blocks() as demo:
172
+ prog = gr.Slider(
173
+ minimum=0,
174
+ maximum=dynamic_dataset.len - 1,
175
+ value=0,
176
+ step=1,
177
+ label="Example # (slide to navigate)",
178
+ interactive=True,
179
  )
180
+ inp_box = gr.HighlightedText(label="Sentence", interactive=True)
181
+ info_md = gr.Markdown(label="Source")
182
+ status = gr.Checkbox(label="Validated?", value=False, interactive=False)
183
 
184
+ gr.Markdown("[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)")
 
 
 
185
 
186
+ with gr.Row():
187
+ prev_btn = gr.Button("โ—€๏ธ Previous")
188
+ apply_btn = gr.Button("๐Ÿ“ Apply Changes")
189
+ next_btn = gr.Button("Next โ–ถ๏ธ")
190
  with gr.Row():
191
  skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
192
  validate_btn = gr.Button("โœ… Validate")
193
  skip_next = gr.Button("โญ๏ธ Next Unvalidated")
194
 
195
+ # โ”€โ”€โ”€โ”€โ”€ Wiring events โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
196
+ prog.release(
197
+ fn=load_example,
198
+ inputs=[prog],
199
+ outputs=[inp_box, status, prog, info_md],
200
+ )
201
  demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog, info_md])
 
 
202
  apply_btn.click(update_example, inputs=[inp_box, prog], outputs=[inp_box, status, prog, info_md])
203
  prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog, info_md])
204
  next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog, info_md])
 
209
  return demo
210
 
211
  if __name__ == "__main__":
212
+ create_demo().launch(share=False, debug=True)