rafmacalaba's picture
change dataset
b3d566f
import boto3
import os
import json
import gradio as gr
from typing import List, Dict, Tuple, Optional, Any
# โ”€โ”€ S3 CONFIG โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
s3 = boto3.client(
"s3",
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2"),
)
BUCKET = "doccano-processed"
INIT_KEY = "gradio/ai4data-datause-alldata.json"
VALID_PREFIX = "ai4data-alldata-output/"
# โ”€โ”€ Helpers to load & save from S3 โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def load_initial_data() -> List[Dict]:
obj = s3.get_object(Bucket=BUCKET, Key=INIT_KEY)
data = json.loads(obj['Body'].read())
# assume ner_text spans use end-index as non-inclusive
for rec in data:
rec.setdefault("validated", False)
return data
def load_all_validations() -> Dict[int, Dict]:
records = {}
pages = s3.get_paginator("list_objects_v2").paginate(
Bucket=BUCKET, Prefix=VALID_PREFIX
)
for page in pages:
for obj in page.get("Contents", []):
idx = int(os.path.splitext(os.path.basename(obj["Key"]))[0])
rec = json.loads(s3.get_object(Bucket=BUCKET, Key=obj["Key"])['Body'].read())
rec.setdefault("validated", True)
records[idx] = rec
return records
def save_single_validation(idx: int, record: Dict):
key = f"{VALID_PREFIX}{idx}.json"
s3.put_object(
Bucket = BUCKET,
Key = key,
Body = json.dumps(record, indent=2).encode('utf-8'),
ContentType = 'application/json'
)
##fckxk
class DynamicDataset:
def __init__(self, data: List[Dict]):
self.data = data
self.len = len(data)
self.current = 0
def example(self, idx: int) -> Dict:
self.current = max(0, min(self.len - 1, idx))
return self.data[self.current]
def next(self) -> Dict:
if self.current < self.len - 1:
self.current += 1
return self.data[self.current]
def prev(self) -> Dict:
if self.current > 0:
self.current -= 1
return self.data[self.current]
def jump_next_unvalidated(self) -> Dict:
for i in range(self.current + 1, self.len):
if not self.data[i]["validated"]:
self.current = i
break
return self.data[self.current]
def jump_prev_unvalidated(self) -> Dict:
for i in range(self.current - 1, -1, -1):
if not self.data[i]["validated"]:
self.current = i
break
return self.data[self.current]
def validate(self):
self.data[self.current]["validated"] = True
# โ”€โ”€ Highlight utils using raw text (half-open intervals) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def prepare_for_highlight(data: Dict) -> List[Tuple[str, Optional[str]]]:
text = data.get("text", "")
# use annotated spans if any, else original ner_text
ner_spans = data.get("ner_annotated", data.get("ner_text", []))
segments: List[Tuple[str, Optional[str]]] = []
last_idx = 0
for start, end, label in sorted(ner_spans, key=lambda x: x[0]):
# slice in [start, end) since end is non-inclusive
if start > last_idx:
segments.append((text[last_idx:start], None))
segments.append((text[start:end], label))
last_idx = end
if last_idx < len(text):
segments.append((text[last_idx:], None))
return segments
def align_spans_to_text(highlighted: List[Dict[str, Any]], text: str) -> List[Tuple[int, int, str]]:
spans: List[Tuple[int, int, str]] = []
search_start = 0
for entry in highlighted:
chunk = entry["token"]
label = entry.get("class_or_confidence") or entry.get("class") or entry.get("label")
pos = text.find(chunk, search_start)
if pos >= 0:
# new end is start + len(chunk)
spans.append((pos, pos + len(chunk), label))
search_start = pos + len(chunk)
else:
print(f"โš ๏ธ Couldnโ€™t align chunk: {chunk!r}")
return spans
# โ”€โ”€ Gradio demo โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def create_demo() -> gr.Blocks:
data = load_initial_data()
validated_store = load_all_validations()
dynamic_dataset = DynamicDataset(data)
def make_info(rec: Dict) -> str:
fn = rec.get("filename", "โ€”")
pg = rec.get("page", "โ€”")
sg = rec.get("segment", "โ€”")
return f"**File:** `{fn}` \n**Page:** `{pg}`\n**sSegment:** `{sg}`"
def load_example(idx: int):
# If thereโ€™s a validated version, show that; otherwise fall back
rec = validated_store.get(idx, dynamic_dataset.example(idx))
segs = prepare_for_highlight(rec)
return segs, rec.get("validated", False), idx, make_info(rec)
def update_example(highlighted, idx: int):
# Always edit the dynamic data, not the validated copy.
rec = dynamic_dataset.data[idx]
text = rec.get("text", "")
new_spans = align_spans_to_text(highlighted, text)
# store edits as half-open
rec["ner_annotated"] = new_spans
rec["validated"] = False
return prepare_for_highlight(rec), rec["validated"], idx, make_info(rec)
def do_validate(highlighted, idx: int):
# Edit dynamic data first
rec = dynamic_dataset.data[idx]
text = rec.get("text", "")
new_spans = align_spans_to_text(highlighted, text)
rec["ner_annotated"] = new_spans
dynamic_dataset.validate()
# Now push that validated copy to S3 and to validated_store
rec_to_save = rec.copy()
rec_to_save["validated"] = True
save_single_validation(idx, rec_to_save)
validated_store[idx] = rec_to_save
return prepare_for_highlight(rec_to_save), True, make_info(rec_to_save)
def nav(fn):
# Move the index/cursor in dynamic_dataset
_ = fn()
idx = dynamic_dataset.current
# If thereโ€™s a validated version, show that; else show dynamic data
rec = validated_store.get(idx, dynamic_dataset.data[idx])
segs = prepare_for_highlight(rec)
return segs, rec.get("validated", False), idx, make_info(rec)
with gr.Blocks() as demo:
prog = gr.Slider(
minimum=0,
maximum=dynamic_dataset.len - 1,
value=0,
step=1,
label="Example # (slide to navigate)",
interactive=True,
)
inp_box = gr.HighlightedText(label="Sentence", interactive=True)
info_md = gr.Markdown(label="Source")
status = gr.Checkbox(label="Validated?", value=False, interactive=False)
gr.Markdown("[๐Ÿ“– Entity Tag Guide](https://huggingface.co/spaces/rafmacalaba/datause-annotation/blob/main/guidelines.md)")
with gr.Row():
prev_btn = gr.Button("โ—€๏ธ Previous")
apply_btn = gr.Button("๐Ÿ“ Apply Changes")
next_btn = gr.Button("Next โ–ถ๏ธ")
with gr.Row():
skip_prev = gr.Button("โฎ๏ธ Prev Unvalidated")
validate_btn = gr.Button("โœ… Validate")
skip_next = gr.Button("โญ๏ธ Next Unvalidated")
# โ”€โ”€โ”€โ”€โ”€ Wiring events โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
prog.release(
fn=load_example,
inputs=[prog],
outputs=[inp_box, status, prog, info_md],
)
demo.load(load_example, inputs=prog, outputs=[inp_box, status, prog, info_md])
apply_btn.click(update_example, inputs=[inp_box, prog], outputs=[inp_box, status, prog, info_md])
prev_btn.click(lambda: nav(dynamic_dataset.prev), inputs=None, outputs=[inp_box, status, prog, info_md])
next_btn.click(lambda: nav(dynamic_dataset.next), inputs=None, outputs=[inp_box, status, prog, info_md])
skip_prev.click(lambda: nav(dynamic_dataset.jump_prev_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md])
skip_next.click(lambda: nav(dynamic_dataset.jump_next_unvalidated), inputs=None, outputs=[inp_box, status, prog, info_md])
validate_btn.click(do_validate, inputs=[inp_box, prog], outputs=[inp_box, status, info_md])
return demo
if __name__ == "__main__":
create_demo().launch(share=False, debug=True)