nam pham commited on
Commit
64d96d3
·
1 Parent(s): ad042b1

feat: download and upload file

Browse files
Files changed (2) hide show
  1. app.py +136 -39
  2. data/annotated_data.json +0 -0
app.py CHANGED
@@ -247,9 +247,10 @@ def merge_entities(entities):
247
  merged.append(current)
248
  return merged
249
 
250
- def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nested_ner: bool) -> Dict:
 
 
251
  labels = [label.strip() for label in labels]
252
- entities = model.predict_entities(text, labels, flat_ner=not nested_ner, threshold=threshold)
253
  r = {
254
  "text": text,
255
  "entities": [
@@ -260,7 +261,9 @@ def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nest
260
  "end": entity["end"],
261
  "score": 0,
262
  }
263
- for entity in entities
 
 
264
  ],
265
  }
266
  r["entities"] = merge_entities(r["entities"])
@@ -311,25 +314,23 @@ class AutoAnnotator:
311
  self.stat["current"] = -1 # Reset current progress
312
 
313
  # Process texts in batches
314
- batch_size = 32 # Adjust based on your GPU memory
315
  processed_data = []
316
 
317
- for i in range(0, len(data), batch_size):
318
- batch_texts = data[i:i + batch_size]
319
  if isinstance(prompt, list):
320
  prompt_text = random.choice(prompt)
321
  else:
322
  prompt_text = prompt
323
 
324
- # Add prompt to each text in batch
325
- batch_texts = [f"{prompt_text}\n{text}" if prompt_text else text for text in batch_texts]
326
 
327
- # Process batch
328
- batch_results = batch_annotate_text(self.model, batch_texts, labels, threshold, nested_ner)
329
- processed_data.extend(batch_results)
330
 
331
  # Update progress
332
- self.stat["current"] = min(i + batch_size, len(data))
333
 
334
  self.annotated_data = processed_data
335
  return self.annotated_data
@@ -338,22 +339,93 @@ class AutoAnnotator:
338
  annotator = None
339
  sentences = []
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  def process_uploaded_file(file_obj):
342
  if file_obj is None:
343
  return "Please upload a file first!"
344
 
345
  try:
346
  # Read the uploaded file
347
- with open(file_obj.name, 'r', encoding='utf-8') as f:
348
- global sentences
349
- sentences = [line.strip() for line in f if line.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  return f"Successfully loaded {len(sentences)} sentences from file!"
351
  except Exception as e:
352
  return f"Error reading file: {str(e)}"
353
 
354
  def is_valid_repo_name(repo_name):
355
  # Hugging Face repo names must not contain slashes or spaces
356
- return bool(re.match(r'^[A-Za-z0-9_.-]+$', repo_name))
357
 
358
  def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
359
  """Create a new repository on Hugging Face Hub"""
@@ -443,7 +515,7 @@ def convert_hf_dataset_to_ner_format(dataset):
443
 
444
  return converted_data
445
 
446
- def load_from_huggingface(dataset_name: str, split: str = "train"):
447
  """Load dataset from Hugging Face Hub"""
448
  try:
449
  dataset = load_dataset(dataset_name, split=split)
@@ -797,17 +869,21 @@ with gr.Blocks() as demo:
797
  )
798
  local_status = gr.Textbox(label="Local File Status", visible=False)
799
 
800
- dataset_name = gr.Textbox(
801
- label="Hugging Face Dataset Name",
802
- placeholder="Enter dataset name (e.g., conll2003)",
803
- visible=False
804
- )
805
- dataset_split = gr.Dropdown(
806
- choices=["train", "validation", "test"],
807
- value="train",
808
- label="Dataset Split",
809
- visible=False
810
- )
 
 
 
 
811
 
812
  bar = gr.Slider(
813
  minimum=0,
@@ -827,7 +903,7 @@ with gr.Blocks() as demo:
827
  save_btn = gr.Button("Save validated dataset")
828
 
829
  # Add Hugging Face upload section
830
- with gr.Group():
831
  gr.Markdown("### Upload to Hugging Face")
832
  hf_repo_name = gr.Textbox(
833
  label="Repository Name",
@@ -846,6 +922,29 @@ with gr.Blocks() as demo:
846
  upload_to_hf_btn = gr.Button("Upload to Hugging Face")
847
  hf_upload_status = gr.Textbox(label="Upload Status")
848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  inp_box = gr.HighlightedText(value=None, interactive=True)
850
 
851
  def toggle_local_inputs():
@@ -853,8 +952,7 @@ with gr.Blocks() as demo:
853
  local_file: gr.update(visible=True),
854
  file_format: gr.update(visible=True),
855
  local_status: gr.update(visible=True),
856
- dataset_name: gr.update(visible=False),
857
- dataset_split: gr.update(visible=False)
858
  }
859
 
860
  def toggle_hf_inputs():
@@ -862,20 +960,19 @@ with gr.Blocks() as demo:
862
  local_file: gr.update(visible=False),
863
  file_format: gr.update(visible=False),
864
  local_status: gr.update(visible=False),
865
- dataset_name: gr.update(visible=True),
866
- dataset_split: gr.update(visible=True)
867
  }
868
 
869
  load_local_btn.click(
870
  fn=toggle_local_inputs,
871
  inputs=None,
872
- outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
873
  )
874
 
875
  load_hf_btn.click(
876
  fn=toggle_hf_inputs,
877
  inputs=None,
878
- outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
879
  )
880
 
881
  def process_and_load_local(file_obj, format):
@@ -893,13 +990,13 @@ with gr.Blocks() as demo:
893
  def load_hf_dataset(name, split):
894
  status = load_from_huggingface(name, split)
895
  if "Successfully" in status:
896
- return load_dataset()
897
- return [status], 0, 0
898
 
899
- load_hf_btn.click(
900
  fn=load_hf_dataset,
901
  inputs=[dataset_name, dataset_split],
902
- outputs=[inp_box, bar]
903
  )
904
 
905
  apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
 
247
  merged.append(current)
248
  return merged
249
 
250
+ def annotate_text(
251
+ model, text, labels: List[str], threshold: float, nested_ner: bool
252
+ ) -> Dict:
253
  labels = [label.strip() for label in labels]
 
254
  r = {
255
  "text": text,
256
  "entities": [
 
261
  "end": entity["end"],
262
  "score": 0,
263
  }
264
+ for entity in model.predict_entities(
265
+ text, labels, flat_ner=not nested_ner, threshold=threshold
266
+ )
267
  ],
268
  }
269
  r["entities"] = merge_entities(r["entities"])
 
314
  self.stat["current"] = -1 # Reset current progress
315
 
316
  # Process texts in batches
 
317
  processed_data = []
318
 
319
+ for i, text in enumerate(data):
 
320
  if isinstance(prompt, list):
321
  prompt_text = random.choice(prompt)
322
  else:
323
  prompt_text = prompt
324
 
325
+ # Add prompt to text
326
+ text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
327
 
328
+ # Process single text
329
+ result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
330
+ processed_data.append(result)
331
 
332
  # Update progress
333
+ self.stat["current"] = i + 1
334
 
335
  self.annotated_data = processed_data
336
  return self.annotated_data
 
339
  annotator = None
340
  sentences = []
341
 
342
+ def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
343
+ """
344
+ Process text for GLiNER by splitting long texts into overlapping chunks.
345
+ Preserves sentence boundaries and context when possible.
346
+
347
+ Args:
348
+ text: The input text to process
349
+ max_tokens: Maximum number of tokens per chunk
350
+ overlap: Number of tokens to overlap between chunks
351
+
352
+ Returns:
353
+ List of text chunks suitable for GLiNER
354
+ """
355
+ # First split into sentences to preserve natural boundaries
356
+ sentences = re.split(r'(?<=[.!?])\s+', text)
357
+ chunks = []
358
+ current_chunk = []
359
+ current_length = 0
360
+
361
+ for sentence in sentences:
362
+ # Tokenize the sentence
363
+ sentence_tokens = tokenize_text(sentence)
364
+ sentence_length = len(sentence_tokens)
365
+
366
+ # If a single sentence is too long, split it
367
+ if sentence_length > max_tokens:
368
+ # If we have accumulated tokens, add them as a chunk
369
+ if current_chunk:
370
+ chunks.append(" ".join(current_chunk))
371
+ current_chunk = []
372
+ current_length = 0
373
+
374
+ # Split the long sentence into smaller chunks
375
+ start = 0
376
+ while start < sentence_length:
377
+ end = min(start + max_tokens, sentence_length)
378
+ chunk_tokens = sentence_tokens[start:end]
379
+ chunks.append(" ".join(chunk_tokens))
380
+ start = end - overlap if end < sentence_length else end
381
+
382
+ # If adding this sentence would exceed max_tokens, start a new chunk
383
+ elif current_length + sentence_length > max_tokens:
384
+ chunks.append(" ".join(current_chunk))
385
+ current_chunk = sentence_tokens
386
+ current_length = sentence_length
387
+ else:
388
+ current_chunk.extend(sentence_tokens)
389
+ current_length += sentence_length
390
+
391
+ # Add any remaining tokens as the final chunk
392
+ if current_chunk:
393
+ chunks.append(" ".join(current_chunk))
394
+
395
+ return chunks
396
+
397
  def process_uploaded_file(file_obj):
398
  if file_obj is None:
399
  return "Please upload a file first!"
400
 
401
  try:
402
  # Read the uploaded file
403
+ global sentences
404
+ if file_obj.name.endswith('.csv'):
405
+ import pandas as pd
406
+ df = pd.read_csv(file_obj.name)
407
+ sentences = df['Nội dung'].dropna().tolist()
408
+ # Process each sentence and flatten the list
409
+ processed_sentences = []
410
+ for sentence in sentences:
411
+ processed_sentences.extend(process_text_for_gliner(sentence))
412
+ sentences = processed_sentences
413
+ else:
414
+ # Read the file content directly from the file object
415
+ content = file_obj.read().decode('utf-8')
416
+ raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
417
+ # Process each sentence and flatten the list
418
+ processed_sentences = []
419
+ for sentence in raw_sentences:
420
+ processed_sentences.extend(process_text_for_gliner(sentence))
421
+ sentences = processed_sentences
422
  return f"Successfully loaded {len(sentences)} sentences from file!"
423
  except Exception as e:
424
  return f"Error reading file: {str(e)}"
425
 
426
  def is_valid_repo_name(repo_name):
427
  # Hugging Face repo names must not contain slashes or spaces
428
+ return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
429
 
430
  def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
431
  """Create a new repository on Hugging Face Hub"""
 
515
 
516
  return converted_data
517
 
518
+ def load_from_huggingface(dataset_name: str, split: str = "all"):
519
  """Load dataset from Hugging Face Hub"""
520
  try:
521
  dataset = load_dataset(dataset_name, split=split)
 
869
  )
870
  local_status = gr.Textbox(label="Local File Status", visible=False)
871
 
872
+ with gr.Group(visible=False) as hf_inputs:
873
+ with gr.Row():
874
+ dataset_name = gr.Textbox(
875
+ label="Hugging Face Dataset Name",
876
+ placeholder="Enter dataset name (e.g., conll2003)",
877
+ scale=3
878
+ )
879
+ dataset_split = gr.Dropdown(
880
+ choices=["train", "validation", "test"],
881
+ value="train",
882
+ label="Dataset Split",
883
+ scale=2
884
+ )
885
+ load_dataset_btn = gr.Button("Load Dataset", scale=1)
886
+ hf_status = gr.Textbox(label="Dataset Loading Status")
887
 
888
  bar = gr.Slider(
889
  minimum=0,
 
903
  save_btn = gr.Button("Save validated dataset")
904
 
905
  # Add Hugging Face upload section
906
+ with gr.Group(visible=False) as hf_upload_group:
907
  gr.Markdown("### Upload to Hugging Face")
908
  hf_repo_name = gr.Textbox(
909
  label="Repository Name",
 
922
  upload_to_hf_btn = gr.Button("Upload to Hugging Face")
923
  hf_upload_status = gr.Textbox(label="Upload Status")
924
 
925
+ with gr.Row():
926
+ show_hf_upload_btn = gr.Button("Show Upload Options")
927
+ hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
928
+
929
+ def toggle_hf_upload(show: bool):
930
+ return {
931
+ hf_upload_group: gr.update(visible=show),
932
+ show_hf_upload_btn: gr.update(visible=not show),
933
+ hide_hf_upload_btn: gr.update(visible=show)
934
+ }
935
+
936
+ show_hf_upload_btn.click(
937
+ fn=lambda: toggle_hf_upload(True),
938
+ inputs=None,
939
+ outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
940
+ )
941
+
942
+ hide_hf_upload_btn.click(
943
+ fn=lambda: toggle_hf_upload(False),
944
+ inputs=None,
945
+ outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
946
+ )
947
+
948
  inp_box = gr.HighlightedText(value=None, interactive=True)
949
 
950
  def toggle_local_inputs():
 
952
  local_file: gr.update(visible=True),
953
  file_format: gr.update(visible=True),
954
  local_status: gr.update(visible=True),
955
+ hf_inputs: gr.update(visible=False)
 
956
  }
957
 
958
  def toggle_hf_inputs():
 
960
  local_file: gr.update(visible=False),
961
  file_format: gr.update(visible=False),
962
  local_status: gr.update(visible=False),
963
+ hf_inputs: gr.update(visible=True)
 
964
  }
965
 
966
  load_local_btn.click(
967
  fn=toggle_local_inputs,
968
  inputs=None,
969
+ outputs=[local_file, file_format, local_status, hf_inputs]
970
  )
971
 
972
  load_hf_btn.click(
973
  fn=toggle_hf_inputs,
974
  inputs=None,
975
+ outputs=[local_file, file_format, local_status, hf_inputs]
976
  )
977
 
978
  def process_and_load_local(file_obj, format):
 
990
  def load_hf_dataset(name, split):
991
  status = load_from_huggingface(name, split)
992
  if "Successfully" in status:
993
+ return load_dataset(), status
994
+ return [status], 0, 0, status
995
 
996
+ load_dataset_btn.click(
997
  fn=load_hf_dataset,
998
  inputs=[dataset_name, dataset_split],
999
+ outputs=[inp_box, bar, hf_status]
1000
  )
1001
 
1002
  apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
data/annotated_data.json CHANGED
The diff for this file is too large to render. See raw diff