Spaces:

BookingCare
/

ner-annotation

Running

App Files Files Community

nam pham commited on May 27

Commit

64d96d3

1 Parent(s): ad042b1

feat: download and upload file

Browse files

Files changed (2) hide show

app.py +136 -39
data/annotated_data.json +0 -0

app.py CHANGED Viewed

@@ -247,9 +247,10 @@ def merge_entities(entities):
     merged.append(current)
     return merged
-def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nested_ner: bool) -> Dict:
     labels = [label.strip() for label in labels]
-    entities = model.predict_entities(text, labels, flat_ner=not nested_ner, threshold=threshold)
     r = {
         "text": text,
         "entities": [
@@ -260,7 +261,9 @@ def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nest
                 "end": entity["end"],
                 "score": 0,
             }
-            for entity in entities
         ],
     }
     r["entities"] = merge_entities(r["entities"])
@@ -311,25 +314,23 @@ class AutoAnnotator:
         self.stat["current"] = -1  # Reset current progress
         # Process texts in batches
-        batch_size = 32  # Adjust based on your GPU memory
         processed_data = []
-        for i in range(0, len(data), batch_size):
-            batch_texts = data[i:i + batch_size]
             if isinstance(prompt, list):
                 prompt_text = random.choice(prompt)
             else:
                 prompt_text = prompt
-            # Add prompt to each text in batch
-            batch_texts = [f"{prompt_text}\n{text}" if prompt_text else text for text in batch_texts]
-            # Process batch
-            batch_results = batch_annotate_text(self.model, batch_texts, labels, threshold, nested_ner)
-            processed_data.extend(batch_results)
             # Update progress
-            self.stat["current"] = min(i + batch_size, len(data))
         self.annotated_data = processed_data
         return self.annotated_data
@@ -338,22 +339,93 @@ class AutoAnnotator:
 annotator = None
 sentences = []
 def process_uploaded_file(file_obj):
     if file_obj is None:
         return "Please upload a file first!"
     try:
         # Read the uploaded file
-        with open(file_obj.name, 'r', encoding='utf-8') as f:
-            global sentences
-            sentences = [line.strip() for line in f if line.strip()]
         return f"Successfully loaded {len(sentences)} sentences from file!"
     except Exception as e:
         return f"Error reading file: {str(e)}"
 def is_valid_repo_name(repo_name):
     # Hugging Face repo names must not contain slashes or spaces
-    return bool(re.match(r'^[A-Za-z0-9_.-]+$', repo_name))
 def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
     """Create a new repository on Hugging Face Hub"""
@@ -443,7 +515,7 @@ def convert_hf_dataset_to_ner_format(dataset):
     return converted_data
-def load_from_huggingface(dataset_name: str, split: str = "train"):
     """Load dataset from Hugging Face Hub"""
     try:
         dataset = load_dataset(dataset_name, split=split)
@@ -797,17 +869,21 @@ with gr.Blocks() as demo:
                     )
                     local_status = gr.Textbox(label="Local File Status", visible=False)
-                    dataset_name = gr.Textbox(
-                        label="Hugging Face Dataset Name",
-                        placeholder="Enter dataset name (e.g., conll2003)",
-                        visible=False
-                    )
-                    dataset_split = gr.Dropdown(
-                        choices=["train", "validation", "test"],
-                        value="train",
-                        label="Dataset Split",
-                        visible=False
-                    )
                     bar = gr.Slider(
                         minimum=0,
@@ -827,7 +903,7 @@ with gr.Blocks() as demo:
                     save_btn = gr.Button("Save validated dataset")
                     # Add Hugging Face upload section
-                    with gr.Group():
                         gr.Markdown("### Upload to Hugging Face")
                         hf_repo_name = gr.Textbox(
                             label="Repository Name",
@@ -846,6 +922,29 @@ with gr.Blocks() as demo:
                         upload_to_hf_btn = gr.Button("Upload to Hugging Face")
                         hf_upload_status = gr.Textbox(label="Upload Status")
                     inp_box = gr.HighlightedText(value=None, interactive=True)
                     def toggle_local_inputs():
@@ -853,8 +952,7 @@ with gr.Blocks() as demo:
                             local_file: gr.update(visible=True),
                             file_format: gr.update(visible=True),
                             local_status: gr.update(visible=True),
-                            dataset_name: gr.update(visible=False),
-                            dataset_split: gr.update(visible=False)
                         }
                     def toggle_hf_inputs():
@@ -862,20 +960,19 @@ with gr.Blocks() as demo:
                             local_file: gr.update(visible=False),
                             file_format: gr.update(visible=False),
                             local_status: gr.update(visible=False),
-                            dataset_name: gr.update(visible=True),
-                            dataset_split: gr.update(visible=True)
                         }
                     load_local_btn.click(
                         fn=toggle_local_inputs,
                         inputs=None,
-                        outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
                     )
                     load_hf_btn.click(
                         fn=toggle_hf_inputs,
                         inputs=None,
-                        outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
                     )
                     def process_and_load_local(file_obj, format):
@@ -893,13 +990,13 @@ with gr.Blocks() as demo:
                     def load_hf_dataset(name, split):
                         status = load_from_huggingface(name, split)
                         if "Successfully" in status:
-                            return load_dataset()
-                        return [status], 0, 0
-                    load_hf_btn.click(
                         fn=load_hf_dataset,
                         inputs=[dataset_name, dataset_split],
-                        outputs=[inp_box, bar]
                     )
                     apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)

     merged.append(current)
     return merged
+def annotate_text(
+    model, text, labels: List[str], threshold: float, nested_ner: bool
+) -> Dict:
     labels = [label.strip() for label in labels]
     r = {
         "text": text,
         "entities": [
                 "end": entity["end"],
                 "score": 0,
             }
+            for entity in model.predict_entities(
+                text, labels, flat_ner=not nested_ner, threshold=threshold
+            )
         ],
     }
     r["entities"] = merge_entities(r["entities"])
         self.stat["current"] = -1  # Reset current progress
         # Process texts in batches
         processed_data = []
+        for i, text in enumerate(data):
             if isinstance(prompt, list):
                 prompt_text = random.choice(prompt)
             else:
                 prompt_text = prompt
+            # Add prompt to text
+            text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
+            # Process single text
+            result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
+            processed_data.append(result)
             # Update progress
+            self.stat["current"] = i + 1
         self.annotated_data = processed_data
         return self.annotated_data
 annotator = None
 sentences = []
+def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
+    """
+    Process text for GLiNER by splitting long texts into overlapping chunks.
+    Preserves sentence boundaries and context when possible.
+    Args:
+        text: The input text to process
+        max_tokens: Maximum number of tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+    Returns:
+        List of text chunks suitable for GLiNER
+    """
+    # First split into sentences to preserve natural boundaries
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        # Tokenize the sentence
+        sentence_tokens = tokenize_text(sentence)
+        sentence_length = len(sentence_tokens)
+        # If a single sentence is too long, split it
+        if sentence_length > max_tokens:
+            # If we have accumulated tokens, add them as a chunk
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            # Split the long sentence into smaller chunks
+            start = 0
+            while start < sentence_length:
+                end = min(start + max_tokens, sentence_length)
+                chunk_tokens = sentence_tokens[start:end]
+                chunks.append(" ".join(chunk_tokens))
+                start = end - overlap if end < sentence_length else end
+        # If adding this sentence would exceed max_tokens, start a new chunk
+        elif current_length + sentence_length > max_tokens:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = sentence_tokens
+            current_length = sentence_length
+        else:
+            current_chunk.extend(sentence_tokens)
+            current_length += sentence_length
+    # Add any remaining tokens as the final chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
 def process_uploaded_file(file_obj):
     if file_obj is None:
         return "Please upload a file first!"
     try:
         # Read the uploaded file
+        global sentences
+        if file_obj.name.endswith('.csv'):
+            import pandas as pd
+            df = pd.read_csv(file_obj.name)
+            sentences = df['Nội dung'].dropna().tolist()
+            # Process each sentence and flatten the list
+            processed_sentences = []
+            for sentence in sentences:
+                processed_sentences.extend(process_text_for_gliner(sentence))
+            sentences = processed_sentences
+        else:
+            # Read the file content directly from the file object
+            content = file_obj.read().decode('utf-8')
+            raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
+            # Process each sentence and flatten the list
+            processed_sentences = []
+            for sentence in raw_sentences:
+                processed_sentences.extend(process_text_for_gliner(sentence))
+            sentences = processed_sentences
         return f"Successfully loaded {len(sentences)} sentences from file!"
     except Exception as e:
         return f"Error reading file: {str(e)}"
 def is_valid_repo_name(repo_name):
     # Hugging Face repo names must not contain slashes or spaces
+    return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
 def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
     """Create a new repository on Hugging Face Hub"""
     return converted_data
+def load_from_huggingface(dataset_name: str, split: str = "all"):
     """Load dataset from Hugging Face Hub"""
     try:
         dataset = load_dataset(dataset_name, split=split)
                     )
                     local_status = gr.Textbox(label="Local File Status", visible=False)
+                    with gr.Group(visible=False) as hf_inputs:
+                        with gr.Row():
+                            dataset_name = gr.Textbox(
+                                label="Hugging Face Dataset Name",
+                                placeholder="Enter dataset name (e.g., conll2003)",
+                                scale=3
+                            )
+                            dataset_split = gr.Dropdown(
+                                choices=["train", "validation", "test"],
+                                value="train",
+                                label="Dataset Split",
+                                scale=2
+                            )
+                            load_dataset_btn = gr.Button("Load Dataset", scale=1)
+                        hf_status = gr.Textbox(label="Dataset Loading Status")
                     bar = gr.Slider(
                         minimum=0,
                     save_btn = gr.Button("Save validated dataset")
                     # Add Hugging Face upload section
+                    with gr.Group(visible=False) as hf_upload_group:
                         gr.Markdown("### Upload to Hugging Face")
                         hf_repo_name = gr.Textbox(
                             label="Repository Name",
                         upload_to_hf_btn = gr.Button("Upload to Hugging Face")
                         hf_upload_status = gr.Textbox(label="Upload Status")
+                    with gr.Row():
+                        show_hf_upload_btn = gr.Button("Show Upload Options")
+                        hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
+                    def toggle_hf_upload(show: bool):
+                        return {
+                            hf_upload_group: gr.update(visible=show),
+                            show_hf_upload_btn: gr.update(visible=not show),
+                            hide_hf_upload_btn: gr.update(visible=show)
+                        }
+                    show_hf_upload_btn.click(
+                        fn=lambda: toggle_hf_upload(True),
+                        inputs=None,
+                        outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
+                    )
+                    hide_hf_upload_btn.click(
+                        fn=lambda: toggle_hf_upload(False),
+                        inputs=None,
+                        outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
+                    )
                     inp_box = gr.HighlightedText(value=None, interactive=True)
                     def toggle_local_inputs():
                             local_file: gr.update(visible=True),
                             file_format: gr.update(visible=True),
                             local_status: gr.update(visible=True),
+                            hf_inputs: gr.update(visible=False)
                         }
                     def toggle_hf_inputs():
                             local_file: gr.update(visible=False),
                             file_format: gr.update(visible=False),
                             local_status: gr.update(visible=False),
+                            hf_inputs: gr.update(visible=True)
                         }
                     load_local_btn.click(
                         fn=toggle_local_inputs,
                         inputs=None,
+                        outputs=[local_file, file_format, local_status, hf_inputs]
                     )
                     load_hf_btn.click(
                         fn=toggle_hf_inputs,
                         inputs=None,
+                        outputs=[local_file, file_format, local_status, hf_inputs]
                     )
                     def process_and_load_local(file_obj, format):
                     def load_hf_dataset(name, split):
                         status = load_from_huggingface(name, split)
                         if "Successfully" in status:
+                            return load_dataset(), status
+                        return [status], 0, 0, status
+                    load_dataset_btn.click(
                         fn=load_hf_dataset,
                         inputs=[dataset_name, dataset_split],
+                        outputs=[inp_box, bar, hf_status]
                     )
                     apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)

data/annotated_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff