Spaces:

BookingCare
/

ner-annotation

Running

App Files Files Community

nam pham commited on May 28

Commit

ffa19f8

1 Parent(s): 345f1ee

fix: upload to huggingface

Browse files

Files changed (2) hide show

app.py +30 -15
data/annotated_data.json +0 -0

app.py CHANGED Viewed

@@ -295,10 +295,16 @@ def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], thre
 class AutoAnnotator:
     def __init__(
-        self, model: str = "knowledgator/gliner-multitask-large-v0.5",
-        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
         ) -> None:
         self.model = GLiNER.from_pretrained(model).to(device)
         self.annotated_data = []
         self.stat = {
@@ -315,22 +321,31 @@ class AutoAnnotator:
         # Process texts in batches
         processed_data = []
-        for i, text in enumerate(data):
-            if isinstance(prompt, list):
-                prompt_text = random.choice(prompt)
-            else:
-                prompt_text = prompt
-            # Add prompt to text
-            text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
-            # Process single text
-            result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
-            processed_data.append(result)
             # Update progress
-            self.stat["current"] = i + 1
         self.annotated_data = processed_data
         return self.annotated_data
@@ -339,7 +354,7 @@ class AutoAnnotator:
 annotator = None
 sentences = []
-def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
     """
     Process text for GLiNER by splitting long texts into overlapping chunks.
     Preserves sentence boundaries and context when possible.
@@ -442,7 +457,7 @@ def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = F
             exist_ok=True,
             token=HF_TOKEN
         )
-        return repo_id
     except Exception as e:
         raise Exception(f"Error creating repository: {str(e)}")

 class AutoAnnotator:
     def __init__(
+        self, model: str = "BookingCare/gliner-multi-healthcare",
+        # device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+        device = torch.device('cpu')
         ) -> None:
+        # Set PyTorch memory management settings
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
         self.model = GLiNER.from_pretrained(model).to(device)
         self.annotated_data = []
         self.stat = {
         # Process texts in batches
         processed_data = []
+        batch_size = 8  # Reduced batch size to prevent OOM errors
+        for i in range(0, len(data), batch_size):
+            batch_texts = data[i:i + batch_size]
+            batch_with_prompts = []
+            # Add prompts to batch texts
+            for text in batch_texts:
+                if isinstance(prompt, list):
+                    prompt_text = random.choice(prompt)
+                else:
+                    prompt_text = prompt
+                text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
+                batch_with_prompts.append(text_with_prompt)
+            # Process batch
+            batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
+            processed_data.extend(batch_results)
+            # Clear CUDA cache after each batch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             # Update progress
+            self.stat["current"] = min(i + batch_size, len(data))
         self.annotated_data = processed_data
         return self.annotated_data
 annotator = None
 sentences = []
+def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
     """
     Process text for GLiNER by splitting long texts into overlapping chunks.
     Preserves sentence boundaries and context when possible.
             exist_ok=True,
             token=HF_TOKEN
         )
+        return repo_name
     except Exception as e:
         raise Exception(f"Error creating repository: {str(e)}")

data/annotated_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff