nam pham commited on
Commit
ffa19f8
·
1 Parent(s): 345f1ee

fix: upload to huggingface

Browse files
Files changed (2) hide show
  1. app.py +30 -15
  2. data/annotated_data.json +0 -0
app.py CHANGED
@@ -295,10 +295,16 @@ def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], thre
295
 
296
  class AutoAnnotator:
297
  def __init__(
298
- self, model: str = "knowledgator/gliner-multitask-large-v0.5",
299
- device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
 
300
  ) -> None:
301
 
 
 
 
 
 
302
  self.model = GLiNER.from_pretrained(model).to(device)
303
  self.annotated_data = []
304
  self.stat = {
@@ -315,22 +321,31 @@ class AutoAnnotator:
315
 
316
  # Process texts in batches
317
  processed_data = []
 
318
 
319
- for i, text in enumerate(data):
320
- if isinstance(prompt, list):
321
- prompt_text = random.choice(prompt)
322
- else:
323
- prompt_text = prompt
 
 
 
 
 
 
 
324
 
325
- # Add prompt to text
326
- text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
 
327
 
328
- # Process single text
329
- result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
330
- processed_data.append(result)
331
 
332
  # Update progress
333
- self.stat["current"] = i + 1
334
 
335
  self.annotated_data = processed_data
336
  return self.annotated_data
@@ -339,7 +354,7 @@ class AutoAnnotator:
339
  annotator = None
340
  sentences = []
341
 
342
- def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
343
  """
344
  Process text for GLiNER by splitting long texts into overlapping chunks.
345
  Preserves sentence boundaries and context when possible.
@@ -442,7 +457,7 @@ def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = F
442
  exist_ok=True,
443
  token=HF_TOKEN
444
  )
445
- return repo_id
446
  except Exception as e:
447
  raise Exception(f"Error creating repository: {str(e)}")
448
 
 
295
 
296
  class AutoAnnotator:
297
  def __init__(
298
+ self, model: str = "BookingCare/gliner-multi-healthcare",
299
+ # device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
300
+ device = torch.device('cpu')
301
  ) -> None:
302
 
303
+ # Set PyTorch memory management settings
304
+ if torch.cuda.is_available():
305
+ torch.cuda.empty_cache()
306
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
307
+
308
  self.model = GLiNER.from_pretrained(model).to(device)
309
  self.annotated_data = []
310
  self.stat = {
 
321
 
322
  # Process texts in batches
323
  processed_data = []
324
+ batch_size = 8 # Reduced batch size to prevent OOM errors
325
 
326
+ for i in range(0, len(data), batch_size):
327
+ batch_texts = data[i:i + batch_size]
328
+ batch_with_prompts = []
329
+
330
+ # Add prompts to batch texts
331
+ for text in batch_texts:
332
+ if isinstance(prompt, list):
333
+ prompt_text = random.choice(prompt)
334
+ else:
335
+ prompt_text = prompt
336
+ text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
337
+ batch_with_prompts.append(text_with_prompt)
338
 
339
+ # Process batch
340
+ batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
341
+ processed_data.extend(batch_results)
342
 
343
+ # Clear CUDA cache after each batch
344
+ if torch.cuda.is_available():
345
+ torch.cuda.empty_cache()
346
 
347
  # Update progress
348
+ self.stat["current"] = min(i + batch_size, len(data))
349
 
350
  self.annotated_data = processed_data
351
  return self.annotated_data
 
354
  annotator = None
355
  sentences = []
356
 
357
+ def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
358
  """
359
  Process text for GLiNER by splitting long texts into overlapping chunks.
360
  Preserves sentence boundaries and context when possible.
 
457
  exist_ok=True,
458
  token=HF_TOKEN
459
  )
460
+ return repo_name
461
  except Exception as e:
462
  raise Exception(f"Error creating repository: {str(e)}")
463
 
data/annotated_data.json CHANGED
The diff for this file is too large to render. See raw diff