Spaces:
Running
Running
nam pham
commited on
Commit
·
ffa19f8
1
Parent(s):
345f1ee
fix: upload to huggingface
Browse files- app.py +30 -15
- data/annotated_data.json +0 -0
app.py
CHANGED
@@ -295,10 +295,16 @@ def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], thre
|
|
295 |
|
296 |
class AutoAnnotator:
|
297 |
def __init__(
|
298 |
-
self, model: str = "
|
299 |
-
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
|
|
300 |
) -> None:
|
301 |
|
|
|
|
|
|
|
|
|
|
|
302 |
self.model = GLiNER.from_pretrained(model).to(device)
|
303 |
self.annotated_data = []
|
304 |
self.stat = {
|
@@ -315,22 +321,31 @@ class AutoAnnotator:
|
|
315 |
|
316 |
# Process texts in batches
|
317 |
processed_data = []
|
|
|
318 |
|
319 |
-
for i
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
-
#
|
326 |
-
|
|
|
327 |
|
328 |
-
#
|
329 |
-
|
330 |
-
|
331 |
|
332 |
# Update progress
|
333 |
-
self.stat["current"] = i +
|
334 |
|
335 |
self.annotated_data = processed_data
|
336 |
return self.annotated_data
|
@@ -339,7 +354,7 @@ class AutoAnnotator:
|
|
339 |
annotator = None
|
340 |
sentences = []
|
341 |
|
342 |
-
def process_text_for_gliner(text: str, max_tokens: int =
|
343 |
"""
|
344 |
Process text for GLiNER by splitting long texts into overlapping chunks.
|
345 |
Preserves sentence boundaries and context when possible.
|
@@ -442,7 +457,7 @@ def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = F
|
|
442 |
exist_ok=True,
|
443 |
token=HF_TOKEN
|
444 |
)
|
445 |
-
return
|
446 |
except Exception as e:
|
447 |
raise Exception(f"Error creating repository: {str(e)}")
|
448 |
|
|
|
295 |
|
296 |
class AutoAnnotator:
|
297 |
def __init__(
|
298 |
+
self, model: str = "BookingCare/gliner-multi-healthcare",
|
299 |
+
# device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
300 |
+
device = torch.device('cpu')
|
301 |
) -> None:
|
302 |
|
303 |
+
# Set PyTorch memory management settings
|
304 |
+
if torch.cuda.is_available():
|
305 |
+
torch.cuda.empty_cache()
|
306 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
307 |
+
|
308 |
self.model = GLiNER.from_pretrained(model).to(device)
|
309 |
self.annotated_data = []
|
310 |
self.stat = {
|
|
|
321 |
|
322 |
# Process texts in batches
|
323 |
processed_data = []
|
324 |
+
batch_size = 8 # Reduced batch size to prevent OOM errors
|
325 |
|
326 |
+
for i in range(0, len(data), batch_size):
|
327 |
+
batch_texts = data[i:i + batch_size]
|
328 |
+
batch_with_prompts = []
|
329 |
+
|
330 |
+
# Add prompts to batch texts
|
331 |
+
for text in batch_texts:
|
332 |
+
if isinstance(prompt, list):
|
333 |
+
prompt_text = random.choice(prompt)
|
334 |
+
else:
|
335 |
+
prompt_text = prompt
|
336 |
+
text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
|
337 |
+
batch_with_prompts.append(text_with_prompt)
|
338 |
|
339 |
+
# Process batch
|
340 |
+
batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
|
341 |
+
processed_data.extend(batch_results)
|
342 |
|
343 |
+
# Clear CUDA cache after each batch
|
344 |
+
if torch.cuda.is_available():
|
345 |
+
torch.cuda.empty_cache()
|
346 |
|
347 |
# Update progress
|
348 |
+
self.stat["current"] = min(i + batch_size, len(data))
|
349 |
|
350 |
self.annotated_data = processed_data
|
351 |
return self.annotated_data
|
|
|
354 |
annotator = None
|
355 |
sentences = []
|
356 |
|
357 |
+
def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
|
358 |
"""
|
359 |
Process text for GLiNER by splitting long texts into overlapping chunks.
|
360 |
Preserves sentence boundaries and context when possible.
|
|
|
457 |
exist_ok=True,
|
458 |
token=HF_TOKEN
|
459 |
)
|
460 |
+
return repo_name
|
461 |
except Exception as e:
|
462 |
raise Exception(f"Error creating repository: {str(e)}")
|
463 |
|
data/annotated_data.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|