Spaces:

user1729
/

cancer_classify_extract-api

Sleeping

App Files Files Community

cancer_classify_extract-api / app /model.py

user1729

Fix: set cache dirs to avoid permission errors in Spaces

49bd15e about 2 months ago

raw

history blame contribute delete

4.11 kB

	import os
	# Avoid cache write permission errors in Hugging Face Spaces
	os.environ["HF_HOME"] = "/tmp/huggingface"
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers"
	os.environ["HF_DATASETS_CACHE"] = "/tmp/huggingface/datasets"
	import re
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

	class CancerClassifier:
	def __init__(self, model_path="user1729/BiomedBERT-cancer-bert-classifier-v1.0"):
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	self.classifier = pipeline(
	"text-classification",
	model=model,
	tokenizer=tokenizer,
	return_all_scores=True,
	device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
	)

	def predict(self, text: str):
	results = self.classifier(text)
	return {
	"predicted_labels": ["Non-Cancer", "Cancer"],
	"confidence_scores": {
	"Non-Cancer": results[0][0]["score"],
	"Cancer": results[0][1]["score"],
	},
	}

	class CancerExtractor:
	def __init__(self, model_path="alvaroalon2/biobert_diseases_ner"):
	self.extractor = pipeline(
	"ner",
	model=model_path,
	aggregation_strategy="simple",
	device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
	)
	self.cancers = [
	"cancer",
	"astrocytoma",
	"medulloblastoma",
	"meningioma",
	"neoplasm",
	"carcinoma",
	"tumor",
	"melanoma",
	"mesothelioma",
	"leukemia",
	"lymphoma",
	"sarcomas",
	]

	def predict(self, text: str):
	results = self.extractor(text)
	extractions = self.extract_diseases(results)
	extractions_cleaned = self.clean_diseases(extractions)
	detections = self.detect_cancer(extractions_cleaned)
	return detections

	def extract_diseases(self, entities):
	entities = self.merge_subwords(entities)
	diseases = [
	entity["word"]
	for entity in entities
	if "disease" in entity["entity_group"].lower()
	]
	return diseases

	def merge_subwords(self, entities):
	merged_entities = []
	current_entity = None
	for entity in entities:
	if current_entity is None:
	current_entity = entity.copy()
	else:
	# Check if this entity is part of the same word as the previous one
	if (
	entity["start"] == current_entity["end"]
	and "disease" in entity["entity_group"].lower()
	and "disease" in current_entity["entity_group"].lower()
	):
	# Merge with previous entity
	current_entity["word"] += entity["word"].replace("##", "")
	current_entity["end"] = entity["end"]
	current_entity["score"] = (
	current_entity["score"] + entity["score"]
	) / 2
	else:
	merged_entities.append(current_entity)
	current_entity = entity.copy()

	if current_entity is not None:
	merged_entities.append(current_entity)
	return merged_entities

	def clean_diseases(self, text_list):
	text_list = [re.sub(r"[^a-zA-Z]", " ", t) for t in text_list]
	unique_text = set([t.lower() for t in text_list]) # and (t not in stop_words)
	cleaned_text = [
	t for t in unique_text if (3 <= len(t.strip()) <= 50 and ("##" not in t))
	]
	return cleaned_text

	def detect_cancer(self, text_list):
	detected_cancers = [
	word2.lower()
	for word2 in text_list
	if any(word1.lower() in word2.lower() for word1 in self.cancers)
	]
	return set(detected_cancers)