cancer-api added
Browse files- Dockerfile +22 -0
- README.md +0 -2
- app/__init__.py +0 -0
- app/main.py +166 -0
- app/model.py +105 -0
- deploy.sh +3 -0
- docker-compose.yml +16 -0
- download_data_and_models.sh +47 -0
- requirements.txt +28 -0
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use official CUDA 12.1 image with Ubuntu
|
2 |
+
FROM python:3.11-slim
|
3 |
+
|
4 |
+
# Set working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Upgrade pip
|
8 |
+
RUN pip3 install --upgrade pip
|
9 |
+
|
10 |
+
# Copy Python dependencies
|
11 |
+
ADD requirements.txt .
|
12 |
+
RUN pip3 install -r requirements.txt
|
13 |
+
|
14 |
+
# Expose port
|
15 |
+
EXPOSE 8000
|
16 |
+
|
17 |
+
# Copy application code
|
18 |
+
COPY app ./app
|
19 |
+
COPY models/fine_tuned ./models/fine_tuned
|
20 |
+
|
21 |
+
# Run the application
|
22 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
@@ -1,2 +0,0 @@
|
|
1 |
-
# cancer_classify_extract-api
|
2 |
-
To extract disease data and classify research article abstracts into cancer and non-cancer categories.
|
|
|
|
|
|
app/__init__.py
ADDED
File without changes
|
app/main.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from typing import List, Union, Optional, Dict
|
4 |
+
import logging
|
5 |
+
from langchain.chains import SequentialChain, TransformChain
|
6 |
+
from .model import CancerClassifier, CancerExtractor
|
7 |
+
|
8 |
+
# Set up logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
app = FastAPI(
|
13 |
+
title="Cancer Text Processing API",
|
14 |
+
description="API for cancer-related text classification and information extraction",
|
15 |
+
version="1.0.0"
|
16 |
+
)
|
17 |
+
|
18 |
+
class TextInput(BaseModel):
|
19 |
+
text: Union[str, List[str]]
|
20 |
+
|
21 |
+
class ProcessingResult(BaseModel):
|
22 |
+
text: str
|
23 |
+
classification: Union[str, dict]
|
24 |
+
extraction: Union[str, dict]
|
25 |
+
error: Optional[str] = None
|
26 |
+
|
27 |
+
class BatchResponse(BaseModel):
|
28 |
+
results: List[ProcessingResult]
|
29 |
+
|
30 |
+
# Initialize models
|
31 |
+
try:
|
32 |
+
logger.info("Loading classification model...")
|
33 |
+
classification_pipeline = CancerClassifier("models/fine_tuned")
|
34 |
+
|
35 |
+
logger.info("Loading extraction model...")
|
36 |
+
extraction_pipeline = CancerExtractor()
|
37 |
+
|
38 |
+
logger.info("Models loaded successfully")
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"Failed to load models: {str(e)}")
|
41 |
+
raise RuntimeError("Could not initialize models")
|
42 |
+
|
43 |
+
def batch_classification_transform(inputs: Dict) -> Dict:
|
44 |
+
"""Process batch of texts through classification model"""
|
45 |
+
try:
|
46 |
+
texts = inputs["input_texts"]
|
47 |
+
if isinstance(texts, str):
|
48 |
+
texts = [texts] # Convert single text to batch of one
|
49 |
+
|
50 |
+
results = []
|
51 |
+
for text in texts:
|
52 |
+
try:
|
53 |
+
result = classification_pipeline.predict(text)
|
54 |
+
results.append(str(result))
|
55 |
+
except Exception as e:
|
56 |
+
logger.warning(f"Classification failed for text: {text[:50]}... Error: {str(e)}")
|
57 |
+
results.append({"error": str(e)})
|
58 |
+
|
59 |
+
return {"classification_results": results}
|
60 |
+
except Exception as e:
|
61 |
+
logger.error(f"Batch classification failed: {str(e)}")
|
62 |
+
raise
|
63 |
+
|
64 |
+
def batch_extraction_transform(inputs: Dict) -> Dict:
|
65 |
+
"""Process batch of texts through extraction model"""
|
66 |
+
try:
|
67 |
+
texts = inputs["input_texts"]
|
68 |
+
if isinstance(texts, str):
|
69 |
+
texts = [texts] # Convert single text to batch of one
|
70 |
+
|
71 |
+
results = []
|
72 |
+
for text in texts:
|
73 |
+
try:
|
74 |
+
result = extraction_pipeline.predict(text)
|
75 |
+
results.append(str(result))
|
76 |
+
except Exception as e:
|
77 |
+
logger.warning(f"Extraction failed for text: {text[:50]}... Error: {str(e)}")
|
78 |
+
results.append({"error": str(e)})
|
79 |
+
|
80 |
+
return {"extraction_results": results}
|
81 |
+
except Exception as e:
|
82 |
+
logger.error(f"Batch extraction failed: {str(e)}")
|
83 |
+
raise
|
84 |
+
|
85 |
+
# Create processing chains
|
86 |
+
classification_chain = TransformChain(
|
87 |
+
input_variables=["input_texts"],
|
88 |
+
output_variables=["classification_results"],
|
89 |
+
transform=batch_classification_transform
|
90 |
+
)
|
91 |
+
|
92 |
+
extraction_chain = TransformChain(
|
93 |
+
input_variables=["input_texts"],
|
94 |
+
output_variables=["extraction_results"],
|
95 |
+
transform=batch_extraction_transform
|
96 |
+
)
|
97 |
+
|
98 |
+
# Create sequential chain
|
99 |
+
processing_chain = SequentialChain(
|
100 |
+
chains=[classification_chain, extraction_chain],
|
101 |
+
input_variables=["input_texts"],
|
102 |
+
output_variables=["classification_results", "extraction_results"],
|
103 |
+
verbose=True
|
104 |
+
)
|
105 |
+
|
106 |
+
@app.post("/process", response_model=BatchResponse)
|
107 |
+
async def process_texts(input: TextInput):
|
108 |
+
"""
|
109 |
+
Process cancer-related texts through classification and extraction pipeline
|
110 |
+
|
111 |
+
Args:
|
112 |
+
input: TextInput object containing either a single string or list of strings
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
BatchResponse with processing results for each input text
|
116 |
+
"""
|
117 |
+
try:
|
118 |
+
texts = [input.text] if isinstance(input.text, str) else input.text
|
119 |
+
|
120 |
+
# Validate input
|
121 |
+
if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
|
122 |
+
raise HTTPException(status_code=400, detail="Input must be string or list of strings")
|
123 |
+
|
124 |
+
# Process through LangChain pipeline
|
125 |
+
chain_result = processing_chain({"input_texts": texts})
|
126 |
+
|
127 |
+
# Format results
|
128 |
+
results = []
|
129 |
+
for i, text in enumerate(texts):
|
130 |
+
classification = chain_result["classification_results"][i]
|
131 |
+
extraction = chain_result["extraction_results"][i]
|
132 |
+
|
133 |
+
error = None
|
134 |
+
if isinstance(classification, dict) and "error" in classification:
|
135 |
+
error = classification["error"]
|
136 |
+
elif isinstance(extraction, dict) and "error" in extraction:
|
137 |
+
error = extraction["error"]
|
138 |
+
|
139 |
+
results.append(ProcessingResult(
|
140 |
+
text=text,
|
141 |
+
classification=classification,
|
142 |
+
extraction=extraction,
|
143 |
+
error=error
|
144 |
+
))
|
145 |
+
|
146 |
+
return BatchResponse(results=results)
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
logger.error(f"Processing failed: {str(e)}")
|
150 |
+
raise HTTPException(status_code=500, detail=str(e))
|
151 |
+
|
152 |
+
@app.get("/health")
|
153 |
+
async def health_check():
|
154 |
+
"""Health check endpoint"""
|
155 |
+
try:
|
156 |
+
# Test with a simple cancer-related phrase
|
157 |
+
test_text = "breast cancer diagnosis"
|
158 |
+
classification_pipeline.predict(test_text)
|
159 |
+
extraction_pipeline.predict(test_text)
|
160 |
+
return {"status": "healthy", "models": ["classification", "extraction"]}
|
161 |
+
except Exception as e:
|
162 |
+
raise HTTPException(status_code=500, detail=str(e))
|
163 |
+
|
164 |
+
if __name__ == "__main__":
|
165 |
+
import uvicorn
|
166 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/model.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
|
5 |
+
class CancerClassifier:
|
6 |
+
def __init__(self, model_path: str):
|
7 |
+
self.classifier = pipeline(
|
8 |
+
"text-classification",
|
9 |
+
model=model_path,
|
10 |
+
tokenizer="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
|
11 |
+
return_all_scores=True,
|
12 |
+
device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
|
13 |
+
)
|
14 |
+
|
15 |
+
def predict(self, text: str):
|
16 |
+
results = self.classifier(text)
|
17 |
+
return {
|
18 |
+
"predicted_labels": ["Non-Cancer", "Cancer"],
|
19 |
+
"confidence_scores": {
|
20 |
+
"Non-Cancer": results[0][0]["score"],
|
21 |
+
"Cancer": results[0][1]["score"],
|
22 |
+
},
|
23 |
+
}
|
24 |
+
|
25 |
+
class CancerExtractor:
|
26 |
+
def __init__(self, model_path ="alvaroalon2/biobert_diseases_ner"):
|
27 |
+
self.extractor = pipeline(
|
28 |
+
"ner",
|
29 |
+
model=model_path,
|
30 |
+
aggregation_strategy="simple",
|
31 |
+
device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
|
32 |
+
)
|
33 |
+
self.cancers = [
|
34 |
+
"cancer",
|
35 |
+
"astrocytoma",
|
36 |
+
"medulloblastoma",
|
37 |
+
"meningioma",
|
38 |
+
"neoplasm",
|
39 |
+
"carcinoma",
|
40 |
+
"tumor",
|
41 |
+
"melanoma",
|
42 |
+
"mesothelioma",
|
43 |
+
"leukemia",
|
44 |
+
"lymphoma",
|
45 |
+
"sarcomas",
|
46 |
+
]
|
47 |
+
|
48 |
+
def predict(self, text: str):
|
49 |
+
results = self.extractor(text)
|
50 |
+
extractions = self.extract_diseases(results)
|
51 |
+
extractions_cleaned = self.clean_diseases(extractions)
|
52 |
+
detections = self.detect_cancer(extractions_cleaned)
|
53 |
+
return detections
|
54 |
+
|
55 |
+
def extract_diseases(self, entities):
|
56 |
+
entities = self.merge_subwords(entities)
|
57 |
+
diseases = [
|
58 |
+
entity["word"]
|
59 |
+
for entity in entities
|
60 |
+
if "disease" in entity["entity_group"].lower()
|
61 |
+
]
|
62 |
+
return diseases
|
63 |
+
|
64 |
+
def merge_subwords(self, entities):
|
65 |
+
merged_entities = []
|
66 |
+
current_entity = None
|
67 |
+
for entity in entities:
|
68 |
+
if current_entity is None:
|
69 |
+
current_entity = entity.copy()
|
70 |
+
else:
|
71 |
+
# Check if this entity is part of the same word as the previous one
|
72 |
+
if (
|
73 |
+
entity["start"] == current_entity["end"]
|
74 |
+
and "disease" in entity["entity_group"].lower()
|
75 |
+
and "disease" in current_entity["entity_group"].lower()
|
76 |
+
):
|
77 |
+
# Merge with previous entity
|
78 |
+
current_entity["word"] += entity["word"].replace("##", "")
|
79 |
+
current_entity["end"] = entity["end"]
|
80 |
+
current_entity["score"] = (
|
81 |
+
current_entity["score"] + entity["score"]
|
82 |
+
) / 2
|
83 |
+
else:
|
84 |
+
merged_entities.append(current_entity)
|
85 |
+
current_entity = entity.copy()
|
86 |
+
|
87 |
+
if current_entity is not None:
|
88 |
+
merged_entities.append(current_entity)
|
89 |
+
return merged_entities
|
90 |
+
|
91 |
+
def clean_diseases(self, text_list):
|
92 |
+
text_list = [re.sub(r"[^a-zA-Z]", " ", t) for t in text_list]
|
93 |
+
unique_text = set([t.lower() for t in text_list]) # and (t not in stop_words)
|
94 |
+
cleaned_text = [
|
95 |
+
t for t in unique_text if (3 <= len(t.strip()) <= 50 and ("##" not in t))
|
96 |
+
]
|
97 |
+
return cleaned_text
|
98 |
+
|
99 |
+
def detect_cancer(self, text_list):
|
100 |
+
detected_cancers = [
|
101 |
+
word2.lower()
|
102 |
+
for word2 in text_list
|
103 |
+
if any(word1.lower() in word2.lower() for word1 in self.cancers)
|
104 |
+
]
|
105 |
+
return set(detected_cancers)
|
deploy.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
docker compose build
|
2 |
+
docker compose up -d
|
3 |
+
docker compose ps
|
docker-compose.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
cancer-api:
|
3 |
+
build: .
|
4 |
+
ports:
|
5 |
+
- "8000:8000"
|
6 |
+
environment:
|
7 |
+
- PYTHONUNBUFFERED=1
|
8 |
+
volumes:
|
9 |
+
- ./models:/app/models
|
10 |
+
- ./app:/app/app
|
11 |
+
restart: unless-stopped
|
12 |
+
healthcheck:
|
13 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
14 |
+
interval: 30s
|
15 |
+
timeout: 10s
|
16 |
+
retries: 3
|
download_data_and_models.sh
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#view url: https://drive.google.com/file/d/1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i/view?usp=sharing
|
3 |
+
FILE_ID="1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i"
|
4 |
+
FILE_NAME="cancer_data.zip"
|
5 |
+
|
6 |
+
echo "Downloading data from Google Drive..."
|
7 |
+
|
8 |
+
curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
|
9 |
+
|
10 |
+
# Check download success
|
11 |
+
if [ ! -f "$FILE_NAME" ]; then
|
12 |
+
echo "Download failed!"
|
13 |
+
exit 1
|
14 |
+
fi
|
15 |
+
|
16 |
+
# Unzip
|
17 |
+
echo "Unzipping $FILE_NAME..."
|
18 |
+
unzip "$FILE_NAME"
|
19 |
+
|
20 |
+
# Optional: Clean up
|
21 |
+
# rm "$FILE_NAME"
|
22 |
+
|
23 |
+
echo "Done."
|
24 |
+
|
25 |
+
#!/bin/bash
|
26 |
+
#view url: https://drive.google.com/file/d/1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W/view?usp=sharing
|
27 |
+
FILE_ID="1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W"
|
28 |
+
FILE_NAME="models-cancer-api.zip"
|
29 |
+
|
30 |
+
echo "Downloading model from Google Drive..."
|
31 |
+
|
32 |
+
curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
|
33 |
+
|
34 |
+
# Check download success
|
35 |
+
if [ ! -f "$FILE_NAME" ]; then
|
36 |
+
echo "Download failed!"
|
37 |
+
exit 1
|
38 |
+
fi
|
39 |
+
|
40 |
+
# Unzip
|
41 |
+
echo "Unzipping $FILE_NAME..."
|
42 |
+
unzip "$FILE_NAME"
|
43 |
+
|
44 |
+
# Optional: Clean up
|
45 |
+
# rm "$FILE_NAME"
|
46 |
+
|
47 |
+
echo "Done."
|
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.95.0
|
2 |
+
uvicorn==0.21.1
|
3 |
+
pydantic
|
4 |
+
python-multipart==0.0.6
|
5 |
+
chardet==5.2.0
|
6 |
+
click==8.2.0
|
7 |
+
datasets==3.6.0
|
8 |
+
beautifulsoup4==4.13.4
|
9 |
+
pandas==2.2.3
|
10 |
+
nltk==3.9.1
|
11 |
+
transformers==4.51.3
|
12 |
+
huggingface-hub==0.31.2
|
13 |
+
langchain
|
14 |
+
langchain-community
|
15 |
+
evaluate==0.4.3
|
16 |
+
evaluation==0.0.2
|
17 |
+
scikit-learn
|
18 |
+
bioc==2.1
|
19 |
+
hf-xet==1.1.2
|
20 |
+
accelerate==1.7.0
|
21 |
+
tqdm==4.67.1
|
22 |
+
scikit-learn==1.6.1
|
23 |
+
python-dotenv==1.1.0
|
24 |
+
peft==0.15.2
|
25 |
+
tokenizers==0.21.1
|
26 |
+
torch==2.5.1
|
27 |
+
torchvision==0.20.1
|
28 |
+
|