user1729 commited on
Commit
3426410
·
1 Parent(s): d85e3b0

cancer-api added

Browse files
Files changed (9) hide show
  1. Dockerfile +22 -0
  2. README.md +0 -2
  3. app/__init__.py +0 -0
  4. app/main.py +166 -0
  5. app/model.py +105 -0
  6. deploy.sh +3 -0
  7. docker-compose.yml +16 -0
  8. download_data_and_models.sh +47 -0
  9. requirements.txt +28 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official CUDA 12.1 image with Ubuntu
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Upgrade pip
8
+ RUN pip3 install --upgrade pip
9
+
10
+ # Copy Python dependencies
11
+ ADD requirements.txt .
12
+ RUN pip3 install -r requirements.txt
13
+
14
+ # Expose port
15
+ EXPOSE 8000
16
+
17
+ # Copy application code
18
+ COPY app ./app
19
+ COPY models/fine_tuned ./models/fine_tuned
20
+
21
+ # Run the application
22
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,2 +0,0 @@
1
- # cancer_classify_extract-api
2
- To extract disease data and classify research article abstracts into cancer and non-cancer categories.
 
 
 
app/__init__.py ADDED
File without changes
app/main.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List, Union, Optional, Dict
4
+ import logging
5
+ from langchain.chains import SequentialChain, TransformChain
6
+ from .model import CancerClassifier, CancerExtractor
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ app = FastAPI(
13
+ title="Cancer Text Processing API",
14
+ description="API for cancer-related text classification and information extraction",
15
+ version="1.0.0"
16
+ )
17
+
18
+ class TextInput(BaseModel):
19
+ text: Union[str, List[str]]
20
+
21
+ class ProcessingResult(BaseModel):
22
+ text: str
23
+ classification: Union[str, dict]
24
+ extraction: Union[str, dict]
25
+ error: Optional[str] = None
26
+
27
+ class BatchResponse(BaseModel):
28
+ results: List[ProcessingResult]
29
+
30
+ # Initialize models
31
+ try:
32
+ logger.info("Loading classification model...")
33
+ classification_pipeline = CancerClassifier("models/fine_tuned")
34
+
35
+ logger.info("Loading extraction model...")
36
+ extraction_pipeline = CancerExtractor()
37
+
38
+ logger.info("Models loaded successfully")
39
+ except Exception as e:
40
+ logger.error(f"Failed to load models: {str(e)}")
41
+ raise RuntimeError("Could not initialize models")
42
+
43
+ def batch_classification_transform(inputs: Dict) -> Dict:
44
+ """Process batch of texts through classification model"""
45
+ try:
46
+ texts = inputs["input_texts"]
47
+ if isinstance(texts, str):
48
+ texts = [texts] # Convert single text to batch of one
49
+
50
+ results = []
51
+ for text in texts:
52
+ try:
53
+ result = classification_pipeline.predict(text)
54
+ results.append(str(result))
55
+ except Exception as e:
56
+ logger.warning(f"Classification failed for text: {text[:50]}... Error: {str(e)}")
57
+ results.append({"error": str(e)})
58
+
59
+ return {"classification_results": results}
60
+ except Exception as e:
61
+ logger.error(f"Batch classification failed: {str(e)}")
62
+ raise
63
+
64
+ def batch_extraction_transform(inputs: Dict) -> Dict:
65
+ """Process batch of texts through extraction model"""
66
+ try:
67
+ texts = inputs["input_texts"]
68
+ if isinstance(texts, str):
69
+ texts = [texts] # Convert single text to batch of one
70
+
71
+ results = []
72
+ for text in texts:
73
+ try:
74
+ result = extraction_pipeline.predict(text)
75
+ results.append(str(result))
76
+ except Exception as e:
77
+ logger.warning(f"Extraction failed for text: {text[:50]}... Error: {str(e)}")
78
+ results.append({"error": str(e)})
79
+
80
+ return {"extraction_results": results}
81
+ except Exception as e:
82
+ logger.error(f"Batch extraction failed: {str(e)}")
83
+ raise
84
+
85
+ # Create processing chains
86
+ classification_chain = TransformChain(
87
+ input_variables=["input_texts"],
88
+ output_variables=["classification_results"],
89
+ transform=batch_classification_transform
90
+ )
91
+
92
+ extraction_chain = TransformChain(
93
+ input_variables=["input_texts"],
94
+ output_variables=["extraction_results"],
95
+ transform=batch_extraction_transform
96
+ )
97
+
98
+ # Create sequential chain
99
+ processing_chain = SequentialChain(
100
+ chains=[classification_chain, extraction_chain],
101
+ input_variables=["input_texts"],
102
+ output_variables=["classification_results", "extraction_results"],
103
+ verbose=True
104
+ )
105
+
106
+ @app.post("/process", response_model=BatchResponse)
107
+ async def process_texts(input: TextInput):
108
+ """
109
+ Process cancer-related texts through classification and extraction pipeline
110
+
111
+ Args:
112
+ input: TextInput object containing either a single string or list of strings
113
+
114
+ Returns:
115
+ BatchResponse with processing results for each input text
116
+ """
117
+ try:
118
+ texts = [input.text] if isinstance(input.text, str) else input.text
119
+
120
+ # Validate input
121
+ if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
122
+ raise HTTPException(status_code=400, detail="Input must be string or list of strings")
123
+
124
+ # Process through LangChain pipeline
125
+ chain_result = processing_chain({"input_texts": texts})
126
+
127
+ # Format results
128
+ results = []
129
+ for i, text in enumerate(texts):
130
+ classification = chain_result["classification_results"][i]
131
+ extraction = chain_result["extraction_results"][i]
132
+
133
+ error = None
134
+ if isinstance(classification, dict) and "error" in classification:
135
+ error = classification["error"]
136
+ elif isinstance(extraction, dict) and "error" in extraction:
137
+ error = extraction["error"]
138
+
139
+ results.append(ProcessingResult(
140
+ text=text,
141
+ classification=classification,
142
+ extraction=extraction,
143
+ error=error
144
+ ))
145
+
146
+ return BatchResponse(results=results)
147
+
148
+ except Exception as e:
149
+ logger.error(f"Processing failed: {str(e)}")
150
+ raise HTTPException(status_code=500, detail=str(e))
151
+
152
+ @app.get("/health")
153
+ async def health_check():
154
+ """Health check endpoint"""
155
+ try:
156
+ # Test with a simple cancer-related phrase
157
+ test_text = "breast cancer diagnosis"
158
+ classification_pipeline.predict(test_text)
159
+ extraction_pipeline.predict(test_text)
160
+ return {"status": "healthy", "models": ["classification", "extraction"]}
161
+ except Exception as e:
162
+ raise HTTPException(status_code=500, detail=str(e))
163
+
164
+ if __name__ == "__main__":
165
+ import uvicorn
166
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/model.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import os
3
+ import re
4
+
5
+ class CancerClassifier:
6
+ def __init__(self, model_path: str):
7
+ self.classifier = pipeline(
8
+ "text-classification",
9
+ model=model_path,
10
+ tokenizer="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
11
+ return_all_scores=True,
12
+ device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
13
+ )
14
+
15
+ def predict(self, text: str):
16
+ results = self.classifier(text)
17
+ return {
18
+ "predicted_labels": ["Non-Cancer", "Cancer"],
19
+ "confidence_scores": {
20
+ "Non-Cancer": results[0][0]["score"],
21
+ "Cancer": results[0][1]["score"],
22
+ },
23
+ }
24
+
25
+ class CancerExtractor:
26
+ def __init__(self, model_path ="alvaroalon2/biobert_diseases_ner"):
27
+ self.extractor = pipeline(
28
+ "ner",
29
+ model=model_path,
30
+ aggregation_strategy="simple",
31
+ device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
32
+ )
33
+ self.cancers = [
34
+ "cancer",
35
+ "astrocytoma",
36
+ "medulloblastoma",
37
+ "meningioma",
38
+ "neoplasm",
39
+ "carcinoma",
40
+ "tumor",
41
+ "melanoma",
42
+ "mesothelioma",
43
+ "leukemia",
44
+ "lymphoma",
45
+ "sarcomas",
46
+ ]
47
+
48
+ def predict(self, text: str):
49
+ results = self.extractor(text)
50
+ extractions = self.extract_diseases(results)
51
+ extractions_cleaned = self.clean_diseases(extractions)
52
+ detections = self.detect_cancer(extractions_cleaned)
53
+ return detections
54
+
55
+ def extract_diseases(self, entities):
56
+ entities = self.merge_subwords(entities)
57
+ diseases = [
58
+ entity["word"]
59
+ for entity in entities
60
+ if "disease" in entity["entity_group"].lower()
61
+ ]
62
+ return diseases
63
+
64
+ def merge_subwords(self, entities):
65
+ merged_entities = []
66
+ current_entity = None
67
+ for entity in entities:
68
+ if current_entity is None:
69
+ current_entity = entity.copy()
70
+ else:
71
+ # Check if this entity is part of the same word as the previous one
72
+ if (
73
+ entity["start"] == current_entity["end"]
74
+ and "disease" in entity["entity_group"].lower()
75
+ and "disease" in current_entity["entity_group"].lower()
76
+ ):
77
+ # Merge with previous entity
78
+ current_entity["word"] += entity["word"].replace("##", "")
79
+ current_entity["end"] = entity["end"]
80
+ current_entity["score"] = (
81
+ current_entity["score"] + entity["score"]
82
+ ) / 2
83
+ else:
84
+ merged_entities.append(current_entity)
85
+ current_entity = entity.copy()
86
+
87
+ if current_entity is not None:
88
+ merged_entities.append(current_entity)
89
+ return merged_entities
90
+
91
+ def clean_diseases(self, text_list):
92
+ text_list = [re.sub(r"[^a-zA-Z]", " ", t) for t in text_list]
93
+ unique_text = set([t.lower() for t in text_list]) # and (t not in stop_words)
94
+ cleaned_text = [
95
+ t for t in unique_text if (3 <= len(t.strip()) <= 50 and ("##" not in t))
96
+ ]
97
+ return cleaned_text
98
+
99
+ def detect_cancer(self, text_list):
100
+ detected_cancers = [
101
+ word2.lower()
102
+ for word2 in text_list
103
+ if any(word1.lower() in word2.lower() for word1 in self.cancers)
104
+ ]
105
+ return set(detected_cancers)
deploy.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ docker compose build
2
+ docker compose up -d
3
+ docker compose ps
docker-compose.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ cancer-api:
3
+ build: .
4
+ ports:
5
+ - "8000:8000"
6
+ environment:
7
+ - PYTHONUNBUFFERED=1
8
+ volumes:
9
+ - ./models:/app/models
10
+ - ./app:/app/app
11
+ restart: unless-stopped
12
+ healthcheck:
13
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
14
+ interval: 30s
15
+ timeout: 10s
16
+ retries: 3
download_data_and_models.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #view url: https://drive.google.com/file/d/1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i/view?usp=sharing
3
+ FILE_ID="1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i"
4
+ FILE_NAME="cancer_data.zip"
5
+
6
+ echo "Downloading data from Google Drive..."
7
+
8
+ curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
9
+
10
+ # Check download success
11
+ if [ ! -f "$FILE_NAME" ]; then
12
+ echo "Download failed!"
13
+ exit 1
14
+ fi
15
+
16
+ # Unzip
17
+ echo "Unzipping $FILE_NAME..."
18
+ unzip "$FILE_NAME"
19
+
20
+ # Optional: Clean up
21
+ # rm "$FILE_NAME"
22
+
23
+ echo "Done."
24
+
25
+ #!/bin/bash
26
+ #view url: https://drive.google.com/file/d/1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W/view?usp=sharing
27
+ FILE_ID="1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W"
28
+ FILE_NAME="models-cancer-api.zip"
29
+
30
+ echo "Downloading model from Google Drive..."
31
+
32
+ curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
33
+
34
+ # Check download success
35
+ if [ ! -f "$FILE_NAME" ]; then
36
+ echo "Download failed!"
37
+ exit 1
38
+ fi
39
+
40
+ # Unzip
41
+ echo "Unzipping $FILE_NAME..."
42
+ unzip "$FILE_NAME"
43
+
44
+ # Optional: Clean up
45
+ # rm "$FILE_NAME"
46
+
47
+ echo "Done."
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.95.0
2
+ uvicorn==0.21.1
3
+ pydantic
4
+ python-multipart==0.0.6
5
+ chardet==5.2.0
6
+ click==8.2.0
7
+ datasets==3.6.0
8
+ beautifulsoup4==4.13.4
9
+ pandas==2.2.3
10
+ nltk==3.9.1
11
+ transformers==4.51.3
12
+ huggingface-hub==0.31.2
13
+ langchain
14
+ langchain-community
15
+ evaluate==0.4.3
16
+ evaluation==0.0.2
17
+ scikit-learn
18
+ bioc==2.1
19
+ hf-xet==1.1.2
20
+ accelerate==1.7.0
21
+ tqdm==4.67.1
22
+ scikit-learn==1.6.1
23
+ python-dotenv==1.1.0
24
+ peft==0.15.2
25
+ tokenizers==0.21.1
26
+ torch==2.5.1
27
+ torchvision==0.20.1
28
+