Spaces:

user1729
/

cancer_classify_extract-api

Sleeping

App Files Files Community

user1729 commited on May 28

Commit

3426410

1 Parent(s): d85e3b0

cancer-api added

Browse files

Files changed (9) hide show

Dockerfile +22 -0
README.md +0 -2
app/__init__.py +0 -0
app/main.py +166 -0
app/model.py +105 -0
deploy.sh +3 -0
docker-compose.yml +16 -0
download_data_and_models.sh +47 -0
requirements.txt +28 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+# Use official CUDA 12.1 image with Ubuntu
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Upgrade pip
+RUN pip3 install --upgrade pip
+# Copy Python dependencies
+ADD requirements.txt .
+RUN pip3 install -r requirements.txt
+# Expose port
+EXPOSE 8000
+# Copy application code
+COPY app ./app
+COPY models/fine_tuned ./models/fine_tuned
+# Run the application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

	@@ -1,2 +0,0 @@
1	- # cancer_classify_extract-api
2	- To extract disease data and classify research article abstracts into cancer and non-cancer categories.

app/__init__.py ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Union, Optional, Dict
+import logging
+from langchain.chains import SequentialChain, TransformChain
+from .model import CancerClassifier, CancerExtractor
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="Cancer Text Processing API",
+    description="API for cancer-related text classification and information extraction",
+    version="1.0.0"
+)
+class TextInput(BaseModel):
+    text: Union[str, List[str]]
+class ProcessingResult(BaseModel):
+    text: str
+    classification: Union[str, dict]
+    extraction: Union[str, dict]
+    error: Optional[str] = None
+class BatchResponse(BaseModel):
+    results: List[ProcessingResult]
+# Initialize models
+try:
+    logger.info("Loading classification model...")
+    classification_pipeline = CancerClassifier("models/fine_tuned")
+    logger.info("Loading extraction model...")
+    extraction_pipeline = CancerExtractor()
+    logger.info("Models loaded successfully")
+except Exception as e:
+    logger.error(f"Failed to load models: {str(e)}")
+    raise RuntimeError("Could not initialize models")
+def batch_classification_transform(inputs: Dict) -> Dict:
+    """Process batch of texts through classification model"""
+    try:
+        texts = inputs["input_texts"]
+        if isinstance(texts, str):
+            texts = [texts]  # Convert single text to batch of one
+        results = []
+        for text in texts:
+            try:
+                result = classification_pipeline.predict(text)
+                results.append(str(result))
+            except Exception as e:
+                logger.warning(f"Classification failed for text: {text[:50]}... Error: {str(e)}")
+                results.append({"error": str(e)})
+        return {"classification_results": results}
+    except Exception as e:
+        logger.error(f"Batch classification failed: {str(e)}")
+        raise
+def batch_extraction_transform(inputs: Dict) -> Dict:
+    """Process batch of texts through extraction model"""
+    try:
+        texts = inputs["input_texts"]
+        if isinstance(texts, str):
+            texts = [texts]  # Convert single text to batch of one
+        results = []
+        for text in texts:
+            try:
+                result = extraction_pipeline.predict(text)
+                results.append(str(result))
+            except Exception as e:
+                logger.warning(f"Extraction failed for text: {text[:50]}... Error: {str(e)}")
+                results.append({"error": str(e)})
+        return {"extraction_results": results}
+    except Exception as e:
+        logger.error(f"Batch extraction failed: {str(e)}")
+        raise
+# Create processing chains
+classification_chain = TransformChain(
+    input_variables=["input_texts"],
+    output_variables=["classification_results"],
+    transform=batch_classification_transform
+)
+extraction_chain = TransformChain(
+    input_variables=["input_texts"],
+    output_variables=["extraction_results"],
+    transform=batch_extraction_transform
+)
+# Create sequential chain
+processing_chain = SequentialChain(
+    chains=[classification_chain, extraction_chain],
+    input_variables=["input_texts"],
+    output_variables=["classification_results", "extraction_results"],
+    verbose=True
+)
+@app.post("/process", response_model=BatchResponse)
+async def process_texts(input: TextInput):
+    """
+    Process cancer-related texts through classification and extraction pipeline
+    Args:
+        input: TextInput object containing either a single string or list of strings
+    Returns:
+        BatchResponse with processing results for each input text
+    """
+    try:
+        texts = [input.text] if isinstance(input.text, str) else input.text
+        # Validate input
+        if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
+            raise HTTPException(status_code=400, detail="Input must be string or list of strings")
+        # Process through LangChain pipeline
+        chain_result = processing_chain({"input_texts": texts})
+        # Format results
+        results = []
+        for i, text in enumerate(texts):
+            classification = chain_result["classification_results"][i]
+            extraction = chain_result["extraction_results"][i]
+            error = None
+            if isinstance(classification, dict) and "error" in classification:
+                error = classification["error"]
+            elif isinstance(extraction, dict) and "error" in extraction:
+                error = extraction["error"]
+            results.append(ProcessingResult(
+                text=text,
+                classification=classification,
+                extraction=extraction,
+                error=error
+            ))
+        return BatchResponse(results=results)
+    except Exception as e:
+        logger.error(f"Processing failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        # Test with a simple cancer-related phrase
+        test_text = "breast cancer diagnosis"
+        classification_pipeline.predict(test_text)
+        extraction_pipeline.predict(test_text)
+        return {"status": "healthy", "models": ["classification", "extraction"]}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app/model.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from transformers import pipeline
+import os
+import re
+class CancerClassifier:
+    def __init__(self, model_path: str):
+        self.classifier = pipeline(
+            "text-classification",
+            model=model_path,
+            tokenizer="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
+            return_all_scores=True,
+            device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
+        )
+    def predict(self, text: str):
+        results = self.classifier(text)
+        return {
+            "predicted_labels": ["Non-Cancer", "Cancer"],
+            "confidence_scores": {
+                "Non-Cancer": results[0][0]["score"],
+                "Cancer": results[0][1]["score"],
+            },
+        }
+class CancerExtractor:
+    def __init__(self, model_path ="alvaroalon2/biobert_diseases_ner"):
+        self.extractor = pipeline(
+            "ner",
+            model=model_path,
+            aggregation_strategy="simple",
+            device=0 if os.environ.get("USE_GPU", "false").lower() == "true" else -1,
+        )
+        self.cancers = [
+            "cancer",
+            "astrocytoma",
+            "medulloblastoma",
+            "meningioma",
+            "neoplasm",
+            "carcinoma",
+            "tumor",
+            "melanoma",
+            "mesothelioma",
+            "leukemia",
+            "lymphoma",
+            "sarcomas",
+        ]
+    def predict(self, text: str):
+        results = self.extractor(text)
+        extractions = self.extract_diseases(results)
+        extractions_cleaned = self.clean_diseases(extractions)
+        detections = self.detect_cancer(extractions_cleaned)
+        return detections
+    def extract_diseases(self, entities):
+        entities = self.merge_subwords(entities)
+        diseases = [
+            entity["word"]
+            for entity in entities
+            if "disease" in entity["entity_group"].lower()
+        ]
+        return diseases
+    def merge_subwords(self, entities):
+        merged_entities = []
+        current_entity = None
+        for entity in entities:
+            if current_entity is None:
+                current_entity = entity.copy()
+            else:
+                # Check if this entity is part of the same word as the previous one
+                if (
+                    entity["start"] == current_entity["end"]
+                    and "disease" in entity["entity_group"].lower()
+                    and "disease" in current_entity["entity_group"].lower()
+                ):
+                    # Merge with previous entity
+                    current_entity["word"] += entity["word"].replace("##", "")
+                    current_entity["end"] = entity["end"]
+                    current_entity["score"] = (
+                        current_entity["score"] + entity["score"]
+                    ) / 2
+                else:
+                    merged_entities.append(current_entity)
+                    current_entity = entity.copy()
+        if current_entity is not None:
+            merged_entities.append(current_entity)
+        return merged_entities
+    def clean_diseases(self, text_list):
+        text_list = [re.sub(r"[^a-zA-Z]", " ", t) for t in text_list]
+        unique_text = set([t.lower() for t in text_list])  # and (t not in stop_words)
+        cleaned_text = [
+            t for t in unique_text if (3 <= len(t.strip()) <= 50 and ("##" not in t))
+        ]
+        return cleaned_text
+    def detect_cancer(self, text_list):
+        detected_cancers = [
+            word2.lower()
+            for word2 in text_list
+            if any(word1.lower() in word2.lower() for word1 in self.cancers)
+        ]
+        return set(detected_cancers)

deploy.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+docker compose build
+docker compose up -d
+docker compose ps

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+services:
+  cancer-api:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - PYTHONUNBUFFERED=1
+    volumes:
+      - ./models:/app/models
+      - ./app:/app/app
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3

download_data_and_models.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/bin/bash
+#view url: https://drive.google.com/file/d/1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i/view?usp=sharing
+FILE_ID="1DVPiNx-UKO7B6HvNniGIOOadJ3yq083i"
+FILE_NAME="cancer_data.zip"
+echo "Downloading data from Google Drive..."
+curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
+# Check download success
+if [ ! -f "$FILE_NAME" ]; then
+  echo "Download failed!"
+  exit 1
+fi
+# Unzip
+echo "Unzipping $FILE_NAME..."
+unzip "$FILE_NAME"
+# Optional: Clean up
+# rm "$FILE_NAME"
+echo "Done."
+#!/bin/bash
+#view url: https://drive.google.com/file/d/1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W/view?usp=sharing
+FILE_ID="1aR6PUjDi8fFBp0_pxe1pCv9S4EBptC5W"
+FILE_NAME="models-cancer-api.zip"
+echo "Downloading model from Google Drive..."
+curl -L -o "$FILE_NAME" "https://docs.google.com/uc?export=download&id=${FILE_ID}"
+# Check download success
+if [ ! -f "$FILE_NAME" ]; then
+  echo "Download failed!"
+  exit 1
+fi
+# Unzip
+echo "Unzipping $FILE_NAME..."
+unzip "$FILE_NAME"
+# Optional: Clean up
+# rm "$FILE_NAME"
+echo "Done."

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+fastapi==0.95.0
+uvicorn==0.21.1
+pydantic
+python-multipart==0.0.6
+chardet==5.2.0
+click==8.2.0
+datasets==3.6.0
+beautifulsoup4==4.13.4
+pandas==2.2.3
+nltk==3.9.1
+transformers==4.51.3
+huggingface-hub==0.31.2
+langchain
+langchain-community
+evaluate==0.4.3
+evaluation==0.0.2
+scikit-learn
+bioc==2.1
+hf-xet==1.1.2
+accelerate==1.7.0
+tqdm==4.67.1
+scikit-learn==1.6.1
+python-dotenv==1.1.0
+peft==0.15.2
+tokenizers==0.21.1
+torch==2.5.1
+torchvision==0.20.1