Spaces:

mabosaimi
/

arabic-summarizer-classifier

Sleeping

App Files Files Community

moabos commited on May 29

Commit

354c6a0

1 Parent(s): 909c729

chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing

Browse files

Files changed (10) hide show

Dockerfile +15 -0
README.md +1 -1
app.py +162 -0
classifier.py +160 -0
preprocessor.py +158 -0
requirements.txt +6 -0
summarizer.py +78 -0
svm_classifier.joblib +3 -0
tfidf_vectorizer_classifier.joblib +3 -0
tfidf_vectorizer_text_summarization.joblib +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.13-slim
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN python -m nltk.downloader stopwords
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Arabic Summarizer Classifier
-emoji: 💻
 colorFrom: green
 colorTo: green
 sdk: docker

 ---
 title: Arabic Summarizer Classifier
+emoji: 📰
 colorFrom: green
 colorTo: green
 sdk: docker

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import Optional, List, Dict, Any
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from classifier import ArabicClassifier
+from summarizer import ArabicSummarizer
+from preprocessor import ArabicPreprocessor
+app = FastAPI(
+    title="Arabic Text Analysis API",
+    description="API for Arabic text classification, summarization, and preprocessing",
+    version="1.0.0"
+)
+classifier = ArabicClassifier("svm_classifier.joblib", "tfidf_vectorizer_classifier.joblib")
+summarizer = ArabicSummarizer("tfidf_vectorizer_text_summarization.joblib")
+preprocessor = ArabicPreprocessor()
+class TextInput(BaseModel):
+    text: str
+class TextInputWithSentences(BaseModel):
+    text: str
+    num_sentences: Optional[int] = 3
+class BatchTextInput(BaseModel):
+    texts: List[str]
+class PreprocessingInput(BaseModel):
+    text: str
+    task_type: Optional[str] = "classification"
+@app.get("/")
+def read_root() -> Dict[str, Any]:
+    """API welcome message and endpoint documentation."""
+    return {
+        "message": "Welcome to the Arabic Text Analysis API!",
+        "documentation": {
+            "interactive_docs": "/docs",
+            "redoc": "/redoc",
+            "openapi_schema": "/openapi.json"
+        },
+        "endpoints": {
+            "classify": "POST /classify - Classify Arabic text",
+            "classify_batch": "POST /classify/batch - Classify multiple texts",
+            "summarize": "POST /summarize - Summarize Arabic text",
+            "analyze": "POST /analyze - Both classify and summarize",
+            "preprocess": "POST /preprocess - Preprocess text with detailed steps",
+            "text_analysis": "POST /text-analysis - Analyze text characteristics",
+            "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
+            "model_info": "GET /model-info - Get model information"
+        }
+    }
+@app.post("/classify")
+def classify_text(data: TextInput) -> Dict[str, Any]:
+    """Classify Arabic text with probability distribution and metadata."""
+    try:
+        result = classifier.predict(data.text)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
+@app.post("/classify/batch")
+def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
+    """Classify multiple Arabic texts in batch."""
+    try:
+        results = classifier.predict_batch(data.texts)
+        return {
+            "results": results,
+            "total_texts": len(data.texts),
+            "model_used": classifier.model_name
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
+@app.post("/summarize")
+def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
+    """Summarize Arabic text with sentence analysis."""
+    try:
+        result = summarizer.summarize(data.text, data.num_sentences)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
+@app.post("/sentence-analysis")
+def analyze_sentences(data: TextInput) -> Dict[str, Any]:
+    """Analyze all sentences with scores and rankings."""
+    try:
+        result = summarizer.get_sentence_analysis(data.text)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
+@app.post("/analyze")
+def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
+    """Complete analysis: classification, summarization, and text statistics."""
+    try:
+        classification_result = classifier.predict(data.text)
+        summarization_result = summarizer.summarize(data.text, data.num_sentences)
+        text_stats = preprocessor.analyze_text(data.text)
+        return {
+            "original_text": data.text,
+            "text_analysis": text_stats,
+            "classification": classification_result,
+            "summarization": summarization_result
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
+@app.post("/preprocess")
+def preprocess_text(data: PreprocessingInput) -> Dict[str, Any]:
+    """Preprocess text with step-by-step breakdown."""
+    try:
+        steps = preprocessor.get_preprocessing_steps(data.text, data.task_type)
+        return {
+            "task_type": data.task_type,
+            "preprocessing_steps": steps
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
+@app.post("/text-analysis")
+def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
+    """Analyze text characteristics and statistics."""
+    try:
+        analysis = preprocessor.analyze_text(data.text)
+        return {
+            "text": data.text,
+            "analysis": analysis
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
+@app.get("/model-info")
+def get_model_info() -> Dict[str, Any]:
+    """Get information about loaded models."""
+    try:
+        classifier_info = classifier.get_model_info()
+        return {
+            "classifier": classifier_info,
+            "summarizer": {
+                "vectorizer_loaded": hasattr(summarizer, 'vectorizer'),
+                "model_type": "TF-IDF based summarization"
+            }
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")

classifier.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import joblib
+import numpy as np
+from typing import List, Dict, Any
+from preprocessor import preprocess_for_classification
+class ArabicClassifier:
+    """Arabic text classifier with probability distributions and metadata."""
+    def __init__(self,
+                 classifier_path: str = "svm_classifier.joblib",
+                 vectorizer_path: str = "tfidf_vectorizer_classifier.joblib"):
+        self.model = joblib.load(classifier_path)
+        self.vectorizer = joblib.load(vectorizer_path)
+        self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
+    def predict(self, text: str) -> Dict[str, Any]:
+        """Predict class with full probability distribution and metadata."""
+        cleaned_text = preprocess_for_classification(text)
+        if self.vectorizer:
+            text_vector = self.vectorizer.transform([cleaned_text])
+        else:
+            text_vector = [cleaned_text]
+        prediction = self.model.predict(text_vector)[0]
+        classes = getattr(self.model, 'classes_', None)
+        if classes is not None:
+            prediction_index = int(np.where(classes == prediction)[0][0])
+        else:
+            prediction_index = int(prediction) if isinstance(prediction, (int, np.integer)) else 0
+        if hasattr(self.model, 'predict_proba'):
+            probabilities = self.model.predict_proba(text_vector)[0]
+            confidence = float(probabilities[prediction_index])
+        else:
+            if hasattr(self.model, 'decision_function'):
+                decision_scores = self.model.decision_function(text_vector)[0]
+                if len(decision_scores.shape) == 0:
+                    probabilities = np.array([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
+                else:
+                    exp_scores = np.exp(decision_scores - np.max(decision_scores))
+                    probabilities = exp_scores / np.sum(exp_scores)
+                confidence = float(probabilities[prediction_index])
+            else:
+                classes = getattr(self.model, 'classes_', None)
+                num_classes = len(classes) if classes is not None else 2
+                probabilities = np.zeros(num_classes)
+                probabilities[prediction_index] = 1.0
+                confidence = 1.0
+        classes = getattr(self.model, 'classes_', None)
+        prob_distribution = {}
+        if classes is not None:
+            for i, class_label in enumerate(classes):
+                prob_distribution[str(class_label)] = float(probabilities[i])
+        else:
+            for i, prob in enumerate(probabilities):
+                prob_distribution[f"class_{i}"] = float(prob)
+        return {
+            "prediction": str(prediction),
+            "prediction_label": str(prediction),
+            "prediction_index": int(prediction_index),
+            "confidence": confidence,
+            "probability_distribution": prob_distribution,
+            "all_probabilities": probabilities.tolist(),
+            "cleaned_text": cleaned_text,
+            "model_used": self.model_name,
+            "prediction_metadata": {
+                "max_probability": float(np.max(probabilities)),
+                "min_probability": float(np.min(probabilities)),
+                "entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
+                "num_classes": len(probabilities)
+            }
+        }
+    def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
+        """Predict classes for multiple texts."""
+        cleaned_texts = [preprocess_for_classification(text) for text in texts]
+        if self.vectorizer:
+            text_vectors = self.vectorizer.transform(cleaned_texts)
+        else:
+            text_vectors = cleaned_texts
+        predictions = self.model.predict(text_vectors)
+        classes = getattr(self.model, 'classes_', None)
+        prediction_indices = []
+        for pred in predictions:
+            if classes is not None:
+                pred_index = int(np.where(classes == pred)[0][0])
+            else:
+                pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
+            prediction_indices.append(pred_index)
+        if hasattr(self.model, 'predict_proba'):
+            probabilities = self.model.predict_proba(text_vectors)
+        else:
+            if hasattr(self.model, 'decision_function'):
+                decision_scores = self.model.decision_function(text_vectors)
+                if len(decision_scores.shape) == 1:
+                    probabilities = np.column_stack([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
+                else:
+                    exp_scores = np.exp(decision_scores - np.max(decision_scores, axis=1, keepdims=True))
+                    probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
+            else:
+                classes = getattr(self.model, 'classes_', None)
+                num_classes = len(classes) if classes is not None else 2
+                probabilities = np.zeros((len(predictions), num_classes))
+                for i, pred_idx in enumerate(prediction_indices):
+                    probabilities[i, pred_idx] = 1.0
+        results = []
+        for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
+            confidence = float(probabilities[i][pred_idx])
+            prob_distribution = {}
+            if classes is not None:
+                for j, class_label in enumerate(classes):
+                    prob_distribution[str(class_label)] = float(probabilities[i][j])
+            else:
+                for j, prob in enumerate(probabilities[i]):
+                    prob_distribution[f"class_{j}"] = float(prob)
+            results.append({
+                "prediction": str(pred),
+                "prediction_label": str(pred),
+                "prediction_index": int(pred_idx),
+                "confidence": confidence,
+                "probability_distribution": prob_distribution,
+                "all_probabilities": probabilities[i].tolist(),
+                "cleaned_text": cleaned_texts[i],
+                "model_used": self.model_name,
+                "prediction_metadata": {
+                    "max_probability": float(np.max(probabilities[i])),
+                    "min_probability": float(np.min(probabilities[i])),
+                    "entropy": float(-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))),
+                    "num_classes": len(probabilities[i])
+                }
+            })
+        return results
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get model information and capabilities."""
+        classes = getattr(self.model, 'classes_', None)
+        return {
+            "model_name": self.model_name,
+            "model_type": type(self.model).__name__,
+            "num_classes": len(classes) if classes is not None else "unknown",
+            "classes": classes.tolist() if classes is not None else None,
+            "has_predict_proba": hasattr(self.model, 'predict_proba'),
+            "has_vectorizer": self.vectorizer is not None,
+            "vectorizer_type": type(self.vectorizer).__name__ if self.vectorizer else None
+        }

preprocessor.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import re
+from nltk.corpus import stopwords
+from nltk.stem.isri import ISRIStemmer
+arabic_stopwords = set(stopwords.words("arabic"))
+stemmer = ISRIStemmer()
+char_map = str.maketrans(
+    {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
+)
+diacritics_pattern = re.compile(r"[\u064B-\u0652]")
+punctuation_pattern = re.compile(r"[^\w\s]")
+whitespace_pattern = re.compile(r"\s+")
+repeated_char_pattern = re.compile(r"(.)\1+")
+def normalize_arabic(text: str) -> str:
+    """Normalize Arabic characters."""
+    return text.translate(char_map)
+def remove_diacritics(text: str) -> str:
+    """Remove Arabic diacritics."""
+    return diacritics_pattern.sub("", text)
+def remove_punctuation(text: str) -> str:
+    """Remove punctuation marks."""
+    return punctuation_pattern.sub(" ", text)
+def reduce_repeated_characters(text: str) -> str:
+    """Reduce repeated characters to single occurrence."""
+    return repeated_char_pattern.sub(r"\1", text)
+def remove_stopwords(tokens: list[str]) -> list[str]:
+    """Remove Arabic stopwords from tokens."""
+    return [word for word in tokens if word not in arabic_stopwords]
+def stem_tokens(tokens: list[str]) -> list[str]:
+    """Apply ISRI stemming to tokens."""
+    return [stemmer.stem(token) for token in tokens]
+def preprocess_for_classification(text: str) -> str:
+    """Preprocess text for classification: normalize, clean, tokenize, stem."""
+    text = text.strip().lower()
+    text = normalize_arabic(text)
+    text = remove_diacritics(text)
+    text = remove_punctuation(text)
+    text = reduce_repeated_characters(text)
+    text = whitespace_pattern.sub(" ", text).strip()
+    text = re.sub(r"\d+", "", text)
+    tokens = text.split()
+    tokens = remove_stopwords(tokens)
+    tokens = stem_tokens(tokens)
+    return " ".join(tokens)
+def preprocess_for_summarization(text: str) -> str:
+    """Light preprocessing for summarization: remove diacritics and numbers."""
+    if not isinstance(text, str):
+        return ""
+    text = text.strip().lower()
+    text = remove_diacritics(text)
+    text = whitespace_pattern.sub(" ", text).strip()
+    return re.sub(r"\d+", "", text)
+class ArabicPreprocessor:
+    """Arabic text preprocessor with analysis capabilities."""
+    def __init__(self):
+        self.arabic_stopwords = arabic_stopwords
+        self.stemmer = stemmer
+        self.char_map = char_map
+    def preprocess_for_classification(self, text: str) -> str:
+        """Preprocess text for classification."""
+        return preprocess_for_classification(text)
+    def preprocess_for_summarization(self, text: str) -> str:
+        """Preprocess text for summarization."""
+        return preprocess_for_summarization(text)
+    def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
+        """Get detailed preprocessing steps for analysis."""
+        steps = {
+            "original": text,
+            "stripped_lowered": text.strip().lower(),
+        }
+        current = text.strip().lower()
+        if task_type == "classification":
+            steps["normalized"] = normalize_arabic(current)
+            current = normalize_arabic(current)
+            steps["diacritics_removed"] = remove_diacritics(current)
+            current = remove_diacritics(current)
+            steps["punctuation_removed"] = remove_punctuation(current)
+            current = remove_punctuation(current)
+            steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
+            current = reduce_repeated_characters(current)
+            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
+            current = whitespace_pattern.sub(" ", current).strip()
+            steps["numbers_removed"] = re.sub(r"\d+", "", current)
+            current = re.sub(r"\d+", "", current)
+            tokens = current.split()
+            steps["tokenized"] = tokens
+            tokens_no_stop = remove_stopwords(tokens)
+            steps["stopwords_removed"] = tokens_no_stop
+            stemmed_tokens = stem_tokens(tokens_no_stop)
+            steps["stemmed"] = stemmed_tokens
+            steps["final"] = " ".join(stemmed_tokens)
+        elif task_type == "summarization":
+            steps["diacritics_removed"] = remove_diacritics(current)
+            current = remove_diacritics(current)
+            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
+            current = whitespace_pattern.sub(" ", current).strip()
+            steps["numbers_removed"] = re.sub(r"\d+", "", current)
+            steps["final"] = re.sub(r"\d+", "", current)
+        return steps
+    def analyze_text(self, text: str) -> dict:
+        """Analyze text characteristics and statistics."""
+        original_sentences = re.split(r"[.!؟\n]+", text)
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+        tokens = text.split()
+        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
+        return {
+            "character_count": len(text),
+            "word_count": len(tokens),
+            "sentence_count": len(original_sentences),
+            "arabic_character_count": arabic_chars,
+            "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
+            "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
+            "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
+            "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
+            "punctuation_count": len(re.findall(r'[^\w\s]', text))
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+scikit-learn
+nltk
+joblib
+numpy

summarizer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import re
+import numpy as np
+import joblib
+from typing import Dict, Any
+from preprocessor import preprocess_for_summarization
+class ArabicSummarizer:
+    """Arabic text summarizer using TF-IDF scoring."""
+    def __init__(self, vectorizer_path: str = "tfidf_vectorizer_text_summarization.joblib"):
+        self.vectorizer = joblib.load(vectorizer_path)
+    def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
+        """Summarize text by selecting top-scored sentences."""
+        cleaned_text = preprocess_for_summarization(text)
+        sentences = re.split(r"[.!؟\n]+", cleaned_text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        if len(sentences) <= num_sentences:
+            return {
+                "summary": text,
+                "original_sentence_count": int(len(sentences)),
+                "summary_sentence_count": int(len(sentences)),
+                "sentences": sentences,
+                "selected_indices": list(range(len(sentences))),
+                "sentence_scores": None
+            }
+        tfidf_matrix = self.vectorizer.transform(sentences)
+        sentence_scores = tfidf_matrix.sum(axis=1).A1
+        top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
+        top_sentences = [sentences[i] for i in sorted(top_indices)]
+        return {
+            "summary": " ".join(top_sentences),
+            "original_sentence_count": int(len(sentences)),
+            "summary_sentence_count": int(len(top_sentences)),
+            "sentences": sentences,
+            "selected_indices": [int(i) for i in sorted(top_indices)],
+            "sentence_scores": sentence_scores.tolist(),
+            "top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
+        }
+    def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
+        """Get detailed analysis of all sentences with scores and rankings."""
+        cleaned_text = preprocess_for_summarization(text)
+        sentences = re.split(r"[.!؟\n]+", cleaned_text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        if not sentences:
+            return {"error": "No sentences found in text"}
+        tfidf_matrix = self.vectorizer.transform(sentences)
+        sentence_scores = tfidf_matrix.sum(axis=1).A1
+        sentence_analysis = []
+        for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
+            sentence_analysis.append({
+                "index": int(i),
+                "sentence": sentence,
+                "score": float(score),
+                "rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
+            })
+        return {
+            "sentences": sentence_analysis,
+            "total_sentences": int(len(sentences)),
+            "score_statistics": {
+                "mean": float(np.mean(sentence_scores)),
+                "std": float(np.std(sentence_scores)),
+                "min": float(np.min(sentence_scores)),
+                "max": float(np.max(sentence_scores))
+            }
+        }

svm_classifier.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c094fd8389cc0b8cb32b94c2ef4b204fd699d70cb852a311018f435a0b71cbf5
+size 10423139

tfidf_vectorizer_classifier.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e15bed99d5c6817645995e69cbad7438e337854afa738086458a610d47e4d0
+size 753983

tfidf_vectorizer_text_summarization.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:898e94b15ac59c19cf97d56ca7023c85079617cc3258a031fa107a639dac580b
+size 6389421