moabos
commited on
Commit
·
354c6a0
1
Parent(s):
909c729
chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing
Browse files- Dockerfile +15 -0
- README.md +1 -1
- app.py +162 -0
- classifier.py +160 -0
- preprocessor.py +158 -0
- requirements.txt +6 -0
- summarizer.py +78 -0
- svm_classifier.joblib +3 -0
- tfidf_vectorizer_classifier.joblib +3 -0
- tfidf_vectorizer_text_summarization.joblib +3 -0
Dockerfile
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.13-slim
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
RUN python -m nltk.downloader stopwords
|
12 |
+
|
13 |
+
|
14 |
+
COPY --chown=user . /app
|
15 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Arabic Summarizer Classifier
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
colorTo: green
|
6 |
sdk: docker
|
|
|
1 |
---
|
2 |
title: Arabic Summarizer Classifier
|
3 |
+
emoji: 📰
|
4 |
colorFrom: green
|
5 |
colorTo: green
|
6 |
sdk: docker
|
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, List, Dict, Any
|
2 |
+
from fastapi import FastAPI, HTTPException
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
from classifier import ArabicClassifier
|
6 |
+
from summarizer import ArabicSummarizer
|
7 |
+
from preprocessor import ArabicPreprocessor
|
8 |
+
|
9 |
+
app = FastAPI(
|
10 |
+
title="Arabic Text Analysis API",
|
11 |
+
description="API for Arabic text classification, summarization, and preprocessing",
|
12 |
+
version="1.0.0"
|
13 |
+
)
|
14 |
+
|
15 |
+
classifier = ArabicClassifier("svm_classifier.joblib", "tfidf_vectorizer_classifier.joblib")
|
16 |
+
summarizer = ArabicSummarizer("tfidf_vectorizer_text_summarization.joblib")
|
17 |
+
preprocessor = ArabicPreprocessor()
|
18 |
+
|
19 |
+
|
20 |
+
class TextInput(BaseModel):
|
21 |
+
text: str
|
22 |
+
|
23 |
+
|
24 |
+
class TextInputWithSentences(BaseModel):
|
25 |
+
text: str
|
26 |
+
num_sentences: Optional[int] = 3
|
27 |
+
|
28 |
+
|
29 |
+
class BatchTextInput(BaseModel):
|
30 |
+
texts: List[str]
|
31 |
+
|
32 |
+
|
33 |
+
class PreprocessingInput(BaseModel):
|
34 |
+
text: str
|
35 |
+
task_type: Optional[str] = "classification"
|
36 |
+
|
37 |
+
|
38 |
+
@app.get("/")
|
39 |
+
def read_root() -> Dict[str, Any]:
|
40 |
+
"""API welcome message and endpoint documentation."""
|
41 |
+
return {
|
42 |
+
"message": "Welcome to the Arabic Text Analysis API!",
|
43 |
+
"documentation": {
|
44 |
+
"interactive_docs": "/docs",
|
45 |
+
"redoc": "/redoc",
|
46 |
+
"openapi_schema": "/openapi.json"
|
47 |
+
},
|
48 |
+
"endpoints": {
|
49 |
+
"classify": "POST /classify - Classify Arabic text",
|
50 |
+
"classify_batch": "POST /classify/batch - Classify multiple texts",
|
51 |
+
"summarize": "POST /summarize - Summarize Arabic text",
|
52 |
+
"analyze": "POST /analyze - Both classify and summarize",
|
53 |
+
"preprocess": "POST /preprocess - Preprocess text with detailed steps",
|
54 |
+
"text_analysis": "POST /text-analysis - Analyze text characteristics",
|
55 |
+
"sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
|
56 |
+
"model_info": "GET /model-info - Get model information"
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
|
61 |
+
@app.post("/classify")
|
62 |
+
def classify_text(data: TextInput) -> Dict[str, Any]:
|
63 |
+
"""Classify Arabic text with probability distribution and metadata."""
|
64 |
+
try:
|
65 |
+
result = classifier.predict(data.text)
|
66 |
+
return result
|
67 |
+
except Exception as e:
|
68 |
+
raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
|
69 |
+
|
70 |
+
|
71 |
+
@app.post("/classify/batch")
|
72 |
+
def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
|
73 |
+
"""Classify multiple Arabic texts in batch."""
|
74 |
+
try:
|
75 |
+
results = classifier.predict_batch(data.texts)
|
76 |
+
return {
|
77 |
+
"results": results,
|
78 |
+
"total_texts": len(data.texts),
|
79 |
+
"model_used": classifier.model_name
|
80 |
+
}
|
81 |
+
except Exception as e:
|
82 |
+
raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
|
83 |
+
|
84 |
+
|
85 |
+
@app.post("/summarize")
|
86 |
+
def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
|
87 |
+
"""Summarize Arabic text with sentence analysis."""
|
88 |
+
try:
|
89 |
+
result = summarizer.summarize(data.text, data.num_sentences)
|
90 |
+
return result
|
91 |
+
except Exception as e:
|
92 |
+
raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
|
93 |
+
|
94 |
+
|
95 |
+
@app.post("/sentence-analysis")
|
96 |
+
def analyze_sentences(data: TextInput) -> Dict[str, Any]:
|
97 |
+
"""Analyze all sentences with scores and rankings."""
|
98 |
+
try:
|
99 |
+
result = summarizer.get_sentence_analysis(data.text)
|
100 |
+
return result
|
101 |
+
except Exception as e:
|
102 |
+
raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
|
103 |
+
|
104 |
+
|
105 |
+
@app.post("/analyze")
|
106 |
+
def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
|
107 |
+
"""Complete analysis: classification, summarization, and text statistics."""
|
108 |
+
try:
|
109 |
+
classification_result = classifier.predict(data.text)
|
110 |
+
summarization_result = summarizer.summarize(data.text, data.num_sentences)
|
111 |
+
text_stats = preprocessor.analyze_text(data.text)
|
112 |
+
|
113 |
+
return {
|
114 |
+
"original_text": data.text,
|
115 |
+
"text_analysis": text_stats,
|
116 |
+
"classification": classification_result,
|
117 |
+
"summarization": summarization_result
|
118 |
+
}
|
119 |
+
except Exception as e:
|
120 |
+
raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
|
121 |
+
|
122 |
+
|
123 |
+
@app.post("/preprocess")
|
124 |
+
def preprocess_text(data: PreprocessingInput) -> Dict[str, Any]:
|
125 |
+
"""Preprocess text with step-by-step breakdown."""
|
126 |
+
try:
|
127 |
+
steps = preprocessor.get_preprocessing_steps(data.text, data.task_type)
|
128 |
+
return {
|
129 |
+
"task_type": data.task_type,
|
130 |
+
"preprocessing_steps": steps
|
131 |
+
}
|
132 |
+
except Exception as e:
|
133 |
+
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
|
134 |
+
|
135 |
+
|
136 |
+
@app.post("/text-analysis")
|
137 |
+
def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
|
138 |
+
"""Analyze text characteristics and statistics."""
|
139 |
+
try:
|
140 |
+
analysis = preprocessor.analyze_text(data.text)
|
141 |
+
return {
|
142 |
+
"text": data.text,
|
143 |
+
"analysis": analysis
|
144 |
+
}
|
145 |
+
except Exception as e:
|
146 |
+
raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
|
147 |
+
|
148 |
+
|
149 |
+
@app.get("/model-info")
|
150 |
+
def get_model_info() -> Dict[str, Any]:
|
151 |
+
"""Get information about loaded models."""
|
152 |
+
try:
|
153 |
+
classifier_info = classifier.get_model_info()
|
154 |
+
return {
|
155 |
+
"classifier": classifier_info,
|
156 |
+
"summarizer": {
|
157 |
+
"vectorizer_loaded": hasattr(summarizer, 'vectorizer'),
|
158 |
+
"model_type": "TF-IDF based summarization"
|
159 |
+
}
|
160 |
+
}
|
161 |
+
except Exception as e:
|
162 |
+
raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")
|
classifier.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import numpy as np
|
3 |
+
from typing import List, Dict, Any
|
4 |
+
from preprocessor import preprocess_for_classification
|
5 |
+
|
6 |
+
|
7 |
+
class ArabicClassifier:
|
8 |
+
"""Arabic text classifier with probability distributions and metadata."""
|
9 |
+
|
10 |
+
def __init__(self,
|
11 |
+
classifier_path: str = "svm_classifier.joblib",
|
12 |
+
vectorizer_path: str = "tfidf_vectorizer_classifier.joblib"):
|
13 |
+
self.model = joblib.load(classifier_path)
|
14 |
+
self.vectorizer = joblib.load(vectorizer_path)
|
15 |
+
self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
|
16 |
+
|
17 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
18 |
+
"""Predict class with full probability distribution and metadata."""
|
19 |
+
cleaned_text = preprocess_for_classification(text)
|
20 |
+
|
21 |
+
if self.vectorizer:
|
22 |
+
text_vector = self.vectorizer.transform([cleaned_text])
|
23 |
+
else:
|
24 |
+
text_vector = [cleaned_text]
|
25 |
+
|
26 |
+
prediction = self.model.predict(text_vector)[0]
|
27 |
+
|
28 |
+
classes = getattr(self.model, 'classes_', None)
|
29 |
+
if classes is not None:
|
30 |
+
prediction_index = int(np.where(classes == prediction)[0][0])
|
31 |
+
else:
|
32 |
+
prediction_index = int(prediction) if isinstance(prediction, (int, np.integer)) else 0
|
33 |
+
|
34 |
+
if hasattr(self.model, 'predict_proba'):
|
35 |
+
probabilities = self.model.predict_proba(text_vector)[0]
|
36 |
+
confidence = float(probabilities[prediction_index])
|
37 |
+
else:
|
38 |
+
if hasattr(self.model, 'decision_function'):
|
39 |
+
decision_scores = self.model.decision_function(text_vector)[0]
|
40 |
+
if len(decision_scores.shape) == 0:
|
41 |
+
probabilities = np.array([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
|
42 |
+
else:
|
43 |
+
exp_scores = np.exp(decision_scores - np.max(decision_scores))
|
44 |
+
probabilities = exp_scores / np.sum(exp_scores)
|
45 |
+
confidence = float(probabilities[prediction_index])
|
46 |
+
else:
|
47 |
+
classes = getattr(self.model, 'classes_', None)
|
48 |
+
num_classes = len(classes) if classes is not None else 2
|
49 |
+
probabilities = np.zeros(num_classes)
|
50 |
+
probabilities[prediction_index] = 1.0
|
51 |
+
confidence = 1.0
|
52 |
+
|
53 |
+
classes = getattr(self.model, 'classes_', None)
|
54 |
+
|
55 |
+
prob_distribution = {}
|
56 |
+
if classes is not None:
|
57 |
+
for i, class_label in enumerate(classes):
|
58 |
+
prob_distribution[str(class_label)] = float(probabilities[i])
|
59 |
+
else:
|
60 |
+
for i, prob in enumerate(probabilities):
|
61 |
+
prob_distribution[f"class_{i}"] = float(prob)
|
62 |
+
|
63 |
+
return {
|
64 |
+
"prediction": str(prediction),
|
65 |
+
"prediction_label": str(prediction),
|
66 |
+
"prediction_index": int(prediction_index),
|
67 |
+
"confidence": confidence,
|
68 |
+
"probability_distribution": prob_distribution,
|
69 |
+
"all_probabilities": probabilities.tolist(),
|
70 |
+
"cleaned_text": cleaned_text,
|
71 |
+
"model_used": self.model_name,
|
72 |
+
"prediction_metadata": {
|
73 |
+
"max_probability": float(np.max(probabilities)),
|
74 |
+
"min_probability": float(np.min(probabilities)),
|
75 |
+
"entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
|
76 |
+
"num_classes": len(probabilities)
|
77 |
+
}
|
78 |
+
}
|
79 |
+
|
80 |
+
def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
|
81 |
+
"""Predict classes for multiple texts."""
|
82 |
+
cleaned_texts = [preprocess_for_classification(text) for text in texts]
|
83 |
+
|
84 |
+
if self.vectorizer:
|
85 |
+
text_vectors = self.vectorizer.transform(cleaned_texts)
|
86 |
+
else:
|
87 |
+
text_vectors = cleaned_texts
|
88 |
+
|
89 |
+
predictions = self.model.predict(text_vectors)
|
90 |
+
classes = getattr(self.model, 'classes_', None)
|
91 |
+
|
92 |
+
prediction_indices = []
|
93 |
+
for pred in predictions:
|
94 |
+
if classes is not None:
|
95 |
+
pred_index = int(np.where(classes == pred)[0][0])
|
96 |
+
else:
|
97 |
+
pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
|
98 |
+
prediction_indices.append(pred_index)
|
99 |
+
|
100 |
+
if hasattr(self.model, 'predict_proba'):
|
101 |
+
probabilities = self.model.predict_proba(text_vectors)
|
102 |
+
else:
|
103 |
+
if hasattr(self.model, 'decision_function'):
|
104 |
+
decision_scores = self.model.decision_function(text_vectors)
|
105 |
+
if len(decision_scores.shape) == 1:
|
106 |
+
probabilities = np.column_stack([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
|
107 |
+
else:
|
108 |
+
exp_scores = np.exp(decision_scores - np.max(decision_scores, axis=1, keepdims=True))
|
109 |
+
probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
|
110 |
+
else:
|
111 |
+
classes = getattr(self.model, 'classes_', None)
|
112 |
+
num_classes = len(classes) if classes is not None else 2
|
113 |
+
probabilities = np.zeros((len(predictions), num_classes))
|
114 |
+
for i, pred_idx in enumerate(prediction_indices):
|
115 |
+
probabilities[i, pred_idx] = 1.0
|
116 |
+
|
117 |
+
results = []
|
118 |
+
|
119 |
+
for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
|
120 |
+
confidence = float(probabilities[i][pred_idx])
|
121 |
+
|
122 |
+
prob_distribution = {}
|
123 |
+
if classes is not None:
|
124 |
+
for j, class_label in enumerate(classes):
|
125 |
+
prob_distribution[str(class_label)] = float(probabilities[i][j])
|
126 |
+
else:
|
127 |
+
for j, prob in enumerate(probabilities[i]):
|
128 |
+
prob_distribution[f"class_{j}"] = float(prob)
|
129 |
+
|
130 |
+
results.append({
|
131 |
+
"prediction": str(pred),
|
132 |
+
"prediction_label": str(pred),
|
133 |
+
"prediction_index": int(pred_idx),
|
134 |
+
"confidence": confidence,
|
135 |
+
"probability_distribution": prob_distribution,
|
136 |
+
"all_probabilities": probabilities[i].tolist(),
|
137 |
+
"cleaned_text": cleaned_texts[i],
|
138 |
+
"model_used": self.model_name,
|
139 |
+
"prediction_metadata": {
|
140 |
+
"max_probability": float(np.max(probabilities[i])),
|
141 |
+
"min_probability": float(np.min(probabilities[i])),
|
142 |
+
"entropy": float(-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))),
|
143 |
+
"num_classes": len(probabilities[i])
|
144 |
+
}
|
145 |
+
})
|
146 |
+
|
147 |
+
return results
|
148 |
+
|
149 |
+
def get_model_info(self) -> Dict[str, Any]:
|
150 |
+
"""Get model information and capabilities."""
|
151 |
+
classes = getattr(self.model, 'classes_', None)
|
152 |
+
return {
|
153 |
+
"model_name": self.model_name,
|
154 |
+
"model_type": type(self.model).__name__,
|
155 |
+
"num_classes": len(classes) if classes is not None else "unknown",
|
156 |
+
"classes": classes.tolist() if classes is not None else None,
|
157 |
+
"has_predict_proba": hasattr(self.model, 'predict_proba'),
|
158 |
+
"has_vectorizer": self.vectorizer is not None,
|
159 |
+
"vectorizer_type": type(self.vectorizer).__name__ if self.vectorizer else None
|
160 |
+
}
|
preprocessor.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
from nltk.stem.isri import ISRIStemmer
|
4 |
+
|
5 |
+
arabic_stopwords = set(stopwords.words("arabic"))
|
6 |
+
stemmer = ISRIStemmer()
|
7 |
+
|
8 |
+
char_map = str.maketrans(
|
9 |
+
{"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
|
10 |
+
)
|
11 |
+
|
12 |
+
diacritics_pattern = re.compile(r"[\u064B-\u0652]")
|
13 |
+
punctuation_pattern = re.compile(r"[^\w\s]")
|
14 |
+
whitespace_pattern = re.compile(r"\s+")
|
15 |
+
repeated_char_pattern = re.compile(r"(.)\1+")
|
16 |
+
|
17 |
+
|
18 |
+
def normalize_arabic(text: str) -> str:
|
19 |
+
"""Normalize Arabic characters."""
|
20 |
+
return text.translate(char_map)
|
21 |
+
|
22 |
+
|
23 |
+
def remove_diacritics(text: str) -> str:
|
24 |
+
"""Remove Arabic diacritics."""
|
25 |
+
return diacritics_pattern.sub("", text)
|
26 |
+
|
27 |
+
|
28 |
+
def remove_punctuation(text: str) -> str:
|
29 |
+
"""Remove punctuation marks."""
|
30 |
+
return punctuation_pattern.sub(" ", text)
|
31 |
+
|
32 |
+
|
33 |
+
def reduce_repeated_characters(text: str) -> str:
|
34 |
+
"""Reduce repeated characters to single occurrence."""
|
35 |
+
return repeated_char_pattern.sub(r"\1", text)
|
36 |
+
|
37 |
+
|
38 |
+
def remove_stopwords(tokens: list[str]) -> list[str]:
|
39 |
+
"""Remove Arabic stopwords from tokens."""
|
40 |
+
return [word for word in tokens if word not in arabic_stopwords]
|
41 |
+
|
42 |
+
|
43 |
+
def stem_tokens(tokens: list[str]) -> list[str]:
|
44 |
+
"""Apply ISRI stemming to tokens."""
|
45 |
+
return [stemmer.stem(token) for token in tokens]
|
46 |
+
|
47 |
+
|
48 |
+
def preprocess_for_classification(text: str) -> str:
|
49 |
+
"""Preprocess text for classification: normalize, clean, tokenize, stem."""
|
50 |
+
text = text.strip().lower()
|
51 |
+
text = normalize_arabic(text)
|
52 |
+
text = remove_diacritics(text)
|
53 |
+
text = remove_punctuation(text)
|
54 |
+
text = reduce_repeated_characters(text)
|
55 |
+
text = whitespace_pattern.sub(" ", text).strip()
|
56 |
+
text = re.sub(r"\d+", "", text)
|
57 |
+
tokens = text.split()
|
58 |
+
tokens = remove_stopwords(tokens)
|
59 |
+
tokens = stem_tokens(tokens)
|
60 |
+
return " ".join(tokens)
|
61 |
+
|
62 |
+
|
63 |
+
def preprocess_for_summarization(text: str) -> str:
|
64 |
+
"""Light preprocessing for summarization: remove diacritics and numbers."""
|
65 |
+
if not isinstance(text, str):
|
66 |
+
return ""
|
67 |
+
text = text.strip().lower()
|
68 |
+
text = remove_diacritics(text)
|
69 |
+
text = whitespace_pattern.sub(" ", text).strip()
|
70 |
+
return re.sub(r"\d+", "", text)
|
71 |
+
|
72 |
+
|
73 |
+
class ArabicPreprocessor:
|
74 |
+
"""Arabic text preprocessor with analysis capabilities."""
|
75 |
+
|
76 |
+
def __init__(self):
|
77 |
+
self.arabic_stopwords = arabic_stopwords
|
78 |
+
self.stemmer = stemmer
|
79 |
+
self.char_map = char_map
|
80 |
+
|
81 |
+
def preprocess_for_classification(self, text: str) -> str:
|
82 |
+
"""Preprocess text for classification."""
|
83 |
+
return preprocess_for_classification(text)
|
84 |
+
|
85 |
+
def preprocess_for_summarization(self, text: str) -> str:
|
86 |
+
"""Preprocess text for summarization."""
|
87 |
+
return preprocess_for_summarization(text)
|
88 |
+
|
89 |
+
def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
|
90 |
+
"""Get detailed preprocessing steps for analysis."""
|
91 |
+
steps = {
|
92 |
+
"original": text,
|
93 |
+
"stripped_lowered": text.strip().lower(),
|
94 |
+
}
|
95 |
+
|
96 |
+
current = text.strip().lower()
|
97 |
+
|
98 |
+
if task_type == "classification":
|
99 |
+
steps["normalized"] = normalize_arabic(current)
|
100 |
+
current = normalize_arabic(current)
|
101 |
+
|
102 |
+
steps["diacritics_removed"] = remove_diacritics(current)
|
103 |
+
current = remove_diacritics(current)
|
104 |
+
|
105 |
+
steps["punctuation_removed"] = remove_punctuation(current)
|
106 |
+
current = remove_punctuation(current)
|
107 |
+
|
108 |
+
steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
|
109 |
+
current = reduce_repeated_characters(current)
|
110 |
+
|
111 |
+
steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
|
112 |
+
current = whitespace_pattern.sub(" ", current).strip()
|
113 |
+
|
114 |
+
steps["numbers_removed"] = re.sub(r"\d+", "", current)
|
115 |
+
current = re.sub(r"\d+", "", current)
|
116 |
+
|
117 |
+
tokens = current.split()
|
118 |
+
steps["tokenized"] = tokens
|
119 |
+
|
120 |
+
tokens_no_stop = remove_stopwords(tokens)
|
121 |
+
steps["stopwords_removed"] = tokens_no_stop
|
122 |
+
|
123 |
+
stemmed_tokens = stem_tokens(tokens_no_stop)
|
124 |
+
steps["stemmed"] = stemmed_tokens
|
125 |
+
|
126 |
+
steps["final"] = " ".join(stemmed_tokens)
|
127 |
+
|
128 |
+
elif task_type == "summarization":
|
129 |
+
steps["diacritics_removed"] = remove_diacritics(current)
|
130 |
+
current = remove_diacritics(current)
|
131 |
+
|
132 |
+
steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
|
133 |
+
current = whitespace_pattern.sub(" ", current).strip()
|
134 |
+
|
135 |
+
steps["numbers_removed"] = re.sub(r"\d+", "", current)
|
136 |
+
steps["final"] = re.sub(r"\d+", "", current)
|
137 |
+
|
138 |
+
return steps
|
139 |
+
|
140 |
+
def analyze_text(self, text: str) -> dict:
|
141 |
+
"""Analyze text characteristics and statistics."""
|
142 |
+
original_sentences = re.split(r"[.!؟\n]+", text)
|
143 |
+
original_sentences = [s.strip() for s in original_sentences if s.strip()]
|
144 |
+
|
145 |
+
tokens = text.split()
|
146 |
+
arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
|
147 |
+
|
148 |
+
return {
|
149 |
+
"character_count": len(text),
|
150 |
+
"word_count": len(tokens),
|
151 |
+
"sentence_count": len(original_sentences),
|
152 |
+
"arabic_character_count": arabic_chars,
|
153 |
+
"arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
|
154 |
+
"average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
|
155 |
+
"average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
|
156 |
+
"has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
|
157 |
+
"punctuation_count": len(re.findall(r'[^\w\s]', text))
|
158 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
scikit-learn
|
4 |
+
nltk
|
5 |
+
joblib
|
6 |
+
numpy
|
summarizer.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import numpy as np
|
3 |
+
import joblib
|
4 |
+
from typing import Dict, Any
|
5 |
+
from preprocessor import preprocess_for_summarization
|
6 |
+
|
7 |
+
|
8 |
+
class ArabicSummarizer:
|
9 |
+
"""Arabic text summarizer using TF-IDF scoring."""
|
10 |
+
|
11 |
+
def __init__(self, vectorizer_path: str = "tfidf_vectorizer_text_summarization.joblib"):
|
12 |
+
self.vectorizer = joblib.load(vectorizer_path)
|
13 |
+
|
14 |
+
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
|
15 |
+
"""Summarize text by selecting top-scored sentences."""
|
16 |
+
cleaned_text = preprocess_for_summarization(text)
|
17 |
+
|
18 |
+
sentences = re.split(r"[.!؟\n]+", cleaned_text)
|
19 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
20 |
+
|
21 |
+
if len(sentences) <= num_sentences:
|
22 |
+
return {
|
23 |
+
"summary": text,
|
24 |
+
"original_sentence_count": int(len(sentences)),
|
25 |
+
"summary_sentence_count": int(len(sentences)),
|
26 |
+
"sentences": sentences,
|
27 |
+
"selected_indices": list(range(len(sentences))),
|
28 |
+
"sentence_scores": None
|
29 |
+
}
|
30 |
+
|
31 |
+
tfidf_matrix = self.vectorizer.transform(sentences)
|
32 |
+
sentence_scores = tfidf_matrix.sum(axis=1).A1
|
33 |
+
|
34 |
+
top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
|
35 |
+
top_sentences = [sentences[i] for i in sorted(top_indices)]
|
36 |
+
|
37 |
+
return {
|
38 |
+
"summary": " ".join(top_sentences),
|
39 |
+
"original_sentence_count": int(len(sentences)),
|
40 |
+
"summary_sentence_count": int(len(top_sentences)),
|
41 |
+
"sentences": sentences,
|
42 |
+
"selected_indices": [int(i) for i in sorted(top_indices)],
|
43 |
+
"sentence_scores": sentence_scores.tolist(),
|
44 |
+
"top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
|
45 |
+
}
|
46 |
+
|
47 |
+
def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
|
48 |
+
"""Get detailed analysis of all sentences with scores and rankings."""
|
49 |
+
cleaned_text = preprocess_for_summarization(text)
|
50 |
+
|
51 |
+
sentences = re.split(r"[.!؟\n]+", cleaned_text)
|
52 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
53 |
+
|
54 |
+
if not sentences:
|
55 |
+
return {"error": "No sentences found in text"}
|
56 |
+
|
57 |
+
tfidf_matrix = self.vectorizer.transform(sentences)
|
58 |
+
sentence_scores = tfidf_matrix.sum(axis=1).A1
|
59 |
+
|
60 |
+
sentence_analysis = []
|
61 |
+
for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
|
62 |
+
sentence_analysis.append({
|
63 |
+
"index": int(i),
|
64 |
+
"sentence": sentence,
|
65 |
+
"score": float(score),
|
66 |
+
"rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
|
67 |
+
})
|
68 |
+
|
69 |
+
return {
|
70 |
+
"sentences": sentence_analysis,
|
71 |
+
"total_sentences": int(len(sentences)),
|
72 |
+
"score_statistics": {
|
73 |
+
"mean": float(np.mean(sentence_scores)),
|
74 |
+
"std": float(np.std(sentence_scores)),
|
75 |
+
"min": float(np.min(sentence_scores)),
|
76 |
+
"max": float(np.max(sentence_scores))
|
77 |
+
}
|
78 |
+
}
|
svm_classifier.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c094fd8389cc0b8cb32b94c2ef4b204fd699d70cb852a311018f435a0b71cbf5
|
3 |
+
size 10423139
|
tfidf_vectorizer_classifier.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5e15bed99d5c6817645995e69cbad7438e337854afa738086458a610d47e4d0
|
3 |
+
size 753983
|
tfidf_vectorizer_text_summarization.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:898e94b15ac59c19cf97d56ca7023c85079617cc3258a031fa107a639dac580b
|
3 |
+
size 6389421
|