moabos commited on
Commit
354c6a0
·
1 Parent(s): 909c729

chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing

Browse files
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+ RUN python -m nltk.downloader stopwords
12
+
13
+
14
+ COPY --chown=user . /app
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Arabic Summarizer Classifier
3
- emoji: 💻
4
  colorFrom: green
5
  colorTo: green
6
  sdk: docker
 
1
  ---
2
  title: Arabic Summarizer Classifier
3
+ emoji: 📰
4
  colorFrom: green
5
  colorTo: green
6
  sdk: docker
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Dict, Any
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+
5
+ from classifier import ArabicClassifier
6
+ from summarizer import ArabicSummarizer
7
+ from preprocessor import ArabicPreprocessor
8
+
9
+ app = FastAPI(
10
+ title="Arabic Text Analysis API",
11
+ description="API for Arabic text classification, summarization, and preprocessing",
12
+ version="1.0.0"
13
+ )
14
+
15
+ classifier = ArabicClassifier("svm_classifier.joblib", "tfidf_vectorizer_classifier.joblib")
16
+ summarizer = ArabicSummarizer("tfidf_vectorizer_text_summarization.joblib")
17
+ preprocessor = ArabicPreprocessor()
18
+
19
+
20
+ class TextInput(BaseModel):
21
+ text: str
22
+
23
+
24
+ class TextInputWithSentences(BaseModel):
25
+ text: str
26
+ num_sentences: Optional[int] = 3
27
+
28
+
29
+ class BatchTextInput(BaseModel):
30
+ texts: List[str]
31
+
32
+
33
+ class PreprocessingInput(BaseModel):
34
+ text: str
35
+ task_type: Optional[str] = "classification"
36
+
37
+
38
+ @app.get("/")
39
+ def read_root() -> Dict[str, Any]:
40
+ """API welcome message and endpoint documentation."""
41
+ return {
42
+ "message": "Welcome to the Arabic Text Analysis API!",
43
+ "documentation": {
44
+ "interactive_docs": "/docs",
45
+ "redoc": "/redoc",
46
+ "openapi_schema": "/openapi.json"
47
+ },
48
+ "endpoints": {
49
+ "classify": "POST /classify - Classify Arabic text",
50
+ "classify_batch": "POST /classify/batch - Classify multiple texts",
51
+ "summarize": "POST /summarize - Summarize Arabic text",
52
+ "analyze": "POST /analyze - Both classify and summarize",
53
+ "preprocess": "POST /preprocess - Preprocess text with detailed steps",
54
+ "text_analysis": "POST /text-analysis - Analyze text characteristics",
55
+ "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
56
+ "model_info": "GET /model-info - Get model information"
57
+ }
58
+ }
59
+
60
+
61
+ @app.post("/classify")
62
+ def classify_text(data: TextInput) -> Dict[str, Any]:
63
+ """Classify Arabic text with probability distribution and metadata."""
64
+ try:
65
+ result = classifier.predict(data.text)
66
+ return result
67
+ except Exception as e:
68
+ raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
69
+
70
+
71
+ @app.post("/classify/batch")
72
+ def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
73
+ """Classify multiple Arabic texts in batch."""
74
+ try:
75
+ results = classifier.predict_batch(data.texts)
76
+ return {
77
+ "results": results,
78
+ "total_texts": len(data.texts),
79
+ "model_used": classifier.model_name
80
+ }
81
+ except Exception as e:
82
+ raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
83
+
84
+
85
+ @app.post("/summarize")
86
+ def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
87
+ """Summarize Arabic text with sentence analysis."""
88
+ try:
89
+ result = summarizer.summarize(data.text, data.num_sentences)
90
+ return result
91
+ except Exception as e:
92
+ raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
93
+
94
+
95
+ @app.post("/sentence-analysis")
96
+ def analyze_sentences(data: TextInput) -> Dict[str, Any]:
97
+ """Analyze all sentences with scores and rankings."""
98
+ try:
99
+ result = summarizer.get_sentence_analysis(data.text)
100
+ return result
101
+ except Exception as e:
102
+ raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
103
+
104
+
105
+ @app.post("/analyze")
106
+ def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
107
+ """Complete analysis: classification, summarization, and text statistics."""
108
+ try:
109
+ classification_result = classifier.predict(data.text)
110
+ summarization_result = summarizer.summarize(data.text, data.num_sentences)
111
+ text_stats = preprocessor.analyze_text(data.text)
112
+
113
+ return {
114
+ "original_text": data.text,
115
+ "text_analysis": text_stats,
116
+ "classification": classification_result,
117
+ "summarization": summarization_result
118
+ }
119
+ except Exception as e:
120
+ raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
121
+
122
+
123
+ @app.post("/preprocess")
124
+ def preprocess_text(data: PreprocessingInput) -> Dict[str, Any]:
125
+ """Preprocess text with step-by-step breakdown."""
126
+ try:
127
+ steps = preprocessor.get_preprocessing_steps(data.text, data.task_type)
128
+ return {
129
+ "task_type": data.task_type,
130
+ "preprocessing_steps": steps
131
+ }
132
+ except Exception as e:
133
+ raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
134
+
135
+
136
+ @app.post("/text-analysis")
137
+ def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
138
+ """Analyze text characteristics and statistics."""
139
+ try:
140
+ analysis = preprocessor.analyze_text(data.text)
141
+ return {
142
+ "text": data.text,
143
+ "analysis": analysis
144
+ }
145
+ except Exception as e:
146
+ raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
147
+
148
+
149
+ @app.get("/model-info")
150
+ def get_model_info() -> Dict[str, Any]:
151
+ """Get information about loaded models."""
152
+ try:
153
+ classifier_info = classifier.get_model_info()
154
+ return {
155
+ "classifier": classifier_info,
156
+ "summarizer": {
157
+ "vectorizer_loaded": hasattr(summarizer, 'vectorizer'),
158
+ "model_type": "TF-IDF based summarization"
159
+ }
160
+ }
161
+ except Exception as e:
162
+ raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")
classifier.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import numpy as np
3
+ from typing import List, Dict, Any
4
+ from preprocessor import preprocess_for_classification
5
+
6
+
7
+ class ArabicClassifier:
8
+ """Arabic text classifier with probability distributions and metadata."""
9
+
10
+ def __init__(self,
11
+ classifier_path: str = "svm_classifier.joblib",
12
+ vectorizer_path: str = "tfidf_vectorizer_classifier.joblib"):
13
+ self.model = joblib.load(classifier_path)
14
+ self.vectorizer = joblib.load(vectorizer_path)
15
+ self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
16
+
17
+ def predict(self, text: str) -> Dict[str, Any]:
18
+ """Predict class with full probability distribution and metadata."""
19
+ cleaned_text = preprocess_for_classification(text)
20
+
21
+ if self.vectorizer:
22
+ text_vector = self.vectorizer.transform([cleaned_text])
23
+ else:
24
+ text_vector = [cleaned_text]
25
+
26
+ prediction = self.model.predict(text_vector)[0]
27
+
28
+ classes = getattr(self.model, 'classes_', None)
29
+ if classes is not None:
30
+ prediction_index = int(np.where(classes == prediction)[0][0])
31
+ else:
32
+ prediction_index = int(prediction) if isinstance(prediction, (int, np.integer)) else 0
33
+
34
+ if hasattr(self.model, 'predict_proba'):
35
+ probabilities = self.model.predict_proba(text_vector)[0]
36
+ confidence = float(probabilities[prediction_index])
37
+ else:
38
+ if hasattr(self.model, 'decision_function'):
39
+ decision_scores = self.model.decision_function(text_vector)[0]
40
+ if len(decision_scores.shape) == 0:
41
+ probabilities = np.array([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
42
+ else:
43
+ exp_scores = np.exp(decision_scores - np.max(decision_scores))
44
+ probabilities = exp_scores / np.sum(exp_scores)
45
+ confidence = float(probabilities[prediction_index])
46
+ else:
47
+ classes = getattr(self.model, 'classes_', None)
48
+ num_classes = len(classes) if classes is not None else 2
49
+ probabilities = np.zeros(num_classes)
50
+ probabilities[prediction_index] = 1.0
51
+ confidence = 1.0
52
+
53
+ classes = getattr(self.model, 'classes_', None)
54
+
55
+ prob_distribution = {}
56
+ if classes is not None:
57
+ for i, class_label in enumerate(classes):
58
+ prob_distribution[str(class_label)] = float(probabilities[i])
59
+ else:
60
+ for i, prob in enumerate(probabilities):
61
+ prob_distribution[f"class_{i}"] = float(prob)
62
+
63
+ return {
64
+ "prediction": str(prediction),
65
+ "prediction_label": str(prediction),
66
+ "prediction_index": int(prediction_index),
67
+ "confidence": confidence,
68
+ "probability_distribution": prob_distribution,
69
+ "all_probabilities": probabilities.tolist(),
70
+ "cleaned_text": cleaned_text,
71
+ "model_used": self.model_name,
72
+ "prediction_metadata": {
73
+ "max_probability": float(np.max(probabilities)),
74
+ "min_probability": float(np.min(probabilities)),
75
+ "entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
76
+ "num_classes": len(probabilities)
77
+ }
78
+ }
79
+
80
+ def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
81
+ """Predict classes for multiple texts."""
82
+ cleaned_texts = [preprocess_for_classification(text) for text in texts]
83
+
84
+ if self.vectorizer:
85
+ text_vectors = self.vectorizer.transform(cleaned_texts)
86
+ else:
87
+ text_vectors = cleaned_texts
88
+
89
+ predictions = self.model.predict(text_vectors)
90
+ classes = getattr(self.model, 'classes_', None)
91
+
92
+ prediction_indices = []
93
+ for pred in predictions:
94
+ if classes is not None:
95
+ pred_index = int(np.where(classes == pred)[0][0])
96
+ else:
97
+ pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
98
+ prediction_indices.append(pred_index)
99
+
100
+ if hasattr(self.model, 'predict_proba'):
101
+ probabilities = self.model.predict_proba(text_vectors)
102
+ else:
103
+ if hasattr(self.model, 'decision_function'):
104
+ decision_scores = self.model.decision_function(text_vectors)
105
+ if len(decision_scores.shape) == 1:
106
+ probabilities = np.column_stack([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
107
+ else:
108
+ exp_scores = np.exp(decision_scores - np.max(decision_scores, axis=1, keepdims=True))
109
+ probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
110
+ else:
111
+ classes = getattr(self.model, 'classes_', None)
112
+ num_classes = len(classes) if classes is not None else 2
113
+ probabilities = np.zeros((len(predictions), num_classes))
114
+ for i, pred_idx in enumerate(prediction_indices):
115
+ probabilities[i, pred_idx] = 1.0
116
+
117
+ results = []
118
+
119
+ for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
120
+ confidence = float(probabilities[i][pred_idx])
121
+
122
+ prob_distribution = {}
123
+ if classes is not None:
124
+ for j, class_label in enumerate(classes):
125
+ prob_distribution[str(class_label)] = float(probabilities[i][j])
126
+ else:
127
+ for j, prob in enumerate(probabilities[i]):
128
+ prob_distribution[f"class_{j}"] = float(prob)
129
+
130
+ results.append({
131
+ "prediction": str(pred),
132
+ "prediction_label": str(pred),
133
+ "prediction_index": int(pred_idx),
134
+ "confidence": confidence,
135
+ "probability_distribution": prob_distribution,
136
+ "all_probabilities": probabilities[i].tolist(),
137
+ "cleaned_text": cleaned_texts[i],
138
+ "model_used": self.model_name,
139
+ "prediction_metadata": {
140
+ "max_probability": float(np.max(probabilities[i])),
141
+ "min_probability": float(np.min(probabilities[i])),
142
+ "entropy": float(-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))),
143
+ "num_classes": len(probabilities[i])
144
+ }
145
+ })
146
+
147
+ return results
148
+
149
+ def get_model_info(self) -> Dict[str, Any]:
150
+ """Get model information and capabilities."""
151
+ classes = getattr(self.model, 'classes_', None)
152
+ return {
153
+ "model_name": self.model_name,
154
+ "model_type": type(self.model).__name__,
155
+ "num_classes": len(classes) if classes is not None else "unknown",
156
+ "classes": classes.tolist() if classes is not None else None,
157
+ "has_predict_proba": hasattr(self.model, 'predict_proba'),
158
+ "has_vectorizer": self.vectorizer is not None,
159
+ "vectorizer_type": type(self.vectorizer).__name__ if self.vectorizer else None
160
+ }
preprocessor.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem.isri import ISRIStemmer
4
+
5
+ arabic_stopwords = set(stopwords.words("arabic"))
6
+ stemmer = ISRIStemmer()
7
+
8
+ char_map = str.maketrans(
9
+ {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
10
+ )
11
+
12
+ diacritics_pattern = re.compile(r"[\u064B-\u0652]")
13
+ punctuation_pattern = re.compile(r"[^\w\s]")
14
+ whitespace_pattern = re.compile(r"\s+")
15
+ repeated_char_pattern = re.compile(r"(.)\1+")
16
+
17
+
18
+ def normalize_arabic(text: str) -> str:
19
+ """Normalize Arabic characters."""
20
+ return text.translate(char_map)
21
+
22
+
23
+ def remove_diacritics(text: str) -> str:
24
+ """Remove Arabic diacritics."""
25
+ return diacritics_pattern.sub("", text)
26
+
27
+
28
+ def remove_punctuation(text: str) -> str:
29
+ """Remove punctuation marks."""
30
+ return punctuation_pattern.sub(" ", text)
31
+
32
+
33
+ def reduce_repeated_characters(text: str) -> str:
34
+ """Reduce repeated characters to single occurrence."""
35
+ return repeated_char_pattern.sub(r"\1", text)
36
+
37
+
38
+ def remove_stopwords(tokens: list[str]) -> list[str]:
39
+ """Remove Arabic stopwords from tokens."""
40
+ return [word for word in tokens if word not in arabic_stopwords]
41
+
42
+
43
+ def stem_tokens(tokens: list[str]) -> list[str]:
44
+ """Apply ISRI stemming to tokens."""
45
+ return [stemmer.stem(token) for token in tokens]
46
+
47
+
48
+ def preprocess_for_classification(text: str) -> str:
49
+ """Preprocess text for classification: normalize, clean, tokenize, stem."""
50
+ text = text.strip().lower()
51
+ text = normalize_arabic(text)
52
+ text = remove_diacritics(text)
53
+ text = remove_punctuation(text)
54
+ text = reduce_repeated_characters(text)
55
+ text = whitespace_pattern.sub(" ", text).strip()
56
+ text = re.sub(r"\d+", "", text)
57
+ tokens = text.split()
58
+ tokens = remove_stopwords(tokens)
59
+ tokens = stem_tokens(tokens)
60
+ return " ".join(tokens)
61
+
62
+
63
+ def preprocess_for_summarization(text: str) -> str:
64
+ """Light preprocessing for summarization: remove diacritics and numbers."""
65
+ if not isinstance(text, str):
66
+ return ""
67
+ text = text.strip().lower()
68
+ text = remove_diacritics(text)
69
+ text = whitespace_pattern.sub(" ", text).strip()
70
+ return re.sub(r"\d+", "", text)
71
+
72
+
73
+ class ArabicPreprocessor:
74
+ """Arabic text preprocessor with analysis capabilities."""
75
+
76
+ def __init__(self):
77
+ self.arabic_stopwords = arabic_stopwords
78
+ self.stemmer = stemmer
79
+ self.char_map = char_map
80
+
81
+ def preprocess_for_classification(self, text: str) -> str:
82
+ """Preprocess text for classification."""
83
+ return preprocess_for_classification(text)
84
+
85
+ def preprocess_for_summarization(self, text: str) -> str:
86
+ """Preprocess text for summarization."""
87
+ return preprocess_for_summarization(text)
88
+
89
+ def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
90
+ """Get detailed preprocessing steps for analysis."""
91
+ steps = {
92
+ "original": text,
93
+ "stripped_lowered": text.strip().lower(),
94
+ }
95
+
96
+ current = text.strip().lower()
97
+
98
+ if task_type == "classification":
99
+ steps["normalized"] = normalize_arabic(current)
100
+ current = normalize_arabic(current)
101
+
102
+ steps["diacritics_removed"] = remove_diacritics(current)
103
+ current = remove_diacritics(current)
104
+
105
+ steps["punctuation_removed"] = remove_punctuation(current)
106
+ current = remove_punctuation(current)
107
+
108
+ steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
109
+ current = reduce_repeated_characters(current)
110
+
111
+ steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
112
+ current = whitespace_pattern.sub(" ", current).strip()
113
+
114
+ steps["numbers_removed"] = re.sub(r"\d+", "", current)
115
+ current = re.sub(r"\d+", "", current)
116
+
117
+ tokens = current.split()
118
+ steps["tokenized"] = tokens
119
+
120
+ tokens_no_stop = remove_stopwords(tokens)
121
+ steps["stopwords_removed"] = tokens_no_stop
122
+
123
+ stemmed_tokens = stem_tokens(tokens_no_stop)
124
+ steps["stemmed"] = stemmed_tokens
125
+
126
+ steps["final"] = " ".join(stemmed_tokens)
127
+
128
+ elif task_type == "summarization":
129
+ steps["diacritics_removed"] = remove_diacritics(current)
130
+ current = remove_diacritics(current)
131
+
132
+ steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
133
+ current = whitespace_pattern.sub(" ", current).strip()
134
+
135
+ steps["numbers_removed"] = re.sub(r"\d+", "", current)
136
+ steps["final"] = re.sub(r"\d+", "", current)
137
+
138
+ return steps
139
+
140
+ def analyze_text(self, text: str) -> dict:
141
+ """Analyze text characteristics and statistics."""
142
+ original_sentences = re.split(r"[.!؟\n]+", text)
143
+ original_sentences = [s.strip() for s in original_sentences if s.strip()]
144
+
145
+ tokens = text.split()
146
+ arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
147
+
148
+ return {
149
+ "character_count": len(text),
150
+ "word_count": len(tokens),
151
+ "sentence_count": len(original_sentences),
152
+ "arabic_character_count": arabic_chars,
153
+ "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
154
+ "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
155
+ "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
156
+ "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
157
+ "punctuation_count": len(re.findall(r'[^\w\s]', text))
158
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ scikit-learn
4
+ nltk
5
+ joblib
6
+ numpy
summarizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ import joblib
4
+ from typing import Dict, Any
5
+ from preprocessor import preprocess_for_summarization
6
+
7
+
8
+ class ArabicSummarizer:
9
+ """Arabic text summarizer using TF-IDF scoring."""
10
+
11
+ def __init__(self, vectorizer_path: str = "tfidf_vectorizer_text_summarization.joblib"):
12
+ self.vectorizer = joblib.load(vectorizer_path)
13
+
14
+ def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
15
+ """Summarize text by selecting top-scored sentences."""
16
+ cleaned_text = preprocess_for_summarization(text)
17
+
18
+ sentences = re.split(r"[.!؟\n]+", cleaned_text)
19
+ sentences = [s.strip() for s in sentences if s.strip()]
20
+
21
+ if len(sentences) <= num_sentences:
22
+ return {
23
+ "summary": text,
24
+ "original_sentence_count": int(len(sentences)),
25
+ "summary_sentence_count": int(len(sentences)),
26
+ "sentences": sentences,
27
+ "selected_indices": list(range(len(sentences))),
28
+ "sentence_scores": None
29
+ }
30
+
31
+ tfidf_matrix = self.vectorizer.transform(sentences)
32
+ sentence_scores = tfidf_matrix.sum(axis=1).A1
33
+
34
+ top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
35
+ top_sentences = [sentences[i] for i in sorted(top_indices)]
36
+
37
+ return {
38
+ "summary": " ".join(top_sentences),
39
+ "original_sentence_count": int(len(sentences)),
40
+ "summary_sentence_count": int(len(top_sentences)),
41
+ "sentences": sentences,
42
+ "selected_indices": [int(i) for i in sorted(top_indices)],
43
+ "sentence_scores": sentence_scores.tolist(),
44
+ "top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
45
+ }
46
+
47
+ def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
48
+ """Get detailed analysis of all sentences with scores and rankings."""
49
+ cleaned_text = preprocess_for_summarization(text)
50
+
51
+ sentences = re.split(r"[.!؟\n]+", cleaned_text)
52
+ sentences = [s.strip() for s in sentences if s.strip()]
53
+
54
+ if not sentences:
55
+ return {"error": "No sentences found in text"}
56
+
57
+ tfidf_matrix = self.vectorizer.transform(sentences)
58
+ sentence_scores = tfidf_matrix.sum(axis=1).A1
59
+
60
+ sentence_analysis = []
61
+ for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
62
+ sentence_analysis.append({
63
+ "index": int(i),
64
+ "sentence": sentence,
65
+ "score": float(score),
66
+ "rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
67
+ })
68
+
69
+ return {
70
+ "sentences": sentence_analysis,
71
+ "total_sentences": int(len(sentences)),
72
+ "score_statistics": {
73
+ "mean": float(np.mean(sentence_scores)),
74
+ "std": float(np.std(sentence_scores)),
75
+ "min": float(np.min(sentence_scores)),
76
+ "max": float(np.max(sentence_scores))
77
+ }
78
+ }
svm_classifier.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c094fd8389cc0b8cb32b94c2ef4b204fd699d70cb852a311018f435a0b71cbf5
3
+ size 10423139
tfidf_vectorizer_classifier.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5e15bed99d5c6817645995e69cbad7438e337854afa738086458a610d47e4d0
3
+ size 753983
tfidf_vectorizer_text_summarization.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:898e94b15ac59c19cf97d56ca7023c85079617cc3258a031fa107a639dac580b
3
+ size 6389421