moabos commited on
Commit
123e49c
·
1 Parent(s): 29dedef

feat: integrate 2 modern models for classification and update routes and examples

Browse files
Dockerfile CHANGED
@@ -10,6 +10,5 @@ COPY --chown=user ./requirements.txt requirements.txt
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
  RUN python -m nltk.downloader stopwords
12
 
13
-
14
  COPY --chown=user . /app
15
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
  RUN python -m nltk.downloader stopwords
12
 
 
13
  COPY --chown=user . /app
14
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: green
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
- short_description: 'Arabic text summarization and topic classification app. '
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ short_description: 'Arabic text summarization and topic classification app.'
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -3,9 +3,9 @@ from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  from enum import Enum
5
 
6
- from classifier import ArabicClassifier
7
  from summarizer import ArabicSummarizer
8
  from preprocessor import ArabicPreprocessor
 
9
  from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
10
 
11
 
@@ -14,56 +14,55 @@ class TaskType(str, Enum):
14
  SUMMARIZATION = "summarization"
15
 
16
 
 
 
 
 
 
 
17
  app = FastAPI(
18
  title="Arabic Text Analysis API",
19
- description="API for Arabic text classification, summarization, and preprocessing",
20
- version="1.0.0"
21
  )
22
 
23
- classifier = ArabicClassifier("svm_classifier.joblib", "tfidf_vectorizer_classifier.joblib")
24
- summarizer = ArabicSummarizer("tfidf_vectorizer_text_summarization.joblib")
25
  preprocessor = ArabicPreprocessor()
26
 
27
 
28
  class TextInput(BaseModel):
29
  text: str
30
-
31
- model_config = {
32
- "json_schema_extra": {
33
- "example": REQUEST_EXAMPLES["text_input"]
34
- }
35
- }
36
 
37
 
38
  class TextInputWithSentences(BaseModel):
39
  text: str
40
  num_sentences: Optional[int] = 3
41
-
 
42
  model_config = {
43
- "json_schema_extra": {
44
- "example": REQUEST_EXAMPLES["text_input_with_sentences"]
45
- }
46
  }
47
 
48
 
49
  class BatchTextInput(BaseModel):
50
  texts: List[str]
51
-
 
52
  model_config = {
53
- "json_schema_extra": {
54
- "example": REQUEST_EXAMPLES["batch_text_input"]
55
- }
56
  }
57
 
58
 
59
  class PreprocessingInput(BaseModel):
60
  text: str
61
  task_type: TaskType = TaskType.CLASSIFICATION
62
-
63
  model_config = {
64
- "json_schema_extra": {
65
- "example": REQUEST_EXAMPLES["preprocessing_input"]
66
- }
67
  }
68
 
69
 
@@ -86,8 +85,8 @@ class ClassificationResponse(BaseModel):
86
  "description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
87
  }
88
  }
89
- }
90
- }
91
  }
92
 
93
 
@@ -101,9 +100,7 @@ class SummarizationResponse(BaseModel):
101
  top_sentence_scores: Optional[List[float]]
102
 
103
  model_config = {
104
- "json_schema_extra": {
105
- "example": RESPONSE_EXAMPLES["summarization"]
106
- }
107
  }
108
 
109
 
@@ -112,9 +109,7 @@ class TextAnalysisResponse(BaseModel):
112
  analysis: Dict[str, Any]
113
 
114
  model_config = {
115
- "json_schema_extra": {
116
- "example": RESPONSE_EXAMPLES["text_analysis"]
117
- }
118
  }
119
 
120
 
@@ -125,9 +120,7 @@ class BatchClassificationResponse(BaseModel):
125
 
126
  model_config = {
127
  "protected_namespaces": (),
128
- "json_schema_extra": {
129
- "example": RESPONSE_EXAMPLES["batch_classification"]
130
- }
131
  }
132
 
133
 
@@ -137,9 +130,7 @@ class SentenceAnalysisResponse(BaseModel):
137
  score_statistics: Dict[str, float]
138
 
139
  model_config = {
140
- "json_schema_extra": {
141
- "example": RESPONSE_EXAMPLES["sentence_analysis"]
142
- }
143
  }
144
 
145
 
@@ -150,9 +141,7 @@ class CompleteAnalysisResponse(BaseModel):
150
  summarization: SummarizationResponse
151
 
152
  model_config = {
153
- "json_schema_extra": {
154
- "example": RESPONSE_EXAMPLES["complete_analysis"]
155
- }
156
  }
157
 
158
 
@@ -161,9 +150,7 @@ class PreprocessingResponse(BaseModel):
161
  preprocessing_steps: Dict[str, Any]
162
 
163
  model_config = {
164
- "json_schema_extra": {
165
- "example": RESPONSE_EXAMPLES["preprocessing"]
166
- }
167
  }
168
 
169
 
@@ -171,11 +158,25 @@ class ModelInfoResponse(BaseModel):
171
  classifier: Dict[str, Any]
172
  summarizer: Dict[str, Any]
173
 
174
- model_config = {
175
- "json_schema_extra": {
176
- "example": RESPONSE_EXAMPLES["model_info"]
177
- }
178
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
 
181
  @app.get("/")
@@ -186,7 +187,7 @@ def read_root() -> Dict[str, Any]:
186
  "documentation": {
187
  "interactive_docs": "/docs",
188
  "redoc": "/redoc",
189
- "openapi_schema": "/openapi.json"
190
  },
191
  "endpoints": {
192
  "classify": "POST /classify - Classify Arabic text",
@@ -196,8 +197,9 @@ def read_root() -> Dict[str, Any]:
196
  "preprocess": "POST /preprocess - Preprocess text with detailed steps",
197
  "text_analysis": "POST /text-analysis - Analyze text characteristics",
198
  "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
199
- "model_info": "GET /model-info - Get model information"
200
- }
 
201
  }
202
 
203
 
@@ -205,7 +207,8 @@ def read_root() -> Dict[str, Any]:
205
  def classify_text(data: TextInput) -> ClassificationResponse:
206
  """Classify Arabic text with probability distribution and metadata."""
207
  try:
208
- result = classifier.predict(data.text)
 
209
  return result
210
  except Exception as e:
211
  raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
@@ -215,14 +218,19 @@ def classify_text(data: TextInput) -> ClassificationResponse:
215
  def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
216
  """Classify multiple Arabic texts in batch."""
217
  try:
218
- results = classifier.predict_batch(data.texts)
 
 
 
219
  return {
220
  "results": results,
221
  "total_texts": len(data.texts),
222
- "model_used": classifier.model_name
223
  }
224
  except Exception as e:
225
- raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
 
 
226
 
227
 
228
  @app.post("/summarize", response_model=SummarizationResponse)
@@ -242,25 +250,31 @@ def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
242
  result = summarizer.get_sentence_analysis(data.text)
243
  return result
244
  except Exception as e:
245
- raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
 
 
246
 
247
 
248
  @app.post("/analyze", response_model=CompleteAnalysisResponse)
249
  def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
250
  """Complete analysis: classification, summarization, and text statistics."""
251
  try:
252
- classification_result = classifier.predict(data.text)
 
 
253
  summarization_result = summarizer.summarize(data.text, data.num_sentences)
254
  text_stats = preprocessor.analyze_text(data.text)
255
-
256
  return {
257
  "original_text": data.text,
258
  "text_analysis": text_stats,
259
  "classification": classification_result,
260
- "summarization": summarization_result
261
  }
262
  except Exception as e:
263
- raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
 
 
264
 
265
 
266
  @app.post("/preprocess", response_model=PreprocessingResponse)
@@ -268,10 +282,7 @@ def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
268
  """Preprocess text with step-by-step breakdown."""
269
  try:
270
  steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
271
- return {
272
- "task_type": data.task_type.value,
273
- "preprocessing_steps": steps
274
- }
275
  except Exception as e:
276
  raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
277
 
@@ -281,25 +292,60 @@ def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
281
  """Analyze text characteristics and statistics."""
282
  try:
283
  analysis = preprocessor.analyze_text(data.text)
284
- return {
285
- "text": data.text,
286
- "analysis": analysis
287
- }
288
  except Exception as e:
289
  raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
290
 
291
 
292
  @app.get("/model-info", response_model=ModelInfoResponse)
293
- def get_model_info() -> ModelInfoResponse:
294
- """Get information about loaded models."""
295
  try:
296
- classifier_info = classifier.get_model_info()
 
297
  return {
298
  "classifier": classifier_info,
299
  "summarizer": {
300
- "vectorizer_loaded": hasattr(summarizer, 'vectorizer'),
301
- "model_type": "TF-IDF based summarization"
302
- }
303
  }
304
  except Exception as e:
305
- raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from pydantic import BaseModel
4
  from enum import Enum
5
 
 
6
  from summarizer import ArabicSummarizer
7
  from preprocessor import ArabicPreprocessor
8
+ from model_manager import ModelManager
9
  from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
10
 
11
 
 
14
  SUMMARIZATION = "summarization"
15
 
16
 
17
+ class ModelType(str, Enum):
18
+ TRADITIONAL_SVM = "traditional_svm"
19
+ MODERN_BERT = "modern_bert"
20
+ MODERN_LSTM = "modern_lstm"
21
+
22
+
23
  app = FastAPI(
24
  title="Arabic Text Analysis API",
25
+ description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
26
+ version="1.0.0",
27
  )
28
 
29
+ model_manager = ModelManager(default_model="traditional_svm")
30
+ summarizer = ArabicSummarizer("traditional_tfidf_vectorizer_summarization.joblib")
31
  preprocessor = ArabicPreprocessor()
32
 
33
 
34
  class TextInput(BaseModel):
35
  text: str
36
+ model: Optional[ModelType] = None
37
+
38
+ model_config = {"json_schema_extra": {"example": REQUEST_EXAMPLES["text_input"]}}
 
 
 
39
 
40
 
41
  class TextInputWithSentences(BaseModel):
42
  text: str
43
  num_sentences: Optional[int] = 3
44
+ model: Optional[ModelType] = None
45
+
46
  model_config = {
47
+ "json_schema_extra": {"example": REQUEST_EXAMPLES["text_input_with_sentences"]}
 
 
48
  }
49
 
50
 
51
  class BatchTextInput(BaseModel):
52
  texts: List[str]
53
+ model: Optional[ModelType] = None
54
+
55
  model_config = {
56
+ "json_schema_extra": {"example": REQUEST_EXAMPLES["batch_text_input"]}
 
 
57
  }
58
 
59
 
60
  class PreprocessingInput(BaseModel):
61
  text: str
62
  task_type: TaskType = TaskType.CLASSIFICATION
63
+
64
  model_config = {
65
+ "json_schema_extra": {"example": REQUEST_EXAMPLES["preprocessing_input"]}
 
 
66
  }
67
 
68
 
 
85
  "description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
86
  }
87
  }
88
+ },
89
+ },
90
  }
91
 
92
 
 
100
  top_sentence_scores: Optional[List[float]]
101
 
102
  model_config = {
103
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["summarization"]}
 
 
104
  }
105
 
106
 
 
109
  analysis: Dict[str, Any]
110
 
111
  model_config = {
112
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["text_analysis"]}
 
 
113
  }
114
 
115
 
 
120
 
121
  model_config = {
122
  "protected_namespaces": (),
123
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["batch_classification"]},
 
 
124
  }
125
 
126
 
 
130
  score_statistics: Dict[str, float]
131
 
132
  model_config = {
133
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["sentence_analysis"]}
 
 
134
  }
135
 
136
 
 
141
  summarization: SummarizationResponse
142
 
143
  model_config = {
144
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["complete_analysis"]}
 
 
145
  }
146
 
147
 
 
150
  preprocessing_steps: Dict[str, Any]
151
 
152
  model_config = {
153
+ "json_schema_extra": {"example": RESPONSE_EXAMPLES["preprocessing"]}
 
 
154
  }
155
 
156
 
 
158
  classifier: Dict[str, Any]
159
  summarizer: Dict[str, Any]
160
 
161
+ model_config = {"json_schema_extra": {"example": RESPONSE_EXAMPLES["model_info"]}}
162
+
163
+
164
+ class ModelSwitchInput(BaseModel):
165
+ model: ModelType
166
+
167
+ model_config = {"json_schema_extra": {"example": {"model": "modern_bert"}}}
168
+
169
+
170
+ class ModelSwitchResponse(BaseModel):
171
+ success: bool
172
+ message: str
173
+ previous_model: Optional[str] = None
174
+ current_model: str
175
+
176
+
177
+ class AvailableModelsResponse(BaseModel):
178
+ models: Dict[str, Any]
179
+ current_model: str
180
 
181
 
182
  @app.get("/")
 
187
  "documentation": {
188
  "interactive_docs": "/docs",
189
  "redoc": "/redoc",
190
+ "openapi_schema": "/openapi.json",
191
  },
192
  "endpoints": {
193
  "classify": "POST /classify - Classify Arabic text",
 
197
  "preprocess": "POST /preprocess - Preprocess text with detailed steps",
198
  "text_analysis": "POST /text-analysis - Analyze text characteristics",
199
  "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
200
+ "model_info": "GET /model-info - Get model information",
201
+ "available_models": "GET /models - Get all available models",
202
+ },
203
  }
204
 
205
 
 
207
  def classify_text(data: TextInput) -> ClassificationResponse:
208
  """Classify Arabic text with probability distribution and metadata."""
209
  try:
210
+ model_name = data.model.value if data.model else None
211
+ result = model_manager.predict(data.text, model_name)
212
  return result
213
  except Exception as e:
214
  raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
 
218
  def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
219
  """Classify multiple Arabic texts in batch."""
220
  try:
221
+ model_name = data.model.value if data.model else None
222
+ results = model_manager.predict_batch(data.texts, model_name)
223
+ used_model = model_name or model_manager.default_model
224
+
225
  return {
226
  "results": results,
227
  "total_texts": len(data.texts),
228
+ "model_used": used_model,
229
  }
230
  except Exception as e:
231
+ raise HTTPException(
232
+ status_code=500, detail=f"Batch classification failed: {str(e)}"
233
+ )
234
 
235
 
236
  @app.post("/summarize", response_model=SummarizationResponse)
 
250
  result = summarizer.get_sentence_analysis(data.text)
251
  return result
252
  except Exception as e:
253
+ raise HTTPException(
254
+ status_code=500, detail=f"Sentence analysis failed: {str(e)}"
255
+ )
256
 
257
 
258
  @app.post("/analyze", response_model=CompleteAnalysisResponse)
259
  def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
260
  """Complete analysis: classification, summarization, and text statistics."""
261
  try:
262
+ model_name = data.model.value if data.model else None
263
+
264
+ classification_result = model_manager.predict(data.text, model_name)
265
  summarization_result = summarizer.summarize(data.text, data.num_sentences)
266
  text_stats = preprocessor.analyze_text(data.text)
267
+
268
  return {
269
  "original_text": data.text,
270
  "text_analysis": text_stats,
271
  "classification": classification_result,
272
+ "summarization": summarization_result,
273
  }
274
  except Exception as e:
275
+ raise HTTPException(
276
+ status_code=500, detail=f"Complete analysis failed: {str(e)}"
277
+ )
278
 
279
 
280
  @app.post("/preprocess", response_model=PreprocessingResponse)
 
282
  """Preprocess text with step-by-step breakdown."""
283
  try:
284
  steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
285
+ return {"task_type": data.task_type.value, "preprocessing_steps": steps}
 
 
 
286
  except Exception as e:
287
  raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
288
 
 
292
  """Analyze text characteristics and statistics."""
293
  try:
294
  analysis = preprocessor.analyze_text(data.text)
295
+ return {"text": data.text, "analysis": analysis}
 
 
 
296
  except Exception as e:
297
  raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
298
 
299
 
300
  @app.get("/model-info", response_model=ModelInfoResponse)
301
+ def get_model_info(model: Optional[ModelType] = None) -> ModelInfoResponse:
302
+ """Get information about a specific model or the default model."""
303
  try:
304
+ model_name = model.value if model else None
305
+ classifier_info = model_manager.get_model_info(model_name)
306
  return {
307
  "classifier": classifier_info,
308
  "summarizer": {
309
+ "vectorizer_loaded": hasattr(summarizer, "vectorizer"),
310
+ "model_type": "TF-IDF based summarization",
311
+ },
312
  }
313
  except Exception as e:
314
+ raise HTTPException(
315
+ status_code=500, detail=f"Failed to get model info: {str(e)}"
316
+ )
317
+
318
+
319
+ @app.get("/models", response_model=AvailableModelsResponse)
320
+ def get_available_models() -> AvailableModelsResponse:
321
+ """Get all available classification models."""
322
+ try:
323
+ models = model_manager.get_available_models()
324
+ return {"models": models, "current_model": model_manager.default_model}
325
+ except Exception as e:
326
+ raise HTTPException(
327
+ status_code=500, detail=f"Failed to get available models: {str(e)}"
328
+ )
329
+
330
+
331
+ @app.get("/models/cache")
332
+ def get_cache_status() -> Dict[str, Any]:
333
+ """Get information about cached models."""
334
+ try:
335
+ return model_manager.get_cache_status()
336
+ except Exception as e:
337
+ raise HTTPException(
338
+ status_code=500, detail=f"Failed to get cache status: {str(e)}"
339
+ )
340
+
341
+
342
+ @app.post("/models/cache/clear")
343
+ def clear_model_cache(model: Optional[ModelType] = None) -> Dict[str, Any]:
344
+ """Clear model cache for a specific model or all models."""
345
+ try:
346
+ model_name = model.value if model else None
347
+ return model_manager.clear_cache(model_name)
348
+ except Exception as e:
349
+ raise HTTPException(
350
+ status_code=500, detail=f"Failed to clear cache: {str(e)}"
351
+ )
examples.py CHANGED
@@ -1,10 +1,19 @@
 
1
  """API request and response examples for documentation."""
2
 
3
  EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
4
 
 
 
 
 
5
  REQUEST_EXAMPLES = {
6
  "text_input": {"text": EXAMPLE_TEXT},
 
 
 
7
  "text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
 
8
  "batch_text_input": {
9
  "texts": [
10
  EXAMPLE_TEXT,
@@ -12,30 +21,38 @@ REQUEST_EXAMPLES = {
12
  "المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
13
  ]
14
  },
 
 
 
 
 
 
 
 
15
  "preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
16
  }
17
 
18
  RESPONSE_EXAMPLES = {
19
  "classification": {
20
- "prediction": "culture",
21
- "prediction_index": 0,
22
- "confidence": 0.902,
23
  "probability_distribution": {
24
- "culture": 0.902,
25
- "economy": 0.001,
26
- "international": 0.0,
27
- "local": 0.061,
28
- "religion": 0.0,
29
- "sports": 0.036,
30
  },
31
- "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
32
- "model_used": "svm_classifier",
33
  "prediction_metadata": {
34
- "max_probability": 0.902,
35
- "min_probability": 0.0,
36
- "entropy": 0.393,
37
- "num_classes": 6,
38
- },
39
  },
40
  "summarization": {
41
  "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
@@ -45,116 +62,163 @@ RESPONSE_EXAMPLES = {
45
  "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
46
  "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
47
  "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
48
- "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
49
  ],
50
  "selected_indices": [1, 2],
51
- "sentence_scores": [2.968, 3.224, 3.234, 2.642],
52
- "top_sentence_scores": [3.224, 3.234],
53
  },
54
  "text_analysis": {
55
  "text": EXAMPLE_TEXT,
56
  "analysis": {
57
- "character_count": 282,
58
- "word_count": 46,
59
  "sentence_count": 4,
60
- "arabic_character_count": 252,
61
- "arabic_character_ratio": 0.8936,
62
- "average_word_length": 5.48,
63
- "average_sentence_length": 11.5,
64
  "has_diacritics": False,
65
- "punctuation_count": 3,
66
- },
67
  },
68
  "batch_classification": {
69
  "results": [
70
  {
71
- "prediction": "culture",
72
- "prediction_index": 0,
73
- "confidence": 0.902,
74
  "probability_distribution": {
75
- "culture": 0.902,
76
- "economy": 0.001,
77
- "international": 0.0,
78
- "local": 0.061,
79
- "religion": 0.0,
80
- "sports": 0.036,
81
  },
82
- "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
83
- "model_used": "svm_classifier",
84
  "prediction_metadata": {
85
- "max_probability": 0.902,
86
- "min_probability": 0.0,
87
- "entropy": 0.393,
88
- "num_classes": 6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  },
 
 
 
 
 
 
 
 
90
  }
91
  ],
92
  "total_texts": 3,
93
- "model_used": "svm_classifier",
94
  },
95
  "sentence_analysis": {
96
  "sentences": [
97
  {
98
  "index": 0,
99
  "sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
100
- "score": 2.968,
101
- "rank": 3,
102
  },
103
  {
104
  "index": 1,
105
  "sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
106
- "score": 3.224,
107
- "rank": 2,
108
  },
109
  {
110
  "index": 2,
111
  "sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
112
- "score": 3.234,
113
- "rank": 1,
114
  },
115
  {
116
  "index": 3,
117
  "sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
118
- "score": 2.642,
119
- "rank": 4,
120
- },
121
  ],
122
  "total_sentences": 4,
123
- "score_statistics": {"mean": 3.017, "std": 0.254, "min": 2.642, "max": 3.234},
 
 
 
 
 
124
  },
125
  "complete_analysis": {
126
  "original_text": EXAMPLE_TEXT,
127
  "text_analysis": {
128
- "character_count": 282,
129
- "word_count": 46,
130
  "sentence_count": 4,
131
- "arabic_character_count": 252,
132
- "arabic_character_ratio": 0.8936,
133
- "average_word_length": 5.48,
134
- "average_sentence_length": 11.5,
135
  "has_diacritics": False,
136
- "punctuation_count": 3,
137
  },
138
  "classification": {
139
- "prediction": "culture",
140
- "prediction_index": 0,
141
- "confidence": 0.902,
142
  "probability_distribution": {
143
- "culture": 0.902,
144
- "economy": 0.001,
145
- "international": 0.0,
146
- "local": 0.061,
147
- "religion": 0.0,
148
- "sports": 0.036,
149
  },
150
- "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
151
- "model_used": "svm_classifier",
152
  "prediction_metadata": {
153
- "max_probability": 0.902,
154
- "min_probability": 0.0,
155
- "entropy": 0.393,
156
- "num_classes": 6,
157
- },
158
  },
159
  "summarization": {
160
  "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
@@ -164,51 +228,86 @@ RESPONSE_EXAMPLES = {
164
  "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
165
  "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
166
  "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
167
- "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
168
  ],
169
  "selected_indices": [1, 2],
170
- "sentence_scores": [2.968, 3.224, 3.234, 2.642],
171
- "top_sentence_scores": [3.224, 3.234],
172
- },
173
  },
174
  "preprocessing": {
175
  "task_type": "classification",
176
  "preprocessing_steps": {
177
- "original_text": EXAMPLE_TEXT,
178
- "step_1_remove_diacritics": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
179
- "step_2_remove_punctuation": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
180
- "step_3_normalize_text": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
181
- "step_4_remove_stopwords": "سعر فاكهة خضراوات موسم انباتها اقل غيره مواسم",
182
- "step_5_stem_words": "سعر فاكه خضرو موسم انبات اقل غير موسم",
183
- "final_result": "سعر فاكه خضرو موسم انبات اقل غير موسم",
 
184
  "preprocessing_summary": {
185
- "original_length": 282,
186
- "final_length": 47,
187
- "reduction_percentage": 83.3,
188
- "words_removed": 39,
189
- "words_remaining": 7,
190
- },
191
- },
192
  },
193
  "model_info": {
194
  "classifier": {
195
- "model_name": "svm_classifier",
196
- "vectorizer_loaded": True,
197
- "model_loaded": True,
198
- "classes": [
199
- "culture",
200
- "economy",
201
- "international",
202
- "local",
203
- "religion",
204
- "sports",
205
- ],
206
  "num_classes": 6,
207
- "model_type": "SVM with TF-IDF vectorization",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  },
209
  "summarizer": {
210
  "vectorizer_loaded": True,
211
- "model_type": "TF-IDF based summarization",
212
- },
213
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  }
 
1
+
2
  """API request and response examples for documentation."""
3
 
4
  EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
5
 
6
+ # Note: For classification endpoints, the 'model' field is optional.
7
+ # Available models: traditional_svm, modern_bert, modern_lstm
8
+ # If not specified, defaults to traditional_svm
9
+
10
  REQUEST_EXAMPLES = {
11
  "text_input": {"text": EXAMPLE_TEXT},
12
+ "text_input_with_model": {"text": EXAMPLE_TEXT, "model": "traditional_svm"},
13
+ "text_input_modern_bert": {"text": EXAMPLE_TEXT, "model": "modern_bert"},
14
+ "text_input_modern_lstm": {"text": EXAMPLE_TEXT, "model": "modern_lstm"},
15
  "text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
16
+ "text_input_with_sentences_and_model": {"text": EXAMPLE_TEXT, "num_sentences": 2, "model": "modern_bert"},
17
  "batch_text_input": {
18
  "texts": [
19
  EXAMPLE_TEXT,
 
21
  "المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
22
  ]
23
  },
24
+ "batch_text_input_with_model": {
25
+ "texts": [
26
+ EXAMPLE_TEXT,
27
+ "هذا نص تجريبي آخر للتصنيف باللغة العربية.",
28
+ "المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
29
+ ],
30
+ "model": "modern_lstm"
31
+ },
32
  "preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
33
  }
34
 
35
  RESPONSE_EXAMPLES = {
36
  "classification": {
37
+ "prediction": "economy",
38
+ "prediction_index": 1,
39
+ "confidence": 0.851430067618738,
40
  "probability_distribution": {
41
+ "culture": 0.03446028829161909,
42
+ "economy": 0.851430067618738,
43
+ "international": 0.005894582125851457,
44
+ "local": 0.07395284130576442,
45
+ "religion": 0.006103062287269104,
46
+ "sports": 0.028159158370758375
47
  },
48
+ "cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
49
+ "model_used": "traditional_svm_classifier",
50
  "prediction_metadata": {
51
+ "max_probability": 0.851430067618738,
52
+ "min_probability": 0.005894582125851457,
53
+ "entropy": 0.607505444397693,
54
+ "num_classes": 6
55
+ }
56
  },
57
  "summarization": {
58
  "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
 
62
  "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
63
  "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
64
  "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
65
+ "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
66
  ],
67
  "selected_indices": [1, 2],
68
+ "sentence_scores": [2.9675719816913095, 3.2243029195650847, 3.2339935441416645, 2.6416207226871506],
69
+ "top_sentence_scores": [3.2243029195650847, 3.2339935441416645]
70
  },
71
  "text_analysis": {
72
  "text": EXAMPLE_TEXT,
73
  "analysis": {
74
+ "character_count": 278,
75
+ "word_count": 48,
76
  "sentence_count": 4,
77
+ "arabic_character_count": 227,
78
+ "arabic_character_ratio": 0.8165467625899281,
79
+ "average_word_length": 4.8125,
80
+ "average_sentence_length": 12.0,
81
  "has_diacritics": False,
82
+ "punctuation_count": 5
83
+ }
84
  },
85
  "batch_classification": {
86
  "results": [
87
  {
88
+ "prediction": "economy",
89
+ "prediction_index": 1,
90
+ "confidence": 0.851430067618738,
91
  "probability_distribution": {
92
+ "culture": 0.03446028829161909,
93
+ "economy": 0.851430067618738,
94
+ "international": 0.005894582125851457,
95
+ "local": 0.07395284130576442,
96
+ "religion": 0.006103062287269104,
97
+ "sports": 0.028159158370758375
98
  },
99
+ "cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
100
+ "model_used": "traditional_svm_classifier",
101
  "prediction_metadata": {
102
+ "max_probability": 0.851430067618738,
103
+ "min_probability": 0.005894582125851457,
104
+ "entropy": 0.607505444397693,
105
+ "num_classes": 6
106
+ }
107
+ },
108
+ {
109
+ "prediction": "economy",
110
+ "prediction_index": 1,
111
+ "confidence": 0.594920291892315,
112
+ "probability_distribution": {
113
+ "culture": 0.2745678397668158,
114
+ "economy": 0.594920291892315,
115
+ "international": 0.026763844094989445,
116
+ "local": 0.059508468104556384,
117
+ "religion": 0.010869503926651195,
118
+ "sports": 0.033370152114672154
119
+ },
120
+ "cleaned_text": "نص تجرب اخر تصن لغ عرب",
121
+ "model_used": "traditional_svm_classifier",
122
+ "prediction_metadata": {
123
+ "max_probability": 0.594920291892315,
124
+ "min_probability": 0.010869503926651195,
125
+ "entropy": 1.2244037883788486,
126
+ "num_classes": 6
127
+ }
128
+ },
129
+ {
130
+ "prediction": "local",
131
+ "prediction_index": 3,
132
+ "confidence": 0.8469012893956159,
133
+ "probability_distribution": {
134
+ "culture": 0.03610036096593097,
135
+ "economy": 0.02966970458779607,
136
+ "international": 0.00998550970524301,
137
+ "local": 0.8469012893956159,
138
+ "religion": 0.025074648364464147,
139
+ "sports": 0.05237313697694993
140
  },
141
+ "cleaned_text": "طعم دين قدم طب شه تنع",
142
+ "model_used": "traditional_svm_classifier",
143
+ "prediction_metadata": {
144
+ "max_probability": 0.8469012893956159,
145
+ "min_probability": 0.00998550970524301,
146
+ "entropy": 0.5892773509761756,
147
+ "num_classes": 6
148
+ }
149
  }
150
  ],
151
  "total_texts": 3,
152
+ "model_used": "traditional_svm"
153
  },
154
  "sentence_analysis": {
155
  "sentences": [
156
  {
157
  "index": 0,
158
  "sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
159
+ "score": 2.9675719816913095,
160
+ "rank": 3
161
  },
162
  {
163
  "index": 1,
164
  "sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
165
+ "score": 3.2243029195650847,
166
+ "rank": 2
167
  },
168
  {
169
  "index": 2,
170
  "sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
171
+ "score": 3.2339935441416645,
172
+ "rank": 1
173
  },
174
  {
175
  "index": 3,
176
  "sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
177
+ "score": 2.6416207226871506,
178
+ "rank": 4
179
+ }
180
  ],
181
  "total_sentences": 4,
182
+ "score_statistics": {
183
+ "mean": 3.0168722920213025,
184
+ "std": 0.24156436215421892,
185
+ "min": 2.6416207226871506,
186
+ "max": 3.2339935441416645
187
+ }
188
  },
189
  "complete_analysis": {
190
  "original_text": EXAMPLE_TEXT,
191
  "text_analysis": {
192
+ "character_count": 278,
193
+ "word_count": 48,
194
  "sentence_count": 4,
195
+ "arabic_character_count": 227,
196
+ "arabic_character_ratio": 0.8165467625899281,
197
+ "average_word_length": 4.8125,
198
+ "average_sentence_length": 12.0,
199
  "has_diacritics": False,
200
+ "punctuation_count": 5
201
  },
202
  "classification": {
203
+ "prediction": "economy",
204
+ "prediction_index": 1,
205
+ "confidence": 0.851430067618738,
206
  "probability_distribution": {
207
+ "culture": 0.03446028829161909,
208
+ "economy": 0.851430067618738,
209
+ "international": 0.005894582125851457,
210
+ "local": 0.07395284130576442,
211
+ "religion": 0.006103062287269104,
212
+ "sports": 0.028159158370758375
213
  },
214
+ "cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
215
+ "model_used": "traditional_svm_classifier",
216
  "prediction_metadata": {
217
+ "max_probability": 0.851430067618738,
218
+ "min_probability": 0.005894582125851457,
219
+ "entropy": 0.607505444397693,
220
+ "num_classes": 6
221
+ }
222
  },
223
  "summarization": {
224
  "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
 
228
  "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
229
  "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
230
  "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
231
+ "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
232
  ],
233
  "selected_indices": [1, 2],
234
+ "sentence_scores": [2.9675719816913095, 3.2243029195650847, 3.2339935441416645, 2.6416207226871506],
235
+ "top_sentence_scores": [3.2243029195650847, 3.2339935441416645]
236
+ }
237
  },
238
  "preprocessing": {
239
  "task_type": "classification",
240
  "preprocessing_steps": {
241
+ "original": EXAMPLE_TEXT,
242
+ "stripped_lowered": EXAMPLE_TEXT,
243
+ "normalized": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم. ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات. لا يقتصر الامر علي السعر الاقل، بل سيكون طعامك اشهي والذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري.",
244
+ "diacritics_removed": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم. ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات. لا يقتصر الامر علي السعر الاقل، بل سيكون طعامك اشهي والذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري.",
245
+ "punctuation_removed": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري ",
246
+ "repeated_chars_reduced": "يكون سعر الفاكه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكه باسعار مناسبه في موسمها بسب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري",
247
+ "whitespace_normalized": "يكون سعر الفاكه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكه باسعار مناسبه في موسمها بسب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري",
248
+ "final_result": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
249
  "preprocessing_summary": {
250
+ "original_length": 278,
251
+ "final_length": 165,
252
+ "reduction_percentage": 40.6,
253
+ "words_removed": 25,
254
+ "words_remaining": 23
255
+ }
256
+ }
257
  },
258
  "model_info": {
259
  "classifier": {
260
+ "model_name": "traditional_svm_classifier",
261
+ "model_type": "SVC",
 
 
 
 
 
 
 
 
 
262
  "num_classes": 6,
263
+ "classes": ["culture", "economy", "international", "local", "religion", "sports"],
264
+ "has_predict_proba": True,
265
+ "has_vectorizer": True,
266
+ "vectorizer_type": "TfidfVectorizer",
267
+ "model_manager": {
268
+ "model_name": "traditional_svm",
269
+ "model_description": "Traditional SVM classifier with TF-IDF vectorization",
270
+ "model_config": {
271
+ "type": "traditional",
272
+ "classifier_path": "traditional_svm_classifier.joblib",
273
+ "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
274
+ "description": "Traditional SVM classifier with TF-IDF vectorization"
275
+ },
276
+ "is_cached": True
277
+ }
278
  },
279
  "summarizer": {
280
  "vectorizer_loaded": True,
281
+ "model_type": "TF-IDF based summarization"
282
+ }
283
  },
284
+ "available_models": {
285
+ "models": {
286
+ "traditional_svm": {
287
+ "description": "Traditional SVM classifier with TF-IDF vectorization",
288
+ "type": "traditional",
289
+ "available": True,
290
+ "missing_files": [],
291
+ "is_default": True,
292
+ "is_cached": True
293
+ },
294
+ "modern_bert": {
295
+ "description": "Modern BERT-based transformer classifier",
296
+ "type": "modern",
297
+ "available": True,
298
+ "missing_files": [],
299
+ "is_default": False,
300
+ "is_cached": False
301
+ },
302
+ "modern_lstm": {
303
+ "description": "Modern LSTM-based neural network classifier",
304
+ "type": "modern",
305
+ "available": True,
306
+ "missing_files": [],
307
+ "is_default": False,
308
+ "is_cached": False
309
+ }
310
+ },
311
+ "current_model": "traditional_svm"
312
+ }
313
  }
model_manager.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ import os
3
+ from traditional_classifier import TraditionalClassifier
4
+
5
+ try:
6
+ from modern_classifier import ModernClassifier
7
+ MODERN_MODELS_AVAILABLE = True
8
+ except ImportError:
9
+ MODERN_MODELS_AVAILABLE = False
10
+
11
+
12
+ class ModelManager:
13
+ """Manages different types of Arabic text classification models with per-request model selection and caching."""
14
+
15
+ AVAILABLE_MODELS = {
16
+ "traditional_svm": {
17
+ "type": "traditional",
18
+ "classifier_path": "traditional_svm_classifier.joblib",
19
+ "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
20
+ "description": "Traditional SVM classifier with TF-IDF vectorization"
21
+ },
22
+
23
+ "modern_bert": {
24
+ "type": "modern",
25
+ "model_type": "bert",
26
+ "model_path": "modern_bert_classifier.safetensors",
27
+ "config_path": "config.json",
28
+ "description": "Modern BERT-based transformer classifier"
29
+ },
30
+
31
+ "modern_lstm": {
32
+ "type": "modern",
33
+ "model_type": "lstm",
34
+ "model_path": "modern_lstm_classifier.pth",
35
+ "description": "Modern LSTM-based neural network classifier"
36
+ }
37
+ }
38
+
39
+ def __init__(self, default_model: str = "traditional_svm"):
40
+ self.default_model = default_model
41
+ self._model_cache = {}
42
+
43
+ def _get_model(self, model_name: str):
44
+ """Get model instance, loading from cache or creating new one."""
45
+ if model_name not in self.AVAILABLE_MODELS:
46
+ raise ValueError(f"Model '{model_name}' not available. Available models: {list(self.AVAILABLE_MODELS.keys())}")
47
+
48
+ if model_name in self._model_cache:
49
+ return self._model_cache[model_name]
50
+
51
+ model_config = self.AVAILABLE_MODELS[model_name]
52
+
53
+ if model_config["type"] == "traditional":
54
+ classifier_path = model_config["classifier_path"]
55
+ vectorizer_path = model_config["vectorizer_path"]
56
+
57
+ if not os.path.exists(classifier_path):
58
+ raise FileNotFoundError(f"Classifier file not found: {classifier_path}")
59
+ if not os.path.exists(vectorizer_path):
60
+ raise FileNotFoundError(f"Vectorizer file not found: {vectorizer_path}")
61
+
62
+ model = TraditionalClassifier(classifier_path, vectorizer_path)
63
+
64
+ elif model_config["type"] == "modern":
65
+ if not MODERN_MODELS_AVAILABLE:
66
+ raise ImportError("Modern models require PyTorch and transformers")
67
+
68
+ model_path = model_config["model_path"]
69
+
70
+ if not os.path.exists(model_path):
71
+ raise FileNotFoundError(f"Model file not found: {model_path}")
72
+
73
+ config_path = model_config.get("config_path")
74
+ if config_path and not os.path.exists(config_path):
75
+ config_path = None
76
+
77
+ model = ModernClassifier(
78
+ model_type=model_config["model_type"],
79
+ model_path=model_path,
80
+ config_path=config_path
81
+ )
82
+
83
+ self._model_cache[model_name] = model
84
+ return model
85
+
86
+ def predict(self, text: str, model_name: str = None) -> Dict[str, Any]:
87
+ """Predict using the specified model (or default if none specified)."""
88
+ if model_name is None:
89
+ model_name = self.default_model
90
+
91
+ model = self._get_model(model_name)
92
+ result = model.predict(text)
93
+
94
+ result["model_manager"] = {
95
+ "model_used": model_name,
96
+ "model_description": self.AVAILABLE_MODELS[model_name]["description"]
97
+ }
98
+ return result
99
+
100
+ def predict_batch(self, texts: list, model_name: str = None) -> list:
101
+ """Predict batch using the specified model (or default if none specified)."""
102
+ if model_name is None:
103
+ model_name = self.default_model
104
+
105
+ model = self._get_model(model_name)
106
+ results = model.predict_batch(texts)
107
+
108
+ for result in results:
109
+ result["model_manager"] = {
110
+ "model_used": model_name,
111
+ "model_description": self.AVAILABLE_MODELS[model_name]["description"]
112
+ }
113
+ return results
114
+
115
+ def get_model_info(self, model_name: str = None) -> Dict[str, Any]:
116
+ """Get information about a specific model (or default if none specified)."""
117
+ if model_name is None:
118
+ model_name = self.default_model
119
+
120
+ model = self._get_model(model_name)
121
+ model_info = model.get_model_info()
122
+ model_info.update({
123
+ "model_manager": {
124
+ "model_name": model_name,
125
+ "model_description": self.AVAILABLE_MODELS[model_name]["description"],
126
+ "model_config": self.AVAILABLE_MODELS[model_name],
127
+ "is_cached": model_name in self._model_cache
128
+ }
129
+ })
130
+ return model_info
131
+
132
+ def get_available_models(self) -> Dict[str, Any]:
133
+ """Get list of all available models."""
134
+ available = {}
135
+ for model_name, config in self.AVAILABLE_MODELS.items():
136
+ files_exist = True
137
+ missing_files = []
138
+
139
+ if config["type"] == "traditional":
140
+ for file_key in ["classifier_path", "vectorizer_path"]:
141
+ if not os.path.exists(config[file_key]):
142
+ files_exist = False
143
+ missing_files.append(config[file_key])
144
+ elif config["type"] == "modern":
145
+ if not os.path.exists(config["model_path"]):
146
+ files_exist = False
147
+ missing_files.append(config["model_path"])
148
+
149
+ available[model_name] = {
150
+ "description": config["description"],
151
+ "type": config["type"],
152
+ "available": files_exist,
153
+ "missing_files": missing_files if not files_exist else [],
154
+ "is_default": model_name == self.default_model,
155
+ "is_cached": model_name in self._model_cache
156
+ }
157
+
158
+ return available
159
+
160
+ def clear_cache(self, model_name: str = None) -> Dict[str, Any]:
161
+ """Clear model cache (specific model or all models)."""
162
+ if model_name:
163
+ if model_name in self._model_cache:
164
+ del self._model_cache[model_name]
165
+ return {"message": f"Cache cleared for model: {model_name}"}
166
+ else:
167
+ return {"message": f"Model {model_name} was not cached"}
168
+ else:
169
+ cleared_count = len(self._model_cache)
170
+ self._model_cache.clear()
171
+ return {"message": f"Cache cleared for {cleared_count} models"}
172
+
173
+ def get_cache_status(self) -> Dict[str, Any]:
174
+ """Get information about cached models."""
175
+ return {
176
+ "cached_models": list(self._model_cache.keys()),
177
+ "cache_count": len(self._model_cache),
178
+ "default_model": self.default_model
179
+ }
modern_bert_classifier.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd36d704baf314e519a6abbcde8b6239ba3867dc4a44acc9c690701b863a3a7d
3
+ size 442511368
modern_classifier.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from typing import List, Dict, Any, Optional
5
+ from preprocessor import preprocess_for_classification
6
+ import re
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ from safetensors.torch import load_file
9
+ from transformers import AutoConfig
10
+
11
+
12
+ class LSTMClassifier(nn.Module):
13
+ """LSTM-based Arabic text classifier."""
14
+
15
+ def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2, bidirectional=False):
16
+ super(LSTMClassifier, self).__init__()
17
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
18
+ self.bidirectional = bidirectional
19
+ self.lstm = nn.LSTM(
20
+ embedding_dim,
21
+ hidden_dim,
22
+ num_layers,
23
+ batch_first=True,
24
+ dropout=0.3,
25
+ bidirectional=self.bidirectional
26
+ )
27
+ fc_input_dim = hidden_dim * 2 if self.bidirectional else hidden_dim
28
+ self.fc = nn.Linear(fc_input_dim, output_dim)
29
+ self.dropout = nn.Dropout(0.5)
30
+
31
+ def forward(self, x):
32
+ embedded = self.embedding(x)
33
+ _, (hidden, _) = self.lstm(embedded)
34
+ if self.bidirectional:
35
+ forward_hidden = hidden[-2]
36
+ backward_hidden = hidden[-1]
37
+ combined = torch.cat((forward_hidden, backward_hidden), dim=1)
38
+ h = combined
39
+ else:
40
+ h = hidden[-1]
41
+ output = self.fc(self.dropout(h))
42
+ return output
43
+
44
+
45
+ class ModernClassifier:
46
+ """Modern Arabic text classifier supporting BERT and LSTM models."""
47
+
48
+ def __init__(self, model_type: str, model_path: str, config_path: Optional[str] = None):
49
+ self.model_type = model_type.lower()
50
+ self.model_path = model_path
51
+ self.config_path = config_path
52
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
53
+
54
+ self.classes = np.array(['culture', 'economy', 'international', 'local', 'religion', 'sports'])
55
+
56
+ if self.model_type == 'bert':
57
+ self._load_bert_model()
58
+ elif self.model_type == 'lstm':
59
+ self._load_lstm_model()
60
+ else:
61
+ raise ValueError(f"Unsupported model type: {model_type}")
62
+
63
+ self.model_name = f"{model_type}_classifier"
64
+
65
+ def _load_bert_model(self):
66
+ """Load BERT model from safetensors."""
67
+ try:
68
+ self.tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')
69
+ state_dict = load_file(self.model_path)
70
+ embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
71
+ checkpoint_vocab_size = state_dict[embed_key].shape[0]
72
+ config = AutoConfig.from_pretrained(
73
+ 'aubmindlab/bert-base-arabertv2',
74
+ num_labels=len(self.classes),
75
+ vocab_size=checkpoint_vocab_size
76
+ )
77
+ self.model = AutoModelForSequenceClassification.from_config(config)
78
+ self.model.resize_token_embeddings(checkpoint_vocab_size)
79
+ self.model.load_state_dict(state_dict, strict=False)
80
+ self.model.to(self.device)
81
+ self.model.eval()
82
+ except Exception as e:
83
+ raise RuntimeError(f"Error loading BERT model: {e}")
84
+
85
+ def _load_lstm_model(self):
86
+ """Load LSTM model from .pth file."""
87
+ try:
88
+ checkpoint = torch.load(self.model_path, map_location=self.device)
89
+ state_dict = checkpoint.get('model_state_dict', checkpoint)
90
+ vocab_size, embedding_dim = state_dict['embedding.weight'].shape
91
+ _, hidden_dim = state_dict['lstm.weight_hh_l0'].shape
92
+ layer_nums = set(int(re.match(r'lstm\.weight_ih_l(\d+)', k).group(1))
93
+ for k in state_dict if re.match(r'lstm\.weight_ih_l(\d+)$', k))
94
+ num_layers = len(layer_nums)
95
+ bidirectional = True
96
+ output_dim = len(self.classes)
97
+ self.model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim,
98
+ output_dim, num_layers=num_layers,
99
+ bidirectional=bidirectional)
100
+ self.model.load_state_dict(state_dict, strict=False)
101
+ self.model.to(self.device)
102
+ self.model.eval()
103
+ self.vocab = checkpoint.get('vocab', {})
104
+ except Exception as e:
105
+ raise RuntimeError(f"Error loading LSTM model: {e}")
106
+
107
+ def _preprocess_text_for_bert(self, text: str) -> Dict[str, torch.Tensor]:
108
+ """Preprocess text for BERT model."""
109
+ cleaned_text = preprocess_for_classification(text)
110
+
111
+ inputs = self.tokenizer(
112
+ cleaned_text,
113
+ return_tensors='pt',
114
+ truncation=True,
115
+ padding=True,
116
+ max_length=512
117
+ )
118
+
119
+ return {key: value.to(self.device) for key, value in inputs.items()}
120
+
121
+ def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
122
+ """Preprocess text for LSTM model."""
123
+ cleaned_text = preprocess_for_classification(text)
124
+
125
+ tokens = cleaned_text.split()
126
+
127
+ if hasattr(self, 'vocab') and self.vocab:
128
+ indices = [self.vocab.get(token, 0) for token in tokens]
129
+ else:
130
+ indices = [hash(token) % 10000 for token in tokens]
131
+
132
+ max_length = 100
133
+ if len(indices) > max_length:
134
+ indices = indices[:max_length]
135
+ else:
136
+ indices.extend([0] * (max_length - len(indices)))
137
+
138
+ return torch.tensor([indices], dtype=torch.long).to(self.device)
139
+
140
+ def predict(self, text: str) -> Dict[str, Any]:
141
+ """Predict class with full probability distribution and metadata."""
142
+ cleaned_text = preprocess_for_classification(text)
143
+
144
+ with torch.no_grad():
145
+ if self.model_type == 'bert':
146
+ inputs = self._preprocess_text_for_bert(text)
147
+ outputs = self.model(**inputs)
148
+ logits = outputs.logits
149
+ elif self.model_type == 'lstm':
150
+ inputs = self._preprocess_text_for_lstm(text)
151
+ logits = self.model(inputs)
152
+
153
+ probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
154
+
155
+ prediction_index = int(np.argmax(probabilities))
156
+ prediction = self.classes[prediction_index]
157
+ confidence = float(probabilities[prediction_index])
158
+
159
+ prob_distribution = {}
160
+ for i, class_label in enumerate(self.classes):
161
+ prob_distribution[str(class_label)] = float(probabilities[i])
162
+
163
+ return {
164
+ "prediction": str(prediction),
165
+ "prediction_index": prediction_index,
166
+ "confidence": confidence,
167
+ "probability_distribution": prob_distribution,
168
+ "cleaned_text": cleaned_text,
169
+ "model_used": self.model_name,
170
+ "prediction_metadata": {
171
+ "max_probability": float(np.max(probabilities)),
172
+ "min_probability": float(np.min(probabilities)),
173
+ "entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
174
+ "num_classes": len(probabilities),
175
+ "model_type": self.model_type,
176
+ "device": str(self.device)
177
+ },
178
+ }
179
+
180
+ def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
181
+ """Predict classes for multiple texts using true batch processing."""
182
+ if not texts:
183
+ return []
184
+
185
+ cleaned_texts = [preprocess_for_classification(text) for text in texts]
186
+
187
+ with torch.no_grad():
188
+ if self.model_type == 'bert':
189
+ inputs = self.tokenizer(
190
+ cleaned_texts,
191
+ return_tensors='pt',
192
+ truncation=True,
193
+ padding=True,
194
+ max_length=512
195
+ )
196
+ inputs = {key: value.to(self.device) for key, value in inputs.items()}
197
+ outputs = self.model(**inputs)
198
+ logits = outputs.logits
199
+
200
+ elif self.model_type == 'lstm':
201
+ batch_indices = []
202
+ max_length = 100
203
+
204
+ for cleaned_text in cleaned_texts:
205
+ tokens = cleaned_text.split()
206
+ if hasattr(self, 'vocab') and self.vocab:
207
+ indices = [self.vocab.get(token, 0) for token in tokens]
208
+ else:
209
+ indices = [hash(token) % 10000 for token in tokens]
210
+
211
+ if len(indices) > max_length:
212
+ indices = indices[:max_length]
213
+ else:
214
+ indices.extend([0] * (max_length - len(indices)))
215
+
216
+ batch_indices.append(indices)
217
+
218
+ batch_tensor = torch.tensor(batch_indices, dtype=torch.long).to(self.device)
219
+ logits = self.model(batch_tensor)
220
+
221
+ probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
222
+
223
+ results = []
224
+ for i, (text, cleaned_text) in enumerate(zip(texts, cleaned_texts)):
225
+ probs = probabilities[i]
226
+ prediction_index = int(np.argmax(probs))
227
+ prediction = self.classes[prediction_index]
228
+ confidence = float(probs[prediction_index])
229
+
230
+ prob_distribution = {}
231
+ for j, class_label in enumerate(self.classes):
232
+ prob_distribution[str(class_label)] = float(probs[j])
233
+
234
+ result = {
235
+ "prediction": str(prediction),
236
+ "prediction_index": prediction_index,
237
+ "confidence": confidence,
238
+ "probability_distribution": prob_distribution,
239
+ "cleaned_text": cleaned_text,
240
+ "model_used": self.model_name,
241
+ "prediction_metadata": {
242
+ "max_probability": float(np.max(probs)),
243
+ "min_probability": float(np.min(probs)),
244
+ "entropy": float(-np.sum(probs * np.log(probs + 1e-10))),
245
+ "num_classes": len(probs),
246
+ "model_type": self.model_type,
247
+ "device": str(self.device)
248
+ },
249
+ }
250
+ results.append(result)
251
+
252
+ return results
253
+
254
+ def get_model_info(self) -> Dict[str, Any]:
255
+ """Get model information and capabilities."""
256
+ return {
257
+ "model_name": self.model_name,
258
+ "model_type": self.model_type,
259
+ "model_path": self.model_path,
260
+ "num_classes": len(self.classes),
261
+ "classes": self.classes.tolist(),
262
+ "device": str(self.device),
263
+ "has_predict_proba": True,
264
+ "framework": "pytorch",
265
+ "modern_model": True
266
+ }
modern_lstm_classifier.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc096c6cdc0523b378529139c5bcbb9dbcbcd9351ba6babc272a90b1d78bd4f
3
+ size 5921296
requirements.txt CHANGED
@@ -4,3 +4,6 @@ scikit-learn
4
  nltk
5
  joblib
6
  numpy
 
 
 
 
4
  nltk
5
  joblib
6
  numpy
7
+ torch
8
+ transformers
9
+ safetensors
summarizer.py CHANGED
@@ -8,7 +8,7 @@ from preprocessor import preprocess_for_summarization
8
  class ArabicSummarizer:
9
  """Arabic text summarizer using TF-IDF scoring."""
10
 
11
- def __init__(self, vectorizer_path: str = "tfidf_vectorizer_text_summarization.joblib"):
12
  self.vectorizer = joblib.load(vectorizer_path)
13
 
14
  def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
 
8
  class ArabicSummarizer:
9
  """Arabic text summarizer using TF-IDF scoring."""
10
 
11
+ def __init__(self, vectorizer_path: str = "traditional_tfidf_vectorizer_summarization.joblib"):
12
  self.vectorizer = joblib.load(vectorizer_path)
13
 
14
  def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
classifier.py → traditional_classifier.py RENAMED
@@ -1,16 +1,16 @@
1
- import joblib
2
  import numpy as np
 
3
  from typing import List, Dict, Any
4
  from preprocessor import preprocess_for_classification
5
 
6
 
7
- class ArabicClassifier:
8
- """Arabic text classifier with probability distributions and metadata."""
9
 
10
  def __init__(
11
  self,
12
- classifier_path: str = "svm_classifier.joblib",
13
- vectorizer_path: str = "tfidf_vectorizer_classifier.joblib",
14
  ):
15
  self.model = joblib.load(classifier_path)
16
  self.vectorizer = joblib.load(vectorizer_path)
 
 
1
  import numpy as np
2
+ import joblib
3
  from typing import List, Dict, Any
4
  from preprocessor import preprocess_for_classification
5
 
6
 
7
+ class TraditionalClassifier:
8
+ """Traditional text classifier with probability distributions and metadata."""
9
 
10
  def __init__(
11
  self,
12
+ classifier_path: str = "traditional_svm_classifier.joblib",
13
+ vectorizer_path: str = "traditional_tfidf_vectorizer_classifier.joblib",
14
  ):
15
  self.model = joblib.load(classifier_path)
16
  self.vectorizer = joblib.load(vectorizer_path)
svm_classifier.joblib → traditional_svm_classifier.joblib RENAMED
File without changes
tfidf_vectorizer_classifier.joblib → traditional_tfidf_vectorizer_classifier.joblib RENAMED
File without changes
tfidf_vectorizer_text_summarization.joblib → traditional_tfidf_vectorizer_summarization.joblib RENAMED
File without changes