moabos
commited on
Commit
·
123e49c
1
Parent(s):
29dedef
feat: integrate 2 modern models for classification and update routes and examples
Browse files- Dockerfile +0 -1
- README.md +1 -1
- app.py +121 -75
- examples.py +207 -108
- model_manager.py +179 -0
- modern_bert_classifier.safetensors +3 -0
- modern_classifier.py +266 -0
- modern_lstm_classifier.pth +3 -0
- requirements.txt +3 -0
- summarizer.py +1 -1
- classifier.py → traditional_classifier.py +5 -5
- svm_classifier.joblib → traditional_svm_classifier.joblib +0 -0
- tfidf_vectorizer_classifier.joblib → traditional_tfidf_vectorizer_classifier.joblib +0 -0
- tfidf_vectorizer_text_summarization.joblib → traditional_tfidf_vectorizer_summarization.joblib +0 -0
Dockerfile
CHANGED
@@ -10,6 +10,5 @@ COPY --chown=user ./requirements.txt requirements.txt
|
|
10 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
RUN python -m nltk.downloader stopwords
|
12 |
|
13 |
-
|
14 |
COPY --chown=user . /app
|
15 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
10 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
RUN python -m nltk.downloader stopwords
|
12 |
|
|
|
13 |
COPY --chown=user . /app
|
14 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -6,7 +6,7 @@ colorTo: green
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
-
short_description: 'Arabic text summarization and topic classification app.
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
+
short_description: 'Arabic text summarization and topic classification app.'
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -3,9 +3,9 @@ from fastapi import FastAPI, HTTPException
|
|
3 |
from pydantic import BaseModel
|
4 |
from enum import Enum
|
5 |
|
6 |
-
from classifier import ArabicClassifier
|
7 |
from summarizer import ArabicSummarizer
|
8 |
from preprocessor import ArabicPreprocessor
|
|
|
9 |
from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
|
10 |
|
11 |
|
@@ -14,56 +14,55 @@ class TaskType(str, Enum):
|
|
14 |
SUMMARIZATION = "summarization"
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
app = FastAPI(
|
18 |
title="Arabic Text Analysis API",
|
19 |
-
description="API for Arabic text classification, summarization, and preprocessing",
|
20 |
-
version="1.0.0"
|
21 |
)
|
22 |
|
23 |
-
|
24 |
-
summarizer = ArabicSummarizer("
|
25 |
preprocessor = ArabicPreprocessor()
|
26 |
|
27 |
|
28 |
class TextInput(BaseModel):
|
29 |
text: str
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
"example": REQUEST_EXAMPLES["text_input"]
|
34 |
-
}
|
35 |
-
}
|
36 |
|
37 |
|
38 |
class TextInputWithSentences(BaseModel):
|
39 |
text: str
|
40 |
num_sentences: Optional[int] = 3
|
41 |
-
|
|
|
42 |
model_config = {
|
43 |
-
"json_schema_extra": {
|
44 |
-
"example": REQUEST_EXAMPLES["text_input_with_sentences"]
|
45 |
-
}
|
46 |
}
|
47 |
|
48 |
|
49 |
class BatchTextInput(BaseModel):
|
50 |
texts: List[str]
|
51 |
-
|
|
|
52 |
model_config = {
|
53 |
-
"json_schema_extra": {
|
54 |
-
"example": REQUEST_EXAMPLES["batch_text_input"]
|
55 |
-
}
|
56 |
}
|
57 |
|
58 |
|
59 |
class PreprocessingInput(BaseModel):
|
60 |
text: str
|
61 |
task_type: TaskType = TaskType.CLASSIFICATION
|
62 |
-
|
63 |
model_config = {
|
64 |
-
"json_schema_extra": {
|
65 |
-
"example": REQUEST_EXAMPLES["preprocessing_input"]
|
66 |
-
}
|
67 |
}
|
68 |
|
69 |
|
@@ -86,8 +85,8 @@ class ClassificationResponse(BaseModel):
|
|
86 |
"description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
|
87 |
}
|
88 |
}
|
89 |
-
}
|
90 |
-
}
|
91 |
}
|
92 |
|
93 |
|
@@ -101,9 +100,7 @@ class SummarizationResponse(BaseModel):
|
|
101 |
top_sentence_scores: Optional[List[float]]
|
102 |
|
103 |
model_config = {
|
104 |
-
"json_schema_extra": {
|
105 |
-
"example": RESPONSE_EXAMPLES["summarization"]
|
106 |
-
}
|
107 |
}
|
108 |
|
109 |
|
@@ -112,9 +109,7 @@ class TextAnalysisResponse(BaseModel):
|
|
112 |
analysis: Dict[str, Any]
|
113 |
|
114 |
model_config = {
|
115 |
-
"json_schema_extra": {
|
116 |
-
"example": RESPONSE_EXAMPLES["text_analysis"]
|
117 |
-
}
|
118 |
}
|
119 |
|
120 |
|
@@ -125,9 +120,7 @@ class BatchClassificationResponse(BaseModel):
|
|
125 |
|
126 |
model_config = {
|
127 |
"protected_namespaces": (),
|
128 |
-
"json_schema_extra": {
|
129 |
-
"example": RESPONSE_EXAMPLES["batch_classification"]
|
130 |
-
}
|
131 |
}
|
132 |
|
133 |
|
@@ -137,9 +130,7 @@ class SentenceAnalysisResponse(BaseModel):
|
|
137 |
score_statistics: Dict[str, float]
|
138 |
|
139 |
model_config = {
|
140 |
-
"json_schema_extra": {
|
141 |
-
"example": RESPONSE_EXAMPLES["sentence_analysis"]
|
142 |
-
}
|
143 |
}
|
144 |
|
145 |
|
@@ -150,9 +141,7 @@ class CompleteAnalysisResponse(BaseModel):
|
|
150 |
summarization: SummarizationResponse
|
151 |
|
152 |
model_config = {
|
153 |
-
"json_schema_extra": {
|
154 |
-
"example": RESPONSE_EXAMPLES["complete_analysis"]
|
155 |
-
}
|
156 |
}
|
157 |
|
158 |
|
@@ -161,9 +150,7 @@ class PreprocessingResponse(BaseModel):
|
|
161 |
preprocessing_steps: Dict[str, Any]
|
162 |
|
163 |
model_config = {
|
164 |
-
"json_schema_extra": {
|
165 |
-
"example": RESPONSE_EXAMPLES["preprocessing"]
|
166 |
-
}
|
167 |
}
|
168 |
|
169 |
|
@@ -171,11 +158,25 @@ class ModelInfoResponse(BaseModel):
|
|
171 |
classifier: Dict[str, Any]
|
172 |
summarizer: Dict[str, Any]
|
173 |
|
174 |
-
model_config = {
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
|
181 |
@app.get("/")
|
@@ -186,7 +187,7 @@ def read_root() -> Dict[str, Any]:
|
|
186 |
"documentation": {
|
187 |
"interactive_docs": "/docs",
|
188 |
"redoc": "/redoc",
|
189 |
-
"openapi_schema": "/openapi.json"
|
190 |
},
|
191 |
"endpoints": {
|
192 |
"classify": "POST /classify - Classify Arabic text",
|
@@ -196,8 +197,9 @@ def read_root() -> Dict[str, Any]:
|
|
196 |
"preprocess": "POST /preprocess - Preprocess text with detailed steps",
|
197 |
"text_analysis": "POST /text-analysis - Analyze text characteristics",
|
198 |
"sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
|
199 |
-
"model_info": "GET /model-info - Get model information"
|
200 |
-
|
|
|
201 |
}
|
202 |
|
203 |
|
@@ -205,7 +207,8 @@ def read_root() -> Dict[str, Any]:
|
|
205 |
def classify_text(data: TextInput) -> ClassificationResponse:
|
206 |
"""Classify Arabic text with probability distribution and metadata."""
|
207 |
try:
|
208 |
-
|
|
|
209 |
return result
|
210 |
except Exception as e:
|
211 |
raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
|
@@ -215,14 +218,19 @@ def classify_text(data: TextInput) -> ClassificationResponse:
|
|
215 |
def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
|
216 |
"""Classify multiple Arabic texts in batch."""
|
217 |
try:
|
218 |
-
|
|
|
|
|
|
|
219 |
return {
|
220 |
"results": results,
|
221 |
"total_texts": len(data.texts),
|
222 |
-
"model_used":
|
223 |
}
|
224 |
except Exception as e:
|
225 |
-
raise HTTPException(
|
|
|
|
|
226 |
|
227 |
|
228 |
@app.post("/summarize", response_model=SummarizationResponse)
|
@@ -242,25 +250,31 @@ def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
|
|
242 |
result = summarizer.get_sentence_analysis(data.text)
|
243 |
return result
|
244 |
except Exception as e:
|
245 |
-
raise HTTPException(
|
|
|
|
|
246 |
|
247 |
|
248 |
@app.post("/analyze", response_model=CompleteAnalysisResponse)
|
249 |
def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
|
250 |
"""Complete analysis: classification, summarization, and text statistics."""
|
251 |
try:
|
252 |
-
|
|
|
|
|
253 |
summarization_result = summarizer.summarize(data.text, data.num_sentences)
|
254 |
text_stats = preprocessor.analyze_text(data.text)
|
255 |
-
|
256 |
return {
|
257 |
"original_text": data.text,
|
258 |
"text_analysis": text_stats,
|
259 |
"classification": classification_result,
|
260 |
-
"summarization": summarization_result
|
261 |
}
|
262 |
except Exception as e:
|
263 |
-
raise HTTPException(
|
|
|
|
|
264 |
|
265 |
|
266 |
@app.post("/preprocess", response_model=PreprocessingResponse)
|
@@ -268,10 +282,7 @@ def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
|
|
268 |
"""Preprocess text with step-by-step breakdown."""
|
269 |
try:
|
270 |
steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
|
271 |
-
return {
|
272 |
-
"task_type": data.task_type.value,
|
273 |
-
"preprocessing_steps": steps
|
274 |
-
}
|
275 |
except Exception as e:
|
276 |
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
|
277 |
|
@@ -281,25 +292,60 @@ def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
|
|
281 |
"""Analyze text characteristics and statistics."""
|
282 |
try:
|
283 |
analysis = preprocessor.analyze_text(data.text)
|
284 |
-
return {
|
285 |
-
"text": data.text,
|
286 |
-
"analysis": analysis
|
287 |
-
}
|
288 |
except Exception as e:
|
289 |
raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
|
290 |
|
291 |
|
292 |
@app.get("/model-info", response_model=ModelInfoResponse)
|
293 |
-
def get_model_info() -> ModelInfoResponse:
|
294 |
-
"""Get information about
|
295 |
try:
|
296 |
-
|
|
|
297 |
return {
|
298 |
"classifier": classifier_info,
|
299 |
"summarizer": {
|
300 |
-
"vectorizer_loaded": hasattr(summarizer,
|
301 |
-
"model_type": "TF-IDF based summarization"
|
302 |
-
}
|
303 |
}
|
304 |
except Exception as e:
|
305 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from pydantic import BaseModel
|
4 |
from enum import Enum
|
5 |
|
|
|
6 |
from summarizer import ArabicSummarizer
|
7 |
from preprocessor import ArabicPreprocessor
|
8 |
+
from model_manager import ModelManager
|
9 |
from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
|
10 |
|
11 |
|
|
|
14 |
SUMMARIZATION = "summarization"
|
15 |
|
16 |
|
17 |
+
class ModelType(str, Enum):
|
18 |
+
TRADITIONAL_SVM = "traditional_svm"
|
19 |
+
MODERN_BERT = "modern_bert"
|
20 |
+
MODERN_LSTM = "modern_lstm"
|
21 |
+
|
22 |
+
|
23 |
app = FastAPI(
|
24 |
title="Arabic Text Analysis API",
|
25 |
+
description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
|
26 |
+
version="1.0.0",
|
27 |
)
|
28 |
|
29 |
+
model_manager = ModelManager(default_model="traditional_svm")
|
30 |
+
summarizer = ArabicSummarizer("traditional_tfidf_vectorizer_summarization.joblib")
|
31 |
preprocessor = ArabicPreprocessor()
|
32 |
|
33 |
|
34 |
class TextInput(BaseModel):
|
35 |
text: str
|
36 |
+
model: Optional[ModelType] = None
|
37 |
+
|
38 |
+
model_config = {"json_schema_extra": {"example": REQUEST_EXAMPLES["text_input"]}}
|
|
|
|
|
|
|
39 |
|
40 |
|
41 |
class TextInputWithSentences(BaseModel):
|
42 |
text: str
|
43 |
num_sentences: Optional[int] = 3
|
44 |
+
model: Optional[ModelType] = None
|
45 |
+
|
46 |
model_config = {
|
47 |
+
"json_schema_extra": {"example": REQUEST_EXAMPLES["text_input_with_sentences"]}
|
|
|
|
|
48 |
}
|
49 |
|
50 |
|
51 |
class BatchTextInput(BaseModel):
|
52 |
texts: List[str]
|
53 |
+
model: Optional[ModelType] = None
|
54 |
+
|
55 |
model_config = {
|
56 |
+
"json_schema_extra": {"example": REQUEST_EXAMPLES["batch_text_input"]}
|
|
|
|
|
57 |
}
|
58 |
|
59 |
|
60 |
class PreprocessingInput(BaseModel):
|
61 |
text: str
|
62 |
task_type: TaskType = TaskType.CLASSIFICATION
|
63 |
+
|
64 |
model_config = {
|
65 |
+
"json_schema_extra": {"example": REQUEST_EXAMPLES["preprocessing_input"]}
|
|
|
|
|
66 |
}
|
67 |
|
68 |
|
|
|
85 |
"description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
|
86 |
}
|
87 |
}
|
88 |
+
},
|
89 |
+
},
|
90 |
}
|
91 |
|
92 |
|
|
|
100 |
top_sentence_scores: Optional[List[float]]
|
101 |
|
102 |
model_config = {
|
103 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["summarization"]}
|
|
|
|
|
104 |
}
|
105 |
|
106 |
|
|
|
109 |
analysis: Dict[str, Any]
|
110 |
|
111 |
model_config = {
|
112 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["text_analysis"]}
|
|
|
|
|
113 |
}
|
114 |
|
115 |
|
|
|
120 |
|
121 |
model_config = {
|
122 |
"protected_namespaces": (),
|
123 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["batch_classification"]},
|
|
|
|
|
124 |
}
|
125 |
|
126 |
|
|
|
130 |
score_statistics: Dict[str, float]
|
131 |
|
132 |
model_config = {
|
133 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["sentence_analysis"]}
|
|
|
|
|
134 |
}
|
135 |
|
136 |
|
|
|
141 |
summarization: SummarizationResponse
|
142 |
|
143 |
model_config = {
|
144 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["complete_analysis"]}
|
|
|
|
|
145 |
}
|
146 |
|
147 |
|
|
|
150 |
preprocessing_steps: Dict[str, Any]
|
151 |
|
152 |
model_config = {
|
153 |
+
"json_schema_extra": {"example": RESPONSE_EXAMPLES["preprocessing"]}
|
|
|
|
|
154 |
}
|
155 |
|
156 |
|
|
|
158 |
classifier: Dict[str, Any]
|
159 |
summarizer: Dict[str, Any]
|
160 |
|
161 |
+
model_config = {"json_schema_extra": {"example": RESPONSE_EXAMPLES["model_info"]}}
|
162 |
+
|
163 |
+
|
164 |
+
class ModelSwitchInput(BaseModel):
|
165 |
+
model: ModelType
|
166 |
+
|
167 |
+
model_config = {"json_schema_extra": {"example": {"model": "modern_bert"}}}
|
168 |
+
|
169 |
+
|
170 |
+
class ModelSwitchResponse(BaseModel):
|
171 |
+
success: bool
|
172 |
+
message: str
|
173 |
+
previous_model: Optional[str] = None
|
174 |
+
current_model: str
|
175 |
+
|
176 |
+
|
177 |
+
class AvailableModelsResponse(BaseModel):
|
178 |
+
models: Dict[str, Any]
|
179 |
+
current_model: str
|
180 |
|
181 |
|
182 |
@app.get("/")
|
|
|
187 |
"documentation": {
|
188 |
"interactive_docs": "/docs",
|
189 |
"redoc": "/redoc",
|
190 |
+
"openapi_schema": "/openapi.json",
|
191 |
},
|
192 |
"endpoints": {
|
193 |
"classify": "POST /classify - Classify Arabic text",
|
|
|
197 |
"preprocess": "POST /preprocess - Preprocess text with detailed steps",
|
198 |
"text_analysis": "POST /text-analysis - Analyze text characteristics",
|
199 |
"sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
|
200 |
+
"model_info": "GET /model-info - Get model information",
|
201 |
+
"available_models": "GET /models - Get all available models",
|
202 |
+
},
|
203 |
}
|
204 |
|
205 |
|
|
|
207 |
def classify_text(data: TextInput) -> ClassificationResponse:
|
208 |
"""Classify Arabic text with probability distribution and metadata."""
|
209 |
try:
|
210 |
+
model_name = data.model.value if data.model else None
|
211 |
+
result = model_manager.predict(data.text, model_name)
|
212 |
return result
|
213 |
except Exception as e:
|
214 |
raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
|
|
|
218 |
def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
|
219 |
"""Classify multiple Arabic texts in batch."""
|
220 |
try:
|
221 |
+
model_name = data.model.value if data.model else None
|
222 |
+
results = model_manager.predict_batch(data.texts, model_name)
|
223 |
+
used_model = model_name or model_manager.default_model
|
224 |
+
|
225 |
return {
|
226 |
"results": results,
|
227 |
"total_texts": len(data.texts),
|
228 |
+
"model_used": used_model,
|
229 |
}
|
230 |
except Exception as e:
|
231 |
+
raise HTTPException(
|
232 |
+
status_code=500, detail=f"Batch classification failed: {str(e)}"
|
233 |
+
)
|
234 |
|
235 |
|
236 |
@app.post("/summarize", response_model=SummarizationResponse)
|
|
|
250 |
result = summarizer.get_sentence_analysis(data.text)
|
251 |
return result
|
252 |
except Exception as e:
|
253 |
+
raise HTTPException(
|
254 |
+
status_code=500, detail=f"Sentence analysis failed: {str(e)}"
|
255 |
+
)
|
256 |
|
257 |
|
258 |
@app.post("/analyze", response_model=CompleteAnalysisResponse)
|
259 |
def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
|
260 |
"""Complete analysis: classification, summarization, and text statistics."""
|
261 |
try:
|
262 |
+
model_name = data.model.value if data.model else None
|
263 |
+
|
264 |
+
classification_result = model_manager.predict(data.text, model_name)
|
265 |
summarization_result = summarizer.summarize(data.text, data.num_sentences)
|
266 |
text_stats = preprocessor.analyze_text(data.text)
|
267 |
+
|
268 |
return {
|
269 |
"original_text": data.text,
|
270 |
"text_analysis": text_stats,
|
271 |
"classification": classification_result,
|
272 |
+
"summarization": summarization_result,
|
273 |
}
|
274 |
except Exception as e:
|
275 |
+
raise HTTPException(
|
276 |
+
status_code=500, detail=f"Complete analysis failed: {str(e)}"
|
277 |
+
)
|
278 |
|
279 |
|
280 |
@app.post("/preprocess", response_model=PreprocessingResponse)
|
|
|
282 |
"""Preprocess text with step-by-step breakdown."""
|
283 |
try:
|
284 |
steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
|
285 |
+
return {"task_type": data.task_type.value, "preprocessing_steps": steps}
|
|
|
|
|
|
|
286 |
except Exception as e:
|
287 |
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
|
288 |
|
|
|
292 |
"""Analyze text characteristics and statistics."""
|
293 |
try:
|
294 |
analysis = preprocessor.analyze_text(data.text)
|
295 |
+
return {"text": data.text, "analysis": analysis}
|
|
|
|
|
|
|
296 |
except Exception as e:
|
297 |
raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
|
298 |
|
299 |
|
300 |
@app.get("/model-info", response_model=ModelInfoResponse)
|
301 |
+
def get_model_info(model: Optional[ModelType] = None) -> ModelInfoResponse:
|
302 |
+
"""Get information about a specific model or the default model."""
|
303 |
try:
|
304 |
+
model_name = model.value if model else None
|
305 |
+
classifier_info = model_manager.get_model_info(model_name)
|
306 |
return {
|
307 |
"classifier": classifier_info,
|
308 |
"summarizer": {
|
309 |
+
"vectorizer_loaded": hasattr(summarizer, "vectorizer"),
|
310 |
+
"model_type": "TF-IDF based summarization",
|
311 |
+
},
|
312 |
}
|
313 |
except Exception as e:
|
314 |
+
raise HTTPException(
|
315 |
+
status_code=500, detail=f"Failed to get model info: {str(e)}"
|
316 |
+
)
|
317 |
+
|
318 |
+
|
319 |
+
@app.get("/models", response_model=AvailableModelsResponse)
|
320 |
+
def get_available_models() -> AvailableModelsResponse:
|
321 |
+
"""Get all available classification models."""
|
322 |
+
try:
|
323 |
+
models = model_manager.get_available_models()
|
324 |
+
return {"models": models, "current_model": model_manager.default_model}
|
325 |
+
except Exception as e:
|
326 |
+
raise HTTPException(
|
327 |
+
status_code=500, detail=f"Failed to get available models: {str(e)}"
|
328 |
+
)
|
329 |
+
|
330 |
+
|
331 |
+
@app.get("/models/cache")
|
332 |
+
def get_cache_status() -> Dict[str, Any]:
|
333 |
+
"""Get information about cached models."""
|
334 |
+
try:
|
335 |
+
return model_manager.get_cache_status()
|
336 |
+
except Exception as e:
|
337 |
+
raise HTTPException(
|
338 |
+
status_code=500, detail=f"Failed to get cache status: {str(e)}"
|
339 |
+
)
|
340 |
+
|
341 |
+
|
342 |
+
@app.post("/models/cache/clear")
|
343 |
+
def clear_model_cache(model: Optional[ModelType] = None) -> Dict[str, Any]:
|
344 |
+
"""Clear model cache for a specific model or all models."""
|
345 |
+
try:
|
346 |
+
model_name = model.value if model else None
|
347 |
+
return model_manager.clear_cache(model_name)
|
348 |
+
except Exception as e:
|
349 |
+
raise HTTPException(
|
350 |
+
status_code=500, detail=f"Failed to clear cache: {str(e)}"
|
351 |
+
)
|
examples.py
CHANGED
@@ -1,10 +1,19 @@
|
|
|
|
1 |
"""API request and response examples for documentation."""
|
2 |
|
3 |
EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
|
4 |
|
|
|
|
|
|
|
|
|
5 |
REQUEST_EXAMPLES = {
|
6 |
"text_input": {"text": EXAMPLE_TEXT},
|
|
|
|
|
|
|
7 |
"text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
|
|
|
8 |
"batch_text_input": {
|
9 |
"texts": [
|
10 |
EXAMPLE_TEXT,
|
@@ -12,30 +21,38 @@ REQUEST_EXAMPLES = {
|
|
12 |
"المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
|
13 |
]
|
14 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
"preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
|
16 |
}
|
17 |
|
18 |
RESPONSE_EXAMPLES = {
|
19 |
"classification": {
|
20 |
-
"prediction": "
|
21 |
-
"prediction_index":
|
22 |
-
"confidence": 0.
|
23 |
"probability_distribution": {
|
24 |
-
"culture": 0.
|
25 |
-
"economy": 0.
|
26 |
-
"international": 0.
|
27 |
-
"local": 0.
|
28 |
-
"religion": 0.
|
29 |
-
"sports": 0.
|
30 |
},
|
31 |
-
"cleaned_text": "
|
32 |
-
"model_used": "
|
33 |
"prediction_metadata": {
|
34 |
-
"max_probability": 0.
|
35 |
-
"min_probability": 0.
|
36 |
-
"entropy": 0.
|
37 |
-
"num_classes": 6
|
38 |
-
}
|
39 |
},
|
40 |
"summarization": {
|
41 |
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
@@ -45,116 +62,163 @@ RESPONSE_EXAMPLES = {
|
|
45 |
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
46 |
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
47 |
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
48 |
-
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
|
49 |
],
|
50 |
"selected_indices": [1, 2],
|
51 |
-
"sentence_scores": [2.
|
52 |
-
"top_sentence_scores": [3.
|
53 |
},
|
54 |
"text_analysis": {
|
55 |
"text": EXAMPLE_TEXT,
|
56 |
"analysis": {
|
57 |
-
"character_count":
|
58 |
-
"word_count":
|
59 |
"sentence_count": 4,
|
60 |
-
"arabic_character_count":
|
61 |
-
"arabic_character_ratio": 0.
|
62 |
-
"average_word_length":
|
63 |
-
"average_sentence_length":
|
64 |
"has_diacritics": False,
|
65 |
-
"punctuation_count":
|
66 |
-
}
|
67 |
},
|
68 |
"batch_classification": {
|
69 |
"results": [
|
70 |
{
|
71 |
-
"prediction": "
|
72 |
-
"prediction_index":
|
73 |
-
"confidence": 0.
|
74 |
"probability_distribution": {
|
75 |
-
"culture": 0.
|
76 |
-
"economy": 0.
|
77 |
-
"international": 0.
|
78 |
-
"local": 0.
|
79 |
-
"religion": 0.
|
80 |
-
"sports": 0.
|
81 |
},
|
82 |
-
"cleaned_text": "
|
83 |
-
"model_used": "
|
84 |
"prediction_metadata": {
|
85 |
-
"max_probability": 0.
|
86 |
-
"min_probability": 0.
|
87 |
-
"entropy": 0.
|
88 |
-
"num_classes": 6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
}
|
91 |
],
|
92 |
"total_texts": 3,
|
93 |
-
"model_used": "
|
94 |
},
|
95 |
"sentence_analysis": {
|
96 |
"sentences": [
|
97 |
{
|
98 |
"index": 0,
|
99 |
"sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
100 |
-
"score": 2.
|
101 |
-
"rank": 3
|
102 |
},
|
103 |
{
|
104 |
"index": 1,
|
105 |
"sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
106 |
-
"score": 3.
|
107 |
-
"rank": 2
|
108 |
},
|
109 |
{
|
110 |
"index": 2,
|
111 |
"sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
112 |
-
"score": 3.
|
113 |
-
"rank": 1
|
114 |
},
|
115 |
{
|
116 |
"index": 3,
|
117 |
"sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
|
118 |
-
"score": 2.
|
119 |
-
"rank": 4
|
120 |
-
}
|
121 |
],
|
122 |
"total_sentences": 4,
|
123 |
-
"score_statistics": {
|
|
|
|
|
|
|
|
|
|
|
124 |
},
|
125 |
"complete_analysis": {
|
126 |
"original_text": EXAMPLE_TEXT,
|
127 |
"text_analysis": {
|
128 |
-
"character_count":
|
129 |
-
"word_count":
|
130 |
"sentence_count": 4,
|
131 |
-
"arabic_character_count":
|
132 |
-
"arabic_character_ratio": 0.
|
133 |
-
"average_word_length":
|
134 |
-
"average_sentence_length":
|
135 |
"has_diacritics": False,
|
136 |
-
"punctuation_count":
|
137 |
},
|
138 |
"classification": {
|
139 |
-
"prediction": "
|
140 |
-
"prediction_index":
|
141 |
-
"confidence": 0.
|
142 |
"probability_distribution": {
|
143 |
-
"culture": 0.
|
144 |
-
"economy": 0.
|
145 |
-
"international": 0.
|
146 |
-
"local": 0.
|
147 |
-
"religion": 0.
|
148 |
-
"sports": 0.
|
149 |
},
|
150 |
-
"cleaned_text": "
|
151 |
-
"model_used": "
|
152 |
"prediction_metadata": {
|
153 |
-
"max_probability": 0.
|
154 |
-
"min_probability": 0.
|
155 |
-
"entropy": 0.
|
156 |
-
"num_classes": 6
|
157 |
-
}
|
158 |
},
|
159 |
"summarization": {
|
160 |
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
@@ -164,51 +228,86 @@ RESPONSE_EXAMPLES = {
|
|
164 |
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
165 |
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
166 |
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
167 |
-
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
|
168 |
],
|
169 |
"selected_indices": [1, 2],
|
170 |
-
"sentence_scores": [2.
|
171 |
-
"top_sentence_scores": [3.
|
172 |
-
}
|
173 |
},
|
174 |
"preprocessing": {
|
175 |
"task_type": "classification",
|
176 |
"preprocessing_steps": {
|
177 |
-
"
|
178 |
-
"
|
179 |
-
"
|
180 |
-
"
|
181 |
-
"
|
182 |
-
"
|
183 |
-
"
|
|
|
184 |
"preprocessing_summary": {
|
185 |
-
"original_length":
|
186 |
-
"final_length":
|
187 |
-
"reduction_percentage":
|
188 |
-
"words_removed":
|
189 |
-
"words_remaining":
|
190 |
-
}
|
191 |
-
}
|
192 |
},
|
193 |
"model_info": {
|
194 |
"classifier": {
|
195 |
-
"model_name": "
|
196 |
-
"
|
197 |
-
"model_loaded": True,
|
198 |
-
"classes": [
|
199 |
-
"culture",
|
200 |
-
"economy",
|
201 |
-
"international",
|
202 |
-
"local",
|
203 |
-
"religion",
|
204 |
-
"sports",
|
205 |
-
],
|
206 |
"num_classes": 6,
|
207 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
},
|
209 |
"summarizer": {
|
210 |
"vectorizer_loaded": True,
|
211 |
-
"model_type": "TF-IDF based summarization"
|
212 |
-
}
|
213 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
}
|
|
|
1 |
+
|
2 |
"""API request and response examples for documentation."""
|
3 |
|
4 |
EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
|
5 |
|
6 |
+
# Note: For classification endpoints, the 'model' field is optional.
|
7 |
+
# Available models: traditional_svm, modern_bert, modern_lstm
|
8 |
+
# If not specified, defaults to traditional_svm
|
9 |
+
|
10 |
REQUEST_EXAMPLES = {
|
11 |
"text_input": {"text": EXAMPLE_TEXT},
|
12 |
+
"text_input_with_model": {"text": EXAMPLE_TEXT, "model": "traditional_svm"},
|
13 |
+
"text_input_modern_bert": {"text": EXAMPLE_TEXT, "model": "modern_bert"},
|
14 |
+
"text_input_modern_lstm": {"text": EXAMPLE_TEXT, "model": "modern_lstm"},
|
15 |
"text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
|
16 |
+
"text_input_with_sentences_and_model": {"text": EXAMPLE_TEXT, "num_sentences": 2, "model": "modern_bert"},
|
17 |
"batch_text_input": {
|
18 |
"texts": [
|
19 |
EXAMPLE_TEXT,
|
|
|
21 |
"المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
|
22 |
]
|
23 |
},
|
24 |
+
"batch_text_input_with_model": {
|
25 |
+
"texts": [
|
26 |
+
EXAMPLE_TEXT,
|
27 |
+
"هذا نص تجريبي آخر للتصنيف باللغة العربية.",
|
28 |
+
"المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
|
29 |
+
],
|
30 |
+
"model": "modern_lstm"
|
31 |
+
},
|
32 |
"preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
|
33 |
}
|
34 |
|
35 |
RESPONSE_EXAMPLES = {
|
36 |
"classification": {
|
37 |
+
"prediction": "economy",
|
38 |
+
"prediction_index": 1,
|
39 |
+
"confidence": 0.851430067618738,
|
40 |
"probability_distribution": {
|
41 |
+
"culture": 0.03446028829161909,
|
42 |
+
"economy": 0.851430067618738,
|
43 |
+
"international": 0.005894582125851457,
|
44 |
+
"local": 0.07395284130576442,
|
45 |
+
"religion": 0.006103062287269104,
|
46 |
+
"sports": 0.028159158370758375
|
47 |
},
|
48 |
+
"cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
|
49 |
+
"model_used": "traditional_svm_classifier",
|
50 |
"prediction_metadata": {
|
51 |
+
"max_probability": 0.851430067618738,
|
52 |
+
"min_probability": 0.005894582125851457,
|
53 |
+
"entropy": 0.607505444397693,
|
54 |
+
"num_classes": 6
|
55 |
+
}
|
56 |
},
|
57 |
"summarization": {
|
58 |
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
|
|
62 |
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
63 |
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
64 |
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
65 |
+
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
|
66 |
],
|
67 |
"selected_indices": [1, 2],
|
68 |
+
"sentence_scores": [2.9675719816913095, 3.2243029195650847, 3.2339935441416645, 2.6416207226871506],
|
69 |
+
"top_sentence_scores": [3.2243029195650847, 3.2339935441416645]
|
70 |
},
|
71 |
"text_analysis": {
|
72 |
"text": EXAMPLE_TEXT,
|
73 |
"analysis": {
|
74 |
+
"character_count": 278,
|
75 |
+
"word_count": 48,
|
76 |
"sentence_count": 4,
|
77 |
+
"arabic_character_count": 227,
|
78 |
+
"arabic_character_ratio": 0.8165467625899281,
|
79 |
+
"average_word_length": 4.8125,
|
80 |
+
"average_sentence_length": 12.0,
|
81 |
"has_diacritics": False,
|
82 |
+
"punctuation_count": 5
|
83 |
+
}
|
84 |
},
|
85 |
"batch_classification": {
|
86 |
"results": [
|
87 |
{
|
88 |
+
"prediction": "economy",
|
89 |
+
"prediction_index": 1,
|
90 |
+
"confidence": 0.851430067618738,
|
91 |
"probability_distribution": {
|
92 |
+
"culture": 0.03446028829161909,
|
93 |
+
"economy": 0.851430067618738,
|
94 |
+
"international": 0.005894582125851457,
|
95 |
+
"local": 0.07395284130576442,
|
96 |
+
"religion": 0.006103062287269104,
|
97 |
+
"sports": 0.028159158370758375
|
98 |
},
|
99 |
+
"cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
|
100 |
+
"model_used": "traditional_svm_classifier",
|
101 |
"prediction_metadata": {
|
102 |
+
"max_probability": 0.851430067618738,
|
103 |
+
"min_probability": 0.005894582125851457,
|
104 |
+
"entropy": 0.607505444397693,
|
105 |
+
"num_classes": 6
|
106 |
+
}
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"prediction": "economy",
|
110 |
+
"prediction_index": 1,
|
111 |
+
"confidence": 0.594920291892315,
|
112 |
+
"probability_distribution": {
|
113 |
+
"culture": 0.2745678397668158,
|
114 |
+
"economy": 0.594920291892315,
|
115 |
+
"international": 0.026763844094989445,
|
116 |
+
"local": 0.059508468104556384,
|
117 |
+
"religion": 0.010869503926651195,
|
118 |
+
"sports": 0.033370152114672154
|
119 |
+
},
|
120 |
+
"cleaned_text": "نص تجرب اخر تصن لغ عرب",
|
121 |
+
"model_used": "traditional_svm_classifier",
|
122 |
+
"prediction_metadata": {
|
123 |
+
"max_probability": 0.594920291892315,
|
124 |
+
"min_probability": 0.010869503926651195,
|
125 |
+
"entropy": 1.2244037883788486,
|
126 |
+
"num_classes": 6
|
127 |
+
}
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"prediction": "local",
|
131 |
+
"prediction_index": 3,
|
132 |
+
"confidence": 0.8469012893956159,
|
133 |
+
"probability_distribution": {
|
134 |
+
"culture": 0.03610036096593097,
|
135 |
+
"economy": 0.02966970458779607,
|
136 |
+
"international": 0.00998550970524301,
|
137 |
+
"local": 0.8469012893956159,
|
138 |
+
"religion": 0.025074648364464147,
|
139 |
+
"sports": 0.05237313697694993
|
140 |
},
|
141 |
+
"cleaned_text": "طعم دين قدم طب شه تنع",
|
142 |
+
"model_used": "traditional_svm_classifier",
|
143 |
+
"prediction_metadata": {
|
144 |
+
"max_probability": 0.8469012893956159,
|
145 |
+
"min_probability": 0.00998550970524301,
|
146 |
+
"entropy": 0.5892773509761756,
|
147 |
+
"num_classes": 6
|
148 |
+
}
|
149 |
}
|
150 |
],
|
151 |
"total_texts": 3,
|
152 |
+
"model_used": "traditional_svm"
|
153 |
},
|
154 |
"sentence_analysis": {
|
155 |
"sentences": [
|
156 |
{
|
157 |
"index": 0,
|
158 |
"sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
159 |
+
"score": 2.9675719816913095,
|
160 |
+
"rank": 3
|
161 |
},
|
162 |
{
|
163 |
"index": 1,
|
164 |
"sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
165 |
+
"score": 3.2243029195650847,
|
166 |
+
"rank": 2
|
167 |
},
|
168 |
{
|
169 |
"index": 2,
|
170 |
"sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
171 |
+
"score": 3.2339935441416645,
|
172 |
+
"rank": 1
|
173 |
},
|
174 |
{
|
175 |
"index": 3,
|
176 |
"sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
|
177 |
+
"score": 2.6416207226871506,
|
178 |
+
"rank": 4
|
179 |
+
}
|
180 |
],
|
181 |
"total_sentences": 4,
|
182 |
+
"score_statistics": {
|
183 |
+
"mean": 3.0168722920213025,
|
184 |
+
"std": 0.24156436215421892,
|
185 |
+
"min": 2.6416207226871506,
|
186 |
+
"max": 3.2339935441416645
|
187 |
+
}
|
188 |
},
|
189 |
"complete_analysis": {
|
190 |
"original_text": EXAMPLE_TEXT,
|
191 |
"text_analysis": {
|
192 |
+
"character_count": 278,
|
193 |
+
"word_count": 48,
|
194 |
"sentence_count": 4,
|
195 |
+
"arabic_character_count": 227,
|
196 |
+
"arabic_character_ratio": 0.8165467625899281,
|
197 |
+
"average_word_length": 4.8125,
|
198 |
+
"average_sentence_length": 12.0,
|
199 |
"has_diacritics": False,
|
200 |
+
"punctuation_count": 5
|
201 |
},
|
202 |
"classification": {
|
203 |
+
"prediction": "economy",
|
204 |
+
"prediction_index": 1,
|
205 |
+
"confidence": 0.851430067618738,
|
206 |
"probability_distribution": {
|
207 |
+
"culture": 0.03446028829161909,
|
208 |
+
"economy": 0.851430067618738,
|
209 |
+
"international": 0.005894582125851457,
|
210 |
+
"local": 0.07395284130576442,
|
211 |
+
"religion": 0.006103062287269104,
|
212 |
+
"sports": 0.028159158370758375
|
213 |
},
|
214 |
+
"cleaned_text": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
|
215 |
+
"model_used": "traditional_svm_classifier",
|
216 |
"prediction_metadata": {
|
217 |
+
"max_probability": 0.851430067618738,
|
218 |
+
"min_probability": 0.005894582125851457,
|
219 |
+
"entropy": 0.607505444397693,
|
220 |
+
"num_classes": 6
|
221 |
+
}
|
222 |
},
|
223 |
"summarization": {
|
224 |
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
|
|
228 |
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
229 |
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
230 |
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
231 |
+
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى"
|
232 |
],
|
233 |
"selected_indices": [1, 2],
|
234 |
+
"sentence_scores": [2.9675719816913095, 3.2243029195650847, 3.2339935441416645, 2.6416207226871506],
|
235 |
+
"top_sentence_scores": [3.2243029195650847, 3.2339935441416645]
|
236 |
+
}
|
237 |
},
|
238 |
"preprocessing": {
|
239 |
"task_type": "classification",
|
240 |
"preprocessing_steps": {
|
241 |
+
"original": EXAMPLE_TEXT,
|
242 |
+
"stripped_lowered": EXAMPLE_TEXT,
|
243 |
+
"normalized": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم. ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات. لا يقتصر الامر علي السعر الاقل، بل سيكون طعامك اشهي والذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري.",
|
244 |
+
"diacritics_removed": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم. ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات. لا يقتصر الامر علي السعر الاقل، بل سيكون طعامك اشهي والذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري.",
|
245 |
+
"punctuation_removed": "يكون سعر الفاكهه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكهه باسعار مناسبه في موسمها بسبب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري ",
|
246 |
+
"repeated_chars_reduced": "يكون سعر الفاكه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكه باسعار مناسبه في موسمها بسب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري",
|
247 |
+
"whitespace_normalized": "يكون سعر الفاكه والخضراوات في موسم انباتها اقل من غيره من المواسم ستلجا محلات الخضروات الي عرض الفاكه باسعار مناسبه في موسمها بسب توفر المنتجات لا يقتصر الامر علي السعر الاقل بل سيكون طعامك اشهي والذ عند تناوله في موسمه في فصل الخريف يتوفر التفاح والتين والبنجر والكمثري",
|
248 |
+
"final_result": "يكون سعر فكه خضراو وسم بات اقل غير وسم تلج محل خضرو الي عرض فكه سعر نسب وسم بسب وفر نتج قصر امر علي سعر اقل سيك طعم اشه ولذ نول وسم فصل خرف وفر تفح تين نجر كمثر",
|
249 |
"preprocessing_summary": {
|
250 |
+
"original_length": 278,
|
251 |
+
"final_length": 165,
|
252 |
+
"reduction_percentage": 40.6,
|
253 |
+
"words_removed": 25,
|
254 |
+
"words_remaining": 23
|
255 |
+
}
|
256 |
+
}
|
257 |
},
|
258 |
"model_info": {
|
259 |
"classifier": {
|
260 |
+
"model_name": "traditional_svm_classifier",
|
261 |
+
"model_type": "SVC",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
"num_classes": 6,
|
263 |
+
"classes": ["culture", "economy", "international", "local", "religion", "sports"],
|
264 |
+
"has_predict_proba": True,
|
265 |
+
"has_vectorizer": True,
|
266 |
+
"vectorizer_type": "TfidfVectorizer",
|
267 |
+
"model_manager": {
|
268 |
+
"model_name": "traditional_svm",
|
269 |
+
"model_description": "Traditional SVM classifier with TF-IDF vectorization",
|
270 |
+
"model_config": {
|
271 |
+
"type": "traditional",
|
272 |
+
"classifier_path": "traditional_svm_classifier.joblib",
|
273 |
+
"vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
|
274 |
+
"description": "Traditional SVM classifier with TF-IDF vectorization"
|
275 |
+
},
|
276 |
+
"is_cached": True
|
277 |
+
}
|
278 |
},
|
279 |
"summarizer": {
|
280 |
"vectorizer_loaded": True,
|
281 |
+
"model_type": "TF-IDF based summarization"
|
282 |
+
}
|
283 |
},
|
284 |
+
"available_models": {
|
285 |
+
"models": {
|
286 |
+
"traditional_svm": {
|
287 |
+
"description": "Traditional SVM classifier with TF-IDF vectorization",
|
288 |
+
"type": "traditional",
|
289 |
+
"available": True,
|
290 |
+
"missing_files": [],
|
291 |
+
"is_default": True,
|
292 |
+
"is_cached": True
|
293 |
+
},
|
294 |
+
"modern_bert": {
|
295 |
+
"description": "Modern BERT-based transformer classifier",
|
296 |
+
"type": "modern",
|
297 |
+
"available": True,
|
298 |
+
"missing_files": [],
|
299 |
+
"is_default": False,
|
300 |
+
"is_cached": False
|
301 |
+
},
|
302 |
+
"modern_lstm": {
|
303 |
+
"description": "Modern LSTM-based neural network classifier",
|
304 |
+
"type": "modern",
|
305 |
+
"available": True,
|
306 |
+
"missing_files": [],
|
307 |
+
"is_default": False,
|
308 |
+
"is_cached": False
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"current_model": "traditional_svm"
|
312 |
+
}
|
313 |
}
|
model_manager.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
import os
|
3 |
+
from traditional_classifier import TraditionalClassifier
|
4 |
+
|
5 |
+
try:
|
6 |
+
from modern_classifier import ModernClassifier
|
7 |
+
MODERN_MODELS_AVAILABLE = True
|
8 |
+
except ImportError:
|
9 |
+
MODERN_MODELS_AVAILABLE = False
|
10 |
+
|
11 |
+
|
12 |
+
class ModelManager:
|
13 |
+
"""Manages different types of Arabic text classification models with per-request model selection and caching."""
|
14 |
+
|
15 |
+
AVAILABLE_MODELS = {
|
16 |
+
"traditional_svm": {
|
17 |
+
"type": "traditional",
|
18 |
+
"classifier_path": "traditional_svm_classifier.joblib",
|
19 |
+
"vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
|
20 |
+
"description": "Traditional SVM classifier with TF-IDF vectorization"
|
21 |
+
},
|
22 |
+
|
23 |
+
"modern_bert": {
|
24 |
+
"type": "modern",
|
25 |
+
"model_type": "bert",
|
26 |
+
"model_path": "modern_bert_classifier.safetensors",
|
27 |
+
"config_path": "config.json",
|
28 |
+
"description": "Modern BERT-based transformer classifier"
|
29 |
+
},
|
30 |
+
|
31 |
+
"modern_lstm": {
|
32 |
+
"type": "modern",
|
33 |
+
"model_type": "lstm",
|
34 |
+
"model_path": "modern_lstm_classifier.pth",
|
35 |
+
"description": "Modern LSTM-based neural network classifier"
|
36 |
+
}
|
37 |
+
}
|
38 |
+
|
39 |
+
def __init__(self, default_model: str = "traditional_svm"):
|
40 |
+
self.default_model = default_model
|
41 |
+
self._model_cache = {}
|
42 |
+
|
43 |
+
def _get_model(self, model_name: str):
|
44 |
+
"""Get model instance, loading from cache or creating new one."""
|
45 |
+
if model_name not in self.AVAILABLE_MODELS:
|
46 |
+
raise ValueError(f"Model '{model_name}' not available. Available models: {list(self.AVAILABLE_MODELS.keys())}")
|
47 |
+
|
48 |
+
if model_name in self._model_cache:
|
49 |
+
return self._model_cache[model_name]
|
50 |
+
|
51 |
+
model_config = self.AVAILABLE_MODELS[model_name]
|
52 |
+
|
53 |
+
if model_config["type"] == "traditional":
|
54 |
+
classifier_path = model_config["classifier_path"]
|
55 |
+
vectorizer_path = model_config["vectorizer_path"]
|
56 |
+
|
57 |
+
if not os.path.exists(classifier_path):
|
58 |
+
raise FileNotFoundError(f"Classifier file not found: {classifier_path}")
|
59 |
+
if not os.path.exists(vectorizer_path):
|
60 |
+
raise FileNotFoundError(f"Vectorizer file not found: {vectorizer_path}")
|
61 |
+
|
62 |
+
model = TraditionalClassifier(classifier_path, vectorizer_path)
|
63 |
+
|
64 |
+
elif model_config["type"] == "modern":
|
65 |
+
if not MODERN_MODELS_AVAILABLE:
|
66 |
+
raise ImportError("Modern models require PyTorch and transformers")
|
67 |
+
|
68 |
+
model_path = model_config["model_path"]
|
69 |
+
|
70 |
+
if not os.path.exists(model_path):
|
71 |
+
raise FileNotFoundError(f"Model file not found: {model_path}")
|
72 |
+
|
73 |
+
config_path = model_config.get("config_path")
|
74 |
+
if config_path and not os.path.exists(config_path):
|
75 |
+
config_path = None
|
76 |
+
|
77 |
+
model = ModernClassifier(
|
78 |
+
model_type=model_config["model_type"],
|
79 |
+
model_path=model_path,
|
80 |
+
config_path=config_path
|
81 |
+
)
|
82 |
+
|
83 |
+
self._model_cache[model_name] = model
|
84 |
+
return model
|
85 |
+
|
86 |
+
def predict(self, text: str, model_name: str = None) -> Dict[str, Any]:
|
87 |
+
"""Predict using the specified model (or default if none specified)."""
|
88 |
+
if model_name is None:
|
89 |
+
model_name = self.default_model
|
90 |
+
|
91 |
+
model = self._get_model(model_name)
|
92 |
+
result = model.predict(text)
|
93 |
+
|
94 |
+
result["model_manager"] = {
|
95 |
+
"model_used": model_name,
|
96 |
+
"model_description": self.AVAILABLE_MODELS[model_name]["description"]
|
97 |
+
}
|
98 |
+
return result
|
99 |
+
|
100 |
+
def predict_batch(self, texts: list, model_name: str = None) -> list:
|
101 |
+
"""Predict batch using the specified model (or default if none specified)."""
|
102 |
+
if model_name is None:
|
103 |
+
model_name = self.default_model
|
104 |
+
|
105 |
+
model = self._get_model(model_name)
|
106 |
+
results = model.predict_batch(texts)
|
107 |
+
|
108 |
+
for result in results:
|
109 |
+
result["model_manager"] = {
|
110 |
+
"model_used": model_name,
|
111 |
+
"model_description": self.AVAILABLE_MODELS[model_name]["description"]
|
112 |
+
}
|
113 |
+
return results
|
114 |
+
|
115 |
+
def get_model_info(self, model_name: str = None) -> Dict[str, Any]:
|
116 |
+
"""Get information about a specific model (or default if none specified)."""
|
117 |
+
if model_name is None:
|
118 |
+
model_name = self.default_model
|
119 |
+
|
120 |
+
model = self._get_model(model_name)
|
121 |
+
model_info = model.get_model_info()
|
122 |
+
model_info.update({
|
123 |
+
"model_manager": {
|
124 |
+
"model_name": model_name,
|
125 |
+
"model_description": self.AVAILABLE_MODELS[model_name]["description"],
|
126 |
+
"model_config": self.AVAILABLE_MODELS[model_name],
|
127 |
+
"is_cached": model_name in self._model_cache
|
128 |
+
}
|
129 |
+
})
|
130 |
+
return model_info
|
131 |
+
|
132 |
+
def get_available_models(self) -> Dict[str, Any]:
|
133 |
+
"""Get list of all available models."""
|
134 |
+
available = {}
|
135 |
+
for model_name, config in self.AVAILABLE_MODELS.items():
|
136 |
+
files_exist = True
|
137 |
+
missing_files = []
|
138 |
+
|
139 |
+
if config["type"] == "traditional":
|
140 |
+
for file_key in ["classifier_path", "vectorizer_path"]:
|
141 |
+
if not os.path.exists(config[file_key]):
|
142 |
+
files_exist = False
|
143 |
+
missing_files.append(config[file_key])
|
144 |
+
elif config["type"] == "modern":
|
145 |
+
if not os.path.exists(config["model_path"]):
|
146 |
+
files_exist = False
|
147 |
+
missing_files.append(config["model_path"])
|
148 |
+
|
149 |
+
available[model_name] = {
|
150 |
+
"description": config["description"],
|
151 |
+
"type": config["type"],
|
152 |
+
"available": files_exist,
|
153 |
+
"missing_files": missing_files if not files_exist else [],
|
154 |
+
"is_default": model_name == self.default_model,
|
155 |
+
"is_cached": model_name in self._model_cache
|
156 |
+
}
|
157 |
+
|
158 |
+
return available
|
159 |
+
|
160 |
+
def clear_cache(self, model_name: str = None) -> Dict[str, Any]:
|
161 |
+
"""Clear model cache (specific model or all models)."""
|
162 |
+
if model_name:
|
163 |
+
if model_name in self._model_cache:
|
164 |
+
del self._model_cache[model_name]
|
165 |
+
return {"message": f"Cache cleared for model: {model_name}"}
|
166 |
+
else:
|
167 |
+
return {"message": f"Model {model_name} was not cached"}
|
168 |
+
else:
|
169 |
+
cleared_count = len(self._model_cache)
|
170 |
+
self._model_cache.clear()
|
171 |
+
return {"message": f"Cache cleared for {cleared_count} models"}
|
172 |
+
|
173 |
+
def get_cache_status(self) -> Dict[str, Any]:
|
174 |
+
"""Get information about cached models."""
|
175 |
+
return {
|
176 |
+
"cached_models": list(self._model_cache.keys()),
|
177 |
+
"cache_count": len(self._model_cache),
|
178 |
+
"default_model": self.default_model
|
179 |
+
}
|
modern_bert_classifier.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd36d704baf314e519a6abbcde8b6239ba3867dc4a44acc9c690701b863a3a7d
|
3 |
+
size 442511368
|
modern_classifier.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import numpy as np
|
4 |
+
from typing import List, Dict, Any, Optional
|
5 |
+
from preprocessor import preprocess_for_classification
|
6 |
+
import re
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
from safetensors.torch import load_file
|
9 |
+
from transformers import AutoConfig
|
10 |
+
|
11 |
+
|
12 |
+
class LSTMClassifier(nn.Module):
|
13 |
+
"""LSTM-based Arabic text classifier."""
|
14 |
+
|
15 |
+
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2, bidirectional=False):
|
16 |
+
super(LSTMClassifier, self).__init__()
|
17 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
18 |
+
self.bidirectional = bidirectional
|
19 |
+
self.lstm = nn.LSTM(
|
20 |
+
embedding_dim,
|
21 |
+
hidden_dim,
|
22 |
+
num_layers,
|
23 |
+
batch_first=True,
|
24 |
+
dropout=0.3,
|
25 |
+
bidirectional=self.bidirectional
|
26 |
+
)
|
27 |
+
fc_input_dim = hidden_dim * 2 if self.bidirectional else hidden_dim
|
28 |
+
self.fc = nn.Linear(fc_input_dim, output_dim)
|
29 |
+
self.dropout = nn.Dropout(0.5)
|
30 |
+
|
31 |
+
def forward(self, x):
|
32 |
+
embedded = self.embedding(x)
|
33 |
+
_, (hidden, _) = self.lstm(embedded)
|
34 |
+
if self.bidirectional:
|
35 |
+
forward_hidden = hidden[-2]
|
36 |
+
backward_hidden = hidden[-1]
|
37 |
+
combined = torch.cat((forward_hidden, backward_hidden), dim=1)
|
38 |
+
h = combined
|
39 |
+
else:
|
40 |
+
h = hidden[-1]
|
41 |
+
output = self.fc(self.dropout(h))
|
42 |
+
return output
|
43 |
+
|
44 |
+
|
45 |
+
class ModernClassifier:
|
46 |
+
"""Modern Arabic text classifier supporting BERT and LSTM models."""
|
47 |
+
|
48 |
+
def __init__(self, model_type: str, model_path: str, config_path: Optional[str] = None):
|
49 |
+
self.model_type = model_type.lower()
|
50 |
+
self.model_path = model_path
|
51 |
+
self.config_path = config_path
|
52 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
53 |
+
|
54 |
+
self.classes = np.array(['culture', 'economy', 'international', 'local', 'religion', 'sports'])
|
55 |
+
|
56 |
+
if self.model_type == 'bert':
|
57 |
+
self._load_bert_model()
|
58 |
+
elif self.model_type == 'lstm':
|
59 |
+
self._load_lstm_model()
|
60 |
+
else:
|
61 |
+
raise ValueError(f"Unsupported model type: {model_type}")
|
62 |
+
|
63 |
+
self.model_name = f"{model_type}_classifier"
|
64 |
+
|
65 |
+
def _load_bert_model(self):
|
66 |
+
"""Load BERT model from safetensors."""
|
67 |
+
try:
|
68 |
+
self.tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')
|
69 |
+
state_dict = load_file(self.model_path)
|
70 |
+
embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
|
71 |
+
checkpoint_vocab_size = state_dict[embed_key].shape[0]
|
72 |
+
config = AutoConfig.from_pretrained(
|
73 |
+
'aubmindlab/bert-base-arabertv2',
|
74 |
+
num_labels=len(self.classes),
|
75 |
+
vocab_size=checkpoint_vocab_size
|
76 |
+
)
|
77 |
+
self.model = AutoModelForSequenceClassification.from_config(config)
|
78 |
+
self.model.resize_token_embeddings(checkpoint_vocab_size)
|
79 |
+
self.model.load_state_dict(state_dict, strict=False)
|
80 |
+
self.model.to(self.device)
|
81 |
+
self.model.eval()
|
82 |
+
except Exception as e:
|
83 |
+
raise RuntimeError(f"Error loading BERT model: {e}")
|
84 |
+
|
85 |
+
def _load_lstm_model(self):
|
86 |
+
"""Load LSTM model from .pth file."""
|
87 |
+
try:
|
88 |
+
checkpoint = torch.load(self.model_path, map_location=self.device)
|
89 |
+
state_dict = checkpoint.get('model_state_dict', checkpoint)
|
90 |
+
vocab_size, embedding_dim = state_dict['embedding.weight'].shape
|
91 |
+
_, hidden_dim = state_dict['lstm.weight_hh_l0'].shape
|
92 |
+
layer_nums = set(int(re.match(r'lstm\.weight_ih_l(\d+)', k).group(1))
|
93 |
+
for k in state_dict if re.match(r'lstm\.weight_ih_l(\d+)$', k))
|
94 |
+
num_layers = len(layer_nums)
|
95 |
+
bidirectional = True
|
96 |
+
output_dim = len(self.classes)
|
97 |
+
self.model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim,
|
98 |
+
output_dim, num_layers=num_layers,
|
99 |
+
bidirectional=bidirectional)
|
100 |
+
self.model.load_state_dict(state_dict, strict=False)
|
101 |
+
self.model.to(self.device)
|
102 |
+
self.model.eval()
|
103 |
+
self.vocab = checkpoint.get('vocab', {})
|
104 |
+
except Exception as e:
|
105 |
+
raise RuntimeError(f"Error loading LSTM model: {e}")
|
106 |
+
|
107 |
+
def _preprocess_text_for_bert(self, text: str) -> Dict[str, torch.Tensor]:
|
108 |
+
"""Preprocess text for BERT model."""
|
109 |
+
cleaned_text = preprocess_for_classification(text)
|
110 |
+
|
111 |
+
inputs = self.tokenizer(
|
112 |
+
cleaned_text,
|
113 |
+
return_tensors='pt',
|
114 |
+
truncation=True,
|
115 |
+
padding=True,
|
116 |
+
max_length=512
|
117 |
+
)
|
118 |
+
|
119 |
+
return {key: value.to(self.device) for key, value in inputs.items()}
|
120 |
+
|
121 |
+
def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
|
122 |
+
"""Preprocess text for LSTM model."""
|
123 |
+
cleaned_text = preprocess_for_classification(text)
|
124 |
+
|
125 |
+
tokens = cleaned_text.split()
|
126 |
+
|
127 |
+
if hasattr(self, 'vocab') and self.vocab:
|
128 |
+
indices = [self.vocab.get(token, 0) for token in tokens]
|
129 |
+
else:
|
130 |
+
indices = [hash(token) % 10000 for token in tokens]
|
131 |
+
|
132 |
+
max_length = 100
|
133 |
+
if len(indices) > max_length:
|
134 |
+
indices = indices[:max_length]
|
135 |
+
else:
|
136 |
+
indices.extend([0] * (max_length - len(indices)))
|
137 |
+
|
138 |
+
return torch.tensor([indices], dtype=torch.long).to(self.device)
|
139 |
+
|
140 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
141 |
+
"""Predict class with full probability distribution and metadata."""
|
142 |
+
cleaned_text = preprocess_for_classification(text)
|
143 |
+
|
144 |
+
with torch.no_grad():
|
145 |
+
if self.model_type == 'bert':
|
146 |
+
inputs = self._preprocess_text_for_bert(text)
|
147 |
+
outputs = self.model(**inputs)
|
148 |
+
logits = outputs.logits
|
149 |
+
elif self.model_type == 'lstm':
|
150 |
+
inputs = self._preprocess_text_for_lstm(text)
|
151 |
+
logits = self.model(inputs)
|
152 |
+
|
153 |
+
probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
|
154 |
+
|
155 |
+
prediction_index = int(np.argmax(probabilities))
|
156 |
+
prediction = self.classes[prediction_index]
|
157 |
+
confidence = float(probabilities[prediction_index])
|
158 |
+
|
159 |
+
prob_distribution = {}
|
160 |
+
for i, class_label in enumerate(self.classes):
|
161 |
+
prob_distribution[str(class_label)] = float(probabilities[i])
|
162 |
+
|
163 |
+
return {
|
164 |
+
"prediction": str(prediction),
|
165 |
+
"prediction_index": prediction_index,
|
166 |
+
"confidence": confidence,
|
167 |
+
"probability_distribution": prob_distribution,
|
168 |
+
"cleaned_text": cleaned_text,
|
169 |
+
"model_used": self.model_name,
|
170 |
+
"prediction_metadata": {
|
171 |
+
"max_probability": float(np.max(probabilities)),
|
172 |
+
"min_probability": float(np.min(probabilities)),
|
173 |
+
"entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
|
174 |
+
"num_classes": len(probabilities),
|
175 |
+
"model_type": self.model_type,
|
176 |
+
"device": str(self.device)
|
177 |
+
},
|
178 |
+
}
|
179 |
+
|
180 |
+
def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
|
181 |
+
"""Predict classes for multiple texts using true batch processing."""
|
182 |
+
if not texts:
|
183 |
+
return []
|
184 |
+
|
185 |
+
cleaned_texts = [preprocess_for_classification(text) for text in texts]
|
186 |
+
|
187 |
+
with torch.no_grad():
|
188 |
+
if self.model_type == 'bert':
|
189 |
+
inputs = self.tokenizer(
|
190 |
+
cleaned_texts,
|
191 |
+
return_tensors='pt',
|
192 |
+
truncation=True,
|
193 |
+
padding=True,
|
194 |
+
max_length=512
|
195 |
+
)
|
196 |
+
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
197 |
+
outputs = self.model(**inputs)
|
198 |
+
logits = outputs.logits
|
199 |
+
|
200 |
+
elif self.model_type == 'lstm':
|
201 |
+
batch_indices = []
|
202 |
+
max_length = 100
|
203 |
+
|
204 |
+
for cleaned_text in cleaned_texts:
|
205 |
+
tokens = cleaned_text.split()
|
206 |
+
if hasattr(self, 'vocab') and self.vocab:
|
207 |
+
indices = [self.vocab.get(token, 0) for token in tokens]
|
208 |
+
else:
|
209 |
+
indices = [hash(token) % 10000 for token in tokens]
|
210 |
+
|
211 |
+
if len(indices) > max_length:
|
212 |
+
indices = indices[:max_length]
|
213 |
+
else:
|
214 |
+
indices.extend([0] * (max_length - len(indices)))
|
215 |
+
|
216 |
+
batch_indices.append(indices)
|
217 |
+
|
218 |
+
batch_tensor = torch.tensor(batch_indices, dtype=torch.long).to(self.device)
|
219 |
+
logits = self.model(batch_tensor)
|
220 |
+
|
221 |
+
probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
|
222 |
+
|
223 |
+
results = []
|
224 |
+
for i, (text, cleaned_text) in enumerate(zip(texts, cleaned_texts)):
|
225 |
+
probs = probabilities[i]
|
226 |
+
prediction_index = int(np.argmax(probs))
|
227 |
+
prediction = self.classes[prediction_index]
|
228 |
+
confidence = float(probs[prediction_index])
|
229 |
+
|
230 |
+
prob_distribution = {}
|
231 |
+
for j, class_label in enumerate(self.classes):
|
232 |
+
prob_distribution[str(class_label)] = float(probs[j])
|
233 |
+
|
234 |
+
result = {
|
235 |
+
"prediction": str(prediction),
|
236 |
+
"prediction_index": prediction_index,
|
237 |
+
"confidence": confidence,
|
238 |
+
"probability_distribution": prob_distribution,
|
239 |
+
"cleaned_text": cleaned_text,
|
240 |
+
"model_used": self.model_name,
|
241 |
+
"prediction_metadata": {
|
242 |
+
"max_probability": float(np.max(probs)),
|
243 |
+
"min_probability": float(np.min(probs)),
|
244 |
+
"entropy": float(-np.sum(probs * np.log(probs + 1e-10))),
|
245 |
+
"num_classes": len(probs),
|
246 |
+
"model_type": self.model_type,
|
247 |
+
"device": str(self.device)
|
248 |
+
},
|
249 |
+
}
|
250 |
+
results.append(result)
|
251 |
+
|
252 |
+
return results
|
253 |
+
|
254 |
+
def get_model_info(self) -> Dict[str, Any]:
|
255 |
+
"""Get model information and capabilities."""
|
256 |
+
return {
|
257 |
+
"model_name": self.model_name,
|
258 |
+
"model_type": self.model_type,
|
259 |
+
"model_path": self.model_path,
|
260 |
+
"num_classes": len(self.classes),
|
261 |
+
"classes": self.classes.tolist(),
|
262 |
+
"device": str(self.device),
|
263 |
+
"has_predict_proba": True,
|
264 |
+
"framework": "pytorch",
|
265 |
+
"modern_model": True
|
266 |
+
}
|
modern_lstm_classifier.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bc096c6cdc0523b378529139c5bcbb9dbcbcd9351ba6babc272a90b1d78bd4f
|
3 |
+
size 5921296
|
requirements.txt
CHANGED
@@ -4,3 +4,6 @@ scikit-learn
|
|
4 |
nltk
|
5 |
joblib
|
6 |
numpy
|
|
|
|
|
|
|
|
4 |
nltk
|
5 |
joblib
|
6 |
numpy
|
7 |
+
torch
|
8 |
+
transformers
|
9 |
+
safetensors
|
summarizer.py
CHANGED
@@ -8,7 +8,7 @@ from preprocessor import preprocess_for_summarization
|
|
8 |
class ArabicSummarizer:
|
9 |
"""Arabic text summarizer using TF-IDF scoring."""
|
10 |
|
11 |
-
def __init__(self, vectorizer_path: str = "
|
12 |
self.vectorizer = joblib.load(vectorizer_path)
|
13 |
|
14 |
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
|
|
|
8 |
class ArabicSummarizer:
|
9 |
"""Arabic text summarizer using TF-IDF scoring."""
|
10 |
|
11 |
+
def __init__(self, vectorizer_path: str = "traditional_tfidf_vectorizer_summarization.joblib"):
|
12 |
self.vectorizer = joblib.load(vectorizer_path)
|
13 |
|
14 |
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
|
classifier.py → traditional_classifier.py
RENAMED
@@ -1,16 +1,16 @@
|
|
1 |
-
import joblib
|
2 |
import numpy as np
|
|
|
3 |
from typing import List, Dict, Any
|
4 |
from preprocessor import preprocess_for_classification
|
5 |
|
6 |
|
7 |
-
class
|
8 |
-
"""
|
9 |
|
10 |
def __init__(
|
11 |
self,
|
12 |
-
classifier_path: str = "
|
13 |
-
vectorizer_path: str = "
|
14 |
):
|
15 |
self.model = joblib.load(classifier_path)
|
16 |
self.vectorizer = joblib.load(vectorizer_path)
|
|
|
|
|
1 |
import numpy as np
|
2 |
+
import joblib
|
3 |
from typing import List, Dict, Any
|
4 |
from preprocessor import preprocess_for_classification
|
5 |
|
6 |
|
7 |
+
class TraditionalClassifier:
|
8 |
+
"""Traditional text classifier with probability distributions and metadata."""
|
9 |
|
10 |
def __init__(
|
11 |
self,
|
12 |
+
classifier_path: str = "traditional_svm_classifier.joblib",
|
13 |
+
vectorizer_path: str = "traditional_tfidf_vectorizer_classifier.joblib",
|
14 |
):
|
15 |
self.model = joblib.load(classifier_path)
|
16 |
self.vectorizer = joblib.load(vectorizer_path)
|
svm_classifier.joblib → traditional_svm_classifier.joblib
RENAMED
File without changes
|
tfidf_vectorizer_classifier.joblib → traditional_tfidf_vectorizer_classifier.joblib
RENAMED
File without changes
|
tfidf_vectorizer_text_summarization.joblib → traditional_tfidf_vectorizer_summarization.joblib
RENAMED
File without changes
|