Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,11 @@ from langchain.docstore.document import Document
|
|
| 15 |
import docx
|
| 16 |
import os
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
st.markdown("""
|
| 20 |
<style>
|
|
@@ -212,19 +217,23 @@ st.markdown(f"""
|
|
| 212 |
|
| 213 |
|
| 214 |
# ----------------- لود csv و ساخت ایندکس -----------------
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 221 |
-
# تقسیم متنها به دستههای کوچکتر برای جلوگیری از خطای 413
|
| 222 |
-
batch_size = 100 # این مقدار را میتوانید تنظیم کنید
|
| 223 |
embeddings = []
|
| 224 |
-
for
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
embeddings.extend([item.embedding for item in response.data])
|
| 228 |
return embeddings
|
| 229 |
|
| 230 |
def embed_query(self, text: str) -> List[float]:
|
|
@@ -246,6 +255,10 @@ def get_docx_index(folder_path):
|
|
| 246 |
if file_text.strip():
|
| 247 |
texts.append(file_text)
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
# تقسیم متنها
|
| 250 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 251 |
chunk_size=300,
|
|
@@ -254,14 +267,11 @@ def get_docx_index(folder_path):
|
|
| 254 |
separators=["\n\n", "\n", " ", ""]
|
| 255 |
)
|
| 256 |
split_texts = []
|
| 257 |
-
for text in
|
| 258 |
split_texts.extend(text_splitter.split_text(text))
|
| 259 |
|
| 260 |
-
# ایجاد embedding
|
| 261 |
-
embeddings =
|
| 262 |
-
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
| 263 |
-
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
| 264 |
-
)
|
| 265 |
|
| 266 |
# ساخت ایندکس
|
| 267 |
index_creator = VectorstoreIndexCreator(
|
|
@@ -281,8 +291,6 @@ try:
|
|
| 281 |
except Exception as e:
|
| 282 |
st.error(f"❌ خطا در ساخت ایندکس: {e}")
|
| 283 |
|
| 284 |
-
|
| 285 |
-
|
| 286 |
#------------------------------------------
|
| 287 |
llm = ChatOpenAI(
|
| 288 |
base_url="https://api.together.xyz/v1",
|
|
|
|
| 15 |
import docx
|
| 16 |
import os
|
| 17 |
|
| 18 |
+
from hazm import *
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
|
| 24 |
st.markdown("""
|
| 25 |
<style>
|
|
|
|
| 217 |
|
| 218 |
|
| 219 |
# ----------------- لود csv و ساخت ایندکس -----------------
|
| 220 |
+
normalizer = Normalizer()
|
| 221 |
+
|
| 222 |
+
# توکنایزر هضم
|
| 223 |
+
tokenizer = word_tokenize
|
| 224 |
+
|
| 225 |
+
# بارگذاری مدل WordEmbedding
|
| 226 |
+
word_embedding = WordEmbedding(model_type='fasttext', model_path='word2vec.bin')
|
| 227 |
+
|
| 228 |
+
class CustomEmbeddings(Embeddings):
|
| 229 |
+
def __init__(self, word_embedding: WordEmbedding):
|
| 230 |
+
self.word_embedding = word_embedding
|
| 231 |
|
| 232 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
|
|
|
|
|
| 233 |
embeddings = []
|
| 234 |
+
for text in texts:
|
| 235 |
+
# ایجاد امبدینگ برای هر کلمه در متن
|
| 236 |
+
embeddings.append([self.word_embedding.embed(word) for word in tokenizer(text)])
|
|
|
|
| 237 |
return embeddings
|
| 238 |
|
| 239 |
def embed_query(self, text: str) -> List[float]:
|
|
|
|
| 255 |
if file_text.strip():
|
| 256 |
texts.append(file_text)
|
| 257 |
|
| 258 |
+
# نرمالسازی و توکنایز کردن متنها
|
| 259 |
+
normalized_texts = [normalizer.normalize(text) for text in texts]
|
| 260 |
+
tokenized_texts = [tokenizer(text) for text in normalized_texts]
|
| 261 |
+
|
| 262 |
# تقسیم متنها
|
| 263 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 264 |
chunk_size=300,
|
|
|
|
| 267 |
separators=["\n\n", "\n", " ", ""]
|
| 268 |
)
|
| 269 |
split_texts = []
|
| 270 |
+
for text in normalized_texts:
|
| 271 |
split_texts.extend(text_splitter.split_text(text))
|
| 272 |
|
| 273 |
+
# ایجاد embedding با استفاده از WordEmbedding
|
| 274 |
+
embeddings = CustomEmbeddings(word_embedding=word_embedding)
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
# ساخت ایندکس
|
| 277 |
index_creator = VectorstoreIndexCreator(
|
|
|
|
| 291 |
except Exception as e:
|
| 292 |
st.error(f"❌ خطا در ساخت ایندکس: {e}")
|
| 293 |
|
|
|
|
|
|
|
| 294 |
#------------------------------------------
|
| 295 |
llm = ChatOpenAI(
|
| 296 |
base_url="https://api.together.xyz/v1",
|