Update app.py
Browse files
app.py
CHANGED
@@ -500,6 +500,17 @@ st.markdown("""
|
|
500 |
}
|
501 |
</style>
|
502 |
""", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
folder_path = '46'
|
504 |
normalizer = Normalizer()
|
505 |
sentence_tokenizer = SentenceTokenizer()
|
@@ -517,37 +528,17 @@ def load_and_process_documents(path):
|
|
517 |
except Exception as e:
|
518 |
print(f"Error processing {filename}: {e}")
|
519 |
return []
|
520 |
-
with
|
521 |
results = executor.map(process_docx, [f for f in os.listdir(path) if f.endswith(".docx")])
|
522 |
-
|
523 |
-
|
524 |
|
525 |
return list(results)
|
526 |
-
all_sentences = load_and_process_documents(folder_path)
|
527 |
-
|
528 |
-
def clean_text(text):
|
529 |
-
cleaned_text = re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
530 |
-
|
531 |
-
return cleaned_text
|
532 |
-
|
533 |
-
|
534 |
-
def compute_similarity(sentence, query, threshold):
|
535 |
-
similarity = fuzz.partial_ratio(sentence, query)
|
536 |
-
if similarity >= threshold:
|
537 |
-
return sentence
|
538 |
-
return None
|
539 |
-
|
540 |
-
import string
|
541 |
-
from hazm import word_tokenize, sent_tokenize
|
542 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
543 |
-
from sklearn.cluster import KMeans
|
544 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
545 |
-
from concurrent.futures import ThreadPoolExecutor
|
546 |
|
|
|
|
|
547 |
|
548 |
# تابع خلاصهسازی متن با استفاده از KMeans
|
549 |
def summarize_text_with_kmeans(text, num_sentences=3):
|
550 |
-
sentences =
|
551 |
tfidf_vectorizer = TfidfVectorizer()
|
552 |
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) # تبدیل جملات به ماتریس TF-IDF
|
553 |
|
@@ -563,7 +554,19 @@ def summarize_text_with_kmeans(text, num_sentences=3):
|
|
563 |
summary = [sentences[i] for i in similar_sentences_indices]
|
564 |
return ' '.join(summary)
|
565 |
|
566 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
if query:
|
568 |
threshold = 75
|
569 |
keywords = query
|
@@ -656,4 +659,3 @@ if query:
|
|
656 |
])
|
657 |
rewritten = clean_text(response.content.strip())
|
658 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
659 |
-
think.empty()
|
|
|
500 |
}
|
501 |
</style>
|
502 |
""", unsafe_allow_html=True)
|
503 |
+
import streamlit as st
|
504 |
+
import os
|
505 |
+
import docx
|
506 |
+
from hazm import Normalizer, SentenceTokenizer
|
507 |
+
from fuzzywuzzy import fuzz
|
508 |
+
from concurrent.futures import ThreadPoolExecutor
|
509 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
510 |
+
from sklearn.cluster import KMeans
|
511 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
512 |
+
import re
|
513 |
+
|
514 |
folder_path = '46'
|
515 |
normalizer = Normalizer()
|
516 |
sentence_tokenizer = SentenceTokenizer()
|
|
|
528 |
except Exception as e:
|
529 |
print(f"Error processing {filename}: {e}")
|
530 |
return []
|
531 |
+
with ThreadPoolExecutor() as executor:
|
532 |
results = executor.map(process_docx, [f for f in os.listdir(path) if f.endswith(".docx")])
|
|
|
|
|
533 |
|
534 |
return list(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
|
536 |
+
# بارگذاری و پردازش اسناد
|
537 |
+
all_sentences = load_and_process_documents(folder_path)
|
538 |
|
539 |
# تابع خلاصهسازی متن با استفاده از KMeans
|
540 |
def summarize_text_with_kmeans(text, num_sentences=3):
|
541 |
+
sentences = sentence_tokenizer.tokenize(text) # تقسیم متن به جملات
|
542 |
tfidf_vectorizer = TfidfVectorizer()
|
543 |
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) # تبدیل جملات به ماتریس TF-IDF
|
544 |
|
|
|
554 |
summary = [sentences[i] for i in similar_sentences_indices]
|
555 |
return ' '.join(summary)
|
556 |
|
557 |
+
# تابع تمیز کردن متن
|
558 |
+
def clean_text(text):
|
559 |
+
cleaned_text = re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
560 |
+
return cleaned_text
|
561 |
+
|
562 |
+
# محاسبه شباهت بین جملات
|
563 |
+
def compute_similarity(sentence, query, threshold):
|
564 |
+
similarity = fuzz.partial_ratio(sentence, query)
|
565 |
+
if similarity >= threshold:
|
566 |
+
return sentence
|
567 |
+
return None
|
568 |
+
|
569 |
+
# پردازش پرسش
|
570 |
if query:
|
571 |
threshold = 75
|
572 |
keywords = query
|
|
|
659 |
])
|
660 |
rewritten = clean_text(response.content.strip())
|
661 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
|