Update app.py
Browse files
app.py
CHANGED
@@ -511,11 +511,19 @@ import streamlit as st
|
|
511 |
import concurrent.futures
|
512 |
from hazm import Normalizer
|
513 |
from rapidfuzz import fuzz
|
|
|
|
|
514 |
from langchain.schema import SystemMessage, HumanMessage
|
515 |
|
|
|
516 |
folder_path = '46'
|
517 |
normalizer = Normalizer()
|
518 |
|
|
|
|
|
|
|
|
|
|
|
519 |
@st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
|
520 |
def load_and_process_documents(path):
|
521 |
def process_docx(filename):
|
@@ -538,71 +546,86 @@ def load_and_process_documents(path):
|
|
538 |
|
539 |
return doc_texts
|
540 |
|
541 |
-
|
542 |
-
|
543 |
-
with open('stopwords.txt', 'r', encoding='utf-8') as file:
|
544 |
-
stop_words = set(file.read().splitlines())
|
545 |
-
|
546 |
def remove_stop_words(text, stop_words):
|
547 |
words = text.split()
|
548 |
return " ".join([word for word in words if word not in stop_words])
|
549 |
|
|
|
550 |
def extract_keywords_from_text(text, query_words):
|
551 |
matched_lines = []
|
552 |
lines = text.split("\n")
|
553 |
-
|
554 |
for line in lines:
|
555 |
if any(query_word in line for query_word in query_words):
|
556 |
matched_lines.append(line)
|
557 |
return matched_lines
|
558 |
|
|
|
559 |
def clean_text(text):
|
560 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
561 |
|
|
|
562 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
563 |
cleaned_query = remove_stop_words(query, stop_words)
|
564 |
query_words = cleaned_query.split()
|
565 |
|
566 |
all_matched_lines = []
|
567 |
-
|
568 |
for filename, text in doc_texts.items():
|
569 |
matched_lines = extract_keywords_from_text(text, query_words)
|
570 |
for line in matched_lines:
|
571 |
-
similarity = fuzz.partial_ratio(query, line)
|
572 |
all_matched_lines.append((line, similarity))
|
573 |
-
|
574 |
-
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
575 |
|
|
|
576 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
577 |
-
|
578 |
return closest_lines
|
579 |
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
|
588 |
if query:
|
589 |
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=10)
|
590 |
-
|
591 |
-
# حذف
|
592 |
cleaned_closest_lines = [
|
593 |
clean_text(" ".join([word for word in line.split() if word not in stop_words]))
|
594 |
for line in closest_lines
|
595 |
]
|
596 |
|
597 |
-
|
598 |
-
|
599 |
|
|
|
600 |
prompt = f"""
|
601 |
-
لطفاً با توجه به سؤال زیر و محتوای
|
602 |
سوال:
|
603 |
{query}
|
604 |
-
|
605 |
-
{
|
606 |
پاسخ نهایی:
|
607 |
"""
|
608 |
|
@@ -613,6 +636,5 @@ if query:
|
|
613 |
rewritten = clean_text(response.content.strip())
|
614 |
|
615 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
616 |
-
|
617 |
else:
|
618 |
-
st.warning("هیچ
|
|
|
511 |
import concurrent.futures
|
512 |
from hazm import Normalizer
|
513 |
from rapidfuzz import fuzz
|
514 |
+
from collections import Counter
|
515 |
+
import heapq
|
516 |
from langchain.schema import SystemMessage, HumanMessage
|
517 |
|
518 |
+
# مسیر پوشه فایلها
|
519 |
folder_path = '46'
|
520 |
normalizer = Normalizer()
|
521 |
|
522 |
+
# بارگذاری استپوردها
|
523 |
+
with open('stopwords.txt', 'r', encoding='utf-8') as file:
|
524 |
+
stop_words = set(file.read().splitlines())
|
525 |
+
|
526 |
+
# کش کردن پردازش اسناد
|
527 |
@st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
|
528 |
def load_and_process_documents(path):
|
529 |
def process_docx(filename):
|
|
|
546 |
|
547 |
return doc_texts
|
548 |
|
549 |
+
# حذف استپورد از یک متن
|
|
|
|
|
|
|
|
|
550 |
def remove_stop_words(text, stop_words):
|
551 |
words = text.split()
|
552 |
return " ".join([word for word in words if word not in stop_words])
|
553 |
|
554 |
+
# استخراج خطوطی که شامل کلمات کلیدی هستند
|
555 |
def extract_keywords_from_text(text, query_words):
|
556 |
matched_lines = []
|
557 |
lines = text.split("\n")
|
|
|
558 |
for line in lines:
|
559 |
if any(query_word in line for query_word in query_words):
|
560 |
matched_lines.append(line)
|
561 |
return matched_lines
|
562 |
|
563 |
+
# پاکسازی عمومی خطوط
|
564 |
def clean_text(text):
|
565 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
566 |
|
567 |
+
# پیدا کردن خطوط نزدیک به پرسش
|
568 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
569 |
cleaned_query = remove_stop_words(query, stop_words)
|
570 |
query_words = cleaned_query.split()
|
571 |
|
572 |
all_matched_lines = []
|
|
|
573 |
for filename, text in doc_texts.items():
|
574 |
matched_lines = extract_keywords_from_text(text, query_words)
|
575 |
for line in matched_lines:
|
576 |
+
similarity = fuzz.partial_ratio(query, line)
|
577 |
all_matched_lines.append((line, similarity))
|
|
|
|
|
578 |
|
579 |
+
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
580 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
|
|
581 |
return closest_lines
|
582 |
|
583 |
+
# خلاصهسازی بر اساس فرکانس کلمات
|
584 |
+
def summarize_text_by_frequency(text, num_sentences=3):
|
585 |
+
sentences = text.split('\n')
|
586 |
+
word_freq = Counter()
|
587 |
+
|
588 |
+
for sentence in sentences:
|
589 |
+
for word in sentence.split():
|
590 |
+
if word not in stop_words:
|
591 |
+
word_freq[word] += 1
|
592 |
+
|
593 |
+
sentence_scores = {}
|
594 |
+
for sentence in sentences:
|
595 |
+
for word in sentence.split():
|
596 |
+
if word in word_freq:
|
597 |
+
sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]
|
598 |
+
|
599 |
+
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
600 |
+
return "\n".join(summarized_sentences)
|
601 |
+
|
602 |
+
# بارگذاری اسناد
|
603 |
+
doc_texts = load_and_process_documents(folder_path)
|
604 |
+
|
605 |
+
# رابط کاربری
|
606 |
+
st.title("پاسخ به پرسش از روی اسناد")
|
607 |
+
|
608 |
+
query = st.text_input("پرسش خود را وارد کنید:")
|
609 |
|
610 |
if query:
|
611 |
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=10)
|
612 |
+
|
613 |
+
# حذف استپورد و پاکسازی خطوط
|
614 |
cleaned_closest_lines = [
|
615 |
clean_text(" ".join([word for word in line.split() if word not in stop_words]))
|
616 |
for line in closest_lines
|
617 |
]
|
618 |
|
619 |
+
# خلاصهسازی
|
620 |
+
summarized_text = summarize_text_by_frequency("\n".join(cleaned_closest_lines), num_sentences=3)
|
621 |
|
622 |
+
if summarized_text.strip():
|
623 |
prompt = f"""
|
624 |
+
لطفاً با توجه به سؤال زیر و محتوای خلاصهشده، یک پاسخ نهایی حرفهای، دقیق و روان تولید کن. فقط از متن استفاده کن. اگر اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن.
|
625 |
سوال:
|
626 |
{query}
|
627 |
+
خلاصهی مرتبط:
|
628 |
+
{summarized_text}
|
629 |
پاسخ نهایی:
|
630 |
"""
|
631 |
|
|
|
636 |
rewritten = clean_text(response.content.strip())
|
637 |
|
638 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
|
|
639 |
else:
|
640 |
+
st.warning("هیچ محتوای خلاصهشدهای برای پاسخ وجود ندارد.")
|