Spaces:

M17idd
/

army

Sleeping

App Files Files Community

M17idd commited on May 4

Commit

d3b344a

1 Parent(s): 5e397a9

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -49

app.py CHANGED Viewed

@@ -511,19 +511,11 @@ import streamlit as st
 import concurrent.futures
 from hazm import Normalizer
 from rapidfuzz import fuzz
-from collections import Counter
-import heapq
 from langchain.schema import SystemMessage, HumanMessage
-# مسیر پوشه فایل‌ها
 folder_path = '46'
 normalizer = Normalizer()
-# بارگذاری استپ‌وردها
-with open('stopwords.txt', 'r', encoding='utf-8') as file:
-    stop_words = set(file.read().splitlines())
-# کش کردن پردازش اسناد
 @st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
 def load_and_process_documents(path):
     def process_docx(filename):
@@ -546,86 +538,71 @@ def load_and_process_documents(path):
     return doc_texts
-# حذف استپ‌ورد از یک متن
 def remove_stop_words(text, stop_words):
     words = text.split()
     return " ".join([word for word in words if word not in stop_words])
-# استخراج خطوطی که شامل کلمات کلیدی هستند
 def extract_keywords_from_text(text, query_words):
     matched_lines = []
     lines = text.split("\n")
     for line in lines:
         if any(query_word in line for query_word in query_words):
             matched_lines.append(line)
     return matched_lines
-# پاکسازی عمومی خطوط
 def clean_text(text):
     return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
-# پیدا کردن خطوط نزدیک به پرسش
 def find_closest_lines(query, doc_texts, stop_words, top_n=10):
     cleaned_query = remove_stop_words(query, stop_words)
     query_words = cleaned_query.split()
     all_matched_lines = []
     for filename, text in doc_texts.items():
         matched_lines = extract_keywords_from_text(text, query_words)
         for line in matched_lines:
-            similarity = fuzz.partial_ratio(query, line)
             all_matched_lines.append((line, similarity))
     all_matched_lines.sort(key=lambda x: x[1], reverse=True)
-    closest_lines = [line for line, _ in all_matched_lines[:top_n]]
-    return closest_lines
-# خلاصه‌سازی بر اساس فرکانس کلمات
-def summarize_text_by_frequency(text, num_sentences=3):
-    sentences = text.split('\n')
-    word_freq = Counter()
-    for sentence in sentences:
-        for word in sentence.split():
-            if word not in stop_words:
-                word_freq[word] += 1
-    sentence_scores = {}
-    for sentence in sentences:
-        for word in sentence.split():
-            if word in word_freq:
-                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]
-    summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
-    return "\n".join(summarized_sentences)
-# بارگذاری اسناد
-doc_texts = load_and_process_documents(folder_path)
-# رابط کاربری
-st.title("پاسخ به پرسش از روی اسناد")
-query = st.text_input("پرسش خود را وارد کنید:")
 if query:
     closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=10)
-    # حذف استپ‌ورد و پاکسازی خطوط
     cleaned_closest_lines = [
         clean_text(" ".join([word for word in line.split() if word not in stop_words]))
         for line in closest_lines
     ]
-    # خلاصه‌سازی
-    summarized_text = summarize_text_by_frequency("\n".join(cleaned_closest_lines), num_sentences=3)
-    if summarized_text.strip():
         prompt = f"""
-        لطفاً با توجه به سؤال زیر و محتوای خلاصه‌شده، یک پاسخ نهایی حرفه‌ای، دقیق و روان تولید کن. فقط از متن استفاده کن. اگر اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن.
         سوال:
         {query}
-        خل��صه‌ی مرتبط:
-        {summarized_text}
         پاسخ نهایی:
         """
@@ -636,5 +613,6 @@ if query:
         rewritten = clean_text(response.content.strip())
         st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
     else:
-        st.warning("هیچ محتوای خلاصه‌شده‌ای برای پاسخ وجود ندارد.")

 import concurrent.futures
 from hazm import Normalizer
 from rapidfuzz import fuzz
 from langchain.schema import SystemMessage, HumanMessage
 folder_path = '46'
 normalizer = Normalizer()
 @st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
 def load_and_process_documents(path):
     def process_docx(filename):
     return doc_texts
+doc_texts = load_and_process_documents(folder_path)
+with open('stopwords.txt', 'r', encoding='utf-8') as file:
+    stop_words = set(file.read().splitlines())
 def remove_stop_words(text, stop_words):
     words = text.split()
     return " ".join([word for word in words if word not in stop_words])
 def extract_keywords_from_text(text, query_words):
     matched_lines = []
     lines = text.split("\n")
     for line in lines:
         if any(query_word in line for query_word in query_words):
             matched_lines.append(line)
     return matched_lines
 def clean_text(text):
     return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
 def find_closest_lines(query, doc_texts, stop_words, top_n=10):
     cleaned_query = remove_stop_words(query, stop_words)
     query_words = cleaned_query.split()
     all_matched_lines = []
     for filename, text in doc_texts.items():
         matched_lines = extract_keywords_from_text(text, query_words)
         for line in matched_lines:
+            similarity = fuzz.partial_ratio(query, line)  # محاسبه شباهت خط با سوال
             all_matched_lines.append((line, similarity))
     all_matched_lines.sort(key=lambda x: x[1], reverse=True)
+    closest_lines = [line for line, _ in all_matched_lines[:top_n]]
+    return closest_lines
+def remove_stop_words_from_lines(lines, stop_words):
+    cleaned_lines = []
+    for line in lines:
+        words = line.split()
+        cleaned_words = [word for word in words if word not in stop_words]
+        cleaned_lines.append(" ".join(cleaned_words))
+    return cleaned_lines
 if query:
     closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=10)
+    # حذف استپ‌وردها از خطوط و سپس پاکسازی نهایی متن
     cleaned_closest_lines = [
         clean_text(" ".join([word for word in line.split() if word not in stop_words]))
         for line in closest_lines
     ]
+    if cleaned_closest_lines:
+        cleaned_text = "\n".join(cleaned_closest_lines[:1])
         prompt = f"""
+        لطفاً با توجه به سؤال زیر و محتوای خطوط مرتبط، یک پاسخ نهایی حرفه‌ای، دقیق و روان تولید کن. فقط از متن خطوط مرتبط استفاده کن. اگر اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن.
         سوال:
         {query}
+        خطوط مرتبط:
+        {cleaned_text}
         پاسخ نهایی:
         """
         rewritten = clean_text(response.content.strip())
         st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
     else:
+        st.warning("هیچ خط مرتبطی پیدا نشد.")