Spaces:

M17idd
/

army

Running

App Files Files Community

M17idd commited on May 4

Commit

9f8a6b6

verified ·

1 Parent(s): 34701b1

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -60

app.py CHANGED Viewed

@@ -509,7 +509,6 @@ st.markdown("""
 """, unsafe_allow_html=True)
 import os
 import re
 import docx
@@ -546,108 +545,81 @@ def load_and_process_documents(path):
 doc_texts = load_and_process_documents(folder_path)
-with open("stopwords.txt", "r", encoding="utf-8") as f:
-    stop_words = set(line.strip() for line in f if line.strip())
 def remove_stop_words(text, stop_words):
     words = text.split()
     return " ".join([word for word in words if word not in stop_words])
 def extract_keywords_from_text(text, query_words):
     matched_lines = []
     lines = text.split("\n")
     for line in lines:
         if any(query_word in line for query_word in query_words):
             matched_lines.append(line)
     return matched_lines
-from collections import Counter
-import heapq
-def summarize_text_by_frequency(text, num_sentences=1):
-    sentences = text.split('\n')
-    word_freq = Counter()
-    for sentence in sentences:
-        for word in sentence.split():
-            if word not in stop_words:
-                word_freq[word] += 1
-    sentence_scores = {}
-    for sentence in sentences:
-        for word in sentence.split():
-            if word in word_freq:
-                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]
-    summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
-    return "\n".join(summarized_sentences)
-def find_closest_lines(query, doc_texts, stop_words, top_n=15):
     cleaned_query = remove_stop_words(query, stop_words)
     query_words = cleaned_query.split()
     all_matched_lines = []
     for filename, text in doc_texts.items():
         matched_lines = extract_keywords_from_text(text, query_words)
         for line in matched_lines:
             similarity = fuzz.partial_ratio(query, line)  # محاسبه شباهت خط با سوال
             all_matched_lines.append((line, similarity))
     all_matched_lines.sort(key=lambda x: x[1], reverse=True)
     closest_lines = [line for line, _ in all_matched_lines[:top_n]]
-    return closest_lines
-def remove_stop_phrases(text, stop_words):
-    for phrase in stop_words:
-        text = text.replace(phrase, "")
-    return text
 if query:
-    closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=15)
-    # حذف استپ‌وردها از خطوط و سپس پاکسازی نهایی متن
-    cleaned_closest_lines = [
-    remove_stop_phrases(line, stop_words)
-    for line in closest_lines
-    ]
-    summarized_text = summarize_text_by_frequency("\n".join(cleaned_closest_lines), num_sentences=1)
-    summarized_cleaned = remove_stop_phrases(summarized_text, stop_words)
-    st.markdown(summarized_text)
-    if summarized_text:
         prompt = f"""
-        لطفاً با توجه به سؤال زیر و محتوای خطوط مرتبط، یک پاسخ نهایی حرفه‌ای، دقیق و روان تولید کن.
-        فقط از متن خطوط مرتبط استفاده کن و خلاصه بنویس. اطلاعات اضافی ننویس و فقط به سوال پاسخ بده.
-        در صورتی که اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن که اطلاعات کافی برای پاسخ‌دهی موجود نیست.
         سوال:
         {query}
         خطوط مرتبط:
-        {summarized_text}
         پاسخ نهایی:
         """
-        # ارسال پیام به مدل به صورت صحیح
         response = llm([
-            SystemMessage(content="تو رزم یار ارتش هستی و  از کتاب و دیتای موجود به سوالات پاسخ میدی."),
             HumanMessage(content=prompt)
         ])
-        # فرض بر این است که خروجی مدل به صورت دیکشنری است
-        rewritten = response.content.strip()
-        # نمایش نتیجه
         st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
     else:
         st.warning("هیچ خط مرتبطی پیدا نشد.")

 """, unsafe_allow_html=True)
 import os
 import re
 import docx
 doc_texts = load_and_process_documents(folder_path)
+stop_words = [
+    "است", "و", "با", "که", "در", "از", "برای", "به", "بر", "تا", "این", "آن", "یک", "کدام", "کجا", "هم", "همه",
+    "یا", "از", "بر", "همچنین", "می", "باید", "شود", "شد", "گفت", "گویا", "داشت", "داشتن", "کنند", "کنیم",
+    "کرد", "کردن", "نیز", "یا", "اگر", "ای", "اینکه", "نه", "باشید", "باشم", "باشی", "در حالی که", "مگر", "چرا"
+]
+# تابعی برای پاکسازی کلمات اضافی از سوال
 def remove_stop_words(text, stop_words):
     words = text.split()
     return " ".join([word for word in words if word not in stop_words])
+# تابعی برای استخراج کلمات از متن
 def extract_keywords_from_text(text, query_words):
     matched_lines = []
     lines = text.split("\n")
+    # جستجو برای هر کلمه در هر خط
     for line in lines:
         if any(query_word in line for query_word in query_words):
             matched_lines.append(line)
     return matched_lines
+# تابعی برای پاکسازی متن
+def clean_text(text):
+    return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
+# تابعی برای پیدا کردن نزدیک‌ترین خطوط به سوال
+def find_closest_lines(query, doc_texts, stop_words, top_n=20, exclude_line=None):
+    # حذف کلمات اضافی از سوال
     cleaned_query = remove_stop_words(query, stop_words)
     query_words = cleaned_query.split()
     all_matched_lines = []
+    # بررسی محتوای فایل‌ها
     for filename, text in doc_texts.items():
         matched_lines = extract_keywords_from_text(text, query_words)
         for line in matched_lines:
             similarity = fuzz.partial_ratio(query, line)  # محاسبه شباهت خط با سوال
             all_matched_lines.append((line, similarity))
+    # مرتب سازی بر اساس شباهت
     all_matched_lines.sort(key=lambda x: x[1], reverse=True)
+    # انتخاب ۲۰ خط نزدیک‌تر
     closest_lines = [line for line, _ in all_matched_lines[:top_n]]
+    # حذف خط خاص از لیست در صورت وجود
+    if exclude_line and exclude_line in closest_lines:
+        closest_lines.remove(exclude_line)
+    return closest_lines
+# حالا این رو در کد اصلی استفاده می‌کنیم:
 if query:
+    # پیدا کردن ۲۰ خط نزدیک‌تر به سوال (و حذف یک خط خاص)
+    closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=20, exclude_line=None)
+    if closest_lines:
         prompt = f"""
+        لطفاً با توجه به سؤال زیر و محتوای خطوط مرتبط، یک پاسخ نهایی حرفه‌ای، دقیق و روان تولید کن. فقط از متن خطوط مرتبط استفاده کن. اگر اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن.
         سوال:
         {query}
         خطوط مرتبط:
+        {closest_lines}
         پاسخ نهایی:
         """
         response = llm([
+            SystemMessage(content="You are a helpful assistant."),
             HumanMessage(content=prompt)
         ])
+        rewritten = clean_text(response.content.strip())
         st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
     else:
         st.warning("هیچ خط مرتبطی پیدا نشد.")