Spaces:

M17idd
/

army

Running

App Files Files Community

M17idd commited on May 4

Commit

fa35776

1 Parent(s): c275f5e

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -138

app.py CHANGED Viewed

@@ -387,42 +387,6 @@ llm = ChatOpenAI(
     max_tokens=1024
 )
-# ---------- پردازش فایل‌ها با کش و موازی ----------
-folder_path = '46'
-normalizer = Normalizer()
-sentence_tokenizer = SentenceTokenizer()
-@st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
-def load_and_process_documents(path):
-    def process_docx(filename):
-        try:
-            full_path = os.path.join(path, filename)
-            doc = docx.Document(full_path)
-            text = "\n".join([para.text for para in doc.paragraphs])  # استخراج متن
-            normalized = normalizer.normalize(text)
-            sentences = normalized
-            return sentences
-        except Exception as e:
-            print(f"Error processing {filename}: {e}")
-            return []
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        results = executor.map(process_docx, [f for f in os.listdir(path) if f.endswith(".docx")])
-    return list(results)
-all_sentences = load_and_process_documents(folder_path)
-def clean_text(text):
-    cleaned_text = re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
-    return cleaned_text
-# all_sentences = load_and_process_documents(folder_path)
-# st.markdown(all_sentences[2])
 # ---------- ورودی جستجو ----------
 st.markdown("""
@@ -539,112 +503,76 @@ st.markdown("""
-import string
-from hazm import word_tokenize
-from concurrent.futures import ThreadPoolExecutor
-def extract_keywords(query):
-    tokenizer = WordTokenizer()
-    words = tokenizer.tokenize(query)
-    return [word for word in words if word not in string.punctuation]
 def clean_text(text):
-    return text.strip()
-def compute_similarity(sentence, query, threshold):
-    similarity = fuzz.partial_ratio(query, sentence)
-    if similarity >= threshold:
-        return sentence
-    return None
-# فرض: query و all_sentences قبلاً تعریف شده‌اند
 if query:
-    threshold = 75
-    keywords = extract_keywords(query)
-    # استفاده از پردازش موازی برای افزایش سرعت fuzzy matching
-    with ThreadPoolExecutor(max_workers=8) as executor:
-        futures = [executor.submit(compute_similarity, sentence, query, threshold) for sentence in all_sentences]
-        matched_sentences = [future.result() for future in futures if future.result()]
-    if matched_sentences:
-        found_sentences = [sentence for sentence in matched_sentences if any(keyword in sentence for keyword in keywords)]
-        if found_sentences:
-            matched_text = "\n".join(found_sentences)
-            st.markdown(matched_text)
-            prompt = f"""
-            تعدادی پاسخ برای سوال زیر تولید شده است. لطفاً ابتدا این پاسخ‌ها را بررسی کن، سپس با در نظر گرفتن محتوای سوال و لحن آن، یک پاسخ نهایی حرفه‌ای، دقیق و روان از داخل پاسخ‌ها ارائه کن که هم به سوال پاسخ دهد و هم از نظر نگارشی و ساختاری در سطح بالایی باشد. پاسخ نهایی باید حداکثر 2048 کاراکتر و حداقل 512 باشد، خلاصه و واضح نوشته شود و فقط به زبان فارسی باشد. از تکرار اضافی پرهیز کن و فقط از پاسخ‌های زیر استفاده کن. در صورت نیاز، محتوای چند پاسخ را با هم ترکیب کن.
-            سوال:
-            {query}
-            پاسخ‌ها:
-            {matched_text}
-            پاسخ نهایی حرفه‌ای بازنویسی‌شده:
-            """
-            response = llm([
-                SystemMessage(content="You are a helpful assistant."),
-                HumanMessage(content=prompt)
-            ])
-            rewritten = clean_text(response.content.strip())
-            review_prompt = f"""
-            لطفاً بررسی کن که آیا پاسخ زیر به سوال داده‌شده مرتبط، دقیق و معتبر است یا خیر. اگر پاسخ قابل قبول و دقیق است بنویس 'تأیید شد'. اگر متوسط است بنویس 'کمی خوب'. اگر بی‌ربط یا اشتباه است بنویس 'نیاز به اصلاح دارد'.
-            سوال:
-            {query}
-            پاسخ:
-            {rewritten}
-            """
-            review_response = llm([
-                SystemMessage(content="You are a helpful assistant."),
-                HumanMessage(content=review_prompt)
-            ])
-            review_result = review_response.content.strip()
-            if "تأیید شد" in review_result:
-                st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
-            elif "کمی خوب" in review_result:
-                final_prompt = f"""
-                لطفاً برای سوال زیر پاسخی حرفه‌ای، دقیق و روان تولید کن که مرتبط و معتبر باشد. از زبانی جز فارسی استفاده نکن. از محتوای زیر استفاده کن و یک پاسخ نهایی خوب بنویس:
-                سوال:
-                {query}
-                پاسخ اولیه:
-                {rewritten}
-                پاسخ نهایی:
-                """
-                new_response = llm([
-                    SystemMessage(content="You are a helpful assistant."),
-                    HumanMessage(content=final_prompt)
-                ])
-                final_answer = clean_text(new_response.content.strip())
-                st.markdown(f'<div class="chat-message">{final_answer}</div>', unsafe_allow_html=True)
-            else:
-                fallback_prompt = f"""
-                لطفاً برای سوال زیر پاسخی حرفه‌ای، دقیق و روان تولید کن که مرتبط و معتبر باشد. اگر اطلاعات کافی وجود ندارد، صادقانه بگو. فقط به زبان فارسی پاسخ بده:
-                سوال:
-                {query}
-                """
-                fallback_response = llm([
-                    SystemMessage(content="You are a helpful assistant."),
-                    HumanMessage(content=fallback_prompt)
-                ])
-                final_fallback = clean_text(fallback_response.content.strip())
-                st.markdown(f'<div class="chat-message">{final_fallback}</div>', unsafe_allow_html=True)
-        else:
-            fallback_prompt = f"""
-            لطفاً برای سوال زیر یک متن مرتبط و معتبر تولید کن. اگر اطلاعات کافی وجود ندارد، صادقانه اعلام کن. فقط به زبان فارسی پاسخ بده:
-            سوال:
-            {query}
-            """
-            response = llm([
-                SystemMessage(content="You are a helpful assistant."),
-                HumanMessage(content=fallback_prompt)
-            ])
-            rewritten = clean_text(response.content.strip())
-            st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
-            think.empty()

     max_tokens=1024
 )
 # ---------- ورودی جستجو ----------
 st.markdown("""
+import os
+import re
+import docx
+import streamlit as st
+import concurrent.futures
+from hazm import Normalizer
+from rapidfuzz import fuzz
+from langchain.schema import SystemMessage, HumanMessage
+folder_path = '46'
+normalizer = Normalizer()
+@st.cache_data(show_spinner="در حال پردازش اسناد... لطفاً صبور باشید.")
+def load_and_process_documents(path):
+    def process_docx(filename):
+        try:
+            full_path = os.path.join(path, filename)
+            doc = docx.Document(full_path)
+            text = "\n".join([para.text for para in doc.paragraphs])
+            normalized = normalizer.normalize(text)
+            return filename, normalized
+        except Exception as e:
+            print(f"Error processing {filename}: {e}")
+            return filename, ""
+    filenames = [f for f in os.listdir(path) if f.endswith(".docx")]
+    doc_texts = {}
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        for filename, content in executor.map(process_docx, filenames):
+            doc_texts[filename] = content
+    return doc_texts
+doc_texts = load_and_process_documents(folder_path)
 def clean_text(text):
+    return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
+def find_closest_filename(query, filenames):
+    scores = [(f, fuzz.partial_ratio(query, f)) for f in filenames]
+    scores.sort(key=lambda x: x[1], reverse=True)
+    return scores[0][0] if scores else None
+# فرض بر این است که متغیر query توسط کاربر مشخص شده است
 if query:
+    closest_file = find_closest_filename(query, list(doc_texts.keys()))
+    if closest_file:
+        matched_text = doc_texts[closest_file]
+        prompt = f"""
+        لطفاً با توجه به سؤال زیر و محتوای سند موجود، یک پاسخ نهایی حرفه‌ای، دقیق و روان تولید کن. فقط از متن سند استفاده کن. اگر اطلاعات کافی در متن وجود ندارد، صادقانه اعلام کن.
+        سوال:
+        {query}
+        محتوای سند:
+        {matched_text}
+        پاسخ نهایی:
+        """
+        response = llm([
+            SystemMessage(content="You are a helpful assistant."),
+            HumanMessage(content=prompt)
+        ])
+        rewritten = clean_text(response.content.strip())
+        st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
+    else:
+        st.warning("هیچ سند مرتبطی پیدا نشد.")