M17idd commited on
Commit
0f721da
·
1 Parent(s): c20bc05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -500,6 +500,17 @@ st.markdown("""
500
  }
501
  </style>
502
  """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
503
  folder_path = '46'
504
  normalizer = Normalizer()
505
  sentence_tokenizer = SentenceTokenizer()
@@ -517,37 +528,17 @@ def load_and_process_documents(path):
517
  except Exception as e:
518
  print(f"Error processing {filename}: {e}")
519
  return []
520
- with concurrent.futures.ThreadPoolExecutor() as executor:
521
  results = executor.map(process_docx, [f for f in os.listdir(path) if f.endswith(".docx")])
522
-
523
-
524
 
525
  return list(results)
526
- all_sentences = load_and_process_documents(folder_path)
527
-
528
- def clean_text(text):
529
- cleaned_text = re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
530
-
531
- return cleaned_text
532
-
533
-
534
- def compute_similarity(sentence, query, threshold):
535
- similarity = fuzz.partial_ratio(sentence, query)
536
- if similarity >= threshold:
537
- return sentence
538
- return None
539
-
540
- import string
541
- from hazm import word_tokenize, sent_tokenize
542
- from sklearn.feature_extraction.text import TfidfVectorizer
543
- from sklearn.cluster import KMeans
544
- from sklearn.metrics.pairwise import cosine_similarity
545
- from concurrent.futures import ThreadPoolExecutor
546
 
 
 
547
 
548
  # تابع خلاصه‌سازی متن با استفاده از KMeans
549
  def summarize_text_with_kmeans(text, num_sentences=3):
550
- sentences = sent_tokenize(text) # تقسیم متن به جملات
551
  tfidf_vectorizer = TfidfVectorizer()
552
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) # تبدیل جملات به ماتریس TF-IDF
553
 
@@ -563,7 +554,19 @@ def summarize_text_with_kmeans(text, num_sentences=3):
563
  summary = [sentences[i] for i in similar_sentences_indices]
564
  return ' '.join(summary)
565
 
566
- # فرض بر این است که query و all_sentences قبلاً تعریف شده‌اند
 
 
 
 
 
 
 
 
 
 
 
 
567
  if query:
568
  threshold = 75
569
  keywords = query
@@ -656,4 +659,3 @@ if query:
656
  ])
657
  rewritten = clean_text(response.content.strip())
658
  st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
659
- think.empty()
 
500
  }
501
  </style>
502
  """, unsafe_allow_html=True)
503
+ import streamlit as st
504
+ import os
505
+ import docx
506
+ from hazm import Normalizer, SentenceTokenizer
507
+ from fuzzywuzzy import fuzz
508
+ from concurrent.futures import ThreadPoolExecutor
509
+ from sklearn.feature_extraction.text import TfidfVectorizer
510
+ from sklearn.cluster import KMeans
511
+ from sklearn.metrics.pairwise import cosine_similarity
512
+ import re
513
+
514
  folder_path = '46'
515
  normalizer = Normalizer()
516
  sentence_tokenizer = SentenceTokenizer()
 
528
  except Exception as e:
529
  print(f"Error processing {filename}: {e}")
530
  return []
531
+ with ThreadPoolExecutor() as executor:
532
  results = executor.map(process_docx, [f for f in os.listdir(path) if f.endswith(".docx")])
 
 
533
 
534
  return list(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
+ # بارگذاری و پردازش اسناد
537
+ all_sentences = load_and_process_documents(folder_path)
538
 
539
  # تابع خلاصه‌سازی متن با استفاده از KMeans
540
  def summarize_text_with_kmeans(text, num_sentences=3):
541
+ sentences = sentence_tokenizer.tokenize(text) # تقسیم متن به جملات
542
  tfidf_vectorizer = TfidfVectorizer()
543
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) # تبدیل جملات به ماتریس TF-IDF
544
 
 
554
  summary = [sentences[i] for i in similar_sentences_indices]
555
  return ' '.join(summary)
556
 
557
+ # تابع تمیز کردن متن
558
+ def clean_text(text):
559
+ cleaned_text = re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
560
+ return cleaned_text
561
+
562
+ # محاسبه شباهت بین جملات
563
+ def compute_similarity(sentence, query, threshold):
564
+ similarity = fuzz.partial_ratio(sentence, query)
565
+ if similarity >= threshold:
566
+ return sentence
567
+ return None
568
+
569
+ # پردازش پرسش
570
  if query:
571
  threshold = 75
572
  keywords = query
 
659
  ])
660
  rewritten = clean_text(response.content.strip())
661
  st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)