MBAL_chatbot

Sleeping

App Files Files Community

ngcanh commited on 22 days ago

Commit

ecbb6ec

verified ·

1 Parent(s): 101865a

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -30

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import streamlit as st
 import subprocess
 import openai
 import fitz
 from langchain_community.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
 from openai import OpenAI
@@ -31,37 +32,27 @@ class PDFChatbot:
         pdf_directory = "data"
         # Duyệt qua các file trong thư mục và đọc từng file PDF
-        pdf_texts = []
-        for filename in os.listdir(pdf_directory):
-            if filename.endswith(".pdf"):
-                file_path = os.path.join(pdf_directory, filename)
-                # Mở file PDF
-                doc = fitz.open(file_path)
-                # Trích xuất toàn bộ văn bản từ từng trang
-                full_text = ""
-                for page_num in range(doc.page_count):
-                    page = doc.load_page(page_num)
-                    full_text += page.get_text("text", flags=11)
-                pdf_texts.append({"file": filename, "text": full_text})
-        documents = [
-            Document(page_content=doc['text'], metadata={'file': doc['file']})
-            for doc in pdf_texts  # Assuming pdf_texts is a list of dictionaries like {'file': filename, 'text': full_text}
-        ]
-        semantic_splitter = SemanticChunker(
-            embeddings= HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'),
-            buffer_size=1, # total sentence collected before perform text split
-            breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
-            breakpoint_threshold_amount=95, # split text if similarity score > 95%
-            # min_chunk_size=500,
-            add_start_index=True, # assign index for chunk
-        )
-        docs = semantic_splitter.split_documents(documents)
         db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
         relevant_chunks = db.similarity_search(user_question, k=3)

 import subprocess
 import openai
 import fitz
+import PyPDF2
 from langchain_community.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
 from openai import OpenAI
         pdf_directory = "data"
         # Duyệt qua các file trong thư mục và đọc từng file PDF
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            text += page.extract_text() + "\n"
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            if current_length + len(word) + 1 > chunk_size:
+                if current_chunk:
+                    chunks.append(" ".join(current_chunk))
+                    current_chunk = [word]
+                    current_length = len(word)
+            else:
+                current_chunk.append(word)
+                current_length += len(word) + 1
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
         db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
         relevant_chunks = db.similarity_search(user_question, k=3)