MBAL_chatbot

Running

ngcanh commited on Jul 21

Commit

6ceb7f9

verified ·

1 Parent(s): bfb4152

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,29 +29,19 @@ class PDFChatbot:
         pdf_directory = "data"
-import os
-import PyPDF2
-from langchain.vectorstores import FAISS
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.docstore.document import Document
-pdf_directory = "path_to_your_pdf_folder"
-user_question = "your query here"
-all_text = ""
-# Step 1: Read and extract text from all PDFs
-for filename in os.listdir(pdf_directory):
-    if filename.lower().endswith(".pdf"):
-        pdf_path = os.path.join(pdf_directory, filename)
-        with open(pdf_path, "rb") as pdf_file:
-            pdf_reader = PyPDF2.PdfReader(pdf_file)
-            for page in pdf_reader.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    all_text += page_text + "\n"
         # Step 2: Split text into chunks of ~3000 characters
         words = all_text.split()
         chunks = []
@@ -80,7 +70,6 @@ for filename in os.listdir(pdf_directory):
         # Step 5: Return the content of the top relevant chunks
         return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
-    print(return_text)  # Or return from a function if used inside one
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""

         pdf_directory = "data"
+        all_text = ""
+        # Step 1: Read and extract text from all PDFs
+        for filename in os.listdir(pdf_directory):
+            if filename.lower().endswith(".pdf"):
+                pdf_path = os.path.join(pdf_directory, filename)
+                with open(pdf_path, "rb") as pdf_file:
+                    pdf_reader = PyPDF2.PdfReader(pdf_file)
+                    for page in pdf_reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            all_text += page_text + "\n"
         # Step 2: Split text into chunks of ~3000 characters
         words = all_text.split()
         chunks = []
         # Step 5: Return the content of the top relevant chunks
         return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
     def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
         """Generate response using Azure OpenAI based on PDF content and user question."""