ngcanh commited on
Commit
6ceb7f9
·
verified ·
1 Parent(s): bfb4152

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -23
app.py CHANGED
@@ -29,29 +29,19 @@ class PDFChatbot:
29
 
30
 
31
  pdf_directory = "data"
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- import os
34
- import PyPDF2
35
- from langchain.vectorstores import FAISS
36
- from langchain.embeddings import HuggingFaceEmbeddings
37
- from langchain.docstore.document import Document
38
-
39
- pdf_directory = "path_to_your_pdf_folder"
40
- user_question = "your query here"
41
-
42
- all_text = ""
43
-
44
- # Step 1: Read and extract text from all PDFs
45
- for filename in os.listdir(pdf_directory):
46
- if filename.lower().endswith(".pdf"):
47
- pdf_path = os.path.join(pdf_directory, filename)
48
- with open(pdf_path, "rb") as pdf_file:
49
- pdf_reader = PyPDF2.PdfReader(pdf_file)
50
- for page in pdf_reader.pages:
51
- page_text = page.extract_text()
52
- if page_text:
53
- all_text += page_text + "\n"
54
-
55
  # Step 2: Split text into chunks of ~3000 characters
56
  words = all_text.split()
57
  chunks = []
@@ -80,7 +70,6 @@ for filename in os.listdir(pdf_directory):
80
 
81
  # Step 5: Return the content of the top relevant chunks
82
  return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
83
- print(return_text) # Or return from a function if used inside one
84
 
85
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
86
  """Generate response using Azure OpenAI based on PDF content and user question."""
 
29
 
30
 
31
  pdf_directory = "data"
32
+ all_text = ""
33
+
34
+ # Step 1: Read and extract text from all PDFs
35
+ for filename in os.listdir(pdf_directory):
36
+ if filename.lower().endswith(".pdf"):
37
+ pdf_path = os.path.join(pdf_directory, filename)
38
+ with open(pdf_path, "rb") as pdf_file:
39
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
40
+ for page in pdf_reader.pages:
41
+ page_text = page.extract_text()
42
+ if page_text:
43
+ all_text += page_text + "\n"
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Step 2: Split text into chunks of ~3000 characters
46
  words = all_text.split()
47
  chunks = []
 
70
 
71
  # Step 5: Return the content of the top relevant chunks
72
  return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
 
73
 
74
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
75
  """Generate response using Azure OpenAI based on PDF content and user question."""