ngcanh commited on
Commit
6765cfa
·
verified ·
1 Parent(s): 6ceb7f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -22
app.py CHANGED
@@ -22,16 +22,12 @@ class PDFChatbot:
22
  self.azure_client = openai.OpenAI()
23
  self.conversation_history = []
24
  self.pdf_content = ""
 
25
 
26
- def get_relevant_context(self, user_question: str) -> List[str]:
27
- """Split text into smaller chunks for better processing."""
28
- # db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
29
-
30
-
31
- pdf_directory = "data"
32
  all_text = ""
33
-
34
- # Step 1: Read and extract text from all PDFs
35
  for filename in os.listdir(pdf_directory):
36
  if filename.lower().endswith(".pdf"):
37
  pdf_path = os.path.join(pdf_directory, filename)
@@ -41,15 +37,15 @@ class PDFChatbot:
41
  page_text = page.extract_text()
42
  if page_text:
43
  all_text += page_text + "\n"
44
-
45
- # Step 2: Split text into chunks of ~3000 characters
46
  words = all_text.split()
47
  chunks = []
48
  current_chunk = []
49
  current_length = 0
50
-
51
  for word in words:
52
- if current_length + len(word) + 1 > 3000:
53
  if current_chunk:
54
  chunks.append(Document(page_content=" ".join(current_chunk)))
55
  current_chunk = [word]
@@ -57,20 +53,20 @@ class PDFChatbot:
57
  else:
58
  current_chunk.append(word)
59
  current_length += len(word) + 1
60
-
61
  if current_chunk:
62
  chunks.append(Document(page_content=" ".join(current_chunk)))
63
-
64
- # Step 3: Build the FAISS index
65
  embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
66
- db = FAISS.from_documents(chunks, embedding_model)
67
-
68
- # Step 4: Perform similarity search
69
- relevant_chunks = db.similarity_search(user_question, k=3)
70
-
71
- # Step 5: Return the content of the top relevant chunks
72
- return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
73
 
 
 
 
 
 
74
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
75
  """Generate response using Azure OpenAI based on PDF content and user question."""
76
  # Split PDF content into chunks
 
22
  self.azure_client = openai.OpenAI()
23
  self.conversation_history = []
24
  self.pdf_content = ""
25
+ self.faiss_index = self.build_faiss_index("data")
26
 
27
+ def build_faiss_index(self, pdf_directory: str, chunk_size: int = 3000) -> FAISS:
28
+ """Read PDFs, split into chunks, and build FAISS index."""
 
 
 
 
29
  all_text = ""
30
+
 
31
  for filename in os.listdir(pdf_directory):
32
  if filename.lower().endswith(".pdf"):
33
  pdf_path = os.path.join(pdf_directory, filename)
 
37
  page_text = page.extract_text()
38
  if page_text:
39
  all_text += page_text + "\n"
40
+
41
+ # Split text into ~chunk_size character chunks
42
  words = all_text.split()
43
  chunks = []
44
  current_chunk = []
45
  current_length = 0
46
+
47
  for word in words:
48
+ if current_length + len(word) + 1 > chunk_size:
49
  if current_chunk:
50
  chunks.append(Document(page_content=" ".join(current_chunk)))
51
  current_chunk = [word]
 
53
  else:
54
  current_chunk.append(word)
55
  current_length += len(word) + 1
56
+
57
  if current_chunk:
58
  chunks.append(Document(page_content=" ".join(current_chunk)))
59
+
60
+ # Embed and index
61
  embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
62
+ faiss_index = FAISS.from_documents(chunks, embedding_model)
63
+ return faiss_index
 
 
 
 
 
64
 
65
+ def get_relevant_context(self, user_question: str) -> List[str]:
66
+ """Query the FAISS index for the top relevant chunks."""
67
+ relevant_chunks = self.faiss_index.similarity_search(user_question, k=3)
68
+ return "\n\n".join([doc.page_content for doc in relevant_chunks])
69
+
70
  def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
71
  """Generate response using Azure OpenAI based on PDF content and user question."""
72
  # Split PDF content into chunks