ngcanh commited on
Commit
7164d53
·
verified ·
1 Parent(s): d220e65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -1
app.py CHANGED
@@ -7,6 +7,11 @@ from langchain.embeddings import HuggingFaceEmbeddings
7
  from openai import OpenAI
8
  from langchain_openai import ChatOpenAI
9
  from typing import List, Dict
 
 
 
 
 
10
  # Load environment variables
11
  OPENAI_API_KEY = os.getenv("OPENAI_API")
12
  TOKEN=os.getenv('HF_TOKEN')
@@ -20,7 +25,45 @@ class PDFChatbot:
20
 
21
  def get_relevant_context(self, user_question: str) -> List[str]:
22
  """Split text into smaller chunks for better processing."""
23
- db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  relevant_chunks = db.similarity_search(user_question, k=3)
25
  relevant_chunks = [chunk.page_content for chunk in relevant_chunks]
26
  return "\n\n".join(relevant_chunks)
 
7
  from openai import OpenAI
8
  from langchain_openai import ChatOpenAI
9
  from typing import List, Dict
10
+ import fitz # PyMuPDF
11
+ from langchain.schema import Document
12
+ from langchain_experimental.text_splitter import SemanticChunker # module for chunking text
13
+ import os
14
+
15
  # Load environment variables
16
  OPENAI_API_KEY = os.getenv("OPENAI_API")
17
  TOKEN=os.getenv('HF_TOKEN')
 
25
 
26
  def get_relevant_context(self, user_question: str) -> List[str]:
27
  """Split text into smaller chunks for better processing."""
28
+ # db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
29
+
30
+
31
+ pdf_directory = "data"
32
+
33
+ # Duyệt qua các file trong thư mục và đọc từng file PDF
34
+ pdf_texts = []
35
+ for filename in os.listdir(pdf_directory):
36
+ if filename.endswith(".pdf"):
37
+ file_path = os.path.join(pdf_directory, filename)
38
+
39
+ # Mở file PDF
40
+ doc = fitz.open(file_path)
41
+
42
+ # Trích xuất toàn bộ văn bản từ từng trang
43
+ full_text = ""
44
+ for page_num in range(doc.page_count):
45
+ page = doc.load_page(page_num)
46
+ full_text += page.get_text("text", flags=11)
47
+
48
+ pdf_texts.append({"file": filename, "text": full_text})
49
+
50
+ documents = [
51
+ Document(page_content=doc['text'], metadata={'file': doc['file']})
52
+ for doc in pdf_texts # Assuming pdf_texts is a list of dictionaries like {'file': filename, 'text': full_text}
53
+ ]
54
+
55
+ semantic_splitter = SemanticChunker(
56
+ embeddings= HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'),
57
+ buffer_size=1, # total sentence collected before perform text split
58
+ breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
59
+ breakpoint_threshold_amount=95, # split text if similarity score > 95%
60
+ min_chunk_size=500,
61
+ add_start_index=True, # assign index for chunk
62
+ )
63
+
64
+ docs = semantic_splitter.split_documents(documents)
65
+ db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
66
+
67
  relevant_chunks = db.similarity_search(user_question, k=3)
68
  relevant_chunks = [chunk.page_content for chunk in relevant_chunks]
69
  return "\n\n".join(relevant_chunks)