Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,16 +22,12 @@ class PDFChatbot:
|
|
22 |
self.azure_client = openai.OpenAI()
|
23 |
self.conversation_history = []
|
24 |
self.pdf_content = ""
|
|
|
25 |
|
26 |
-
def
|
27 |
-
"""
|
28 |
-
# db = FAISS.load_local('mbaldb', HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'), allow_dangerous_deserialization = True )
|
29 |
-
|
30 |
-
|
31 |
-
pdf_directory = "data"
|
32 |
all_text = ""
|
33 |
-
|
34 |
-
# Step 1: Read and extract text from all PDFs
|
35 |
for filename in os.listdir(pdf_directory):
|
36 |
if filename.lower().endswith(".pdf"):
|
37 |
pdf_path = os.path.join(pdf_directory, filename)
|
@@ -41,15 +37,15 @@ class PDFChatbot:
|
|
41 |
page_text = page.extract_text()
|
42 |
if page_text:
|
43 |
all_text += page_text + "\n"
|
44 |
-
|
45 |
-
#
|
46 |
words = all_text.split()
|
47 |
chunks = []
|
48 |
current_chunk = []
|
49 |
current_length = 0
|
50 |
-
|
51 |
for word in words:
|
52 |
-
if current_length + len(word) + 1 >
|
53 |
if current_chunk:
|
54 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
55 |
current_chunk = [word]
|
@@ -57,20 +53,20 @@ class PDFChatbot:
|
|
57 |
else:
|
58 |
current_chunk.append(word)
|
59 |
current_length += len(word) + 1
|
60 |
-
|
61 |
if current_chunk:
|
62 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
63 |
-
|
64 |
-
#
|
65 |
embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
|
66 |
-
|
67 |
-
|
68 |
-
# Step 4: Perform similarity search
|
69 |
-
relevant_chunks = db.similarity_search(user_question, k=3)
|
70 |
-
|
71 |
-
# Step 5: Return the content of the top relevant chunks
|
72 |
-
return_text = "\n\n".join([doc.page_content for doc in relevant_chunks])
|
73 |
|
|
|
|
|
|
|
|
|
|
|
74 |
def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
|
75 |
"""Generate response using Azure OpenAI based on PDF content and user question."""
|
76 |
# Split PDF content into chunks
|
|
|
22 |
self.azure_client = openai.OpenAI()
|
23 |
self.conversation_history = []
|
24 |
self.pdf_content = ""
|
25 |
+
self.faiss_index = self.build_faiss_index("data")
|
26 |
|
27 |
+
def build_faiss_index(self, pdf_directory: str, chunk_size: int = 3000) -> FAISS:
|
28 |
+
"""Read PDFs, split into chunks, and build FAISS index."""
|
|
|
|
|
|
|
|
|
29 |
all_text = ""
|
30 |
+
|
|
|
31 |
for filename in os.listdir(pdf_directory):
|
32 |
if filename.lower().endswith(".pdf"):
|
33 |
pdf_path = os.path.join(pdf_directory, filename)
|
|
|
37 |
page_text = page.extract_text()
|
38 |
if page_text:
|
39 |
all_text += page_text + "\n"
|
40 |
+
|
41 |
+
# Split text into ~chunk_size character chunks
|
42 |
words = all_text.split()
|
43 |
chunks = []
|
44 |
current_chunk = []
|
45 |
current_length = 0
|
46 |
+
|
47 |
for word in words:
|
48 |
+
if current_length + len(word) + 1 > chunk_size:
|
49 |
if current_chunk:
|
50 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
51 |
current_chunk = [word]
|
|
|
53 |
else:
|
54 |
current_chunk.append(word)
|
55 |
current_length += len(word) + 1
|
56 |
+
|
57 |
if current_chunk:
|
58 |
chunks.append(Document(page_content=" ".join(current_chunk)))
|
59 |
+
|
60 |
+
# Embed and index
|
61 |
embedding_model = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')
|
62 |
+
faiss_index = FAISS.from_documents(chunks, embedding_model)
|
63 |
+
return faiss_index
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
def get_relevant_context(self, user_question: str) -> List[str]:
|
66 |
+
"""Query the FAISS index for the top relevant chunks."""
|
67 |
+
relevant_chunks = self.faiss_index.similarity_search(user_question, k=3)
|
68 |
+
return "\n\n".join([doc.page_content for doc in relevant_chunks])
|
69 |
+
|
70 |
def chat_with_pdf(self, user_question: str, pdf_content: str) -> str:
|
71 |
"""Generate response using Azure OpenAI based on PDF content and user question."""
|
72 |
# Split PDF content into chunks
|