Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
|
|
3 |
import subprocess
|
4 |
import openai
|
5 |
import fitz
|
|
|
6 |
from langchain_community.vectorstores import FAISS
|
7 |
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
from openai import OpenAI
|
@@ -31,37 +32,27 @@ class PDFChatbot:
|
|
31 |
pdf_directory = "data"
|
32 |
|
33 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
# Mở file PDF
|
40 |
-
doc = fitz.open(file_path)
|
41 |
-
|
42 |
-
# Trích xuất toàn bộ văn bản từ từng trang
|
43 |
-
full_text = ""
|
44 |
-
for page_num in range(doc.page_count):
|
45 |
-
page = doc.load_page(page_num)
|
46 |
-
full_text += page.get_text("text", flags=11)
|
47 |
-
|
48 |
-
pdf_texts.append({"file": filename, "text": full_text})
|
49 |
-
|
50 |
-
documents = [
|
51 |
-
Document(page_content=doc['text'], metadata={'file': doc['file']})
|
52 |
-
for doc in pdf_texts # Assuming pdf_texts is a list of dictionaries like {'file': filename, 'text': full_text}
|
53 |
-
]
|
54 |
-
|
55 |
-
semantic_splitter = SemanticChunker(
|
56 |
-
embeddings= HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'),
|
57 |
-
buffer_size=1, # total sentence collected before perform text split
|
58 |
-
breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
|
59 |
-
breakpoint_threshold_amount=95, # split text if similarity score > 95%
|
60 |
-
# min_chunk_size=500,
|
61 |
-
add_start_index=True, # assign index for chunk
|
62 |
-
)
|
63 |
-
|
64 |
-
docs = semantic_splitter.split_documents(documents)
|
65 |
db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
|
66 |
|
67 |
relevant_chunks = db.similarity_search(user_question, k=3)
|
|
|
3 |
import subprocess
|
4 |
import openai
|
5 |
import fitz
|
6 |
+
import PyPDF2
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
from langchain.embeddings import HuggingFaceEmbeddings
|
9 |
from openai import OpenAI
|
|
|
32 |
pdf_directory = "data"
|
33 |
|
34 |
# Duyệt qua các file trong thư mục và đọc từng file PDF
|
35 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
36 |
+
text = ""
|
37 |
+
for page_num in range(len(pdf_reader.pages)):
|
38 |
+
page = pdf_reader.pages[page_num]
|
39 |
+
text += page.extract_text() + "\n"
|
40 |
+
words = text.split()
|
41 |
+
chunks = []
|
42 |
+
current_chunk = []
|
43 |
+
current_length = 0
|
44 |
+
for word in words:
|
45 |
+
if current_length + len(word) + 1 > chunk_size:
|
46 |
+
if current_chunk:
|
47 |
+
chunks.append(" ".join(current_chunk))
|
48 |
+
current_chunk = [word]
|
49 |
+
current_length = len(word)
|
50 |
+
else:
|
51 |
+
current_chunk.append(word)
|
52 |
+
current_length += len(word) + 1
|
53 |
+
if current_chunk:
|
54 |
+
chunks.append(" ".join(current_chunk))
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
|
57 |
|
58 |
relevant_chunks = db.similarity_search(user_question, k=3)
|