ngcanh commited on
Commit
ecbb6ec
·
verified ·
1 Parent(s): 101865a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -30
app.py CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
3
  import subprocess
4
  import openai
5
  import fitz
 
6
  from langchain_community.vectorstores import FAISS
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from openai import OpenAI
@@ -31,37 +32,27 @@ class PDFChatbot:
31
  pdf_directory = "data"
32
 
33
  # Duyệt qua các file trong thư mục và đọc từng file PDF
34
- pdf_texts = []
35
- for filename in os.listdir(pdf_directory):
36
- if filename.endswith(".pdf"):
37
- file_path = os.path.join(pdf_directory, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Mở file PDF
40
- doc = fitz.open(file_path)
41
-
42
- # Trích xuất toàn bộ văn bản từ từng trang
43
- full_text = ""
44
- for page_num in range(doc.page_count):
45
- page = doc.load_page(page_num)
46
- full_text += page.get_text("text", flags=11)
47
-
48
- pdf_texts.append({"file": filename, "text": full_text})
49
-
50
- documents = [
51
- Document(page_content=doc['text'], metadata={'file': doc['file']})
52
- for doc in pdf_texts # Assuming pdf_texts is a list of dictionaries like {'file': filename, 'text': full_text}
53
- ]
54
-
55
- semantic_splitter = SemanticChunker(
56
- embeddings= HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'),
57
- buffer_size=1, # total sentence collected before perform text split
58
- breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
59
- breakpoint_threshold_amount=95, # split text if similarity score > 95%
60
- # min_chunk_size=500,
61
- add_start_index=True, # assign index for chunk
62
- )
63
-
64
- docs = semantic_splitter.split_documents(documents)
65
  db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
66
 
67
  relevant_chunks = db.similarity_search(user_question, k=3)
 
3
  import subprocess
4
  import openai
5
  import fitz
6
+ import PyPDF2
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from openai import OpenAI
 
32
  pdf_directory = "data"
33
 
34
  # Duyệt qua các file trong thư mục và đọc từng file PDF
35
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
36
+ text = ""
37
+ for page_num in range(len(pdf_reader.pages)):
38
+ page = pdf_reader.pages[page_num]
39
+ text += page.extract_text() + "\n"
40
+ words = text.split()
41
+ chunks = []
42
+ current_chunk = []
43
+ current_length = 0
44
+ for word in words:
45
+ if current_length + len(word) + 1 > chunk_size:
46
+ if current_chunk:
47
+ chunks.append(" ".join(current_chunk))
48
+ current_chunk = [word]
49
+ current_length = len(word)
50
+ else:
51
+ current_chunk.append(word)
52
+ current_length += len(word) + 1
53
+ if current_chunk:
54
+ chunks.append(" ".join(current_chunk))
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  db = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder'))
57
 
58
  relevant_chunks = db.similarity_search(user_question, k=3)