ngcanh commited on
Commit
2a29635
·
verified ·
1 Parent(s): ab2d264

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -21
app.py CHANGED
@@ -90,27 +90,42 @@ def format_docs(docs):
90
  return "\n\n".join(doc.page_content for doc in docs)
91
 
92
  def process_pdf(uploaded_file):
93
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
94
- tmp_file.write(uploaded_file.getvalue())
95
- tmp_file_path = tmp_file.name
96
-
97
- try:
98
- loader = PyPDFLoader(tmp_file_path)
99
- documents = loader.load()
100
- except Exception as e:
101
- st.error(f"Đọc file thất bại: {e}")
102
- return None, 0
103
-
104
- semantic_splitter = SemanticChunker(
105
- embeddings=st.session_state.embeddings,
106
- buffer_size=1, # total sentence collected before perform text split
107
- breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
108
- breakpoint_threshold_amount=95, # split text if similarity score > 95%
109
- min_chunk_size=500,
110
- add_start_index=True, # assign index for chunk
111
- )
112
-
113
- docs = semantic_splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  vector_db = Chroma.from_documents(documents=docs,
115
  embedding=st.session_state.embeddings)
116
 
 
90
  return "\n\n".join(doc.page_content for doc in docs)
91
 
92
  def process_pdf(uploaded_file):
93
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
94
+ # tmp_file.write(uploaded_file.getvalue())
95
+ # tmp_file_path = tmp_file.name
96
+
97
+ # try:
98
+ # loader = PyPDFLoader(tmp_file_path)
99
+ # documents = loader.load()
100
+ # except Exception as e:
101
+ # st.error(f"Đọc file thất bại: {e}")
102
+ # return None, 0
103
+
104
+ # semantic_splitter = SemanticChunker(
105
+ # embeddings=st.session_state.embeddings,
106
+ # buffer_size=1, # total sentence collected before perform text split
107
+ # breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
108
+ # breakpoint_threshold_amount=95, # split text if similarity score > 95%
109
+ # min_chunk_size=500,
110
+ # add_start_index=True, # assign index for chunk
111
+ # )
112
+
113
+ # docs = semantic_splitter.split_documents(documents)
114
+ df = pd.read_excel("chunk_metadata_template.xlsx")
115
+ docs = []
116
+
117
+ # Tạo danh sách các Document có metadata
118
+ for _, row in df.iterrows():
119
+ chunk_with_metadata = Document(
120
+ page_content=row['page_content'],
121
+ metadata={
122
+ 'chunk_id': row['chunk_id'],
123
+ 'document_title': row['document_title']
124
+ # 'topic': row['topic'],
125
+ # 'stakeholder': row['stakeholder']
126
+ }
127
+ )
128
+ docs.append(chunk_with_metadata)
129
  vector_db = Chroma.from_documents(documents=docs,
130
  embedding=st.session_state.embeddings)
131