Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -90,27 +90,42 @@ def format_docs(docs):
|
|
90 |
return "\n\n".join(doc.page_content for doc in docs)
|
91 |
|
92 |
def process_pdf(uploaded_file):
|
93 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
try:
|
98 |
-
|
99 |
-
|
100 |
-
except Exception as e:
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
semantic_splitter = SemanticChunker(
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
)
|
112 |
-
|
113 |
-
docs = semantic_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
vector_db = Chroma.from_documents(documents=docs,
|
115 |
embedding=st.session_state.embeddings)
|
116 |
|
|
|
90 |
return "\n\n".join(doc.page_content for doc in docs)
|
91 |
|
92 |
def process_pdf(uploaded_file):
|
93 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
94 |
+
# tmp_file.write(uploaded_file.getvalue())
|
95 |
+
# tmp_file_path = tmp_file.name
|
96 |
+
|
97 |
+
# try:
|
98 |
+
# loader = PyPDFLoader(tmp_file_path)
|
99 |
+
# documents = loader.load()
|
100 |
+
# except Exception as e:
|
101 |
+
# st.error(f"Đọc file thất bại: {e}")
|
102 |
+
# return None, 0
|
103 |
+
|
104 |
+
# semantic_splitter = SemanticChunker(
|
105 |
+
# embeddings=st.session_state.embeddings,
|
106 |
+
# buffer_size=1, # total sentence collected before perform text split
|
107 |
+
# breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
|
108 |
+
# breakpoint_threshold_amount=95, # split text if similarity score > 95%
|
109 |
+
# min_chunk_size=500,
|
110 |
+
# add_start_index=True, # assign index for chunk
|
111 |
+
# )
|
112 |
+
|
113 |
+
# docs = semantic_splitter.split_documents(documents)
|
114 |
+
df = pd.read_excel("chunk_metadata_template.xlsx")
|
115 |
+
docs = []
|
116 |
+
|
117 |
+
# Tạo danh sách các Document có metadata
|
118 |
+
for _, row in df.iterrows():
|
119 |
+
chunk_with_metadata = Document(
|
120 |
+
page_content=row['page_content'],
|
121 |
+
metadata={
|
122 |
+
'chunk_id': row['chunk_id'],
|
123 |
+
'document_title': row['document_title']
|
124 |
+
# 'topic': row['topic'],
|
125 |
+
# 'stakeholder': row['stakeholder']
|
126 |
+
}
|
127 |
+
)
|
128 |
+
docs.append(chunk_with_metadata)
|
129 |
vector_db = Chroma.from_documents(documents=docs,
|
130 |
embedding=st.session_state.embeddings)
|
131 |
|