quoc-khanh commited on
Commit
3d892df
·
verified ·
1 Parent(s): 5c08ccc

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +66 -1
helpers.py CHANGED
@@ -22,7 +22,8 @@ import logging
22
  # logging.getLogger("langchain").setLevel(logging.ERROR)
23
  logging.getLogger().setLevel(logging.ERROR)
24
 
25
-
 
26
 
27
  # from file_loader import get_vectorstore
28
 
@@ -129,6 +130,70 @@ key = os.getenv("GOOGLE_API_KEY")
129
  # return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
130
  # ).load()
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
134
  """Fetch content from a list of URLs with a progress bar."""
 
22
  # logging.getLogger("langchain").setLevel(logging.ERROR)
23
  logging.getLogger().setLevel(logging.ERROR)
24
 
25
+ import re
26
+ import ast
27
 
28
  # from file_loader import get_vectorstore
29
 
 
130
  # return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
131
  # ).load()
132
 
133
+ def extract_metadata(input_string):
134
+ # Use regex to find the content inside curly braces
135
+ match = re.search(r'\{.*?\}', input_string)
136
+ if match:
137
+ metadata_str = match.group() # This returns the substring with the braces
138
+ try:
139
+ # Safely evaluate the string to a dictionary
140
+ new_metadata = ast.literal_eval(metadata_str)
141
+ except Exception as e:
142
+ print(f"Error evaluating metadata: {e}")
143
+ new_metadata = {}
144
+ else:
145
+ new_metadata = None
146
+ return new_metadata
147
+
148
+ # Example usage:
149
+ input_str = "Some random text before and then {'a': 'abc', 'b': 'bcd'} and some random text after."
150
+ metadata = extract_metadata(input_str)
151
+ print(metadata)
152
+
153
+ def define_metadata(input_text):
154
+ condition1 = 'Chương trình'
155
+ condition2 = 'Đề án'
156
+ condition3 = 'Đề cương'
157
+ condition4 = ['Trí tuệ nhân tạo',
158
+ 'Toán kinh tế',
159
+ 'Thống kê kinh tế',
160
+ 'Phân tích dữ liệu trong Kinh tế',
161
+ 'Kỹ thuật phần mềm',
162
+ 'Khoa học máy tính',
163
+ 'Khoa học dữ liệu',
164
+ 'Hệ thống thông tin quản lý',
165
+ 'Hệ thống thông tin',
166
+ 'Định phí bảo hiểm và Quản trị rủi ro',
167
+ 'Chương trình Công nghệ thông tin',
168
+ 'An toàn thông tin']
169
+ #cond1 cond2 la str, con3 la list ten cac nganh
170
+ result = {}
171
+ if condition3 in input_text:
172
+ result["Tai lieu ve"] = 'De cuong'
173
+ elif condition1 in input_text:
174
+ result["Tai lieu ve"] = 'Chuong trinh dao tao'
175
+ elif condition2 in input_text:
176
+ result["Tai lieu ve"] = 'De an'
177
+ for cond in condition4:
178
+ if cond in input_text:
179
+ result["Nganh"] = cond
180
+ return result
181
+
182
+ def update_documents_metadata(documents, new_metadata):
183
+ updated_documents = []
184
+ for doc in documents:
185
+ # Preserve the original 'source'
186
+ original_source = doc.metadata.get("source")
187
+
188
+ # Update metadata with new key-value pairs
189
+ doc.metadata.update(new_metadata)
190
+
191
+ # Ensure the 'source' remains unchanged
192
+ if original_source:
193
+ doc.metadata["source"] = original_source
194
+
195
+ updated_documents.append(doc)
196
+ return updated_documents
197
 
198
  def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
199
  """Fetch content from a list of URLs with a progress bar."""