chatbot4nct_test2

Runtime error

App Files Files Community

quoc-khanh commited on Mar 7

Commit

3d892df

verified ·

1 Parent(s): 5c08ccc

Update helpers.py

Browse files

Files changed (1) hide show

helpers.py +66 -1

helpers.py CHANGED Viewed

@@ -22,7 +22,8 @@ import logging
 # logging.getLogger("langchain").setLevel(logging.ERROR)
 logging.getLogger().setLevel(logging.ERROR)
 # from file_loader import get_vectorstore
@@ -129,6 +130,70 @@ key = os.getenv("GOOGLE_API_KEY")
 #     return DoclingLoader(file_path=file_path, chunker=chunker  # This will break your doc into manageable pieces.
 #                         ).load()
 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
     """Fetch content from a list of URLs with a progress bar."""

 # logging.getLogger("langchain").setLevel(logging.ERROR)
 logging.getLogger().setLevel(logging.ERROR)
+import re
+import ast
 # from file_loader import get_vectorstore
 #     return DoclingLoader(file_path=file_path, chunker=chunker  # This will break your doc into manageable pieces.
 #                         ).load()
+def extract_metadata(input_string):
+    # Use regex to find the content inside curly braces
+    match = re.search(r'\{.*?\}', input_string)
+    if match:
+        metadata_str = match.group()  # This returns the substring with the braces
+        try:
+            # Safely evaluate the string to a dictionary
+            new_metadata = ast.literal_eval(metadata_str)
+        except Exception as e:
+            print(f"Error evaluating metadata: {e}")
+            new_metadata = {}
+    else:
+        new_metadata = None
+    return new_metadata
+# Example usage:
+input_str = "Some random text before and then {'a': 'abc', 'b': 'bcd'} and some random text after."
+metadata = extract_metadata(input_str)
+print(metadata)
+def define_metadata(input_text):
+    condition1 = 'Chương trình'
+    condition2 = 'Đề án'
+    condition3 = 'Đề cương'
+    condition4 = ['Trí tuệ nhân tạo',
+                  'Toán kinh tế',
+                  'Thống kê kinh tế',
+                  'Phân tích dữ liệu trong Kinh tế',
+                  'Kỹ thuật phần mềm',
+                  'Khoa học máy tính',
+                  'Khoa học dữ liệu',
+                  'Hệ thống thông tin quản lý',
+                  'Hệ thống thông tin',
+                  'Định phí bảo hiểm và Quản trị rủi ro',
+                  'Chương trình Công nghệ thông tin',
+                  'An toàn thông tin']
+    #cond1 cond2 la str, con3 la list ten cac nganh
+    result = {}
+    if condition3 in input_text:
+        result["Tai lieu ve"] = 'De cuong'
+    elif condition1 in input_text:
+        result["Tai lieu ve"] = 'Chuong trinh dao tao'
+    elif condition2 in input_text:
+        result["Tai lieu ve"] = 'De an'
+    for cond in condition4:
+        if cond in input_text:
+            result["Nganh"] = cond
+    return result
+def update_documents_metadata(documents, new_metadata):
+    updated_documents = []
+    for doc in documents:
+        # Preserve the original 'source'
+        original_source = doc.metadata.get("source")
+        # Update metadata with new key-value pairs
+        doc.metadata.update(new_metadata)
+        # Ensure the 'source' remains unchanged
+        if original_source:
+            doc.metadata["source"] = original_source
+        updated_documents.append(doc)
+    return updated_documents
 def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
     """Fetch content from a list of URLs with a progress bar."""