rajeshthangaraj1 commited on
Commit
6866239
·
verified ·
1 Parent(s): a440dc6

milvus update

Browse files
Files changed (5) hide show
  1. .gitignore +107 -0
  2. app.py +145 -130
  3. chat_handler.py +34 -29
  4. file_handler.py +55 -22
  5. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Ignore Streamlit secrets
7
+ .streamlit/secrets.toml
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ env/
15
+ venv/
16
+ ENV/
17
+ env.bak/
18
+ venv.bak/
19
+ *.egg
20
+ *.egg-info/
21
+ dist/
22
+ build/
23
+ eggs/
24
+ lib/
25
+ libs/
26
+ parts/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # PyInstaller
38
+ # Usually these files are written by a Python script from a template
39
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
40
+ *.manifest
41
+ *.spec
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.log
55
+
56
+ # Environments
57
+ .env
58
+ .venv
59
+ ENV/
60
+ env/
61
+ venv/
62
+
63
+ # PyCharm
64
+ .idea/
65
+
66
+ # Jupyter Notebook
67
+ .ipynb_checkpoints
68
+
69
+ # pyenv
70
+ .python-version
71
+
72
+ # celery
73
+ celerybeat-schedule
74
+ *.pid
75
+
76
+ # SageMath
77
+ *.sage.py
78
+
79
+ # Encrypted credentials in the dev environment
80
+ *.key
81
+
82
+ # log files
83
+ *.log
84
+
85
+ # vscode settings
86
+ .vscode/
87
+
88
+ # mypy
89
+ .mypy_cache/
90
+ .dmypy.json
91
+ dmypy.json
92
+
93
+ # profiler
94
+ profiler.log
95
+
96
+ # Other files and directories
97
+ *.swp
98
+ *~
99
+ *.bak
100
+ *.tmp
101
+ *.temp
102
+ *.orig
103
+ *.lock
104
+ *.log
105
+
106
+ # Backup files
107
+ *_backup.*
app.py CHANGED
@@ -1,130 +1,145 @@
1
- import streamlit as st
2
- import os
3
- from dotenv import load_dotenv
4
- from file_handler import FileHandler
5
- from chat_handler import ChatHandler
6
-
7
- # Load environment variables
8
- load_dotenv()
9
-
10
- # Static credentials
11
- USERNAME = os.environ.get("USERNAME")
12
- PASSWORD = os.environ.get("PASSWORD")
13
-
14
- # Initialize Handlers
15
- VECTOR_DB_PATH = os.environ.get("VECTOR_DB_PATH_DB")
16
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
17
- HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
18
- GROQ_API_KEY_TOKEN = os.environ.get("GROQ_API_KEY")
19
-
20
- os.makedirs(VECTOR_DB_PATH, exist_ok=True)
21
-
22
- file_handler = FileHandler(VECTOR_DB_PATH, HUGGINGFACE_API_TOKEN)
23
- chat_handler = ChatHandler(VECTOR_DB_PATH, HUGGINGFACE_API_TOKEN, OPENAI_API_KEY,GROQ_API_KEY_TOKEN)
24
-
25
- # Streamlit UI
26
- st.set_page_config(layout="wide", page_title="AI Connect - Smarter Network Planning for the Future")
27
-
28
- # Session state to track login status
29
- if "logged_in" not in st.session_state:
30
- st.session_state["logged_in"] = False
31
-
32
- # Login page
33
- # Refined Login Page
34
- if not st.session_state["logged_in"]:
35
- # Customize page title
36
- st.markdown(
37
- """
38
- <style>
39
- .title {
40
- font-size: 2.5rem;
41
- color: #1f77b4;
42
- font-weight: bold;
43
- text-align: center;
44
- margin-bottom: 10px;
45
- }
46
- .subtitle {
47
- font-size: 1.2rem;
48
- color: #555;
49
- text-align: center;
50
- margin-bottom: 20px;
51
- }
52
- .login-box {
53
- margin: auto;
54
- width: 50%;
55
- padding: 20px;
56
- background: #f9f9f9;
57
- border: 1px solid #ddd;
58
- border-radius: 10px;
59
- }
60
- .login-box input {
61
- margin-bottom: 10px;
62
- }
63
- </style>
64
- <div>
65
- <div class="title">Welcome to AI Connect</div>
66
- <div class="subtitle">Smarter Network Planning for the Future</div>
67
- </div>
68
- """,
69
- unsafe_allow_html=True,
70
- )
71
-
72
- # Centered Login Box
73
- # st.markdown('<div class="login-box">', unsafe_allow_html=True)
74
- st.subheader("Login to Continue")
75
- username = st.text_input("Username")
76
- password = st.text_input("Password", type="password")
77
- if st.button("Login"):
78
- if username == USERNAME and password == PASSWORD:
79
- st.session_state["logged_in"] = True
80
- st.success("Login successful!")
81
- st.rerun()
82
- else:
83
- st.error("Invalid username or password.")
84
- st.markdown("</div>", unsafe_allow_html=True)
85
- else:
86
- # Main app (Chat Interface)
87
- st.title("Chatbot - Smarter Network Planning for the Future")
88
- st.sidebar.header("Upload Documents")
89
- uploaded_file = st.sidebar.file_uploader("Upload PDF, Excel, Docx, or Txt", type=["pdf", "xlsx", "docx", "txt", "csv"])
90
- document_name = st.sidebar.text_input("Document Name", "")
91
- document_description = st.sidebar.text_area("Document Description", "")
92
-
93
- if st.sidebar.button("Process File"):
94
- if uploaded_file:
95
- with st.spinner("Processing your file..."):
96
- response = file_handler.handle_file_upload(
97
- file=uploaded_file,
98
- document_name=document_name,
99
- document_description=document_description,
100
- )
101
- st.sidebar.success(f"File processed: {response['message']}")
102
- else:
103
- st.sidebar.warning("Please upload a file before processing.")
104
-
105
- # Chat Interface
106
- if "messages" not in st.session_state:
107
- st.session_state["messages"] = []
108
-
109
- # Display chat messages from history
110
- for message in st.session_state["messages"]:
111
- with st.chat_message(message["role"]):
112
- st.markdown(message["content"])
113
-
114
- # Accept user input
115
- if prompt := st.chat_input("Type your question here..."):
116
- with st.chat_message("user"):
117
- st.markdown(prompt)
118
- st.session_state["messages"].append({"role": "user", "content": prompt})
119
-
120
- with st.spinner("Processing your question..."):
121
- response = chat_handler.answer_question(prompt)
122
- with st.chat_message("assistant"):
123
- st.markdown(response)
124
- st.session_state["messages"].append({"role": "assistant", "content": response})
125
-
126
- # Logout button
127
- if st.session_state["logged_in"]:
128
- if st.sidebar.button("Logout"):
129
- st.session_state["logged_in"] = False
130
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import streamlit as st
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from file_handler import FileHandler
6
+ from chat_handler import ChatHandler
7
+ # Initialize Milvus connection
8
+ from pymilvus import connections
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Static credentials for login
14
+ USERNAME = os.environ.get("USERNAME")
15
+ PASSWORD = os.environ.get("PASSWORD")
16
+
17
+ # Configure logging
18
+ LOG_PATH = os.environ.get("LOG_PATH")
19
+ os.makedirs(LOG_PATH, exist_ok=True)
20
+
21
+ LOG_FILE = os.path.join(LOG_PATH, "chatbot.log")
22
+ logging.basicConfig(
23
+ filename=LOG_FILE,
24
+ level=logging.INFO,
25
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
26
+ )
27
+ logger = logging.getLogger("AI_Connect")
28
+
29
+ # Initialize Handlers
30
+ MILVUS_HOST = os.environ.get("MILVUS_HOST")
31
+ MILVUS_PORT = os.environ.get("MILVUS_PORT")
32
+ HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
33
+ GROQ_API_KEY_TOKEN = os.environ.get("GROQ_API_KEY")
34
+
35
+ connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
36
+
37
+ file_handler = FileHandler(HUGGINGFACE_API_TOKEN,logger)
38
+ chat_handler = ChatHandler(HUGGINGFACE_API_TOKEN,GROQ_API_KEY_TOKEN,logger)
39
+
40
+ # Streamlit UI
41
+ st.set_page_config(layout="wide", page_title="AI Connect - Smarter Network Planning for the Future")
42
+
43
+ # Session state to track login status
44
+ if "logged_in" not in st.session_state:
45
+ st.session_state["logged_in"] = False
46
+
47
+ # Login page
48
+ # Refined Login Page
49
+ if not st.session_state["logged_in"]:
50
+ # Customize page title
51
+ st.markdown(
52
+ """
53
+ <style>
54
+ .title {
55
+ font-size: 2.5rem;
56
+ color: #1f77b4;
57
+ font-weight: bold;
58
+ text-align: center;
59
+ margin-bottom: 10px;
60
+ }
61
+ .subtitle {
62
+ font-size: 1.2rem;
63
+ color: #555;
64
+ text-align: center;
65
+ margin-bottom: 20px;
66
+ }
67
+ .login-box {
68
+ margin: auto;
69
+ width: 50%;
70
+ padding: 20px;
71
+ background: #f9f9f9;
72
+ border: 1px solid #ddd;
73
+ border-radius: 10px;
74
+ }
75
+ .login-box input {
76
+ margin-bottom: 10px;
77
+ }
78
+ </style>
79
+ <div>
80
+ <div class="title">Welcome to AI Connect</div>
81
+ <div class="subtitle">Smarter Network Planning for the Future</div>
82
+ </div>
83
+ """,
84
+ unsafe_allow_html=True,
85
+ )
86
+
87
+ # Centered Login Box
88
+ st.subheader("Login to Continue")
89
+ username = st.text_input("Username")
90
+ password = st.text_input("Password", type="password")
91
+ if st.button("Login"):
92
+ if username == USERNAME and password == PASSWORD:
93
+ st.session_state["logged_in"] = True
94
+ st.success("Login successful!")
95
+ logger.info("User Logged Successfully")
96
+ st.rerun()
97
+ else:
98
+ st.error("Invalid username or password.")
99
+ st.markdown("</div>", unsafe_allow_html=True)
100
+ else:
101
+ # Main app (Chat Interface)
102
+ st.title("Chatbot - Smarter Network Planning for the Future")
103
+ st.sidebar.header("Upload Documents")
104
+ uploaded_file = st.sidebar.file_uploader("Upload PDF, Excel, Docx, or Txt", type=["pdf", "xlsx", "docx", "txt", "csv"])
105
+ document_name = st.sidebar.text_input("Document Name", "")
106
+ document_description = st.sidebar.text_area("Document Description", "")
107
+
108
+ if st.sidebar.button("Process File"):
109
+ if uploaded_file:
110
+ with st.spinner("Processing your file..."):
111
+ response = file_handler.handle_file_upload(
112
+ file=uploaded_file,
113
+ document_name=document_name,
114
+ document_description=document_description,
115
+ )
116
+ st.sidebar.success(f"File processed: {response['message']}")
117
+ else:
118
+ st.sidebar.warning("Please upload a file before processing.")
119
+
120
+ # Chat Interface
121
+ if "messages" not in st.session_state:
122
+ st.session_state["messages"] = []
123
+
124
+ # Display chat messages from history
125
+ for message in st.session_state["messages"]:
126
+ with st.chat_message(message["role"]):
127
+ st.markdown(message["content"])
128
+
129
+ # Accept user input
130
+ if prompt := st.chat_input("Type your question here..."):
131
+ with st.chat_message("user"):
132
+ st.markdown(prompt)
133
+ st.session_state["messages"].append({"role": "user", "content": prompt})
134
+
135
+ with st.spinner("Processing your question..."):
136
+ response = chat_handler.answer_question(prompt)
137
+ with st.chat_message("assistant"):
138
+ st.markdown(response)
139
+ st.session_state["messages"].append({"role": "assistant", "content": response})
140
+
141
+ # Logout button
142
+ if st.session_state["logged_in"]:
143
+ if st.sidebar.button("Logout"):
144
+ st.session_state["logged_in"] = False
145
+ st.rerun()
chat_handler.py CHANGED
@@ -1,26 +1,19 @@
1
  import os
2
- from langchain_community.vectorstores import FAISS
3
- from langchain_openai import ChatOpenAI
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from groq import Groq
6
- import requests
7
-
8
 
9
  class ChatHandler:
10
- def __init__(self, vector_db_path,api_token,open_api_key,grok_api_token):
11
- self.vector_db_path = vector_db_path
 
12
  self.groq_client = Groq(api_key=grok_api_token)
13
  # Initialize the embedding model using Hugging Face
14
  self.embeddings = HuggingFaceEmbeddings(
15
  model_name="sentence-transformers/all-MiniLM-L6-v2",
16
  model_kwargs={"token": api_token},
17
  )
18
- self.llm = ChatOpenAI(
19
- model_name="gpt-4",
20
- api_key=open_api_key,
21
- max_tokens=500,
22
- temperature=0.2,
23
- )
24
  def _query_groq_model(self, prompt):
25
  """
26
  Query Groq's Llama model using the SDK.
@@ -33,29 +26,38 @@ class ChatHandler:
33
  # Return the assistant's response
34
  return chat_completion.choices[0].message.content
35
  except Exception as e:
 
36
  return f"Error querying Groq API: {e}"
37
 
38
  def answer_question(self, question):
39
  # Generate embedding for the question
 
 
40
  responses = []
41
- for root, dirs, files in os.walk(self.vector_db_path):
42
- for dir in dirs:
43
- index_path = os.path.join(root, dir, "index.faiss")
44
- if os.path.exists(index_path):
45
- vector_store = FAISS.load_local(
46
- os.path.join(root, dir), self.embeddings, allow_dangerous_deserialization=True
47
- )
48
- response_with_scores = vector_store.similarity_search_with_relevance_scores(question, k=100)
49
- filtered_responses = [doc.page_content for doc, score in response_with_scores]
50
- responses.extend(filtered_responses)
 
 
 
 
 
 
 
 
 
 
51
 
52
  if responses:
53
- prompt = self._generate_prompt(question, responses)
54
- # response = self.llm.invoke(prompt)
55
- # if hasattr(response, "content"):
56
- # return response.content.strip() # Ensure clean output
57
- # else:
58
- # return "Error: 'content' attribute not found in the AI's response."
59
  response = self._query_groq_model(prompt)
60
  return response
61
 
@@ -68,7 +70,10 @@ class ChatHandler:
68
  and answer questions effectively using the provided documents.
69
  """
70
  context = "\n".join(
71
- [f"Document {i + 1}:\n{doc.strip()}" for i, doc in enumerate(documents[:5])]
 
 
 
72
  )
73
 
74
  prompt = f"""
 
1
  import os
 
 
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
  from groq import Groq
4
+ from pymilvus import connections, Collection
 
5
 
6
  class ChatHandler:
7
+ def __init__(self,api_token,grok_api_token,logger):
8
+ self.logger = logger
9
+ self.logger.info("Initializing ChatHandler...")
10
  self.groq_client = Groq(api_key=grok_api_token)
11
  # Initialize the embedding model using Hugging Face
12
  self.embeddings = HuggingFaceEmbeddings(
13
  model_name="sentence-transformers/all-MiniLM-L6-v2",
14
  model_kwargs={"token": api_token},
15
  )
16
+
 
 
 
 
 
17
  def _query_groq_model(self, prompt):
18
  """
19
  Query Groq's Llama model using the SDK.
 
26
  # Return the assistant's response
27
  return chat_completion.choices[0].message.content
28
  except Exception as e:
29
+ self.logger.error(f"Error querying Groq API: {e}")
30
  return f"Error querying Groq API: {e}"
31
 
32
  def answer_question(self, question):
33
  # Generate embedding for the question
34
+ self.logger.info(f"Received question: {question}")
35
+ collections = connections._fetch_handler().list_collections()
36
  responses = []
37
+
38
+ for collection_name in collections:
39
+ collection = Collection(name=collection_name)
40
+ embeddings = self.embeddings.embed_query(question)
41
+
42
+ search_params = {
43
+ "metric_type": "IP",
44
+ "params": {"nprobe": 10},
45
+ }
46
+
47
+ results = collection.search(
48
+ data=[embeddings],
49
+ anns_field="embedding",
50
+ param=search_params,
51
+ limit=5,
52
+ )
53
+ # Extract the embeddings or metadata (if needed)
54
+ for res in results[0]:
55
+ # Store the ID or use res.distance if needed for similarity score
56
+ responses.append({"id": res.id, "distance": res.distance,"content":res.entity})
57
 
58
  if responses:
59
+ sorted_responses = sorted(responses, key=lambda x: x["distance"], reverse=True)
60
+ prompt = self._generate_prompt(question, sorted_responses[:5])
 
 
 
 
61
  response = self._query_groq_model(prompt)
62
  return response
63
 
 
70
  and answer questions effectively using the provided documents.
71
  """
72
  context = "\n".join(
73
+ [
74
+ f"Document {i + 1}:\nID: {doc['id']}\nSimilarity: {doc['distance']:.4f}\nContent: {doc['content']}"
75
+ for i, doc in enumerate(documents[:5])
76
+ ]
77
  )
78
 
79
  prompt = f"""
file_handler.py CHANGED
@@ -1,16 +1,17 @@
1
  import os
2
  import hashlib
3
  import io
4
- import json
5
  import pandas as pd
6
- from langchain_community.vectorstores import FAISS
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
10
 
11
  class FileHandler:
12
- def __init__(self, vector_db_path,api_token):
13
- self.vector_db_path = vector_db_path
 
14
  # Initialize the embedding model using Hugging Face
15
  self.embeddings = HuggingFaceEmbeddings(
16
  model_name="sentence-transformers/all-MiniLM-L6-v2",
@@ -21,12 +22,11 @@ class FileHandler:
21
  try:
22
  content = file.read()
23
  file_hash = hashlib.md5(content).hexdigest()
24
- file_key = f"{file.name}_{file_hash}"
25
- vector_store_dir = os.path.join(self.vector_db_path, file_key)
26
- os.makedirs(vector_store_dir, exist_ok=True)
27
- vector_store_path = os.path.join(vector_store_dir, "index.faiss")
28
 
29
- if os.path.exists(vector_store_path):
 
 
30
  return {"message": "File already processed."}
31
 
32
  # Process file based on type
@@ -41,30 +41,63 @@ class FileHandler:
41
  elif file.name.endswith(".csv"):
42
  texts, metadatas = self.load_and_split_csv(content)
43
  else:
 
44
  raise ValueError("Unsupported file format.")
45
 
 
46
  if not texts:
47
  return {"message": "No text extracted from the file. Check the file content."}
48
 
49
- # Create FAISS vector store using LangChain's from_texts method
50
- vector_store = FAISS.from_texts(texts, embedding=self.embeddings, metadatas=metadatas)
51
- vector_store.save_local(vector_store_dir)
52
-
53
- metadata = {
54
- "filename": file.name,
55
- "document_name": document_name,
56
- "document_description": document_description,
57
- "file_size": len(content),
58
- }
59
- metadata_path = os.path.join(vector_store_dir, "metadata.json")
60
- with open(metadata_path, 'w') as md_file:
61
- json.dump(metadata, md_file)
62
 
63
  return {"message": "File processed successfully."}
64
  except Exception as e:
 
65
  return {"message": f"Error processing file: {str(e)}"}
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
68
  def load_and_split_pdf(self, file):
69
  reader = PdfReader(file)
70
  texts = []
 
1
  import os
2
  import hashlib
3
  import io
 
4
  import pandas as pd
 
5
  from PyPDF2 import PdfReader
6
  from docx import Document
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
+ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
9
+ import json
10
 
11
  class FileHandler:
12
+ def __init__(self,api_token,logger):
13
+ self.logger = logger
14
+ self.logger.info("Initializing FileHandler...")
15
  # Initialize the embedding model using Hugging Face
16
  self.embeddings = HuggingFaceEmbeddings(
17
  model_name="sentence-transformers/all-MiniLM-L6-v2",
 
22
  try:
23
  content = file.read()
24
  file_hash = hashlib.md5(content).hexdigest()
25
+ collection_name = f"collection_{file_hash}"
 
 
 
26
 
27
+ # Check if the collection exists
28
+ if connections._fetch_handler().has_collection(collection_name):
29
+ self.logger.info(f"Collection '{collection_name}' already exists.")
30
  return {"message": "File already processed."}
31
 
32
  # Process file based on type
 
41
  elif file.name.endswith(".csv"):
42
  texts, metadatas = self.load_and_split_csv(content)
43
  else:
44
+ self.logger.info("Unsupported file format.")
45
  raise ValueError("Unsupported file format.")
46
 
47
+
48
  if not texts:
49
  return {"message": "No text extracted from the file. Check the file content."}
50
 
51
+ # self._store_vectors(collection_name, texts, metadatas)
52
+ filename = file.name
53
+ filelen = len(content)
54
+ self._store_vectors(collection_name, texts, metadatas, document_name, document_description,filename,filelen)
55
+ self.logger.info(f"File processed successfully. Collection name: {collection_name}")
 
 
 
 
 
 
 
 
56
 
57
  return {"message": "File processed successfully."}
58
  except Exception as e:
59
+ self.logger.error(f"Error processing file: {str(e)}")
60
  return {"message": f"Error processing file: {str(e)}"}
61
 
62
+ def _store_vectors(self, collection_name, texts, metadatas, document_name, document_description,file_name,file_len):
63
+ fields = [
64
+ FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
65
+ FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
66
+ FieldSchema(name="file_name_hash", dtype=DataType.INT64), # Hash of file name
67
+ FieldSchema(name="document_name_hash", dtype=DataType.INT64), # Hash of document name
68
+ FieldSchema(name="document_description_hash", dtype=DataType.INT64), # Hash of document description
69
+ FieldSchema(name="file_meta_hash", dtype=DataType.INT64),
70
+ FieldSchema(name="file_size", dtype=DataType.INT64),
71
+ ]
72
+ schema = CollectionSchema(fields, description="Document embeddings with metadata")
73
+ collection = Collection(name=collection_name, schema=schema)
74
+ # Generate embeddings
75
+ embeddings = [self.embeddings.embed_query(text) for text in texts]
76
+
77
+ # Convert metadata to hashed values
78
+ file_name_hash = int(hashlib.md5(file_name.encode('utf-8')).hexdigest(), 16) % (10 ** 12)
79
+ document_name_hash = int(hashlib.md5((document_name or "Unknown Document").encode('utf-8')).hexdigest(), 16) % (
80
+ 10 ** 12)
81
+ document_description_hash = int(
82
+ hashlib.md5((document_description or "No Description Provided").encode('utf-8')).hexdigest(), 16) % (
83
+ 10 ** 12)
84
+ # Convert metadata list to JSON string and hash it
85
+ metadata_string = json.dumps(metadatas, ensure_ascii=False)
86
+ file_meta_hash = int(hashlib.md5(metadata_string.encode('utf-8')).hexdigest(), 16) % (10 ** 12)
87
+
88
+ # Prepare data for insertion
89
+ data = [
90
+ embeddings,
91
+ [file_name_hash] * len(embeddings),
92
+ [document_name_hash] * len(embeddings),
93
+ [document_description_hash] * len(embeddings),
94
+ [file_meta_hash] * len(embeddings),
95
+ [file_len or 0] * len(embeddings),
96
+ ]
97
 
98
+ # Insert data into collection
99
+ collection.insert(data)
100
+ collection.load()
101
  def load_and_split_pdf(self, file):
102
  reader = PdfReader(file)
103
  texts = []
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ