milvus update
Browse files- .gitignore +107 -0
- app.py +145 -130
- chat_handler.py +34 -29
- file_handler.py +55 -22
- requirements.txt +0 -0
.gitignore
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# Ignore Streamlit secrets
|
7 |
+
.streamlit/secrets.toml
|
8 |
+
|
9 |
+
# C extensions
|
10 |
+
*.so
|
11 |
+
|
12 |
+
# Distribution / packaging
|
13 |
+
.Python
|
14 |
+
env/
|
15 |
+
venv/
|
16 |
+
ENV/
|
17 |
+
env.bak/
|
18 |
+
venv.bak/
|
19 |
+
*.egg
|
20 |
+
*.egg-info/
|
21 |
+
dist/
|
22 |
+
build/
|
23 |
+
eggs/
|
24 |
+
lib/
|
25 |
+
libs/
|
26 |
+
parts/
|
27 |
+
var/
|
28 |
+
wheels/
|
29 |
+
share/python-wheels/
|
30 |
+
*.manifest
|
31 |
+
*.spec
|
32 |
+
|
33 |
+
# Installer logs
|
34 |
+
pip-log.txt
|
35 |
+
pip-delete-this-directory.txt
|
36 |
+
|
37 |
+
# PyInstaller
|
38 |
+
# Usually these files are written by a Python script from a template
|
39 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
40 |
+
*.manifest
|
41 |
+
*.spec
|
42 |
+
|
43 |
+
# Unit test / coverage reports
|
44 |
+
htmlcov/
|
45 |
+
.tox/
|
46 |
+
.nox/
|
47 |
+
.coverage
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
nosetests.xml
|
53 |
+
coverage.xml
|
54 |
+
*.log
|
55 |
+
|
56 |
+
# Environments
|
57 |
+
.env
|
58 |
+
.venv
|
59 |
+
ENV/
|
60 |
+
env/
|
61 |
+
venv/
|
62 |
+
|
63 |
+
# PyCharm
|
64 |
+
.idea/
|
65 |
+
|
66 |
+
# Jupyter Notebook
|
67 |
+
.ipynb_checkpoints
|
68 |
+
|
69 |
+
# pyenv
|
70 |
+
.python-version
|
71 |
+
|
72 |
+
# celery
|
73 |
+
celerybeat-schedule
|
74 |
+
*.pid
|
75 |
+
|
76 |
+
# SageMath
|
77 |
+
*.sage.py
|
78 |
+
|
79 |
+
# Encrypted credentials in the dev environment
|
80 |
+
*.key
|
81 |
+
|
82 |
+
# log files
|
83 |
+
*.log
|
84 |
+
|
85 |
+
# vscode settings
|
86 |
+
.vscode/
|
87 |
+
|
88 |
+
# mypy
|
89 |
+
.mypy_cache/
|
90 |
+
.dmypy.json
|
91 |
+
dmypy.json
|
92 |
+
|
93 |
+
# profiler
|
94 |
+
profiler.log
|
95 |
+
|
96 |
+
# Other files and directories
|
97 |
+
*.swp
|
98 |
+
*~
|
99 |
+
*.bak
|
100 |
+
*.tmp
|
101 |
+
*.temp
|
102 |
+
*.orig
|
103 |
+
*.lock
|
104 |
+
*.log
|
105 |
+
|
106 |
+
# Backup files
|
107 |
+
*_backup.*
|
app.py
CHANGED
@@ -1,130 +1,145 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
|
4 |
-
from
|
5 |
-
from
|
6 |
-
|
7 |
-
#
|
8 |
-
|
9 |
-
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
st.
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import streamlit as st
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from file_handler import FileHandler
|
6 |
+
from chat_handler import ChatHandler
|
7 |
+
# Initialize Milvus connection
|
8 |
+
from pymilvus import connections
|
9 |
+
|
10 |
+
# Load environment variables
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
# Static credentials for login
|
14 |
+
USERNAME = os.environ.get("USERNAME")
|
15 |
+
PASSWORD = os.environ.get("PASSWORD")
|
16 |
+
|
17 |
+
# Configure logging
|
18 |
+
LOG_PATH = os.environ.get("LOG_PATH")
|
19 |
+
os.makedirs(LOG_PATH, exist_ok=True)
|
20 |
+
|
21 |
+
LOG_FILE = os.path.join(LOG_PATH, "chatbot.log")
|
22 |
+
logging.basicConfig(
|
23 |
+
filename=LOG_FILE,
|
24 |
+
level=logging.INFO,
|
25 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
26 |
+
)
|
27 |
+
logger = logging.getLogger("AI_Connect")
|
28 |
+
|
29 |
+
# Initialize Handlers
|
30 |
+
MILVUS_HOST = os.environ.get("MILVUS_HOST")
|
31 |
+
MILVUS_PORT = os.environ.get("MILVUS_PORT")
|
32 |
+
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
|
33 |
+
GROQ_API_KEY_TOKEN = os.environ.get("GROQ_API_KEY")
|
34 |
+
|
35 |
+
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
|
36 |
+
|
37 |
+
file_handler = FileHandler(HUGGINGFACE_API_TOKEN,logger)
|
38 |
+
chat_handler = ChatHandler(HUGGINGFACE_API_TOKEN,GROQ_API_KEY_TOKEN,logger)
|
39 |
+
|
40 |
+
# Streamlit UI
|
41 |
+
st.set_page_config(layout="wide", page_title="AI Connect - Smarter Network Planning for the Future")
|
42 |
+
|
43 |
+
# Session state to track login status
|
44 |
+
if "logged_in" not in st.session_state:
|
45 |
+
st.session_state["logged_in"] = False
|
46 |
+
|
47 |
+
# Login page
|
48 |
+
# Refined Login Page
|
49 |
+
if not st.session_state["logged_in"]:
|
50 |
+
# Customize page title
|
51 |
+
st.markdown(
|
52 |
+
"""
|
53 |
+
<style>
|
54 |
+
.title {
|
55 |
+
font-size: 2.5rem;
|
56 |
+
color: #1f77b4;
|
57 |
+
font-weight: bold;
|
58 |
+
text-align: center;
|
59 |
+
margin-bottom: 10px;
|
60 |
+
}
|
61 |
+
.subtitle {
|
62 |
+
font-size: 1.2rem;
|
63 |
+
color: #555;
|
64 |
+
text-align: center;
|
65 |
+
margin-bottom: 20px;
|
66 |
+
}
|
67 |
+
.login-box {
|
68 |
+
margin: auto;
|
69 |
+
width: 50%;
|
70 |
+
padding: 20px;
|
71 |
+
background: #f9f9f9;
|
72 |
+
border: 1px solid #ddd;
|
73 |
+
border-radius: 10px;
|
74 |
+
}
|
75 |
+
.login-box input {
|
76 |
+
margin-bottom: 10px;
|
77 |
+
}
|
78 |
+
</style>
|
79 |
+
<div>
|
80 |
+
<div class="title">Welcome to AI Connect</div>
|
81 |
+
<div class="subtitle">Smarter Network Planning for the Future</div>
|
82 |
+
</div>
|
83 |
+
""",
|
84 |
+
unsafe_allow_html=True,
|
85 |
+
)
|
86 |
+
|
87 |
+
# Centered Login Box
|
88 |
+
st.subheader("Login to Continue")
|
89 |
+
username = st.text_input("Username")
|
90 |
+
password = st.text_input("Password", type="password")
|
91 |
+
if st.button("Login"):
|
92 |
+
if username == USERNAME and password == PASSWORD:
|
93 |
+
st.session_state["logged_in"] = True
|
94 |
+
st.success("Login successful!")
|
95 |
+
logger.info("User Logged Successfully")
|
96 |
+
st.rerun()
|
97 |
+
else:
|
98 |
+
st.error("Invalid username or password.")
|
99 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
100 |
+
else:
|
101 |
+
# Main app (Chat Interface)
|
102 |
+
st.title("Chatbot - Smarter Network Planning for the Future")
|
103 |
+
st.sidebar.header("Upload Documents")
|
104 |
+
uploaded_file = st.sidebar.file_uploader("Upload PDF, Excel, Docx, or Txt", type=["pdf", "xlsx", "docx", "txt", "csv"])
|
105 |
+
document_name = st.sidebar.text_input("Document Name", "")
|
106 |
+
document_description = st.sidebar.text_area("Document Description", "")
|
107 |
+
|
108 |
+
if st.sidebar.button("Process File"):
|
109 |
+
if uploaded_file:
|
110 |
+
with st.spinner("Processing your file..."):
|
111 |
+
response = file_handler.handle_file_upload(
|
112 |
+
file=uploaded_file,
|
113 |
+
document_name=document_name,
|
114 |
+
document_description=document_description,
|
115 |
+
)
|
116 |
+
st.sidebar.success(f"File processed: {response['message']}")
|
117 |
+
else:
|
118 |
+
st.sidebar.warning("Please upload a file before processing.")
|
119 |
+
|
120 |
+
# Chat Interface
|
121 |
+
if "messages" not in st.session_state:
|
122 |
+
st.session_state["messages"] = []
|
123 |
+
|
124 |
+
# Display chat messages from history
|
125 |
+
for message in st.session_state["messages"]:
|
126 |
+
with st.chat_message(message["role"]):
|
127 |
+
st.markdown(message["content"])
|
128 |
+
|
129 |
+
# Accept user input
|
130 |
+
if prompt := st.chat_input("Type your question here..."):
|
131 |
+
with st.chat_message("user"):
|
132 |
+
st.markdown(prompt)
|
133 |
+
st.session_state["messages"].append({"role": "user", "content": prompt})
|
134 |
+
|
135 |
+
with st.spinner("Processing your question..."):
|
136 |
+
response = chat_handler.answer_question(prompt)
|
137 |
+
with st.chat_message("assistant"):
|
138 |
+
st.markdown(response)
|
139 |
+
st.session_state["messages"].append({"role": "assistant", "content": response})
|
140 |
+
|
141 |
+
# Logout button
|
142 |
+
if st.session_state["logged_in"]:
|
143 |
+
if st.sidebar.button("Logout"):
|
144 |
+
st.session_state["logged_in"] = False
|
145 |
+
st.rerun()
|
chat_handler.py
CHANGED
@@ -1,26 +1,19 @@
|
|
1 |
import os
|
2 |
-
from langchain_community.vectorstores import FAISS
|
3 |
-
from langchain_openai import ChatOpenAI
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
from groq import Groq
|
6 |
-
import
|
7 |
-
|
8 |
|
9 |
class ChatHandler:
|
10 |
-
def __init__(self,
|
11 |
-
self.
|
|
|
12 |
self.groq_client = Groq(api_key=grok_api_token)
|
13 |
# Initialize the embedding model using Hugging Face
|
14 |
self.embeddings = HuggingFaceEmbeddings(
|
15 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
16 |
model_kwargs={"token": api_token},
|
17 |
)
|
18 |
-
|
19 |
-
model_name="gpt-4",
|
20 |
-
api_key=open_api_key,
|
21 |
-
max_tokens=500,
|
22 |
-
temperature=0.2,
|
23 |
-
)
|
24 |
def _query_groq_model(self, prompt):
|
25 |
"""
|
26 |
Query Groq's Llama model using the SDK.
|
@@ -33,29 +26,38 @@ class ChatHandler:
|
|
33 |
# Return the assistant's response
|
34 |
return chat_completion.choices[0].message.content
|
35 |
except Exception as e:
|
|
|
36 |
return f"Error querying Groq API: {e}"
|
37 |
|
38 |
def answer_question(self, question):
|
39 |
# Generate embedding for the question
|
|
|
|
|
40 |
responses = []
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
if responses:
|
53 |
-
|
54 |
-
|
55 |
-
# if hasattr(response, "content"):
|
56 |
-
# return response.content.strip() # Ensure clean output
|
57 |
-
# else:
|
58 |
-
# return "Error: 'content' attribute not found in the AI's response."
|
59 |
response = self._query_groq_model(prompt)
|
60 |
return response
|
61 |
|
@@ -68,7 +70,10 @@ class ChatHandler:
|
|
68 |
and answer questions effectively using the provided documents.
|
69 |
"""
|
70 |
context = "\n".join(
|
71 |
-
[
|
|
|
|
|
|
|
72 |
)
|
73 |
|
74 |
prompt = f"""
|
|
|
1 |
import os
|
|
|
|
|
2 |
from langchain_huggingface import HuggingFaceEmbeddings
|
3 |
from groq import Groq
|
4 |
+
from pymilvus import connections, Collection
|
|
|
5 |
|
6 |
class ChatHandler:
|
7 |
+
def __init__(self,api_token,grok_api_token,logger):
|
8 |
+
self.logger = logger
|
9 |
+
self.logger.info("Initializing ChatHandler...")
|
10 |
self.groq_client = Groq(api_key=grok_api_token)
|
11 |
# Initialize the embedding model using Hugging Face
|
12 |
self.embeddings = HuggingFaceEmbeddings(
|
13 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
14 |
model_kwargs={"token": api_token},
|
15 |
)
|
16 |
+
|
|
|
|
|
|
|
|
|
|
|
17 |
def _query_groq_model(self, prompt):
|
18 |
"""
|
19 |
Query Groq's Llama model using the SDK.
|
|
|
26 |
# Return the assistant's response
|
27 |
return chat_completion.choices[0].message.content
|
28 |
except Exception as e:
|
29 |
+
self.logger.error(f"Error querying Groq API: {e}")
|
30 |
return f"Error querying Groq API: {e}"
|
31 |
|
32 |
def answer_question(self, question):
|
33 |
# Generate embedding for the question
|
34 |
+
self.logger.info(f"Received question: {question}")
|
35 |
+
collections = connections._fetch_handler().list_collections()
|
36 |
responses = []
|
37 |
+
|
38 |
+
for collection_name in collections:
|
39 |
+
collection = Collection(name=collection_name)
|
40 |
+
embeddings = self.embeddings.embed_query(question)
|
41 |
+
|
42 |
+
search_params = {
|
43 |
+
"metric_type": "IP",
|
44 |
+
"params": {"nprobe": 10},
|
45 |
+
}
|
46 |
+
|
47 |
+
results = collection.search(
|
48 |
+
data=[embeddings],
|
49 |
+
anns_field="embedding",
|
50 |
+
param=search_params,
|
51 |
+
limit=5,
|
52 |
+
)
|
53 |
+
# Extract the embeddings or metadata (if needed)
|
54 |
+
for res in results[0]:
|
55 |
+
# Store the ID or use res.distance if needed for similarity score
|
56 |
+
responses.append({"id": res.id, "distance": res.distance,"content":res.entity})
|
57 |
|
58 |
if responses:
|
59 |
+
sorted_responses = sorted(responses, key=lambda x: x["distance"], reverse=True)
|
60 |
+
prompt = self._generate_prompt(question, sorted_responses[:5])
|
|
|
|
|
|
|
|
|
61 |
response = self._query_groq_model(prompt)
|
62 |
return response
|
63 |
|
|
|
70 |
and answer questions effectively using the provided documents.
|
71 |
"""
|
72 |
context = "\n".join(
|
73 |
+
[
|
74 |
+
f"Document {i + 1}:\nID: {doc['id']}\nSimilarity: {doc['distance']:.4f}\nContent: {doc['content']}"
|
75 |
+
for i, doc in enumerate(documents[:5])
|
76 |
+
]
|
77 |
)
|
78 |
|
79 |
prompt = f"""
|
file_handler.py
CHANGED
@@ -1,16 +1,17 @@
|
|
1 |
import os
|
2 |
import hashlib
|
3 |
import io
|
4 |
-
import json
|
5 |
import pandas as pd
|
6 |
-
from langchain_community.vectorstores import FAISS
|
7 |
from PyPDF2 import PdfReader
|
8 |
from docx import Document
|
9 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
|
|
10 |
|
11 |
class FileHandler:
|
12 |
-
def __init__(self,
|
13 |
-
self.
|
|
|
14 |
# Initialize the embedding model using Hugging Face
|
15 |
self.embeddings = HuggingFaceEmbeddings(
|
16 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
@@ -21,12 +22,11 @@ class FileHandler:
|
|
21 |
try:
|
22 |
content = file.read()
|
23 |
file_hash = hashlib.md5(content).hexdigest()
|
24 |
-
|
25 |
-
vector_store_dir = os.path.join(self.vector_db_path, file_key)
|
26 |
-
os.makedirs(vector_store_dir, exist_ok=True)
|
27 |
-
vector_store_path = os.path.join(vector_store_dir, "index.faiss")
|
28 |
|
29 |
-
if
|
|
|
|
|
30 |
return {"message": "File already processed."}
|
31 |
|
32 |
# Process file based on type
|
@@ -41,30 +41,63 @@ class FileHandler:
|
|
41 |
elif file.name.endswith(".csv"):
|
42 |
texts, metadatas = self.load_and_split_csv(content)
|
43 |
else:
|
|
|
44 |
raise ValueError("Unsupported file format.")
|
45 |
|
|
|
46 |
if not texts:
|
47 |
return {"message": "No text extracted from the file. Check the file content."}
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
"filename": file.name,
|
55 |
-
"document_name": document_name,
|
56 |
-
"document_description": document_description,
|
57 |
-
"file_size": len(content),
|
58 |
-
}
|
59 |
-
metadata_path = os.path.join(vector_store_dir, "metadata.json")
|
60 |
-
with open(metadata_path, 'w') as md_file:
|
61 |
-
json.dump(metadata, md_file)
|
62 |
|
63 |
return {"message": "File processed successfully."}
|
64 |
except Exception as e:
|
|
|
65 |
return {"message": f"Error processing file: {str(e)}"}
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
|
|
|
|
|
|
68 |
def load_and_split_pdf(self, file):
|
69 |
reader = PdfReader(file)
|
70 |
texts = []
|
|
|
1 |
import os
|
2 |
import hashlib
|
3 |
import io
|
|
|
4 |
import pandas as pd
|
|
|
5 |
from PyPDF2 import PdfReader
|
6 |
from docx import Document
|
7 |
from langchain_huggingface import HuggingFaceEmbeddings
|
8 |
+
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
|
9 |
+
import json
|
10 |
|
11 |
class FileHandler:
|
12 |
+
def __init__(self,api_token,logger):
|
13 |
+
self.logger = logger
|
14 |
+
self.logger.info("Initializing FileHandler...")
|
15 |
# Initialize the embedding model using Hugging Face
|
16 |
self.embeddings = HuggingFaceEmbeddings(
|
17 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
|
|
22 |
try:
|
23 |
content = file.read()
|
24 |
file_hash = hashlib.md5(content).hexdigest()
|
25 |
+
collection_name = f"collection_{file_hash}"
|
|
|
|
|
|
|
26 |
|
27 |
+
# Check if the collection exists
|
28 |
+
if connections._fetch_handler().has_collection(collection_name):
|
29 |
+
self.logger.info(f"Collection '{collection_name}' already exists.")
|
30 |
return {"message": "File already processed."}
|
31 |
|
32 |
# Process file based on type
|
|
|
41 |
elif file.name.endswith(".csv"):
|
42 |
texts, metadatas = self.load_and_split_csv(content)
|
43 |
else:
|
44 |
+
self.logger.info("Unsupported file format.")
|
45 |
raise ValueError("Unsupported file format.")
|
46 |
|
47 |
+
|
48 |
if not texts:
|
49 |
return {"message": "No text extracted from the file. Check the file content."}
|
50 |
|
51 |
+
# self._store_vectors(collection_name, texts, metadatas)
|
52 |
+
filename = file.name
|
53 |
+
filelen = len(content)
|
54 |
+
self._store_vectors(collection_name, texts, metadatas, document_name, document_description,filename,filelen)
|
55 |
+
self.logger.info(f"File processed successfully. Collection name: {collection_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
return {"message": "File processed successfully."}
|
58 |
except Exception as e:
|
59 |
+
self.logger.error(f"Error processing file: {str(e)}")
|
60 |
return {"message": f"Error processing file: {str(e)}"}
|
61 |
|
62 |
+
def _store_vectors(self, collection_name, texts, metadatas, document_name, document_description,file_name,file_len):
|
63 |
+
fields = [
|
64 |
+
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
65 |
+
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
|
66 |
+
FieldSchema(name="file_name_hash", dtype=DataType.INT64), # Hash of file name
|
67 |
+
FieldSchema(name="document_name_hash", dtype=DataType.INT64), # Hash of document name
|
68 |
+
FieldSchema(name="document_description_hash", dtype=DataType.INT64), # Hash of document description
|
69 |
+
FieldSchema(name="file_meta_hash", dtype=DataType.INT64),
|
70 |
+
FieldSchema(name="file_size", dtype=DataType.INT64),
|
71 |
+
]
|
72 |
+
schema = CollectionSchema(fields, description="Document embeddings with metadata")
|
73 |
+
collection = Collection(name=collection_name, schema=schema)
|
74 |
+
# Generate embeddings
|
75 |
+
embeddings = [self.embeddings.embed_query(text) for text in texts]
|
76 |
+
|
77 |
+
# Convert metadata to hashed values
|
78 |
+
file_name_hash = int(hashlib.md5(file_name.encode('utf-8')).hexdigest(), 16) % (10 ** 12)
|
79 |
+
document_name_hash = int(hashlib.md5((document_name or "Unknown Document").encode('utf-8')).hexdigest(), 16) % (
|
80 |
+
10 ** 12)
|
81 |
+
document_description_hash = int(
|
82 |
+
hashlib.md5((document_description or "No Description Provided").encode('utf-8')).hexdigest(), 16) % (
|
83 |
+
10 ** 12)
|
84 |
+
# Convert metadata list to JSON string and hash it
|
85 |
+
metadata_string = json.dumps(metadatas, ensure_ascii=False)
|
86 |
+
file_meta_hash = int(hashlib.md5(metadata_string.encode('utf-8')).hexdigest(), 16) % (10 ** 12)
|
87 |
+
|
88 |
+
# Prepare data for insertion
|
89 |
+
data = [
|
90 |
+
embeddings,
|
91 |
+
[file_name_hash] * len(embeddings),
|
92 |
+
[document_name_hash] * len(embeddings),
|
93 |
+
[document_description_hash] * len(embeddings),
|
94 |
+
[file_meta_hash] * len(embeddings),
|
95 |
+
[file_len or 0] * len(embeddings),
|
96 |
+
]
|
97 |
|
98 |
+
# Insert data into collection
|
99 |
+
collection.insert(data)
|
100 |
+
collection.load()
|
101 |
def load_and_split_pdf(self, file):
|
102 |
reader = PdfReader(file)
|
103 |
texts = []
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|