Vela commited on
Commit
a8dda00
·
1 Parent(s): 6628e9e

added pinecone for project

Browse files
src/backend/__pycache__/main.cpython-313.pyc CHANGED
Binary files a/src/backend/__pycache__/main.cpython-313.pyc and b/src/backend/__pycache__/main.cpython-313.pyc differ
 
src/backend/data/__pycache__/dataset.cpython-313.pyc CHANGED
Binary files a/src/backend/data/__pycache__/dataset.cpython-313.pyc and b/src/backend/data/__pycache__/dataset.cpython-313.pyc differ
 
src/backend/data/__pycache__/pinecone_db.cpython-313.pyc CHANGED
Binary files a/src/backend/data/__pycache__/pinecone_db.cpython-313.pyc and b/src/backend/data/__pycache__/pinecone_db.cpython-313.pyc differ
 
src/backend/data/dataset.py CHANGED
@@ -9,29 +9,33 @@ DATASET_PATH = "src/backend/data/dataset.csv"
9
  PARAQUET_DATASET_PATH = "hf://datasets/lavita/ChatDoctor-HealthCareMagic-100k/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet"
10
 
11
  def get_data_set():
12
-
13
  try:
14
  if not os.path.exists(DATASET_PATH):
15
  logger.info(f"{DATASET_PATH} not found. Reading from Parquet file.")
16
  df = pd.read_parquet(PARAQUET_DATASET_PATH)
17
- df.drop_duplicates(subset=["input", "output"], inplace=True)
18
- df.dropna(subset=["input", "output"], inplace=True) # Remove NaNs first
19
-
20
- # This line is to remove the empty column or column with only spaces
21
- df = df[(df["input"].str.strip() != "") & (df["output"].str.strip() != "")] # Remove empty strings/spaces
22
-
23
- # This line is to remove puncuation and emjois
24
- translator = str.maketrans('', '', string.punctuation)
25
- df["input"] = df["input"].str.lower().str.translate(translator)
26
- df["output"] = df["output"].str.lower().str.translate(translator)
27
- df.to_csv(DATASET_PATH, index=False)
28
- logger.info(f"CSV file created and cleaned at: {DATASET_PATH}")
29
  else:
30
  logger.info(f"Loading existing dataset from: {DATASET_PATH}")
31
- df = pd.read_csv(DATASET_PATH)
32
- logger.info("Dataset loaded successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  return df
34
 
35
  except Exception as e:
36
  logger.error(f"Error while loading dataset: {e}", exc_info=True)
37
- return None
 
9
  PARAQUET_DATASET_PATH = "hf://datasets/lavita/ChatDoctor-HealthCareMagic-100k/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet"
10
 
11
  def get_data_set():
 
12
  try:
13
  if not os.path.exists(DATASET_PATH):
14
  logger.info(f"{DATASET_PATH} not found. Reading from Parquet file.")
15
  df = pd.read_parquet(PARAQUET_DATASET_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
16
  else:
17
  logger.info(f"Loading existing dataset from: {DATASET_PATH}")
18
+ df = pd.read_csv(DATASET_PATH).fillna("")
19
+
20
+ # Cleaning logic for both Parquet and CSV data
21
+ df.drop_duplicates(subset=["input", "output"], inplace=True)
22
+
23
+ # Remove NaN values or empty strings
24
+ df = df[df["input"].str.strip().notna() & df["output"].str.strip().notna()]
25
+ df = df[(df["input"].str.strip() != "") & (df["output"].str.strip() != "")]
26
+
27
+ # Clean punctuation and emojis
28
+ translator = str.maketrans('', '', string.punctuation)
29
+ df["input"] = df["input"].fillna("").str.lower().str.translate(translator)
30
+ df["output"] = df["output"].fillna("").str.lower().str.translate(translator)
31
+
32
+ # Save only if data is present
33
+ if not os.path.exists(DATASET_PATH):
34
+ df.to_csv(DATASET_PATH, index=False)
35
+ logger.info(f"CSV file created and cleaned at: {DATASET_PATH}")
36
+
37
  return df
38
 
39
  except Exception as e:
40
  logger.error(f"Error while loading dataset: {e}", exc_info=True)
41
+ return None
src/backend/data/pinecone_db.py CHANGED
@@ -61,37 +61,6 @@ def get_index():
61
 
62
  index = get_index()
63
 
64
- def process_and_upsert_data(index, data: pd.DataFrame):
65
-
66
- # Validate if the required columns exist in the row (Series)
67
- try:
68
- logger.info("Started upserting the data to database")
69
- for idx, row in data.iterrows():
70
- logger.info(f"Processing row {row['input']}")
71
- input_text = row['input']
72
- output_text = row['output']
73
- instruction_text = row['instruction']
74
- if not isinstance(input_text, str) or not input_text.strip():
75
- logger.warning(f"Skipping row {idx} due to empty or invalid input text.")
76
- continue
77
- row_dict = {
78
- "question": input_text,
79
- "answer" : output_text,
80
- "instruction": instruction_text
81
- }
82
- embeddings = embedding_model.get_text_embedding(row['input'])
83
- index.upsert(
84
- vectors=[{
85
- "id": f"id{idx}",
86
- "values": embeddings,
87
- "metadata":row_dict
88
- }],
89
- namespace=NAMESPACE,
90
- )
91
- logger.info(f"Successfully upserted data for question {input_text} with answer {output_text}")
92
- except Exception as e:
93
- logger.error(f"Error processing row with index {idx}: {e}")
94
-
95
  def search_vector_store(query, n_result : int = 3) -> list[dict]:
96
  """
97
  Searches the vector store for the most relevant matches based on the given query.
@@ -138,27 +107,51 @@ def get_retrieved_context(prompt: str) -> str:
138
  return "\n".join(retrieved_contexts[:3])
139
  return "No relevant information found in the database."
140
 
141
- df = dataset.get_data_set()[6:200]
142
- # process_and_upsert_data(index, data_set)
143
- # response = search_vector_store("What is the treatment for diabetes?")
144
- # print(response)
145
 
 
 
 
 
 
 
 
 
 
146
 
147
- def upsert_data_in_db(df: pd.DataFrame):
148
- df["embedding"] = [embedding_model.get_text_embedding([q])[0] for q in tqdm(df["input"], desc="Embedding Questions")]
 
 
 
 
 
 
149
 
150
- # Upload data to Pinecone in batches
151
- BATCH_SIZE = 100
152
- vectors = []
153
 
154
- for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Storing Data in Pinecone"):
155
  batch = df.iloc[i : i + BATCH_SIZE]
156
- vectors = [
157
- (f"q_{idx}", emb, {"question": row[0], "answer": row[1], "instruction": row[2]})
158
- for idx, (emb, row) in enumerate(zip(batch["embedding"], batch.iterrows()))
159
- ]
160
- index.upsert(vectors) # Upsert (insert or update) in Pinecone
 
 
 
 
 
 
 
 
 
161
 
162
- print("All question-answer pairs stored successfully!")
163
 
164
- upsert_data_in_db(df)
 
 
 
 
 
61
 
62
  index = get_index()
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def search_vector_store(query, n_result : int = 3) -> list[dict]:
65
  """
66
  Searches the vector store for the most relevant matches based on the given query.
 
107
  return "\n".join(retrieved_contexts[:3])
108
  return "No relevant information found in the database."
109
 
110
+ def upsert_data_in_db(df: pd.DataFrame):
 
 
 
111
 
112
+ """
113
+ Generates embeddings for the given DataFrame and uploads data to Pinecone in batches.
114
+
115
+ Parameters:
116
+ - df (pd.DataFrame): DataFrame containing 'input', 'question', and 'answer' columns.
117
+
118
+ Returns:
119
+ - None
120
+ """
121
 
122
+ try:
123
+ df["embedding"] = [
124
+ embedding_model.get_text_embedding([q])[0]
125
+ for q in tqdm(df["input"], desc="Generating Embeddings")
126
+ ]
127
+ except Exception as e:
128
+ logger.error(f"Error generating embeddings: {e}")
129
+ return
130
 
131
+ # # Upload data to Pinecone in batches
132
+ BATCH_SIZE = 500
 
133
 
134
+ for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Uploading Data to Pinecone"):
135
  batch = df.iloc[i : i + BATCH_SIZE]
136
+
137
+ vectors = []
138
+ for idx, (embedding, (_, row_data)) in enumerate(zip(batch["embedding"], batch.iterrows())):
139
+ vector_id = f"q_{i + idx}" # Ensures IDs remain unique across batches
140
+ metadata = {
141
+ "question": row_data.get("input"),
142
+ "answer": row_data.get("output")
143
+ }
144
+ vectors.append((vector_id, embedding, metadata))
145
+
146
+ try:
147
+ index.upsert(vectors)
148
+ except Exception as e:
149
+ logger.error(f"Error uploading batch starting at index {i}: {e}")
150
 
151
+ logger.info("All question-answer pairs stored successfully!")
152
 
153
+
154
+ # df = dataset.get_data_set()[19000:21000]
155
+ # upsert_data_in_db(df)
156
+ # response = search_vector_store("What is the treatment for diabetes?")
157
+ # print(response)
src/backend/main.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI
2
- from routes import chat_api
3
 
4
  app = FastAPI()
5
 
6
  app.include_router(chat_api.router, prefix="/chat", tags=["chat"])
7
- # app.include_router(upsert_data.router, prefix="/data", tags=["data"])
 
1
  from fastapi import FastAPI
2
+ from routes import chat_api,upsert_data
3
 
4
  app = FastAPI()
5
 
6
  app.include_router(chat_api.router, prefix="/chat", tags=["chat"])
7
+ app.include_router(upsert_data.router, prefix="/data", tags=["data"])
src/backend/models/__pycache__/embedding_model.cpython-313.pyc CHANGED
Binary files a/src/backend/models/__pycache__/embedding_model.cpython-313.pyc and b/src/backend/models/__pycache__/embedding_model.cpython-313.pyc differ
 
src/backend/models/__pycache__/llm_model.cpython-313.pyc CHANGED
Binary files a/src/backend/models/__pycache__/llm_model.cpython-313.pyc and b/src/backend/models/__pycache__/llm_model.cpython-313.pyc differ
 
src/backend/models/__pycache__/schemas.cpython-313.pyc CHANGED
Binary files a/src/backend/models/__pycache__/schemas.cpython-313.pyc and b/src/backend/models/__pycache__/schemas.cpython-313.pyc differ
 
src/backend/models/embedding_model.py CHANGED
@@ -9,9 +9,8 @@ model = SentenceTransformer("all-MiniLM-L6-v2")
9
 
10
  def get_text_embedding(search_query: str):
11
  try:
12
- logger.info(f"Getting embedding for the text: {search_query}")
13
  text_embedding = model.encode(search_query, convert_to_tensor=True).cpu().numpy().tolist()
14
- logger.info("Text embedding successfully retrieved.")
15
  return text_embedding
16
  except Exception as e:
17
  logger.error(f"Error while getting embedding for text: {e}")
 
9
 
10
  def get_text_embedding(search_query: str):
11
  try:
 
12
  text_embedding = model.encode(search_query, convert_to_tensor=True).cpu().numpy().tolist()
13
+ # logger.info("Text embedding successfully retrieved.")
14
  return text_embedding
15
  except Exception as e:
16
  logger.error(f"Error while getting embedding for text: {e}")
src/backend/models/llm_model.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  from groq import Groq
3
  from utils import logger
4
  from data import chroma_db
 
5
  from dotenv import load_dotenv
6
 
7
  load_dotenv()
@@ -25,7 +26,7 @@ def get_medical_assistant_response(prompt: list):
25
  if not prompt or len(prompt[0]) < 5:
26
  return "⚠️ Your question seems too short. Please provide more details so I can assist you better."
27
  query = prompt[-1]
28
- response = chroma_db.search_vector_store(query)
29
 
30
  if response and "metadatas" in response and response["metadatas"]:
31
  retrieved_contexts = [metadata['answer'] for metadata in response["metadatas"][0]]
@@ -68,7 +69,7 @@ def get_medical_assistant_request(conversation_history: list):
68
  return "⚠️ Please provide more details so I can assist you better."
69
  latest_user_message = conversation_history[-1]["content"]
70
  retrieved_contexts = []
71
- chroma_response = chroma_db.search_vector_store(latest_user_message)
72
  if chroma_response and "metadatas" in chroma_response and chroma_response["metadatas"]:
73
  retrieved_contexts = [metadata['answer'] for metadata in chroma_response["metadatas"][0]]
74
  context = "\n".join(retrieved_contexts[:3]) if retrieved_contexts else "No relevant information found in the database."
 
2
  from groq import Groq
3
  from utils import logger
4
  from data import chroma_db
5
+ from data import pinecone_db
6
  from dotenv import load_dotenv
7
 
8
  load_dotenv()
 
26
  if not prompt or len(prompt[0]) < 5:
27
  return "⚠️ Your question seems too short. Please provide more details so I can assist you better."
28
  query = prompt[-1]
29
+ response = pinecone_db.search_vector_store(query)
30
 
31
  if response and "metadatas" in response and response["metadatas"]:
32
  retrieved_contexts = [metadata['answer'] for metadata in response["metadatas"][0]]
 
69
  return "⚠️ Please provide more details so I can assist you better."
70
  latest_user_message = conversation_history[-1]["content"]
71
  retrieved_contexts = []
72
+ chroma_response = pinecone_db.search_vector_store(latest_user_message)
73
  if chroma_response and "metadatas" in chroma_response and chroma_response["metadatas"]:
74
  retrieved_contexts = [metadata['answer'] for metadata in chroma_response["metadatas"][0]]
75
  context = "\n".join(retrieved_contexts[:3]) if retrieved_contexts else "No relevant information found in the database."
src/backend/models/schemas.py CHANGED
@@ -6,4 +6,8 @@ class Chat_Response(BaseModel):
6
  response: Optional[Dict] = None
7
 
8
  class ChatRequest(BaseModel):
9
- conversation_history: List[Dict]
 
 
 
 
 
6
  response: Optional[Dict] = None
7
 
8
  class ChatRequest(BaseModel):
9
+ conversation_history: List[Dict]
10
+
11
+ class Add_Data_In_DB(BaseModel):
12
+ start: int
13
+ end: int
src/backend/routes/__pycache__/upsert_data.cpython-313.pyc CHANGED
Binary files a/src/backend/routes/__pycache__/upsert_data.cpython-313.pyc and b/src/backend/routes/__pycache__/upsert_data.cpython-313.pyc differ
 
src/backend/routes/upsert_data.py CHANGED
@@ -1,15 +1,19 @@
1
- # from fastapi import APIRouter,HTTPException
2
- # from data import dataset
3
- # from data import pinecone_db
 
4
 
5
- # router = APIRouter()
6
- # index_name = "question-answering-index"
7
 
8
- # @router.post("/upsert_data")
9
- # async def upsert_data():
10
- # try:
11
- # df = dataset.get_data_set()[0:1000]
12
- # pinecone_db.process_and_upsert_data(index_name, df)
13
- # return {"status": "success"}
14
- # except Exception as e:
15
- # raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
1
+ from fastapi import APIRouter,HTTPException
2
+ from data import dataset
3
+ from data import pinecone_db
4
+ from models.schemas import Add_Data_In_DB
5
 
6
+ router = APIRouter()
7
+ index_name = "question-answering-index"
8
 
9
+ @router.post("/upsert_data")
10
+ async def upsert_data(add_data: Add_Data_In_DB):
11
+
12
+ try:
13
+ start = add_data.start
14
+ end = add_data.end
15
+ df = dataset.get_data_set()[start:end]
16
+ pinecone_db.upsert_data_in_db(df, index_name)
17
+ return {"status": "success"}
18
+ except Exception as e:
19
+ raise HTTPException(status_code=500, detail=str(e))
src/frontend/app/__pycache__/common_fuctions.cpython-313.pyc CHANGED
Binary files a/src/frontend/app/__pycache__/common_fuctions.cpython-313.pyc and b/src/frontend/app/__pycache__/common_fuctions.cpython-313.pyc differ
 
src/frontend/app/__pycache__/homepage.cpython-313.pyc CHANGED
Binary files a/src/frontend/app/__pycache__/homepage.cpython-313.pyc and b/src/frontend/app/__pycache__/homepage.cpython-313.pyc differ
 
src/frontend/app/common_fuctions.py CHANGED
@@ -3,6 +3,7 @@ import base64
3
  import requests
4
  from dotenv import load_dotenv
5
  from utils import logger
 
6
 
7
  load_dotenv()
8
  logger = logger.get_logger()
@@ -27,12 +28,29 @@ def get_api_response(endpoint:str, prompt: list):
27
  logger.info(f"Sending user prompt to API endpoint: {API_URL}{endpoint}")
28
  response = requests.post(f"{API_URL}{endpoint}", json={"prompt": prompt})
29
  if response.status_code == 200:
30
- return response.json()["response"]
31
  else:
32
  return "An error occurred while processing your request."
33
  except Exception as e:
34
  return f"An error occurred while processing your request: {str(e)}"
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def initialize_conversation():
37
 
38
  assistant_message = "Hello! I am Yuvabe Care Companion AI. How can I assist you with your health-related queries today?"
 
3
  import requests
4
  from dotenv import load_dotenv
5
  from utils import logger
6
+ import json
7
 
8
  load_dotenv()
9
  logger = logger.get_logger()
 
28
  logger.info(f"Sending user prompt to API endpoint: {API_URL}{endpoint}")
29
  response = requests.post(f"{API_URL}{endpoint}", json={"prompt": prompt})
30
  if response.status_code == 200:
31
+ return response.json()
32
  else:
33
  return "An error occurred while processing your request."
34
  except Exception as e:
35
  return f"An error occurred while processing your request: {str(e)}"
36
 
37
+
38
+ def upsert_data_request(start, end):
39
+ headers = {"Content-Type": "application/json"}
40
+ payload = {
41
+ "start": start,
42
+ "end": end
43
+ }
44
+
45
+ try:
46
+ url = "http://localhost:8000/data/upsert_data"
47
+ response = requests.post(url, data=json.dumps(payload), headers=headers)
48
+ return response
49
+ except requests.exceptions.HTTPError as http_err:
50
+ print(f"HTTP error occurred: {http_err}")
51
+ except Exception as err:
52
+ print(f"An error occurred: {err}")
53
+
54
  def initialize_conversation():
55
 
56
  assistant_message = "Hello! I am Yuvabe Care Companion AI. How can I assist you with your health-related queries today?"
src/frontend/app/homepage.py CHANGED
@@ -60,11 +60,11 @@ def handle_user_input():
60
  response = "⚠️ Oops! Something went wrong. Please try again."
61
 
62
  with st.chat_message("assistant"):
63
- st.markdown(response)
64
 
65
  st.session_state.messages.append({"role": "assistant", "content": response})
66
 
67
- logger.info(f"Assistant response: {response[:100]}...")
68
 
69
  # def handle_user_input():
70
 
 
60
  response = "⚠️ Oops! Something went wrong. Please try again."
61
 
62
  with st.chat_message("assistant"):
63
+ st.markdown(response['response'])
64
 
65
  st.session_state.messages.append({"role": "assistant", "content": response})
66
 
67
+ logger.info(f"Assistant response: {response['response'][:100]}...")
68
 
69
  # def handle_user_input():
70
 
src/frontend/pages/database_response_page.py CHANGED
@@ -19,10 +19,9 @@ if prompt:
19
  endpoint = "/chat/db_response"
20
  response = common_fuctions.get_api_response(endpoint, [prompt])
21
  st.subheader("✅ Relevant question and answer pair found in the database.")
22
- for metadata_group in response["metadatas"]:
23
- for entry in metadata_group:
24
- st.write("Question:", entry["question"])
25
- st.write("Answer:", entry["answer"])
26
  st.write("-" * 80)
27
 
28
  if st.button("Clear chat"):
 
19
  endpoint = "/chat/db_response"
20
  response = common_fuctions.get_api_response(endpoint, [prompt])
21
  st.subheader("✅ Relevant question and answer pair found in the database.")
22
+ for metadata in response:
23
+ st.write("Question:", metadata["question"])
24
+ st.write("Answer:", metadata["answer"])
 
25
  st.write("-" * 80)
26
 
27
  if st.button("Clear chat"):
src/frontend/pages/dataloader_page.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app import common_fuctions
2
+ import streamlit as st
3
+ from app import homepage
4
+ from utils import logger
5
+
6
+ logger = logger.get_logger()
7
+
8
+ homepage.config_homepage()
9
+ st.title("Data Loader")
10
+
11
+ def load_data():
12
+ st.sidebar.header("📊 Data Loading Parameters")
13
+ start_index = st.sidebar.number_input("Select start index", min_value=0, value=0)
14
+ end_index = st.sidebar.number_input("Select end index", min_value=0, value=100)
15
+
16
+ if start_index > end_index:
17
+ st.sidebar.error("⚠️ Start index must be earlier than the end index.")
18
+ return
19
+
20
+ if "load_clicked" not in st.session_state:
21
+ st.session_state.load_clicked = False
22
+
23
+ try:
24
+ st.sidebar.info(f"Click the button to load data from index **{start_index} to {end_index}**.")
25
+ if st.sidebar.button("🚀 Upsert Data", disabled=st.session_state.load_clicked, help="Click to insert data into the database"):
26
+ st.session_state.load_clicked = True
27
+
28
+ with st.spinner("⏳ Upserting data... Please wait"):
29
+ response = common_fuctions.upsert_data_request(start_index, end_index)
30
+ st.write(response)
31
+ # if response.get("status") == "success":
32
+ # st.success("Data upserted successfully!")
33
+ # st.session_state.load_clicked = False
34
+ # else:
35
+ # st.error("Failed to upsert data.")
36
+ # logger.error("Failed to upsert data.")
37
+ # st.session_state.load_clicked = False
38
+ except Exception as e:
39
+ st.error(f"Error loading data: {e}")
40
+ logger.error(f"Error loading data: {e}")
41
+ st.session_state.load_clicked = False
42
+
43
+ load_data()