pratikshahp commited on
Commit
d216b93
·
verified ·
1 Parent(s): bbf555f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -21
app.py CHANGED
@@ -6,29 +6,34 @@ from langchain_huggingface import HuggingFaceEmbeddings
6
  from pinecone import Pinecone, ServerlessSpec
7
  from langchain_community.document_loaders import WhatsAppChatLoader
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
9
 
10
- # Initialize Hugging Face embeddings
11
- embeddings = HuggingFaceEmbeddings()
12
 
13
- def load_chat_content(pinecone_key, file) -> str:
14
- """Load chat content from the uploaded zip file and store it in Pinecone."""
15
- # Initialize Pinecone
16
- pc = Pinecone(api_key=pinecone_key)
17
- index_name = "whatsapp-group-chat-index"
18
-
19
- if index_name not in pc.list_indexes().names():
20
- pc.create_index(
21
- name=index_name,
22
- dimension=768, # change as per embedding model
23
- metric="cosine",
24
- spec=ServerlessSpec(
25
- cloud='aws',
26
- region='us-east-1'
27
- )
28
  )
 
29
 
30
- index = pc.Index(index_name)
31
 
 
 
 
 
 
32
  # Load and process the ZIP file
33
  temp_dir = 'temp_extracted_files'
34
  os.makedirs(temp_dir, exist_ok=True)
@@ -57,20 +62,32 @@ def load_chat_content(pinecone_key, file) -> str:
57
 
58
  # Store chunks in Pinecone with unique IDs
59
  vectors_to_upsert = []
 
60
  for i, chunk in enumerate(chunks):
61
  vector = embeddings.embed_documents([chunk.page_content])[0]
62
  unique_id = str(uuid.uuid4()) # Generate a unique ID
63
  vectors_to_upsert.append((unique_id, vector, {"text": chunk.page_content}))
 
64
 
 
65
  index.upsert(vectors_to_upsert)
66
 
67
- return "Chat content has been successfully upserted to Pinecone."
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Define the Gradio interface
70
  interface = gr.Interface(
71
  fn=load_chat_content,
72
  inputs=[
73
- gr.Textbox(label="Enter Pinecone API Key", type="password"),
74
  gr.File(label="Upload WhatsApp Chat Zip File")
75
  ],
76
  outputs="text",
@@ -79,4 +96,4 @@ interface = gr.Interface(
79
  )
80
 
81
  if __name__ == "__main__":
82
- interface.launch()
 
6
  from pinecone import Pinecone, ServerlessSpec
7
  from langchain_community.document_loaders import WhatsAppChatLoader
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from dotenv import load_dotenv
10
 
11
+ #load env var
12
+ load_dotenv()
13
 
14
+ # Initialize Pinecone and the index outside the function
15
+ pinecone_key = os.getenv("PINECONE_API_KEY")
16
+ pc = Pinecone(api_key=pinecone_key)
17
+ index_name = "whatsapp-chat-index"
18
+
19
+ if index_name not in pc.list_indexes().names():
20
+ pc.create_index(
21
+ name=index_name,
22
+ dimension=768, # change as per embedding model
23
+ metric="cosine",
24
+ spec=ServerlessSpec(
25
+ cloud='aws',
26
+ region='us-east-1'
 
 
27
  )
28
+ )
29
 
30
+ index = pc.Index(index_name)
31
 
32
+ # Initialize Hugging Face embeddings
33
+ embeddings = HuggingFaceEmbeddings()
34
+
35
+ def load_chat_content(file) -> str:
36
+ """Load chat content from the uploaded zip file and store it in Pinecone."""
37
  # Load and process the ZIP file
38
  temp_dir = 'temp_extracted_files'
39
  os.makedirs(temp_dir, exist_ok=True)
 
62
 
63
  # Store chunks in Pinecone with unique IDs
64
  vectors_to_upsert = []
65
+ unique_ids = []
66
  for i, chunk in enumerate(chunks):
67
  vector = embeddings.embed_documents([chunk.page_content])[0]
68
  unique_id = str(uuid.uuid4()) # Generate a unique ID
69
  vectors_to_upsert.append((unique_id, vector, {"text": chunk.page_content}))
70
+ unique_ids.append(unique_id)
71
 
72
+ # Upsert vectors to Pinecone
73
  index.upsert(vectors_to_upsert)
74
 
75
+ # Verify insertion by querying Pinecone with the unique IDs
76
+ inserted_ids = []
77
+ for uid in unique_ids:
78
+ result = index.fetch(ids=[uid])
79
+ if uid in result["matches"]:
80
+ inserted_ids.append(uid)
81
+
82
+ if len(inserted_ids) == len(unique_ids):
83
+ return "All chat content has been successfully upserted to Pinecone."
84
+ else:
85
+ return f"Insertion incomplete. Only {len(inserted_ids)} out of {len(unique_ids)} chunks were inserted."
86
 
87
  # Define the Gradio interface
88
  interface = gr.Interface(
89
  fn=load_chat_content,
90
  inputs=[
 
91
  gr.File(label="Upload WhatsApp Chat Zip File")
92
  ],
93
  outputs="text",
 
96
  )
97
 
98
  if __name__ == "__main__":
99
+ interface.launch()