pratikshahp commited on
Commit
352d09d
·
verified ·
1 Parent(s): 7e0a52c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import zipfile
4
+ import uuid
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from pinecone import Pinecone, ServerlessSpec
7
+ from langchain_community.document_loaders import WhatsAppChatLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+
10
+ # Initialize Hugging Face embeddings
11
+ embeddings = HuggingFaceEmbeddings()
12
+
13
+ def load_chat_content(pinecone_key, file) -> str:
14
+ """Load chat content from the uploaded zip file and store it in Pinecone."""
15
+ # Initialize Pinecone
16
+ pc = Pinecone(api_key=pinecone_key)
17
+ index_name = "whatsapp-group-chat-index"
18
+
19
+ if index_name not in pc.list_indexes().names():
20
+ pc.create_index(
21
+ name=index_name,
22
+ dimension=768, # change as per embedding model
23
+ metric="cosine",
24
+ spec=ServerlessSpec(
25
+ cloud='aws',
26
+ region='us-east-1'
27
+ )
28
+ )
29
+
30
+ index = pc.Index(index_name)
31
+
32
+ # Load and process the ZIP file
33
+ temp_dir = 'temp_extracted_files'
34
+ os.makedirs(temp_dir, exist_ok=True)
35
+
36
+ with zipfile.ZipFile(file, 'r') as z:
37
+ z.extractall(temp_dir)
38
+
39
+ chat_files = [f for f in os.listdir(temp_dir) if f.endswith('.txt')]
40
+ if not chat_files:
41
+ raise ValueError("No chat files found in the zip archive.")
42
+
43
+ chat_file_path = os.path.join(temp_dir, chat_files[0])
44
+
45
+ loader = WhatsAppChatLoader(path=chat_file_path)
46
+ raw_messages = loader.lazy_load()
47
+ messages = list(raw_messages)
48
+
49
+ chat_content = "\n".join([doc.page_content for doc in messages])
50
+
51
+ text_splitter = RecursiveCharacterTextSplitter(
52
+ chunk_size=3000,
53
+ chunk_overlap=150,
54
+ length_function=len,
55
+ )
56
+
57
+ chunks = text_splitter.create_documents([chat_content])
58
+
59
+ # Store chunks in Pinecone with unique IDs
60
+ vectors_to_upsert = []
61
+ for i, chunk in enumerate(chunks):
62
+ vector = embeddings.embed_documents([chunk.page_content])[0]
63
+ unique_id = str(uuid.uuid4()) # Generate a unique ID
64
+ vectors_to_upsert.append((unique_id, vector, {"text": chunk.page_content}))
65
+
66
+ index.upsert(vectors_to_upsert)
67
+
68
+ return "Chat content has been successfully upserted to Pinecone."
69
+
70
+ # Define the Gradio interface
71
+ interface = gr.Interface(
72
+ fn=load_chat_content,
73
+ inputs=[
74
+ gr.Textbox(label="Enter Pinecone API Key", type="password"),
75
+ gr.File(label="Upload WhatsApp Chat Zip File")
76
+ ],
77
+ outputs="text",
78
+ title="WhatsApp Chat Upsert to Pinecone",
79
+ description="Upload a zip file containing a WhatsApp chat file and upsert its content to Pinecone.",
80
+ )
81
+
82
+ if __name__ == "__main__":
83
+ interface.launch()