IProject-10 commited on
Commit
9d9952b
Β·
verified Β·
1 Parent(s): 63cef6d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +215 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import uuid
5
+ import nltk
6
+ import trafilatura
7
+ import chromadb
8
+ import tiktoken
9
+ import gradio as gr
10
+
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.runnables import RunnableLambda, RunnablePassthrough
13
+ from langchain_core.output_parsers import StrOutputParser
14
+ from langchain_together import ChatTogether
15
+ from langchain_community.vectorstores import Chroma
16
+ from langchain_community.embeddings import HuggingFaceEmbeddings
17
+ from sentence_transformers import SentenceTransformer
18
+ from nltk.tokenize import sent_tokenize
19
+
20
+ # Download NLTK resources
21
+ nltk.download('punkt')
22
+
23
+ # Initialize tokenizer
24
+ tokenizer = tiktoken.get_encoding("cl100k_base")
25
+
26
+ # Initialize embedding model
27
+ embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
28
+ embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
29
+
30
+ # Initialize ChromaDB
31
+ chroma_client = chromadb.PersistentClient(path="./chroma_store")
32
+ collection = chroma_client.get_or_create_collection(name="imageonline_chunks")
33
+
34
+ # Sectioned URL List
35
+ url_dict = {
36
+ "Website Designing": [
37
+ "https://www.imageonline.co.in/website-designing-mumbai.html",
38
+ "https://www.imageonline.co.in/domain-hosting-services-india.html",
39
+ "https://www.imageonline.co.in/best-seo-company-mumbai.html",
40
+ "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
41
+ "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
42
+ "https://www.imageonline.co.in/website-template-customization-india.html",
43
+ "https://www.imageonline.co.in/regular-website-maintanence-services.html",
44
+ "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
45
+ "https://www.imageonline.co.in/web-application-screen-designing.html"
46
+ ],
47
+ "Website Development": [
48
+ "https://www.imageonline.co.in/website-development-mumbai.html",
49
+ "https://www.imageonline.co.in/open-source-customization.html",
50
+ "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
51
+ "https://www.imageonline.co.in/website-with-content-management-system.html",
52
+ "https://www.imageonline.co.in/web-application-development-india.html"
53
+ ],
54
+ "Mobile App Development": [
55
+ "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
56
+ ],
57
+ "About Us": [
58
+ "https://www.imageonline.co.in/about-us.html",
59
+ "https://www.imageonline.co.in/vision.html",
60
+ "https://www.imageonline.co.in/team.html"
61
+ ],
62
+ "Testimonials": [
63
+ "https://www.imageonline.co.in/testimonial.html"
64
+ ]
65
+ }
66
+
67
+ # Helper functions
68
+ def extract_clean_text(url):
69
+ try:
70
+ print(f"πŸ”— Fetching URL: {url}")
71
+ downloaded = trafilatura.fetch_url(url)
72
+ if downloaded:
73
+ content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
74
+ print(f"βœ… Extracted text from {url}")
75
+ return content
76
+ else:
77
+ print(f"⚠️ Failed to fetch content from {url}")
78
+ except Exception as e:
79
+ print(f"❌ Error fetching {url}: {e}")
80
+ return None
81
+
82
+ def chunk_text(text, max_tokens=400):
83
+ sentences = sent_tokenize(text)
84
+ chunks = []
85
+ current_chunk = []
86
+
87
+ for sentence in sentences:
88
+ current_chunk.append(sentence)
89
+ tokens = tokenizer.encode(" ".join(current_chunk))
90
+ if len(tokens) > max_tokens:
91
+ current_chunk.pop()
92
+ chunks.append(" ".join(current_chunk).strip())
93
+ current_chunk = [sentence]
94
+
95
+ if current_chunk:
96
+ chunks.append(" ".join(current_chunk).strip())
97
+
98
+ print(f"πŸ“„ Text split into {len(chunks)} chunks.")
99
+ return chunks
100
+
101
+ # Check refresh override
102
+ force_refresh = os.getenv("FORCE_REFRESH", "false").lower() == "true"
103
+
104
+ # Load data into ChromaDB
105
+ if collection.count() == 0 or force_refresh:
106
+ print("πŸ”„ Loading documents into ChromaDB...")
107
+ for section, urls in url_dict.items():
108
+ for url in urls:
109
+ text = extract_clean_text(url)
110
+ if not text:
111
+ continue
112
+ chunks = chunk_text(text)
113
+ embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
114
+ metadatas = [{"source": url, "section": section} for _ in chunks]
115
+ ids = [str(uuid.uuid4()) for _ in chunks]
116
+
117
+ collection.add(
118
+ documents=chunks,
119
+ embeddings=embeddings.tolist(),
120
+ metadatas=metadatas,
121
+ ids=ids
122
+ )
123
+ print("βœ… Document loading complete.")
124
+ else:
125
+ print("βœ… Using existing ChromaDB collection.")
126
+
127
+ # Vectorstore & Retriever
128
+ vectorstore = Chroma(
129
+ client=chroma_client,
130
+ collection_name="imageonline_chunks",
131
+ embedding_function=embedding_function
132
+ )
133
+
134
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
135
+
136
+ # Together.ai LLM
137
+ llm = ChatTogether(
138
+ model="meta-llama/Llama-3-8b-chat-hf",
139
+ temperature=0.3,
140
+ max_tokens=1024,
141
+ top_p=0.7,
142
+ together_api_key=os.getenv("TOGETHER_API_KEY")
143
+ )
144
+
145
+ # Prompt template (refined)
146
+ prompt = ChatPromptTemplate.from_template("""
147
+ You are a helpful assistant for ImageOnline Web Solutions.
148
+
149
+ Use ONLY the information provided in the context to answer the user's query.
150
+
151
+ Context:
152
+ {context}
153
+
154
+ Question:
155
+ {question}
156
+
157
+ If the answer is not found in the context, say "I'm sorry, I don't have enough information to answer that."
158
+ """)
159
+
160
+ # Context retrieval
161
+ def retrieve_and_format(query):
162
+ docs = retriever.get_relevant_documents(query)
163
+ context_strings = []
164
+ for doc in docs:
165
+ content = doc.page_content
166
+ metadata = doc.metadata
167
+ source = metadata.get("source", "")
168
+ section = metadata.get("section", "")
169
+ context_strings.append(f"[{section}] {content}\n(Source: {source})")
170
+ return "\n\n".join(context_strings)
171
+
172
+ # RAG chain
173
+ rag_chain = (
174
+ {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
175
+ | prompt
176
+ | llm
177
+ | StrOutputParser()
178
+ )
179
+
180
+ # Gradio Interface
181
+ def chat_interface(message, history):
182
+ history = history or []
183
+ history.append(("πŸ§‘ You: " + message, "⏳ Generating response..."))
184
+ try:
185
+ answer = rag_chain.invoke(message)
186
+ history[-1] = ("πŸ§‘ You: " + message, "πŸ€– Bot: " + answer)
187
+ except Exception as e:
188
+ error_msg = f"⚠️ Error: {str(e)}"
189
+ history[-1] = ("πŸ§‘ You: " + message, f"πŸ€– Bot: {error_msg}")
190
+ return history, history
191
+
192
+ def launch_gradio():
193
+ with gr.Blocks() as demo:
194
+ gr.Markdown("# πŸ’¬ ImageOnline RAG Chatbot")
195
+ gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")
196
+
197
+ chatbot = gr.Chatbot()
198
+ state = gr.State([])
199
+
200
+ with gr.Row():
201
+ msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
202
+ send_btn = gr.Button("πŸ“¨ Send", scale=1)
203
+
204
+ msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
205
+ send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
206
+
207
+ with gr.Row():
208
+ clear_btn = gr.Button("🧹 Clear Chat")
209
+ clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])
210
+
211
+ return demo
212
+
213
+ if __name__ == "__main__":
214
+ demo = launch_gradio()
215
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-together
3
+ langchain-community
4
+ chromadb
5
+ sentence-transformers
6
+ trafilatura
7
+ beautifulsoup4
8
+ nltk
9
+ tiktoken
10
+ gradio
11
+ together