File size: 8,972 Bytes
8af136f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# SECTIONED URL LIST (in case we want to tag later)
url_dict = {
    "Website Designing": [
        "https://www.imageonline.co.in/website-designing-mumbai.html",
        "https://www.imageonline.co.in/domain-hosting-services-india.html",
        "https://www.imageonline.co.in/best-seo-company-mumbai.html",
        "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
        "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
        "https://www.imageonline.co.in/website-template-customization-india.html",
        "https://www.imageonline.co.in/regular-website-maintanence-services.html",
        "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
        "https://www.imageonline.co.in/web-application-screen-designing.html"
    ],
    "Website Development": [
        "https://www.imageonline.co.in/website-development-mumbai.html",
        "https://www.imageonline.co.in/open-source-customization.html",
        "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
        "https://www.imageonline.co.in/website-with-content-management-system.html",
        "https://www.imageonline.co.in/web-application-development-india.html"
    ],
    "Mobile App Development": [
        "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
    ],
    "About Us": [
        "https://www.imageonline.co.in/about-us.html",
        "https://www.imageonline.co.in/vision.html",
        "https://www.imageonline.co.in/team.html"
    ],
    "Testimonials": [
        "https://www.imageonline.co.in/testimonial.html"
    ]
}

import trafilatura
import requests

# Function to extract clean text using trafilatura
def extract_clean_text(url):
    """
    Fetch and extract clean main content from a URL using trafilatura.
    Returns None if content couldn't be extracted.
    """
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
            return content
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

# Scrape data and prepare for RAG with metadata
scraped_data = []

for section, urls in url_dict.items():
    for url in urls:
        print(f"🟩 Scraping: {url}")
        text = extract_clean_text(url)
        if text:
            print(f"βœ… Extracted {len(text)} characters.\n")
            scraped_data.append({
                "content": text,
                "metadata": {
                    "source": url,
                    "section": section
                }
            })
        else:
            print(f"❌ Failed to extract content from {url}.\n")

print(f"Total pages scraped: {len(scraped_data)}")

import tiktoken
from nltk.tokenize import sent_tokenize

# Initialize GPT tokenizer (cl100k_base works with Together.ai and OpenAI APIs)
tokenizer = tiktoken.get_encoding("cl100k_base")

def chunk_text(text, max_tokens=400):
    """
    Chunk text into overlapping segments based on sentence boundaries and token limits.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        current_chunk.append(sentence)
        tokens = tokenizer.encode(" ".join(current_chunk))
        if len(tokens) > max_tokens:
            # Finalize current chunk without last sentence
            current_chunk.pop()
            chunks.append(" ".join(current_chunk).strip())
            current_chunk = [sentence]  # Start new chunk with overflow sentence

    # Append final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk).strip())

    return chunks

chunked_data = []

for item in scraped_data:
    text = item["content"]
    metadata = item["metadata"]

    chunks = chunk_text(text, max_tokens=400)

    for chunk in chunks:
        chunked_data.append({
            "content": chunk,
            "metadata": metadata  # Keep the same URL + section for each chunk
        })

# Extract text chunks from chunked_data for embedding
texts_to_embed = [item["content"] for item in chunked_data]

from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

def embed_chunks(text_list, model):
    """
    Generate embeddings for a list of text chunks.
    """
    return model.encode(text_list, convert_to_numpy=True)

# Generate embeddings
embeddings = embed_chunks(texts_to_embed, embedding_model)

print(f"βœ… Generated {len(embeddings)} embeddings")
print(f"πŸ”Ή Shape of first embedding: {embeddings[0].shape}")

import chromadb
import uuid

# Initialize ChromaDB client (persistent storage)
chroma_client = chromadb.PersistentClient(path="./chroma_store")

# Create or get collection
collection = chroma_client.get_or_create_collection(name="imageonline_chunks")

# Extract documents, embeddings, metadatas
documents = [item["content"] for item in chunked_data]
metadatas = [item["metadata"] for item in chunked_data]
ids = [str(uuid.uuid4()) for _ in documents]

# Safety check
assert len(documents) == len(embeddings) == len(metadatas), "Data length mismatch!"

# Add to ChromaDB
collection.add(
    documents=documents,
    embeddings=embeddings.tolist(),
    metadatas=metadatas,
    ids=ids
)

# Sample query
query = "web design company"
query_embedding = embedding_model.encode([query])[0]

# Query ChromaDB
results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=3
)

# Display results
for i in range(len(results['documents'][0])):
    print(f"\nπŸ” Match {i+1}:")
    print(f"Content: {results['documents'][0][i][:200]}...")
    print(f"πŸ“Ž Metadata: {results['metadatas'][0][i]}")

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_together import ChatTogether

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize vectorstore
embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

vectorstore = Chroma(
    client=chroma_client,  # from your previous chroma setup
    collection_name="imageonline_chunks",
    embedding_function=embedding_function
)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

def retrieve_and_format(query):
    docs = retriever.get_relevant_documents(query)

    context_strings = []
    for doc in docs:
        content = doc.page_content
        metadata = doc.metadata
        source = metadata.get("source", "")
        section = metadata.get("section", "")
        context_strings.append(f"[{section}] {content}\n(Source: {source})")

    return "\n\n".join(context_strings)

llm = ChatTogether(
    model="meta-llama/Llama-3-8b-chat-hf",
    temperature=0.3,
    max_tokens=1024,
    top_p=0.7,
    together_api_key="a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6"  # Replace before deployment or use os.getenv
)

prompt = ChatPromptTemplate.from_template("""
You are an expert assistant for ImageOnline Web Solutions.

Answer the user's query based ONLY on the following context:

{context}

Query: {question}
""")

rag_chain = (
    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

import gradio as gr

def chat_interface(message, history):
    history = history or []

    # Display user message
    history.append(("πŸ§‘ You: " + message, "⏳ Generating response..."))

    try:
        # Call RAG pipeline
        answer = rag_chain.invoke(message)

        # Replace placeholder with actual response
        history[-1] = ("πŸ§‘ You: " + message, "πŸ€– Bot: " + answer)

    except Exception as e:
        error_msg = f"⚠️ Error: {str(e)}"
        history[-1] = ("πŸ§‘ You: " + message, f"πŸ€– Bot: {error_msg}")

    return history, history

def launch_gradio():
    with gr.Blocks() as demo:
        gr.Markdown("# πŸ’¬ ImageOnline RAG Chatbot")
        gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")

        chatbot = gr.Chatbot()
        state = gr.State([])

        with gr.Row():
            msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
            send_btn = gr.Button("πŸ“¨ Send", scale=1)

        msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
        send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])

        with gr.Row():
            clear_btn = gr.Button("🧹 Clear Chat")
            clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])

    return demo

if __name__ == "__main__":
    demo = launch_gradio()
    demo.launch()