Spaces:

IProject-10
/

IOPL-Chatbot-2

Sleeping

App Files Files Community

IOPL-Chatbot-2 / app.py

IProject-10

Upload 2 files

8af136f verified about 1 month ago

raw

history blame

8.97 kB

	import nltk
	nltk.download('punkt')
	nltk.download('punkt_tab')

	# SECTIONED URL LIST (in case we want to tag later)
	url_dict = {
	"Website Designing": [
	"https://www.imageonline.co.in/website-designing-mumbai.html",
	"https://www.imageonline.co.in/domain-hosting-services-india.html",
	"https://www.imageonline.co.in/best-seo-company-mumbai.html",
	"https://www.imageonline.co.in/wordpress-blog-designing-india.html",
	"https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
	"https://www.imageonline.co.in/website-template-customization-india.html",
	"https://www.imageonline.co.in/regular-website-maintanence-services.html",
	"https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
	"https://www.imageonline.co.in/web-application-screen-designing.html"
	],
	"Website Development": [
	"https://www.imageonline.co.in/website-development-mumbai.html",
	"https://www.imageonline.co.in/open-source-customization.html",
	"https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
	"https://www.imageonline.co.in/website-with-content-management-system.html",
	"https://www.imageonline.co.in/web-application-development-india.html"
	],
	"Mobile App Development": [
	"https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
	],
	"About Us": [
	"https://www.imageonline.co.in/about-us.html",
	"https://www.imageonline.co.in/vision.html",
	"https://www.imageonline.co.in/team.html"
	],
	"Testimonials": [
	"https://www.imageonline.co.in/testimonial.html"
	]
	}

	import trafilatura
	import requests

	# Function to extract clean text using trafilatura
	def extract_clean_text(url):
	"""
	Fetch and extract clean main content from a URL using trafilatura.
	Returns None if content couldn't be extracted.
	"""
	try:
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
	return content
	except Exception as e:
	print(f"Error fetching {url}: {e}")
	return None

	# Scrape data and prepare for RAG with metadata
	scraped_data = []

	for section, urls in url_dict.items():
	for url in urls:
	print(f"🟩 Scraping: {url}")
	text = extract_clean_text(url)
	if text:
	print(f"✅ Extracted {len(text)} characters.\n")
	scraped_data.append({
	"content": text,
	"metadata": {
	"source": url,
	"section": section
	}
	})
	else:
	print(f"❌ Failed to extract content from {url}.\n")

	print(f"Total pages scraped: {len(scraped_data)}")

	import tiktoken
	from nltk.tokenize import sent_tokenize

	# Initialize GPT tokenizer (cl100k_base works with Together.ai and OpenAI APIs)
	tokenizer = tiktoken.get_encoding("cl100k_base")

	def chunk_text(text, max_tokens=400):
	"""
	Chunk text into overlapping segments based on sentence boundaries and token limits.
	"""
	sentences = sent_tokenize(text)
	chunks = []
	current_chunk = []

	for sentence in sentences:
	current_chunk.append(sentence)
	tokens = tokenizer.encode(" ".join(current_chunk))
	if len(tokens) > max_tokens:
	# Finalize current chunk without last sentence
	current_chunk.pop()
	chunks.append(" ".join(current_chunk).strip())
	current_chunk = [sentence] # Start new chunk with overflow sentence

	# Append final chunk
	if current_chunk:
	chunks.append(" ".join(current_chunk).strip())

	return chunks

	chunked_data = []

	for item in scraped_data:
	text = item["content"]
	metadata = item["metadata"]

	chunks = chunk_text(text, max_tokens=400)

	for chunk in chunks:
	chunked_data.append({
	"content": chunk,
	"metadata": metadata # Keep the same URL + section for each chunk
	})

	# Extract text chunks from chunked_data for embedding
	texts_to_embed = [item["content"] for item in chunked_data]

	from sentence_transformers import SentenceTransformer

	# Load the embedding model
	embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

	def embed_chunks(text_list, model):
	"""
	Generate embeddings for a list of text chunks.
	"""
	return model.encode(text_list, convert_to_numpy=True)

	# Generate embeddings
	embeddings = embed_chunks(texts_to_embed, embedding_model)

	print(f"✅ Generated {len(embeddings)} embeddings")
	print(f"🔹 Shape of first embedding: {embeddings[0].shape}")

	import chromadb
	import uuid

	# Initialize ChromaDB client (persistent storage)
	chroma_client = chromadb.PersistentClient(path="./chroma_store")

	# Create or get collection
	collection = chroma_client.get_or_create_collection(name="imageonline_chunks")

	# Extract documents, embeddings, metadatas
	documents = [item["content"] for item in chunked_data]
	metadatas = [item["metadata"] for item in chunked_data]
	ids = [str(uuid.uuid4()) for _ in documents]

	# Safety check
	assert len(documents) == len(embeddings) == len(metadatas), "Data length mismatch!"

	# Add to ChromaDB
	collection.add(
	documents=documents,
	embeddings=embeddings.tolist(),
	metadatas=metadatas,
	ids=ids
	)

	# Sample query
	query = "web design company"
	query_embedding = embedding_model.encode([query])[0]

	# Query ChromaDB
	results = collection.query(
	query_embeddings=[query_embedding.tolist()],
	n_results=3
	)

	# Display results
	for i in range(len(results['documents'][0])):
	print(f"\n🔍 Match {i+1}:")
	print(f"Content: {results['documents'][0][i][:200]}...")
	print(f"📎 Metadata: {results['metadatas'][0][i]}")

	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.runnables import RunnableLambda, RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from langchain_together import ChatTogether

	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# Initialize vectorstore
	embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	vectorstore = Chroma(
	client=chroma_client, # from your previous chroma setup
	collection_name="imageonline_chunks",
	embedding_function=embedding_function
	)

	# Create retriever
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	def retrieve_and_format(query):
	docs = retriever.get_relevant_documents(query)

	context_strings = []
	for doc in docs:
	content = doc.page_content
	metadata = doc.metadata
	source = metadata.get("source", "")
	section = metadata.get("section", "")
	context_strings.append(f"[{section}] {content}\n(Source: {source})")

	return "\n\n".join(context_strings)

	llm = ChatTogether(
	model="meta-llama/Llama-3-8b-chat-hf",
	temperature=0.3,
	max_tokens=1024,
	top_p=0.7,
	together_api_key="a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6" # Replace before deployment or use os.getenv
	)

	prompt = ChatPromptTemplate.from_template("""
	You are an expert assistant for ImageOnline Web Solutions.

	Answer the user's query based ONLY on the following context:

	{context}

	Query: {question}
	""")

	rag_chain = (
	{"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	import gradio as gr

	def chat_interface(message, history):
	history = history or []

	# Display user message
	history.append(("🧑 You: " + message, "⏳ Generating response..."))

	try:
	# Call RAG pipeline
	answer = rag_chain.invoke(message)

	# Replace placeholder with actual response
	history[-1] = ("🧑 You: " + message, "🤖 Bot: " + answer)

	except Exception as e:
	error_msg = f"⚠️ Error: {str(e)}"
	history[-1] = ("🧑 You: " + message, f"🤖 Bot: {error_msg}")

	return history, history

	def launch_gradio():
	with gr.Blocks() as demo:
	gr.Markdown("# 💬 ImageOnline RAG Chatbot")
	gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")

	chatbot = gr.Chatbot()
	state = gr.State([])

	with gr.Row():
	msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
	send_btn = gr.Button("📨 Send", scale=1)

	msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
	send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])

	with gr.Row():
	clear_btn = gr.Button("🧹 Clear Chat")
	clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])

	return demo

	if __name__ == "__main__":
	demo = launch_gradio()
	demo.launch()