Spaces:

IProject-10
/

IOPL-Chatbot

Sleeping

App Files Files Community

IProject-10 commited on Jul 21

Commit

9d9952b

verified ·

1 Parent(s): 63cef6d

Upload 2 files

Browse files

Files changed (2) hide show

app.py +215 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# app.py
+import os
+import uuid
+import nltk
+import trafilatura
+import chromadb
+import tiktoken
+import gradio as gr
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_together import ChatTogether
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from sentence_transformers import SentenceTransformer
+from nltk.tokenize import sent_tokenize
+# Download NLTK resources
+nltk.download('punkt')
+# Initialize tokenizer
+tokenizer = tiktoken.get_encoding("cl100k_base")
+# Initialize embedding model
+embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+# Initialize ChromaDB
+chroma_client = chromadb.PersistentClient(path="./chroma_store")
+collection = chroma_client.get_or_create_collection(name="imageonline_chunks")
+# Sectioned URL List
+url_dict = {
+    "Website Designing": [
+        "https://www.imageonline.co.in/website-designing-mumbai.html",
+        "https://www.imageonline.co.in/domain-hosting-services-india.html",
+        "https://www.imageonline.co.in/best-seo-company-mumbai.html",
+        "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
+        "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
+        "https://www.imageonline.co.in/website-template-customization-india.html",
+        "https://www.imageonline.co.in/regular-website-maintanence-services.html",
+        "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
+        "https://www.imageonline.co.in/web-application-screen-designing.html"
+    ],
+    "Website Development": [
+        "https://www.imageonline.co.in/website-development-mumbai.html",
+        "https://www.imageonline.co.in/open-source-customization.html",
+        "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
+        "https://www.imageonline.co.in/website-with-content-management-system.html",
+        "https://www.imageonline.co.in/web-application-development-india.html"
+    ],
+    "Mobile App Development": [
+        "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
+    ],
+    "About Us": [
+        "https://www.imageonline.co.in/about-us.html",
+        "https://www.imageonline.co.in/vision.html",
+        "https://www.imageonline.co.in/team.html"
+    ],
+    "Testimonials": [
+        "https://www.imageonline.co.in/testimonial.html"
+    ]
+}
+# Helper functions
+def extract_clean_text(url):
+    try:
+        print(f"🔗 Fetching URL: {url}")
+        downloaded = trafilatura.fetch_url(url)
+        if downloaded:
+            content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
+            print(f"✅ Extracted text from {url}")
+            return content
+        else:
+            print(f"⚠️ Failed to fetch content from {url}")
+    except Exception as e:
+        print(f"❌ Error fetching {url}: {e}")
+    return None
+def chunk_text(text, max_tokens=400):
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        current_chunk.append(sentence)
+        tokens = tokenizer.encode(" ".join(current_chunk))
+        if len(tokens) > max_tokens:
+            current_chunk.pop()
+            chunks.append(" ".join(current_chunk).strip())
+            current_chunk = [sentence]
+    if current_chunk:
+        chunks.append(" ".join(current_chunk).strip())
+    print(f"📄 Text split into {len(chunks)} chunks.")
+    return chunks
+# Check refresh override
+force_refresh = os.getenv("FORCE_REFRESH", "false").lower() == "true"
+# Load data into ChromaDB
+if collection.count() == 0 or force_refresh:
+    print("🔄 Loading documents into ChromaDB...")
+    for section, urls in url_dict.items():
+        for url in urls:
+            text = extract_clean_text(url)
+            if not text:
+                continue
+            chunks = chunk_text(text)
+            embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
+            metadatas = [{"source": url, "section": section} for _ in chunks]
+            ids = [str(uuid.uuid4()) for _ in chunks]
+            collection.add(
+                documents=chunks,
+                embeddings=embeddings.tolist(),
+                metadatas=metadatas,
+                ids=ids
+            )
+    print("✅ Document loading complete.")
+else:
+    print("✅ Using existing ChromaDB collection.")
+# Vectorstore & Retriever
+vectorstore = Chroma(
+    client=chroma_client,
+    collection_name="imageonline_chunks",
+    embedding_function=embedding_function
+)
+retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
+# Together.ai LLM
+llm = ChatTogether(
+    model="meta-llama/Llama-3-8b-chat-hf",
+    temperature=0.3,
+    max_tokens=1024,
+    top_p=0.7,
+    together_api_key=os.getenv("TOGETHER_API_KEY")
+)
+# Prompt template (refined)
+prompt = ChatPromptTemplate.from_template("""
+You are a helpful assistant for ImageOnline Web Solutions.
+Use ONLY the information provided in the context to answer the user's query.
+Context:
+{context}
+Question:
+{question}
+If the answer is not found in the context, say "I'm sorry, I don't have enough information to answer that."
+""")
+# Context retrieval
+def retrieve_and_format(query):
+    docs = retriever.get_relevant_documents(query)
+    context_strings = []
+    for doc in docs:
+        content = doc.page_content
+        metadata = doc.metadata
+        source = metadata.get("source", "")
+        section = metadata.get("section", "")
+        context_strings.append(f"[{section}] {content}\n(Source: {source})")
+    return "\n\n".join(context_strings)
+# RAG chain
+rag_chain = (
+    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+# Gradio Interface
+def chat_interface(message, history):
+    history = history or []
+    history.append(("🧑 You: " + message, "⏳ Generating response..."))
+    try:
+        answer = rag_chain.invoke(message)
+        history[-1] = ("🧑 You: " + message, "🤖 Bot: " + answer)
+    except Exception as e:
+        error_msg = f"⚠️ Error: {str(e)}"
+        history[-1] = ("🧑 You: " + message, f"🤖 Bot: {error_msg}")
+    return history, history
+def launch_gradio():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 💬 ImageOnline RAG Chatbot")
+        gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")
+        chatbot = gr.Chatbot()
+        state = gr.State([])
+        with gr.Row():
+            msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
+            send_btn = gr.Button("📨 Send", scale=1)
+        msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
+        send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
+        with gr.Row():
+            clear_btn = gr.Button("🧹 Clear Chat")
+            clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])
+    return demo
+if __name__ == "__main__":
+    demo = launch_gradio()
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+langchain
+langchain-together
+langchain-community
+chromadb
+sentence-transformers
+trafilatura
+beautifulsoup4
+nltk
+tiktoken
+gradio
+together