Spaces:

bharath21gaju
/

testing_rag

Sleeping

App Files Files Community

Bharath Gajula commited on Jun 21

Commit

42cabf2

1 Parent(s): d0b0e7b

sadas

Browse files

Files changed (19) hide show

Dockerfile +4 -3
README.md +20 -17
agents/__init__.py +6 -0
agents/__pycache__/__init__.cpython-312.pyc +0 -0
agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
agents/__pycache__/sql_agent.cpython-312.pyc +0 -0
agents/rag_agent.py +116 -0
agents/sql_agent.py +30 -0
app.py +110 -0
chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/data_level0.bin +3 -0
chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/header.bin +3 -0
chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/length.bin +3 -0
chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/link_lists.bin +0 -0
chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/data_level0.bin +3 -0
chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/header.bin +3 -0
chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/length.bin +3 -0
chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/link_lists.bin +0 -0
requirements.txt +14 -3
setup_data.py +70 -0

Dockerfile CHANGED Viewed

@@ -1,4 +1,4 @@
-FROM python:3.9-slim
 WORKDIR /app
@@ -10,12 +10,13 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
-COPY src/ ./src/
 RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.12-slim
 WORKDIR /app
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
+COPY . .
 RUN pip3 install -r requirements.txt
+RUN python setup_data.py
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,20 +1,23 @@
----
-title: Testing Rag
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
-license: apache-2.0
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# Hybrid Search Chatbot
+A Streamlit app for hybrid search: SQL (Chinook DB) and semantic (RAG, AG News/ChromaDB).
+## Quick Start
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. Initialize data:
+   ```bash
+   python setup_data.py
+   ```
+3. Run the app:
+   ```bash
+   streamlit run app.py
+   ```
+---
+- Edit `.env` for API keys if needed.
+- See `requirements.txt` for dependencies.

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .sql_agent import SQLAgent
+from .rag_agent import RAGAgent
+__all__ = ['sql_agent', 'rag_agent']
+__version__ = "1.0.0"
+__author__ = "Bharath Gajula"

agents/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (346 Bytes). View file

agents/__pycache__/rag_agent.cpython-312.pyc ADDED Viewed

Binary file (4.78 kB). View file

agents/__pycache__/sql_agent.cpython-312.pyc ADDED Viewed

Binary file (1.62 kB). View file

agents/rag_agent.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import chromadb
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict
+import os
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.schema import HumanMessage
+class RAGAgent:
+    def __init__(self):
+        self.embedder = SentenceTransformer('all-mpnet-base-v2')
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-1.5-flash",
+            temperature=0.3,
+            google_api_key=os.getenv("GOOGLE_API_KEY")
+        )
+        persist_directory = "./chroma_agnews/"
+        self.chroma_client = chromadb.PersistentClient(path=persist_directory)
+        self.collection = self.chroma_client.get_collection(name="ag_news")
+        print(f"Connected to ChromaDB with {self.collection.count()} documents")
+    def search(self, query: str, top_k: int = 5) -> Dict:
+        """Search for relevant chunks and answer the question."""
+        # Handle empty query base case scenario
+        if not query or query.strip() == "":
+            query = "news"
+        # Embed the query
+        query_embedding = self.embedder.encode(query).tolist()
+        # Query the collection
+        results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=min(top_k, self.collection.count()),
+            include=["documents", "metadatas", "distances"]
+        )
+        # Format results
+        formatted_results = []
+        context_chunks = []
+        if results['ids'] and len(results['ids'][0]) > 0:
+            for i in range(len(results['ids'][0])):
+                # Calculate similarity score
+                distance = results['distances'][0][i] if results['distances'] else 0
+                similarity_score = 1 - (distance / 2)
+                doc_text = results['documents'][0][i]
+                formatted_results.append({
+                    'text': doc_text,
+                    'category': results['metadatas'][0][i].get('label_text', 'Unknown'),
+                    'score': similarity_score
+                })
+                context_chunks.append(doc_text)
+            # Generate answer based on retrieved chunks
+            answer = self._generate_answer(query, context_chunks)
+            return {
+                "answer": answer,
+                "chunks": formatted_results,
+                "query": query
+            }
+        else:
+            return {
+                "answer": "No relevant information found for your question.",
+                "chunks": [],
+                "query": query
+            }
+    def _generate_answer(self, query: str, chunks: List[str]) -> str:
+        """Generate answer based on retrieved chunks."""
+        # Combine chunks as context
+        context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(chunks)])
+        # Create prompt
+        prompt = f"""Based on the following information, answer the question.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        # Generate answer using Gemini
+        response = self.llm.invoke([HumanMessage(content=prompt)])
+        return response.content
+    def get_collection_stats(self) -> Dict:
+        """Get statistics about the collection."""
+        count = self.collection.count()
+        if count > 0:
+            sample = self.collection.get(
+                limit=min(100, count),
+                include=["metadatas"]
+            )
+            categories = {}
+            for metadata in sample['metadatas']:
+                cat = metadata.get('label_text', 'Unknown')
+                categories[cat] = categories.get(cat, 0) + 1
+            return {
+                "total_documents": count,
+                "categories": categories
+            }
+        else:
+            return {
+                "total_documents": 0,
+                "categories": {}
+            }

agents/sql_agent.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.agents import create_sql_agent
+from langchain.agents.agent_toolkits import SQLDatabaseToolkit
+from langchain.sql_database import SQLDatabase
+import os
+class SQLAgent:
+    def __init__(self, db_path: str):
+        self.db_path = db_path
+        # Create SQLDatabase instance
+        self.db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-1.5-flash",
+            temperature=0,
+            google_api_key=os.getenv("GOOGLE_API_KEY")
+        )
+        # Create SQL toolkit and agent
+        toolkit = SQLDatabaseToolkit(db=self.db, llm=self.llm)
+        self.agent = create_sql_agent(
+            llm=self.llm,
+            toolkit=toolkit,
+            verbose=True
+        )
+    def query(self, question: str) -> str:
+        """Run natural language query and return answer."""
+        return self.agent.run(question)

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import streamlit as st
+import pandas as pd
+from agents.sql_agent import SQLAgent
+from agents.rag_agent import RAGAgent
+import os
+from dotenv import load_dotenv
+load_dotenv()
+st.set_page_config(page_title="Q&A Chatbot", layout="wide")
+st.title(" Q&A Chatbot")
+# Initializing  agents
+@st.cache_resource
+def init_sql_agent():
+    return SQLAgent("./sakila.db")
+@st.cache_resource
+def init_rag_agent():
+    return RAGAgent()
+mode = st.sidebar.radio(
+    "Select Mode:",
+    ["Movie Database (SQL)", "News Search (RAG)"]
+)
+st.markdown("---")
+if mode == "Movie Database (SQL)":
+    st.subheader(" Movie Database (SQL)")
+    sql_question = st.text_input("Ask about movies:", placeholder="Please enter your nlp sql question here ", key="sql_input")
+    if sql_question:
+        sql_agent = init_sql_agent()
+        with st.spinner("Querying database..."):
+            answer = sql_agent.query(sql_question)
+            # Display answer
+            st.markdown("### Answer")
+            st.write(answer)
+else:
+    st.subheader(" News Search (RAG)")
+    # RAG input
+    rag_question = st.text_input("Ask about news:", placeholder="What's happening around the world?", key="rag_input")
+    if rag_question:
+        rag_agent = init_rag_agent()
+        with st.spinner("Searching news..."):
+            result = rag_agent.search(rag_question)
+            st.markdown("### Answer")
+            st.info(result['answer'])
+            # Sources section with full chunks
+            st.markdown("### Source Articles")
+            for j, chunk in enumerate(result['chunks']):
+                with st.container():
+                    col1, col2 = st.columns([3, 1])
+                    with col1:
+                        st.markdown(f"**Article {j+1}**")
+                    with col2:
+                        st.markdown(f"**{chunk['category']}**")
+                    # Full text in a text area for better readability
+                    st.text_area(
+                        label="",
+                        value=chunk['text'],
+                        height=150,
+                        disabled=True,
+                        key=f"chunk_{j}"
+                    )
+                    # Score if available
+                    if chunk.get('score', 0) > 0:
+                        st.caption(f"Relevance Score: {chunk['score']:.1%}")
+                    st.markdown("---")
+# Sidebar
+st.sidebar.markdown("---")
+st.sidebar.markdown("###  Example Questions")
+if mode == "Movie Database (SQL)":
+    st.sidebar.markdown("""
+    - How many films are there?
+    - Show me the top 5 longest films
+    - Which actors have the most films?
+    - List all film categories
+    - How many customers do we have?
+    """)
+else:
+    st.sidebar.markdown("""
+    - What's happening with oil prices?
+    - Tell me about technology news
+    - Any sports updates?
+    - Business news today
+    - Science discoveries
+    """)
+st.sidebar.markdown("---")
+st.sidebar.caption("Created by Bharath Gajula")
+st.sidebar.caption("Powered by Gemini & LangChain")

chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebdb4a62fc9c29c5f41ec836bd8856e9526cf002440601ca5f2fed121cb0696c
+size 32120000

chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c7f00b4415698ee6cb94332eff91aedc06ba8e066b1f200e78ca5df51abb57
+size 100

chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:353eeae34f121621a657ab9bf30b59b722029f4415aff8a48f1466ef39bc7211
+size 40000

chroma_agnews/1ece7c94-65ca-4e00-a18d-81575d0bb13e/link_lists.bin ADDED Viewed

File without changes

chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23add52afbe7588391f32d3deffb581b2663d2e2ad8851aba7de25e6b3f66761
+size 32120000

chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c7f00b4415698ee6cb94332eff91aedc06ba8e066b1f200e78ca5df51abb57
+size 100

chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e2dcff542de95352682dc186432e98f0188084896773f1973276b0577d5305
+size 40000

chroma_agnews/abe39f0b-2fae-49f8-b04a-3877bcadd8ea/link_lists.bin ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,3 +1,14 @@
-altair
-pandas
-streamlit

+streamlit>=1.28.0
+langchain>=0.3.0
+langchain-community>=0.3.0
+langchain-google-genai>=1.0.0
+google-generativeai>=0.3.0
+chromadb>=0.4.0
+sentence-transformers>=2.2.0
+pandas>=2.0.0
+python-dotenv>=1.0.0
+pytest>=7.4.0
+numpy>=1.24.0
+sqlalchemy>=2.0.0
+datasets>=2.14.0
+requests>=2.31.0

setup_data.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import urllib.request
+import chromadb
+from chromadb.utils import embedding_functions
+from datasets import load_dataset
+def download_sakila_db():
+    """Download Sakila SQLite database."""
+    if os.path.exists("./sakila.db"):
+        print("✓ Sakila database already exists")
+        return
+    print("Downloading Sakila database...")
+    url = "https://github.com/ivanceras/sakila/raw/master/sqlite-sakila-db/sakila.db"
+    urllib.request.urlretrieve(url, "./sakila.db")
+    print("✓ Sakila database downloaded")
+def setup_agnews_chromadb():
+    """Load original AG News and compute embeddings."""
+    print("\nLoading AG News dataset...")
+    ds = load_dataset("fancyzhx/ag_news", split="train[:500]")
+    print(f"✓ Loaded {len(ds)} articles")
+    os.makedirs("./chroma_agnews/", exist_ok=True)
+    client = chromadb.PersistentClient(path="./chroma_agnews/")
+    try:
+        client.delete_collection("ag_news")
+    except:
+        pass
+    # Create collection with embedding function
+    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name="all-mpnet-base-v2"
+    )
+    collection = client.create_collection(
+        name="ag_news",
+        embedding_function=embedding_fn,
+        metadata={"hnsw:space": "cosine"}
+    )
+    # Label mapping
+    label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
+    # Adding to ChromaDB
+    print("Computing embeddings and adding to ChromaDB...")
+    ids = [f"doc_{i}" for i in range(len(ds))]
+    documents = [item['text'] for item in ds]
+    metadatas = [{
+        "label": item['label'],
+        "label_text": label_names[item['label']],
+        "title": item['text'][:100] + "..." if len(item['text']) > 100 else item['text']
+    } for item in ds]
+    collection.add(
+        ids=ids,
+        documents=documents,
+        metadatas=metadatas
+    )
+    print(f"✓ Added {len(ds)} articles to ChromaDB")
+if __name__ == "__main__":
+    print("=== Setting up databases ===\n")
+    download_sakila_db()
+    setup_agnews_chromadb()
+    print("\n Setup complete! Run 'streamlit run chatbot.py'")