Spaces:

RChaubey16
/

web-scraper-and-chatbot-rag-app

Running

App Files Files Community

RChaubey16 commited on Mar 11

Commit

7ff6802

verified ·

1 Parent(s): 8f4ddfa

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -119

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from langchain.docstore.document import Document
 import chromadb
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
-import uuid
 # Page configuration
 st.set_page_config(layout="wide")
@@ -19,7 +18,7 @@ genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
 CHROMA_PATH = "chroma_db"
 chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
-# Initialize session state to track if scraping is complete and collection name
 if 'scraped' not in st.session_state:
     st.session_state.scraped = False
 if 'collection_name' not in st.session_state:
@@ -31,23 +30,17 @@ if 'chat_history' not in st.session_state:
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 def clean_text(text):
-    text = re.sub(r'http\S+', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
 def split_content_into_chunks(content):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
-    documents = [Document(page_content=content)]
-    return text_splitter.split_documents(documents)
 def add_chunks_to_db(chunks, collection_name):
-    # Create or get collection
     collection = chroma_client.get_or_create_collection(name=collection_name)
     documents = [chunk.page_content for chunk in chunks]
-    ids = [f"ID{i}" for i in range(len(chunks))]
     embeddings = embedding_model.encode(documents, convert_to_list=True)
-    collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
 def scrape_text(url):
     try:
@@ -55,141 +48,62 @@ def scrape_text(url):
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Extract domain for collection name
-        collection_name = st.session_state.collection_name
         text = clean_text(soup.get_text())
         chunks = split_content_into_chunks(text)
-        add_chunks_to_db(chunks, collection_name)
-        # Set scraped state to True
         st.session_state.scraped = True
         return "Scraping and processing complete. You can now ask questions!"
     except requests.exceptions.RequestException as e:
         return f"Error scraping {url}: {e}"
 def ask_question(query, collection_name):
-    # Get the collection
     collection = chroma_client.get_or_create_collection(name=collection_name)
     query_embedding = embedding_model.encode(query, convert_to_list=True)
     results = collection.query(query_embeddings=[query_embedding], n_results=2)
     top_chunks = results.get("documents", [[]])[0]
     system_prompt = f"""
-    You are a helpful assistant. You answer questions based on the provided context.
-    Only answer based on the knowledge I'm providing you. Don't use your internal
-    knowledge and don't make things up.
-    If you don't know the answer based on the provided context, just say: "I don't have enough information to answer that question based on the scraped content."
-    Context information:
     {str(top_chunks)}
     """
-    full_prompt = system_prompt + "\nUser Query: " + query
     model = genai.GenerativeModel('gemini-2.0-flash')
-    response = model.generate_content(full_prompt)
     return response.text
-# Create two columns: sidebar for database and main content
-col1, main_col = st.columns([1, 3])
-# Database management sidebar
-with col1:
     st.header("Database Management")
-    # List available collections
-    try:
-        # Fix for ChromaDB v0.6.0 - list_collections() now returns only names
-        collection_names = chroma_client.list_collections()
-        if collection_names:
-            st.write("Available data collections:")
-            selected_collection = st.selectbox("Select a collection to query:", collection_names)
-            if selected_collection and st.button("Load Selected Collection"):
-                st.session_state.collection_name = selected_collection
-                st.session_state.scraped = True
-                st.success(f"Loaded collection: {selected_collection}")
-                st.rerun()
-    except Exception as e:
-        st.error(f"Error: {str(e)}")
-    # Add a button to clear the session and start over
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
         st.rerun()
-    # Scraping section
     st.header("Step 1: Scrape a Website")
-    url = st.text_input("Enter the URL to scrape:")
-    if url:
-        if st.button("Scrape & Process"):
-            with st.spinner("Scraping and processing content..."):
-                result = scrape_text(url)
-                st.success(result)
-# Main content area
-with main_col:
-    st.title("Web Scraper & Q&A Chatbot")
-    # Use a container with custom CSS for the scrollable chat area
-    chat_container = st.container()
-    # Apply custom CSS for the chat container
-    st.markdown("""
-    <style>
-    .chat-container {
-        height: 500px;
-        overflow-y: auto;
-        border: 1px solid #ddd;
-        border-radius: 5px;
-        padding: 15px;
-        margin-bottom: 10px;
-        background-color: #f9f9f9;
-    }
-    .stChatInputContainer {
-        position: sticky;
-        bottom: 0;
-        background-color: white;
-        padding-top: 10px;
-        z-index: 100;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-    # Q&A section - only appears after scraping is complete
-    if st.session_state.scraped:
-        st.subheader("Step 2: Ask Questions About the Scraped Content")
-        # Use a div with our custom class for the scrollable area
-        st.markdown('<div class="chat-container">', unsafe_allow_html=True)
-        # Display chat history
-        for message in st.session_state.chat_history:
-            with chat_container.chat_message(message["role"]):
-                st.write(message["content"])
-        st.markdown('</div>', unsafe_allow_html=True)
-        # Input for new question - always at the bottom
-        user_query = st.chat_input("Ask your question here")
-        if user_query:
-            # Add user question to chat history
-            st.session_state.chat_history.append({"role": "user", "content": user_query})
-            # Get answer
-            with st.spinner("Searching database..."):
-                answer = ask_question(user_query, st.session_state.collection_name)
-            # Add answer to chat history
-            st.session_state.chat_history.append({"role": "assistant", "content": answer})
-            # Rerun to update the UI with new messages
-            st.rerun()
-    else:
-        st.info("Please scrape a website or load a collection to start chatting.")

 import chromadb
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
 # Page configuration
 st.set_page_config(layout="wide")
 CHROMA_PATH = "chroma_db"
 chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
+# Initialize session state
 if 'scraped' not in st.session_state:
     st.session_state.scraped = False
 if 'collection_name' not in st.session_state:
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 def clean_text(text):
+    return re.sub(r'\s+', ' ', re.sub(r'http\S+', '', text)).strip()
 def split_content_into_chunks(content):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
+    return text_splitter.split_documents([Document(page_content=content)])
 def add_chunks_to_db(chunks, collection_name):
     collection = chroma_client.get_or_create_collection(name=collection_name)
     documents = [chunk.page_content for chunk in chunks]
     embeddings = embedding_model.encode(documents, convert_to_list=True)
+    collection.upsert(documents=documents, ids=[f"ID{i}" for i in range(len(chunks))], embeddings=embeddings)
 def scrape_text(url):
     try:
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         text = clean_text(soup.get_text())
         chunks = split_content_into_chunks(text)
+        add_chunks_to_db(chunks, st.session_state.collection_name)
         st.session_state.scraped = True
         return "Scraping and processing complete. You can now ask questions!"
     except requests.exceptions.RequestException as e:
         return f"Error scraping {url}: {e}"
 def ask_question(query, collection_name):
     collection = chroma_client.get_or_create_collection(name=collection_name)
     query_embedding = embedding_model.encode(query, convert_to_list=True)
     results = collection.query(query_embeddings=[query_embedding], n_results=2)
     top_chunks = results.get("documents", [[]])[0]
     system_prompt = f"""
+    You are a helpful assistant. Answer only from the provided context.
+    If you lack information, say: "I don't have enough information to answer that question."
+    Context:
     {str(top_chunks)}
     """
     model = genai.GenerativeModel('gemini-2.0-flash')
+    response = model.generate_content(system_prompt + "\nUser Query: " + query)
     return response.text
+# Sidebar
+with st.sidebar:
     st.header("Database Management")
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
         st.rerun()
     st.header("Step 1: Scrape a Website")
+    url = st.text_input("Enter URL:")
+    if url and st.button("Scrape & Process"):
+        with st.spinner("Scraping..."):
+            st.success(scrape_text(url))
+# Main content
+st.title("Web Scraper & Q&A Chatbot")
+if st.session_state.scraped:
+    st.subheader("Step 2: Ask Questions")
+    for message in st.session_state.chat_history:
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+    user_query = st.chat_input("Ask your question here")
+    if user_query:
+        st.session_state.chat_history.append({"role": "user", "content": user_query})
+        with st.spinner("Searching..."):
+            answer = ask_question(user_query, st.session_state.collection_name)
+        st.session_state.chat_history.append({"role": "assistant", "content": answer})
+        # Limit chat history to 6 messages
+        st.session_state.chat_history = st.session_state.chat_history[-6:]
+        st.rerun()
+else:
+    st.info("Please scrape a website first.")