Spaces:

RChaubey16
/

web-scraper-and-chatbot-rag-app

Running

App Files Files Community

RChaubey16 commited on Mar 11

Commit

4ba0755

verified ·

1 Parent(s): bd118ce

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -18

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langchain.docstore.document import Document
 import chromadb
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
 # Initialize Gemini API
 genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
@@ -19,7 +20,7 @@ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
 if 'scraped' not in st.session_state:
     st.session_state.scraped = False
 if 'collection_name' not in st.session_state:
-    st.session_state.collection_name = ""
 # Initialize embedding model
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -43,17 +44,20 @@ def add_chunks_to_db(chunks, collection_name):
     embeddings = embedding_model.encode(documents, convert_to_list=True)
     collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
-def scrape_text(url, collection_name):
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         text = clean_text(soup.get_text())
         chunks = split_content_into_chunks(text)
         add_chunks_to_db(chunks, collection_name)
-        # Store collection name and set scraped state to True
-        st.session_state.collection_name = collection_name
         st.session_state.scraped = True
         return "Scraping and processing complete. You can now ask questions!"
@@ -62,7 +66,7 @@ def scrape_text(url, collection_name):
 def ask_question(query, collection_name):
     # Get the collection
-    collection = chroma_client.get_collection(name=collection_name)
     query_embedding = embedding_model.encode(query, convert_to_list=True)
     results = collection.query(query_embeddings=[query_embedding], n_results=2)
@@ -90,24 +94,19 @@ st.title("Web Scraper & Q&A Chatbot")
 with st.container():
     st.subheader("Step 1: Scrape a Website")
-    # Let user create a new database or use existing one
-    collection_name = st.text_input("Enter a name for this data collection:",
-                                    value="my_collection",
-                                    help="This will create a new database or use an existing one with this name")
     url = st.text_input("Enter the URL to scrape:")
-    if url and collection_name:
         if st.button("Scrape & Process"):
             with st.spinner("Scraping and processing content..."):
-                result = scrape_text(url, collection_name)
                 st.success(result)
 # Q&A section - only appears after scraping is complete
 if st.session_state.scraped:
     with st.container():
         st.subheader("Step 2: Ask Questions About the Scraped Content")
-        st.write(f"The database '{st.session_state.collection_name}' contains information scraped from the website. Ask a question:")
         # Chat history
         if 'chat_history' not in st.session_state:
@@ -144,8 +143,8 @@ with st.sidebar:
     # List available collections
     try:
-        all_collections = chroma_client.list_collections()
-        collection_names = [collection.name for collection in all_collections]
         if collection_names:
             st.write("Available data collections:")
@@ -155,11 +154,11 @@ with st.sidebar:
                 st.session_state.collection_name = selected_collection
                 st.session_state.scraped = True
                 st.success(f"Loaded collection: {selected_collection}")
-                st.rerun()  # Updated from experimental_rerun()
     except Exception as e:
-        st.error(f"Error loading collections: {e}")
     # Add a button to clear the session and start over
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
-        st.rerun()  # Updated from experimental_rerun()

 import chromadb
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
+import uuid
 # Initialize Gemini API
 genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
 if 'scraped' not in st.session_state:
     st.session_state.scraped = False
 if 'collection_name' not in st.session_state:
+    st.session_state.collection_name = "default_collection"
 # Initialize embedding model
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
     embeddings = embedding_model.encode(documents, convert_to_list=True)
     collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
+def scrape_text(url):
     try:
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract domain for collection name
+        collection_name = st.session_state.collection_name
         text = clean_text(soup.get_text())
         chunks = split_content_into_chunks(text)
         add_chunks_to_db(chunks, collection_name)
+        # Set scraped state to True
         st.session_state.scraped = True
         return "Scraping and processing complete. You can now ask questions!"
 def ask_question(query, collection_name):
     # Get the collection
+    collection = chroma_client.get_or_create_collection(name=collection_name)
     query_embedding = embedding_model.encode(query, convert_to_list=True)
     results = collection.query(query_embeddings=[query_embedding], n_results=2)
 with st.container():
     st.subheader("Step 1: Scrape a Website")
     url = st.text_input("Enter the URL to scrape:")
+    if url:
         if st.button("Scrape & Process"):
             with st.spinner("Scraping and processing content..."):
+                result = scrape_text(url)
                 st.success(result)
 # Q&A section - only appears after scraping is complete
 if st.session_state.scraped:
     with st.container():
         st.subheader("Step 2: Ask Questions About the Scraped Content")
+        st.write("Ask a question about the content you've scraped:")
         # Chat history
         if 'chat_history' not in st.session_state:
     # List available collections
     try:
+        # Fix for ChromaDB v0.6.0 - list_collections() now returns only names
+        collection_names = chroma_client.list_collections()
         if collection_names:
             st.write("Available data collections:")
                 st.session_state.collection_name = selected_collection
                 st.session_state.scraped = True
                 st.success(f"Loaded collection: {selected_collection}")
+                st.rerun()
     except Exception as e:
+        st.error(f"Error: {str(e)}")
     # Add a button to clear the session and start over
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
+        st.rerun()