RChaubey16 commited on
Commit
4ba0755
·
verified ·
1 Parent(s): bd118ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -18
app.py CHANGED
@@ -7,6 +7,7 @@ from langchain.docstore.document import Document
7
  import chromadb
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
 
10
 
11
  # Initialize Gemini API
12
  genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
@@ -19,7 +20,7 @@ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
19
  if 'scraped' not in st.session_state:
20
  st.session_state.scraped = False
21
  if 'collection_name' not in st.session_state:
22
- st.session_state.collection_name = ""
23
 
24
  # Initialize embedding model
25
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
@@ -43,17 +44,20 @@ def add_chunks_to_db(chunks, collection_name):
43
  embeddings = embedding_model.encode(documents, convert_to_list=True)
44
  collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
45
 
46
- def scrape_text(url, collection_name):
47
  try:
48
  response = requests.get(url)
49
  response.raise_for_status()
50
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
51
  text = clean_text(soup.get_text())
52
  chunks = split_content_into_chunks(text)
53
  add_chunks_to_db(chunks, collection_name)
54
 
55
- # Store collection name and set scraped state to True
56
- st.session_state.collection_name = collection_name
57
  st.session_state.scraped = True
58
 
59
  return "Scraping and processing complete. You can now ask questions!"
@@ -62,7 +66,7 @@ def scrape_text(url, collection_name):
62
 
63
  def ask_question(query, collection_name):
64
  # Get the collection
65
- collection = chroma_client.get_collection(name=collection_name)
66
 
67
  query_embedding = embedding_model.encode(query, convert_to_list=True)
68
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
@@ -90,24 +94,19 @@ st.title("Web Scraper & Q&A Chatbot")
90
  with st.container():
91
  st.subheader("Step 1: Scrape a Website")
92
 
93
- # Let user create a new database or use existing one
94
- collection_name = st.text_input("Enter a name for this data collection:",
95
- value="my_collection",
96
- help="This will create a new database or use an existing one with this name")
97
-
98
  url = st.text_input("Enter the URL to scrape:")
99
 
100
- if url and collection_name:
101
  if st.button("Scrape & Process"):
102
  with st.spinner("Scraping and processing content..."):
103
- result = scrape_text(url, collection_name)
104
  st.success(result)
105
 
106
  # Q&A section - only appears after scraping is complete
107
  if st.session_state.scraped:
108
  with st.container():
109
  st.subheader("Step 2: Ask Questions About the Scraped Content")
110
- st.write(f"The database '{st.session_state.collection_name}' contains information scraped from the website. Ask a question:")
111
 
112
  # Chat history
113
  if 'chat_history' not in st.session_state:
@@ -144,8 +143,8 @@ with st.sidebar:
144
 
145
  # List available collections
146
  try:
147
- all_collections = chroma_client.list_collections()
148
- collection_names = [collection.name for collection in all_collections]
149
 
150
  if collection_names:
151
  st.write("Available data collections:")
@@ -155,11 +154,11 @@ with st.sidebar:
155
  st.session_state.collection_name = selected_collection
156
  st.session_state.scraped = True
157
  st.success(f"Loaded collection: {selected_collection}")
158
- st.rerun() # Updated from experimental_rerun()
159
  except Exception as e:
160
- st.error(f"Error loading collections: {e}")
161
 
162
  # Add a button to clear the session and start over
163
  if st.button("Clear Chat History"):
164
  st.session_state.chat_history = []
165
- st.rerun() # Updated from experimental_rerun()
 
7
  import chromadb
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
10
+ import uuid
11
 
12
  # Initialize Gemini API
13
  genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
 
20
  if 'scraped' not in st.session_state:
21
  st.session_state.scraped = False
22
  if 'collection_name' not in st.session_state:
23
+ st.session_state.collection_name = "default_collection"
24
 
25
  # Initialize embedding model
26
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 
44
  embeddings = embedding_model.encode(documents, convert_to_list=True)
45
  collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
46
 
47
+ def scrape_text(url):
48
  try:
49
  response = requests.get(url)
50
  response.raise_for_status()
51
  soup = BeautifulSoup(response.text, 'html.parser')
52
+
53
+ # Extract domain for collection name
54
+ collection_name = st.session_state.collection_name
55
+
56
  text = clean_text(soup.get_text())
57
  chunks = split_content_into_chunks(text)
58
  add_chunks_to_db(chunks, collection_name)
59
 
60
+ # Set scraped state to True
 
61
  st.session_state.scraped = True
62
 
63
  return "Scraping and processing complete. You can now ask questions!"
 
66
 
67
  def ask_question(query, collection_name):
68
  # Get the collection
69
+ collection = chroma_client.get_or_create_collection(name=collection_name)
70
 
71
  query_embedding = embedding_model.encode(query, convert_to_list=True)
72
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
 
94
  with st.container():
95
  st.subheader("Step 1: Scrape a Website")
96
 
 
 
 
 
 
97
  url = st.text_input("Enter the URL to scrape:")
98
 
99
+ if url:
100
  if st.button("Scrape & Process"):
101
  with st.spinner("Scraping and processing content..."):
102
+ result = scrape_text(url)
103
  st.success(result)
104
 
105
  # Q&A section - only appears after scraping is complete
106
  if st.session_state.scraped:
107
  with st.container():
108
  st.subheader("Step 2: Ask Questions About the Scraped Content")
109
+ st.write("Ask a question about the content you've scraped:")
110
 
111
  # Chat history
112
  if 'chat_history' not in st.session_state:
 
143
 
144
  # List available collections
145
  try:
146
+ # Fix for ChromaDB v0.6.0 - list_collections() now returns only names
147
+ collection_names = chroma_client.list_collections()
148
 
149
  if collection_names:
150
  st.write("Available data collections:")
 
154
  st.session_state.collection_name = selected_collection
155
  st.session_state.scraped = True
156
  st.success(f"Loaded collection: {selected_collection}")
157
+ st.rerun()
158
  except Exception as e:
159
+ st.error(f"Error: {str(e)}")
160
 
161
  # Add a button to clear the session and start over
162
  if st.button("Clear Chat History"):
163
  st.session_state.chat_history = []
164
+ st.rerun()