RChaubey16 commited on
Commit
7ff6802
·
verified ·
1 Parent(s): 8f4ddfa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -119
app.py CHANGED
@@ -7,7 +7,6 @@ from langchain.docstore.document import Document
7
  import chromadb
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
10
- import uuid
11
 
12
  # Page configuration
13
  st.set_page_config(layout="wide")
@@ -19,7 +18,7 @@ genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
19
  CHROMA_PATH = "chroma_db"
20
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
21
 
22
- # Initialize session state to track if scraping is complete and collection name
23
  if 'scraped' not in st.session_state:
24
  st.session_state.scraped = False
25
  if 'collection_name' not in st.session_state:
@@ -31,23 +30,17 @@ if 'chat_history' not in st.session_state:
31
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
32
 
33
  def clean_text(text):
34
- text = re.sub(r'http\S+', '', text)
35
- text = re.sub(r'\s+', ' ', text).strip()
36
- return text
37
 
38
  def split_content_into_chunks(content):
39
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
40
- documents = [Document(page_content=content)]
41
- return text_splitter.split_documents(documents)
42
 
43
  def add_chunks_to_db(chunks, collection_name):
44
- # Create or get collection
45
  collection = chroma_client.get_or_create_collection(name=collection_name)
46
-
47
  documents = [chunk.page_content for chunk in chunks]
48
- ids = [f"ID{i}" for i in range(len(chunks))]
49
  embeddings = embedding_model.encode(documents, convert_to_list=True)
50
- collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
51
 
52
  def scrape_text(url):
53
  try:
@@ -55,141 +48,62 @@ def scrape_text(url):
55
  response.raise_for_status()
56
  soup = BeautifulSoup(response.text, 'html.parser')
57
 
58
- # Extract domain for collection name
59
- collection_name = st.session_state.collection_name
60
-
61
  text = clean_text(soup.get_text())
62
  chunks = split_content_into_chunks(text)
63
- add_chunks_to_db(chunks, collection_name)
64
 
65
- # Set scraped state to True
66
  st.session_state.scraped = True
67
-
68
  return "Scraping and processing complete. You can now ask questions!"
69
  except requests.exceptions.RequestException as e:
70
  return f"Error scraping {url}: {e}"
71
 
72
  def ask_question(query, collection_name):
73
- # Get the collection
74
  collection = chroma_client.get_or_create_collection(name=collection_name)
75
-
76
  query_embedding = embedding_model.encode(query, convert_to_list=True)
77
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
78
  top_chunks = results.get("documents", [[]])[0]
79
 
80
  system_prompt = f"""
81
- You are a helpful assistant. You answer questions based on the provided context.
82
- Only answer based on the knowledge I'm providing you. Don't use your internal
83
- knowledge and don't make things up.
84
- If you don't know the answer based on the provided context, just say: "I don't have enough information to answer that question based on the scraped content."
85
-
86
- Context information:
87
  {str(top_chunks)}
88
  """
89
 
90
- full_prompt = system_prompt + "\nUser Query: " + query
91
  model = genai.GenerativeModel('gemini-2.0-flash')
92
- response = model.generate_content(full_prompt)
93
  return response.text
94
 
95
- # Create two columns: sidebar for database and main content
96
- col1, main_col = st.columns([1, 3])
97
-
98
- # Database management sidebar
99
- with col1:
100
  st.header("Database Management")
101
-
102
- # List available collections
103
- try:
104
- # Fix for ChromaDB v0.6.0 - list_collections() now returns only names
105
- collection_names = chroma_client.list_collections()
106
-
107
- if collection_names:
108
- st.write("Available data collections:")
109
- selected_collection = st.selectbox("Select a collection to query:", collection_names)
110
-
111
- if selected_collection and st.button("Load Selected Collection"):
112
- st.session_state.collection_name = selected_collection
113
- st.session_state.scraped = True
114
- st.success(f"Loaded collection: {selected_collection}")
115
- st.rerun()
116
- except Exception as e:
117
- st.error(f"Error: {str(e)}")
118
-
119
- # Add a button to clear the session and start over
120
  if st.button("Clear Chat History"):
121
  st.session_state.chat_history = []
122
  st.rerun()
123
 
124
- # Scraping section
125
  st.header("Step 1: Scrape a Website")
126
-
127
- url = st.text_input("Enter the URL to scrape:")
128
-
129
- if url:
130
- if st.button("Scrape & Process"):
131
- with st.spinner("Scraping and processing content..."):
132
- result = scrape_text(url)
133
- st.success(result)
134
 
135
- # Main content area
136
- with main_col:
137
- st.title("Web Scraper & Q&A Chatbot")
138
-
139
- # Use a container with custom CSS for the scrollable chat area
140
- chat_container = st.container()
 
141
 
142
- # Apply custom CSS for the chat container
143
- st.markdown("""
144
- <style>
145
- .chat-container {
146
- height: 500px;
147
- overflow-y: auto;
148
- border: 1px solid #ddd;
149
- border-radius: 5px;
150
- padding: 15px;
151
- margin-bottom: 10px;
152
- background-color: #f9f9f9;
153
- }
154
- .stChatInputContainer {
155
- position: sticky;
156
- bottom: 0;
157
- background-color: white;
158
- padding-top: 10px;
159
- z-index: 100;
160
- }
161
- </style>
162
- """, unsafe_allow_html=True)
163
-
164
- # Q&A section - only appears after scraping is complete
165
- if st.session_state.scraped:
166
- st.subheader("Step 2: Ask Questions About the Scraped Content")
167
-
168
- # Use a div with our custom class for the scrollable area
169
- st.markdown('<div class="chat-container">', unsafe_allow_html=True)
170
 
171
- # Display chat history
172
- for message in st.session_state.chat_history:
173
- with chat_container.chat_message(message["role"]):
174
- st.write(message["content"])
175
-
176
- st.markdown('</div>', unsafe_allow_html=True)
177
-
178
- # Input for new question - always at the bottom
179
- user_query = st.chat_input("Ask your question here")
180
-
181
- if user_query:
182
- # Add user question to chat history
183
- st.session_state.chat_history.append({"role": "user", "content": user_query})
184
-
185
- # Get answer
186
- with st.spinner("Searching database..."):
187
- answer = ask_question(user_query, st.session_state.collection_name)
188
-
189
- # Add answer to chat history
190
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
191
-
192
- # Rerun to update the UI with new messages
193
- st.rerun()
194
- else:
195
- st.info("Please scrape a website or load a collection to start chatting.")
 
7
  import chromadb
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
 
10
 
11
  # Page configuration
12
  st.set_page_config(layout="wide")
 
18
  CHROMA_PATH = "chroma_db"
19
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
20
 
21
+ # Initialize session state
22
  if 'scraped' not in st.session_state:
23
  st.session_state.scraped = False
24
  if 'collection_name' not in st.session_state:
 
30
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
31
 
32
  def clean_text(text):
33
+ return re.sub(r'\s+', ' ', re.sub(r'http\S+', '', text)).strip()
 
 
34
 
35
  def split_content_into_chunks(content):
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
37
+ return text_splitter.split_documents([Document(page_content=content)])
 
38
 
39
  def add_chunks_to_db(chunks, collection_name):
 
40
  collection = chroma_client.get_or_create_collection(name=collection_name)
 
41
  documents = [chunk.page_content for chunk in chunks]
 
42
  embeddings = embedding_model.encode(documents, convert_to_list=True)
43
+ collection.upsert(documents=documents, ids=[f"ID{i}" for i in range(len(chunks))], embeddings=embeddings)
44
 
45
  def scrape_text(url):
46
  try:
 
48
  response.raise_for_status()
49
  soup = BeautifulSoup(response.text, 'html.parser')
50
 
 
 
 
51
  text = clean_text(soup.get_text())
52
  chunks = split_content_into_chunks(text)
53
+ add_chunks_to_db(chunks, st.session_state.collection_name)
54
 
 
55
  st.session_state.scraped = True
 
56
  return "Scraping and processing complete. You can now ask questions!"
57
  except requests.exceptions.RequestException as e:
58
  return f"Error scraping {url}: {e}"
59
 
60
  def ask_question(query, collection_name):
 
61
  collection = chroma_client.get_or_create_collection(name=collection_name)
 
62
  query_embedding = embedding_model.encode(query, convert_to_list=True)
63
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
64
  top_chunks = results.get("documents", [[]])[0]
65
 
66
  system_prompt = f"""
67
+ You are a helpful assistant. Answer only from the provided context.
68
+ If you lack information, say: "I don't have enough information to answer that question."
69
+ Context:
 
 
 
70
  {str(top_chunks)}
71
  """
72
 
 
73
  model = genai.GenerativeModel('gemini-2.0-flash')
74
+ response = model.generate_content(system_prompt + "\nUser Query: " + query)
75
  return response.text
76
 
77
+ # Sidebar
78
+ with st.sidebar:
 
 
 
79
  st.header("Database Management")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  if st.button("Clear Chat History"):
81
  st.session_state.chat_history = []
82
  st.rerun()
83
 
 
84
  st.header("Step 1: Scrape a Website")
85
+ url = st.text_input("Enter URL:")
86
+ if url and st.button("Scrape & Process"):
87
+ with st.spinner("Scraping..."):
88
+ st.success(scrape_text(url))
 
 
 
 
89
 
90
+ # Main content
91
+ st.title("Web Scraper & Q&A Chatbot")
92
+ if st.session_state.scraped:
93
+ st.subheader("Step 2: Ask Questions")
94
+ for message in st.session_state.chat_history:
95
+ with st.chat_message(message["role"]):
96
+ st.write(message["content"])
97
 
98
+ user_query = st.chat_input("Ask your question here")
99
+ if user_query:
100
+ st.session_state.chat_history.append({"role": "user", "content": user_query})
101
+ with st.spinner("Searching..."):
102
+ answer = ask_question(user_query, st.session_state.collection_name)
103
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ # Limit chat history to 6 messages
106
+ st.session_state.chat_history = st.session_state.chat_history[-6:]
107
+ st.rerun()
108
+ else:
109
+ st.info("Please scrape a website first.")