RChaubey16 commited on
Commit
bd118ce
·
verified ·
1 Parent(s): d78024f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -32
app.py CHANGED
@@ -14,12 +14,15 @@ genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
14
  # Initialize ChromaDB
15
  CHROMA_PATH = "chroma_db"
16
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
17
- collection = chroma_client.get_or_create_collection(name="formula_1")
18
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
19
 
20
- # Initialize session state to track if scraping is complete
21
  if 'scraped' not in st.session_state:
22
  st.session_state.scraped = False
 
 
 
 
 
23
 
24
  def clean_text(text):
25
  text = re.sub(r'http\S+', '', text)
@@ -31,37 +34,49 @@ def split_content_into_chunks(content):
31
  documents = [Document(page_content=content)]
32
  return text_splitter.split_documents(documents)
33
 
34
- def add_chunks_to_db(chunks):
 
 
 
35
  documents = [chunk.page_content for chunk in chunks]
36
  ids = [f"ID{i}" for i in range(len(chunks))]
37
  embeddings = embedding_model.encode(documents, convert_to_list=True)
38
  collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
39
 
40
- def scrape_text(url):
41
  try:
42
  response = requests.get(url)
43
  response.raise_for_status()
44
  soup = BeautifulSoup(response.text, 'html.parser')
45
  text = clean_text(soup.get_text())
46
  chunks = split_content_into_chunks(text)
47
- add_chunks_to_db(chunks)
48
- # Set scraped state to True when complete
 
 
49
  st.session_state.scraped = True
 
50
  return "Scraping and processing complete. You can now ask questions!"
51
  except requests.exceptions.RequestException as e:
52
  return f"Error scraping {url}: {e}"
53
 
54
- def ask_question(query):
 
 
 
55
  query_embedding = embedding_model.encode(query, convert_to_list=True)
56
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
57
  top_chunks = results.get("documents", [[]])[0]
58
 
59
- system_prompt = """
60
- You are a Formula 1 expert. You answer questions about Formula 1.
61
- But you only answer based on knowledge I'm providing you. You don't use your internal
62
- knowledge and you don't make things up.
63
- If you don't know the answer, just say: I don't know.
64
- """ + str(top_chunks)
 
 
 
65
 
66
  full_prompt = system_prompt + "\nUser Query: " + query
67
  model = genai.GenerativeModel('gemini-2.0-flash')
@@ -69,24 +84,30 @@ def ask_question(query):
69
  return response.text
70
 
71
  # Main UI
72
- st.title("Formula 1 Web Scraper & Chatbot")
73
 
74
  # Scraping section
75
  with st.container():
76
- st.subheader("Step 1: Scrape a Formula 1 Website")
77
- url = st.text_input("Enter a Formula 1 related URL:")
 
 
 
 
78
 
79
- if url:
 
 
80
  if st.button("Scrape & Process"):
81
  with st.spinner("Scraping and processing content..."):
82
- result = scrape_text(url)
83
  st.success(result)
84
 
85
  # Q&A section - only appears after scraping is complete
86
  if st.session_state.scraped:
87
  with st.container():
88
- st.subheader("Step 2: Ask Questions About Formula 1")
89
- st.write("The database contains information scraped from the website. Ask a question about Formula 1:")
90
 
91
  # Chat history
92
  if 'chat_history' not in st.session_state:
@@ -98,7 +119,7 @@ if st.session_state.scraped:
98
  st.write(message["content"])
99
 
100
  # Input for new question
101
- user_query = st.chat_input("Ask your Formula 1 question here")
102
 
103
  if user_query:
104
  # Add user question to chat history
@@ -110,18 +131,35 @@ if st.session_state.scraped:
110
 
111
  # Get and display answer
112
  with st.chat_message("assistant"):
113
- with st.spinner("Searching Formula 1 database..."):
114
- answer = ask_question(user_query)
115
  st.write(answer)
116
 
117
  # Add answer to chat history
118
  st.session_state.chat_history.append({"role": "assistant", "content": answer})
119
 
120
- else:
121
- st.info("Please scrape a Formula 1 website first to populate the database, then you can ask questions!")
122
-
123
- # Add a button to clear the session and start over
124
- if st.button("Clear Chat History and Data"):
125
- st.session_state.chat_history = []
126
- st.session_state.scraped = False
127
- st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Initialize ChromaDB
15
  CHROMA_PATH = "chroma_db"
16
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
 
 
17
 
18
+ # Initialize session state to track if scraping is complete and collection name
19
  if 'scraped' not in st.session_state:
20
  st.session_state.scraped = False
21
+ if 'collection_name' not in st.session_state:
22
+ st.session_state.collection_name = ""
23
+
24
+ # Initialize embedding model
25
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
26
 
27
  def clean_text(text):
28
  text = re.sub(r'http\S+', '', text)
 
34
  documents = [Document(page_content=content)]
35
  return text_splitter.split_documents(documents)
36
 
37
+ def add_chunks_to_db(chunks, collection_name):
38
+ # Create or get collection
39
+ collection = chroma_client.get_or_create_collection(name=collection_name)
40
+
41
  documents = [chunk.page_content for chunk in chunks]
42
  ids = [f"ID{i}" for i in range(len(chunks))]
43
  embeddings = embedding_model.encode(documents, convert_to_list=True)
44
  collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
45
 
46
+ def scrape_text(url, collection_name):
47
  try:
48
  response = requests.get(url)
49
  response.raise_for_status()
50
  soup = BeautifulSoup(response.text, 'html.parser')
51
  text = clean_text(soup.get_text())
52
  chunks = split_content_into_chunks(text)
53
+ add_chunks_to_db(chunks, collection_name)
54
+
55
+ # Store collection name and set scraped state to True
56
+ st.session_state.collection_name = collection_name
57
  st.session_state.scraped = True
58
+
59
  return "Scraping and processing complete. You can now ask questions!"
60
  except requests.exceptions.RequestException as e:
61
  return f"Error scraping {url}: {e}"
62
 
63
+ def ask_question(query, collection_name):
64
+ # Get the collection
65
+ collection = chroma_client.get_collection(name=collection_name)
66
+
67
  query_embedding = embedding_model.encode(query, convert_to_list=True)
68
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
69
  top_chunks = results.get("documents", [[]])[0]
70
 
71
+ system_prompt = f"""
72
+ You are a helpful assistant. You answer questions based on the provided context.
73
+ Only answer based on the knowledge I'm providing you. Don't use your internal
74
+ knowledge and don't make things up.
75
+ If you don't know the answer based on the provided context, just say: "I don't have enough information to answer that question based on the scraped content."
76
+
77
+ Context information:
78
+ {str(top_chunks)}
79
+ """
80
 
81
  full_prompt = system_prompt + "\nUser Query: " + query
82
  model = genai.GenerativeModel('gemini-2.0-flash')
 
84
  return response.text
85
 
86
  # Main UI
87
+ st.title("Web Scraper & Q&A Chatbot")
88
 
89
  # Scraping section
90
  with st.container():
91
+ st.subheader("Step 1: Scrape a Website")
92
+
93
+ # Let user create a new database or use existing one
94
+ collection_name = st.text_input("Enter a name for this data collection:",
95
+ value="my_collection",
96
+ help="This will create a new database or use an existing one with this name")
97
 
98
+ url = st.text_input("Enter the URL to scrape:")
99
+
100
+ if url and collection_name:
101
  if st.button("Scrape & Process"):
102
  with st.spinner("Scraping and processing content..."):
103
+ result = scrape_text(url, collection_name)
104
  st.success(result)
105
 
106
  # Q&A section - only appears after scraping is complete
107
  if st.session_state.scraped:
108
  with st.container():
109
+ st.subheader("Step 2: Ask Questions About the Scraped Content")
110
+ st.write(f"The database '{st.session_state.collection_name}' contains information scraped from the website. Ask a question:")
111
 
112
  # Chat history
113
  if 'chat_history' not in st.session_state:
 
119
  st.write(message["content"])
120
 
121
  # Input for new question
122
+ user_query = st.chat_input("Ask your question here")
123
 
124
  if user_query:
125
  # Add user question to chat history
 
131
 
132
  # Get and display answer
133
  with st.chat_message("assistant"):
134
+ with st.spinner("Searching database..."):
135
+ answer = ask_question(user_query, st.session_state.collection_name)
136
  st.write(answer)
137
 
138
  # Add answer to chat history
139
  st.session_state.chat_history.append({"role": "assistant", "content": answer})
140
 
141
+ # Selection of existing collections
142
+ with st.sidebar:
143
+ st.header("Database Management")
144
+
145
+ # List available collections
146
+ try:
147
+ all_collections = chroma_client.list_collections()
148
+ collection_names = [collection.name for collection in all_collections]
149
+
150
+ if collection_names:
151
+ st.write("Available data collections:")
152
+ selected_collection = st.selectbox("Select a collection to query:", collection_names)
153
+
154
+ if selected_collection and st.button("Load Selected Collection"):
155
+ st.session_state.collection_name = selected_collection
156
+ st.session_state.scraped = True
157
+ st.success(f"Loaded collection: {selected_collection}")
158
+ st.rerun() # Updated from experimental_rerun()
159
+ except Exception as e:
160
+ st.error(f"Error loading collections: {e}")
161
+
162
+ # Add a button to clear the session and start over
163
+ if st.button("Clear Chat History"):
164
+ st.session_state.chat_history = []
165
+ st.rerun() # Updated from experimental_rerun()