RChaubey16 commited on
Commit
d78024f
·
verified ·
1 Parent(s): dc220d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -13
app.py CHANGED
@@ -8,13 +8,19 @@ import chromadb
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
10
 
 
11
  genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
12
 
 
13
  CHROMA_PATH = "chroma_db"
14
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
15
  collection = chroma_client.get_or_create_collection(name="formula_1")
16
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
17
 
 
 
 
 
18
  def clean_text(text):
19
  text = re.sub(r'http\S+', '', text)
20
  text = re.sub(r'\s+', ' ', text).strip()
@@ -39,6 +45,8 @@ def scrape_text(url):
39
  text = clean_text(soup.get_text())
40
  chunks = split_content_into_chunks(text)
41
  add_chunks_to_db(chunks)
 
 
42
  return "Scraping and processing complete. You can now ask questions!"
43
  except requests.exceptions.RequestException as e:
44
  return f"Error scraping {url}: {e}"
@@ -47,29 +55,73 @@ def ask_question(query):
47
  query_embedding = embedding_model.encode(query, convert_to_list=True)
48
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
49
  top_chunks = results.get("documents", [[]])[0]
 
50
  system_prompt = """
51
  You are a Formula 1 expert. You answer questions about Formula 1.
52
  But you only answer based on knowledge I'm providing you. You don't use your internal
53
  knowledge and you don't make things up.
54
  If you don't know the answer, just say: I don't know.
55
  """ + str(top_chunks)
 
56
  full_prompt = system_prompt + "\nUser Query: " + query
57
  model = genai.GenerativeModel('gemini-2.0-flash')
58
  response = model.generate_content(full_prompt)
59
  return response.text
60
 
61
- st.title("Web Scraping & Chatbot")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- url = st.text_input("Enter a URL:")
64
- if url:
65
- if st.button("Scrape & Process"):
66
- result = scrape_text(url)
67
- st.success(result)
68
 
69
- if 'scraped' in st.session_state and st.session_state.scraped:
70
- st.subheader("Ask a Question")
71
- query = st.text_input("Enter your question:")
72
- if query:
73
- if st.button("Get Answer"):
74
- answer = ask_question(query)
75
- st.write(answer)
 
8
  from sentence_transformers import SentenceTransformer
9
  import google.generativeai as genai
10
 
11
+ # Initialize Gemini API
12
  genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
13
 
14
+ # Initialize ChromaDB
15
  CHROMA_PATH = "chroma_db"
16
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
17
  collection = chroma_client.get_or_create_collection(name="formula_1")
18
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
19
 
20
+ # Initialize session state to track if scraping is complete
21
+ if 'scraped' not in st.session_state:
22
+ st.session_state.scraped = False
23
+
24
  def clean_text(text):
25
  text = re.sub(r'http\S+', '', text)
26
  text = re.sub(r'\s+', ' ', text).strip()
 
45
  text = clean_text(soup.get_text())
46
  chunks = split_content_into_chunks(text)
47
  add_chunks_to_db(chunks)
48
+ # Set scraped state to True when complete
49
+ st.session_state.scraped = True
50
  return "Scraping and processing complete. You can now ask questions!"
51
  except requests.exceptions.RequestException as e:
52
  return f"Error scraping {url}: {e}"
 
55
  query_embedding = embedding_model.encode(query, convert_to_list=True)
56
  results = collection.query(query_embeddings=[query_embedding], n_results=2)
57
  top_chunks = results.get("documents", [[]])[0]
58
+
59
  system_prompt = """
60
  You are a Formula 1 expert. You answer questions about Formula 1.
61
  But you only answer based on knowledge I'm providing you. You don't use your internal
62
  knowledge and you don't make things up.
63
  If you don't know the answer, just say: I don't know.
64
  """ + str(top_chunks)
65
+
66
  full_prompt = system_prompt + "\nUser Query: " + query
67
  model = genai.GenerativeModel('gemini-2.0-flash')
68
  response = model.generate_content(full_prompt)
69
  return response.text
70
 
71
+ # Main UI
72
+ st.title("Formula 1 Web Scraper & Chatbot")
73
+
74
+ # Scraping section
75
+ with st.container():
76
+ st.subheader("Step 1: Scrape a Formula 1 Website")
77
+ url = st.text_input("Enter a Formula 1 related URL:")
78
+
79
+ if url:
80
+ if st.button("Scrape & Process"):
81
+ with st.spinner("Scraping and processing content..."):
82
+ result = scrape_text(url)
83
+ st.success(result)
84
+
85
+ # Q&A section - only appears after scraping is complete
86
+ if st.session_state.scraped:
87
+ with st.container():
88
+ st.subheader("Step 2: Ask Questions About Formula 1")
89
+ st.write("The database contains information scraped from the website. Ask a question about Formula 1:")
90
+
91
+ # Chat history
92
+ if 'chat_history' not in st.session_state:
93
+ st.session_state.chat_history = []
94
+
95
+ # Display chat history
96
+ for message in st.session_state.chat_history:
97
+ with st.chat_message(message["role"]):
98
+ st.write(message["content"])
99
+
100
+ # Input for new question
101
+ user_query = st.chat_input("Ask your Formula 1 question here")
102
+
103
+ if user_query:
104
+ # Add user question to chat history
105
+ st.session_state.chat_history.append({"role": "user", "content": user_query})
106
+
107
+ # Display user question
108
+ with st.chat_message("user"):
109
+ st.write(user_query)
110
+
111
+ # Get and display answer
112
+ with st.chat_message("assistant"):
113
+ with st.spinner("Searching Formula 1 database..."):
114
+ answer = ask_question(user_query)
115
+ st.write(answer)
116
+
117
+ # Add answer to chat history
118
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
119
 
120
+ else:
121
+ st.info("Please scrape a Formula 1 website first to populate the database, then you can ask questions!")
 
 
 
122
 
123
+ # Add a button to clear the session and start over
124
+ if st.button("Clear Chat History and Data"):
125
+ st.session_state.chat_history = []
126
+ st.session_state.scraped = False
127
+ st.experimental_rerun()