shukdevdatta123 commited on
Commit
0c94523
·
verified ·
1 Parent(s): 6bb3d54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -267
app.py CHANGED
@@ -1,268 +1,268 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from urllib.parse import urljoin, urlparse
5
- import re
6
- from openai import OpenAI
7
- import time
8
- import copy
9
-
10
- # Function to check if URL belongs to the website
11
- def is_valid_url(url, base_url):
12
- parsed_url = urlparse(url)
13
- parsed_base = urlparse(base_url)
14
- return parsed_url.netloc == parsed_base.netloc
15
-
16
- # Function to scrape content from a single page
17
- def scrape_page(url):
18
- try:
19
- response = requests.get(url, timeout=10)
20
- if response.status_code == 200:
21
- soup = BeautifulSoup(response.text, 'html.parser')
22
-
23
- # Remove script, style elements and comments
24
- for element in soup(['script', 'style', 'header', 'footer', 'nav']):
25
- element.decompose()
26
-
27
- # Get text content
28
- text = soup.get_text(separator=' ', strip=True)
29
-
30
- # Clean up whitespace
31
- text = re.sub(r'\s+', ' ', text).strip()
32
-
33
- return text
34
- else:
35
- return None
36
- except Exception as e:
37
- print(f"Error scraping {url}: {e}")
38
- return None
39
-
40
- # Function to crawl website and get all links
41
- def crawl_website(base_url, max_pages=30):
42
- print(f"Starting to crawl {base_url}")
43
- visited_urls = set()
44
- urls_to_visit = [base_url]
45
- site_content = {}
46
-
47
- while urls_to_visit and len(visited_urls) < max_pages:
48
- current_url = urls_to_visit.pop(0)
49
-
50
- if current_url in visited_urls:
51
- continue
52
-
53
- print(f"Crawling: {current_url}")
54
- visited_urls.add(current_url)
55
-
56
- try:
57
- response = requests.get(current_url, timeout=10)
58
- if response.status_code == 200:
59
- # Get content of the current page
60
- content = scrape_page(current_url)
61
- if content:
62
- site_content[current_url] = content
63
-
64
- # Find all links on the page
65
- soup = BeautifulSoup(response.text, 'html.parser')
66
- for link in soup.find_all('a', href=True):
67
- href = link['href']
68
- full_url = urljoin(current_url, href)
69
-
70
- # Only follow links that are part of the same website
71
- if is_valid_url(full_url, base_url) and full_url not in visited_urls:
72
- urls_to_visit.append(full_url)
73
-
74
- # Add a small delay to be respectful
75
- time.sleep(0.5)
76
-
77
- except Exception as e:
78
- print(f"Error visiting {current_url}: {e}")
79
-
80
- print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
81
- return site_content
82
-
83
- # Function that creates a context from the scraped content
84
- def create_context(site_content, max_context_length=8000):
85
- context = "Content from https://innovativeskillsbd.com website:\n\n"
86
-
87
- for url, content in site_content.items():
88
- # Add URL and a portion of its content (limited to keep context manageable)
89
- page_content = f"Page: {url}\n{content[:1000]}...\n\n"
90
-
91
- # Check if adding this would exceed max context length
92
- if len(context) + len(page_content) > max_context_length:
93
- break
94
-
95
- context += page_content
96
-
97
- return context
98
-
99
- # Function to fix URLs in text to ensure they point to the correct domain
100
- def fix_urls_in_text(text):
101
- # Look for URLs in the text
102
- url_pattern = r'https?://[^\s/$.?#].[^\s]*'
103
- urls = re.findall(url_pattern, text)
104
-
105
- for url in urls:
106
- # If the URL contains the wrong domain but appears to be an InnovativeSkills link
107
- if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
108
- # Create the correct URL by replacing the domain
109
- path = urlparse(url).path
110
- correct_url = f"https://innovativeskillsbd.com{path}"
111
- # Replace in the text
112
- text = text.replace(url, correct_url)
113
-
114
- return text
115
-
116
- # Function to query the DeepSeek V3 model
117
- def query_model(api_key, messages):
118
- try:
119
- client = OpenAI(
120
- base_url="https://openrouter.ai/api/v1",
121
- api_key=api_key,
122
- )
123
-
124
- completion = client.chat.completions.create(
125
- extra_headers={
126
- "HTTP-Referer": "https://innovativeskillsbd.com",
127
- "X-Title": "InnovativeSkills ChatBot",
128
- },
129
- model="deepseek/deepseek-chat-v3-0324:free",
130
- messages=messages
131
- )
132
-
133
- response = completion.choices[0].message.content
134
-
135
- # Fix any incorrect URLs - ensure all links point to the correct domain
136
- response = fix_urls_in_text(response)
137
-
138
- return response
139
- except Exception as e:
140
- return f"Error querying the model: {str(e)}"
141
-
142
- # Function to answer questions based on website content
143
- def answer_question(api_key, question, site_content, history):
144
- if not api_key:
145
- return "Please enter your OpenRouter API key.", history
146
-
147
- # Prepare the context from scraped content
148
- context = create_context(site_content)
149
-
150
- # Create system message with context
151
- system_message = {
152
- "role": "system",
153
- "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
154
- Use the following content from the website to answer user questions. If the question is not related to the website or the
155
- information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
156
-
157
- IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
158
- For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
159
-
160
- {context}"""
161
- }
162
-
163
- # Create user message
164
- user_message = {"role": "user", "content": question}
165
-
166
- # Create message history for the API call
167
- messages = [system_message]
168
-
169
- # Add conversation history
170
- for user_msg, assistant_msg in history:
171
- messages.append({"role": "user", "content": user_msg})
172
- messages.append({"role": "assistant", "content": assistant_msg})
173
-
174
- # Add current question
175
- messages.append(user_message)
176
-
177
- # Query the model
178
- response = query_model(api_key, messages)
179
-
180
- # Update history by adding the new exchange
181
- new_history = copy.deepcopy(history)
182
- new_history.append((question, response))
183
- return response, new_history
184
-
185
- # Scrape the website when the app starts
186
- def init_scraper(progress=gr.Progress()):
187
- base_url = "https://innovativeskillsbd.com/"
188
- progress(0, desc="Starting website crawler...")
189
- site_content = crawl_website(base_url)
190
- progress(1, desc="Finished crawling website")
191
- return site_content
192
-
193
- # Create Gradio interface
194
- def create_interface(site_content):
195
- with gr.Blocks() as app:
196
- gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
197
- gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
198
-
199
- with gr.Row():
200
- api_key_input = gr.Textbox(
201
- label="OpenRouter API Key",
202
- placeholder="Enter your OpenRouter API key",
203
- type="password"
204
- )
205
-
206
- chatbot = gr.Chatbot(height=500)
207
- msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
208
-
209
- # Container for site content (hidden from UI)
210
- site_content_state = gr.State(site_content)
211
-
212
- # Container for chat history
213
- chat_history = gr.State([])
214
-
215
- # Button to start the conversation
216
- clear = gr.Button("Clear conversation")
217
-
218
- # Events
219
- def user_input(api_key, message, site_content, history):
220
- if not message:
221
- return "", chatbot, history
222
-
223
- # Process the response
224
- bot_response, updated_history = answer_question(api_key, message, site_content, history)
225
-
226
- # Format history for chatbot display
227
- chatbot_display = []
228
- for user_msg, bot_msg in updated_history:
229
- chatbot_display.append([user_msg, bot_msg])
230
-
231
- return "", chatbot_display, updated_history
232
-
233
- msg.submit(
234
- user_input,
235
- inputs=[api_key_input, msg, site_content_state, chat_history],
236
- outputs=[msg, chatbot, chat_history]
237
- )
238
-
239
- def clear_chat():
240
- return "", [], []
241
-
242
- clear.click(
243
- clear_chat,
244
- outputs=[msg, chatbot, chat_history]
245
- )
246
-
247
- return app
248
-
249
- # Initialize and launch the app
250
- def main():
251
- print("Starting to initialize the InnovativeSkills chatbot...")
252
-
253
- # First, scrape the website content
254
- site_content = {}
255
- try:
256
- site_content = crawl_website("https://innovativeskillsbd.com/")
257
- except Exception as e:
258
- print(f"Error during initial website crawling: {e}")
259
- print("The chatbot will still work, but without initial website content.")
260
-
261
- # Create the Gradio interface with the site content
262
- app = create_interface(site_content)
263
-
264
- # Launch the app
265
- app.launch()
266
-
267
- if __name__ == "__main__":
268
  main()
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin, urlparse
5
+ import re
6
+ from openai import OpenAI
7
+ import time
8
+ import copy
9
+
10
+ # Function to check if URL belongs to the website
11
+ def is_valid_url(url, base_url):
12
+ parsed_url = urlparse(url)
13
+ parsed_base = urlparse(base_url)
14
+ return parsed_url.netloc == parsed_base.netloc
15
+
16
+ # Function to scrape content from a single page
17
+ def scrape_page(url):
18
+ try:
19
+ response = requests.get(url, timeout=10)
20
+ if response.status_code == 200:
21
+ soup = BeautifulSoup(response.text, 'html.parser')
22
+
23
+ # Remove script, style elements and comments
24
+ for element in soup(['script', 'style', 'header', 'footer', 'nav']):
25
+ element.decompose()
26
+
27
+ # Get text content
28
+ text = soup.get_text(separator=' ', strip=True)
29
+
30
+ # Clean up whitespace
31
+ text = re.sub(r'\s+', ' ', text).strip()
32
+
33
+ return text
34
+ else:
35
+ return None
36
+ except Exception as e:
37
+ print(f"Error scraping {url}: {e}")
38
+ return None
39
+
40
+ # Function to crawl website and get all links
41
+ def crawl_website(base_url, max_pages=80):
42
+ print(f"Starting to crawl {base_url}")
43
+ visited_urls = set()
44
+ urls_to_visit = [base_url]
45
+ site_content = {}
46
+
47
+ while urls_to_visit and len(visited_urls) < max_pages:
48
+ current_url = urls_to_visit.pop(0)
49
+
50
+ if current_url in visited_urls:
51
+ continue
52
+
53
+ print(f"Crawling: {current_url}")
54
+ visited_urls.add(current_url)
55
+
56
+ try:
57
+ response = requests.get(current_url, timeout=10)
58
+ if response.status_code == 200:
59
+ # Get content of the current page
60
+ content = scrape_page(current_url)
61
+ if content:
62
+ site_content[current_url] = content
63
+
64
+ # Find all links on the page
65
+ soup = BeautifulSoup(response.text, 'html.parser')
66
+ for link in soup.find_all('a', href=True):
67
+ href = link['href']
68
+ full_url = urljoin(current_url, href)
69
+
70
+ # Only follow links that are part of the same website
71
+ if is_valid_url(full_url, base_url) and full_url not in visited_urls:
72
+ urls_to_visit.append(full_url)
73
+
74
+ # Add a small delay to be respectful
75
+ time.sleep(0.5)
76
+
77
+ except Exception as e:
78
+ print(f"Error visiting {current_url}: {e}")
79
+
80
+ print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
81
+ return site_content
82
+
83
+ # Function that creates a context from the scraped content
84
+ def create_context(site_content, max_context_length=8000):
85
+ context = "Content from https://innovativeskillsbd.com website:\n\n"
86
+
87
+ for url, content in site_content.items():
88
+ # Add URL and a portion of its content (limited to keep context manageable)
89
+ page_content = f"Page: {url}\n{content[:1000]}...\n\n"
90
+
91
+ # Check if adding this would exceed max context length
92
+ if len(context) + len(page_content) > max_context_length:
93
+ break
94
+
95
+ context += page_content
96
+
97
+ return context
98
+
99
+ # Function to fix URLs in text to ensure they point to the correct domain
100
+ def fix_urls_in_text(text):
101
+ # Look for URLs in the text
102
+ url_pattern = r'https?://[^\s/$.?#].[^\s]*'
103
+ urls = re.findall(url_pattern, text)
104
+
105
+ for url in urls:
106
+ # If the URL contains the wrong domain but appears to be an InnovativeSkills link
107
+ if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
108
+ # Create the correct URL by replacing the domain
109
+ path = urlparse(url).path
110
+ correct_url = f"https://innovativeskillsbd.com{path}"
111
+ # Replace in the text
112
+ text = text.replace(url, correct_url)
113
+
114
+ return text
115
+
116
+ # Function to query the DeepSeek V3 model
117
+ def query_model(api_key, messages):
118
+ try:
119
+ client = OpenAI(
120
+ base_url="https://openrouter.ai/api/v1",
121
+ api_key=api_key,
122
+ )
123
+
124
+ completion = client.chat.completions.create(
125
+ extra_headers={
126
+ "HTTP-Referer": "https://innovativeskillsbd.com",
127
+ "X-Title": "InnovativeSkills ChatBot",
128
+ },
129
+ model="deepseek/deepseek-chat-v3-0324:free",
130
+ messages=messages
131
+ )
132
+
133
+ response = completion.choices[0].message.content
134
+
135
+ # Fix any incorrect URLs - ensure all links point to the correct domain
136
+ response = fix_urls_in_text(response)
137
+
138
+ return response
139
+ except Exception as e:
140
+ return f"Error querying the model: {str(e)}"
141
+
142
+ # Function to answer questions based on website content
143
+ def answer_question(api_key, question, site_content, history):
144
+ if not api_key:
145
+ return "Please enter your OpenRouter API key.", history
146
+
147
+ # Prepare the context from scraped content
148
+ context = create_context(site_content)
149
+
150
+ # Create system message with context
151
+ system_message = {
152
+ "role": "system",
153
+ "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
154
+ Use the following content from the website to answer user questions. If the question is not related to the website or the
155
+ information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
156
+
157
+ IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
158
+ For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
159
+
160
+ {context}"""
161
+ }
162
+
163
+ # Create user message
164
+ user_message = {"role": "user", "content": question}
165
+
166
+ # Create message history for the API call
167
+ messages = [system_message]
168
+
169
+ # Add conversation history
170
+ for user_msg, assistant_msg in history:
171
+ messages.append({"role": "user", "content": user_msg})
172
+ messages.append({"role": "assistant", "content": assistant_msg})
173
+
174
+ # Add current question
175
+ messages.append(user_message)
176
+
177
+ # Query the model
178
+ response = query_model(api_key, messages)
179
+
180
+ # Update history by adding the new exchange
181
+ new_history = copy.deepcopy(history)
182
+ new_history.append((question, response))
183
+ return response, new_history
184
+
185
+ # Scrape the website when the app starts
186
+ def init_scraper(progress=gr.Progress()):
187
+ base_url = "https://innovativeskillsbd.com/"
188
+ progress(0, desc="Starting website crawler...")
189
+ site_content = crawl_website(base_url)
190
+ progress(1, desc="Finished crawling website")
191
+ return site_content
192
+
193
+ # Create Gradio interface
194
+ def create_interface(site_content):
195
+ with gr.Blocks() as app:
196
+ gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
197
+ gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
198
+
199
+ with gr.Row():
200
+ api_key_input = gr.Textbox(
201
+ label="OpenRouter API Key",
202
+ placeholder="Enter your OpenRouter API key",
203
+ type="password"
204
+ )
205
+
206
+ chatbot = gr.Chatbot(height=500)
207
+ msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
208
+
209
+ # Container for site content (hidden from UI)
210
+ site_content_state = gr.State(site_content)
211
+
212
+ # Container for chat history
213
+ chat_history = gr.State([])
214
+
215
+ # Button to start the conversation
216
+ clear = gr.Button("Clear conversation")
217
+
218
+ # Events
219
+ def user_input(api_key, message, site_content, history):
220
+ if not message:
221
+ return "", chatbot, history
222
+
223
+ # Process the response
224
+ bot_response, updated_history = answer_question(api_key, message, site_content, history)
225
+
226
+ # Format history for chatbot display
227
+ chatbot_display = []
228
+ for user_msg, bot_msg in updated_history:
229
+ chatbot_display.append([user_msg, bot_msg])
230
+
231
+ return "", chatbot_display, updated_history
232
+
233
+ msg.submit(
234
+ user_input,
235
+ inputs=[api_key_input, msg, site_content_state, chat_history],
236
+ outputs=[msg, chatbot, chat_history]
237
+ )
238
+
239
+ def clear_chat():
240
+ return "", [], []
241
+
242
+ clear.click(
243
+ clear_chat,
244
+ outputs=[msg, chatbot, chat_history]
245
+ )
246
+
247
+ return app
248
+
249
+ # Initialize and launch the app
250
+ def main():
251
+ print("Starting to initialize the InnovativeSkills chatbot...")
252
+
253
+ # First, scrape the website content
254
+ site_content = {}
255
+ try:
256
+ site_content = crawl_website("https://innovativeskillsbd.com/")
257
+ except Exception as e:
258
+ print(f"Error during initial website crawling: {e}")
259
+ print("The chatbot will still work, but without initial website content.")
260
+
261
+ # Create the Gradio interface with the site content
262
+ app = create_interface(site_content)
263
+
264
+ # Launch the app
265
+ app.launch()
266
+
267
+ if __name__ == "__main__":
268
  main()