Spaces:

shukdevdatta123
/

INNOVBOT

Running

App Files Files Community

shukdevdatta123 commited on May 17

Commit

0c94523

verified ·

1 Parent(s): 6bb3d54

Update app.py

Browse files

Files changed (1) hide show

app.py +267 -267

app.py CHANGED Viewed

@@ -1,268 +1,268 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-import re
-from openai import OpenAI
-import time
-import copy
-# Function to check if URL belongs to the website
-def is_valid_url(url, base_url):
-    parsed_url = urlparse(url)
-    parsed_base = urlparse(base_url)
-    return parsed_url.netloc == parsed_base.netloc
-# Function to scrape content from a single page
-def scrape_page(url):
-    try:
-        response = requests.get(url, timeout=10)
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove script, style elements and comments
-            for element in soup(['script', 'style', 'header', 'footer', 'nav']):
-                element.decompose()
-            # Get text content
-            text = soup.get_text(separator=' ', strip=True)
-            # Clean up whitespace
-            text = re.sub(r'\s+', ' ', text).strip()
-            return text
-        else:
-            return None
-    except Exception as e:
-        print(f"Error scraping {url}: {e}")
-        return None
-# Function to crawl website and get all links
-def crawl_website(base_url, max_pages=30):
-    print(f"Starting to crawl {base_url}")
-    visited_urls = set()
-    urls_to_visit = [base_url]
-    site_content = {}
-    while urls_to_visit and len(visited_urls) < max_pages:
-        current_url = urls_to_visit.pop(0)
-        if current_url in visited_urls:
-            continue
-        print(f"Crawling: {current_url}")
-        visited_urls.add(current_url)
-        try:
-            response = requests.get(current_url, timeout=10)
-            if response.status_code == 200:
-                # Get content of the current page
-                content = scrape_page(current_url)
-                if content:
-                    site_content[current_url] = content
-                # Find all links on the page
-                soup = BeautifulSoup(response.text, 'html.parser')
-                for link in soup.find_all('a', href=True):
-                    href = link['href']
-                    full_url = urljoin(current_url, href)
-                    # Only follow links that are part of the same website
-                    if is_valid_url(full_url, base_url) and full_url not in visited_urls:
-                        urls_to_visit.append(full_url)
-            # Add a small delay to be respectful
-            time.sleep(0.5)
-        except Exception as e:
-            print(f"Error visiting {current_url}: {e}")
-    print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
-    return site_content
-# Function that creates a context from the scraped content
-def create_context(site_content, max_context_length=8000):
-    context = "Content from https://innovativeskillsbd.com website:\n\n"
-    for url, content in site_content.items():
-        # Add URL and a portion of its content (limited to keep context manageable)
-        page_content = f"Page: {url}\n{content[:1000]}...\n\n"
-        # Check if adding this would exceed max context length
-        if len(context) + len(page_content) > max_context_length:
-            break
-        context += page_content
-    return context
-# Function to fix URLs in text to ensure they point to the correct domain
-def fix_urls_in_text(text):
-    # Look for URLs in the text
-    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
-    urls = re.findall(url_pattern, text)
-    for url in urls:
-        # If the URL contains the wrong domain but appears to be an InnovativeSkills link
-        if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
-            # Create the correct URL by replacing the domain
-            path = urlparse(url).path
-            correct_url = f"https://innovativeskillsbd.com{path}"
-            # Replace in the text
-            text = text.replace(url, correct_url)
-    return text
-# Function to query the DeepSeek V3 model
-def query_model(api_key, messages):
-    try:
-        client = OpenAI(
-            base_url="https://openrouter.ai/api/v1",
-            api_key=api_key,
-        )
-        completion = client.chat.completions.create(
-            extra_headers={
-                "HTTP-Referer": "https://innovativeskillsbd.com",
-                "X-Title": "InnovativeSkills ChatBot",
-            },
-            model="deepseek/deepseek-chat-v3-0324:free",
-            messages=messages
-        )
-        response = completion.choices[0].message.content
-        # Fix any incorrect URLs - ensure all links point to the correct domain
-        response = fix_urls_in_text(response)
-        return response
-    except Exception as e:
-        return f"Error querying the model: {str(e)}"
-# Function to answer questions based on website content
-def answer_question(api_key, question, site_content, history):
-    if not api_key:
-        return "Please enter your OpenRouter API key.", history
-    # Prepare the context from scraped content
-    context = create_context(site_content)
-    # Create system message with context
-    system_message = {
-        "role": "system",
-        "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
-        Use the following content from the website to answer user questions. If the question is not related to the website or the
-        information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
-        IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
-        For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
-        {context}"""
-    }
-    # Create user message
-    user_message = {"role": "user", "content": question}
-    # Create message history for the API call
-    messages = [system_message]
-    # Add conversation history
-    for user_msg, assistant_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": assistant_msg})
-    # Add current question
-    messages.append(user_message)
-    # Query the model
-    response = query_model(api_key, messages)
-    # Update history by adding the new exchange
-    new_history = copy.deepcopy(history)
-    new_history.append((question, response))
-    return response, new_history
-# Scrape the website when the app starts
-def init_scraper(progress=gr.Progress()):
-    base_url = "https://innovativeskillsbd.com/"
-    progress(0, desc="Starting website crawler...")
-    site_content = crawl_website(base_url)
-    progress(1, desc="Finished crawling website")
-    return site_content
-# Create Gradio interface
-def create_interface(site_content):
-    with gr.Blocks() as app:
-        gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
-        gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
-        with gr.Row():
-            api_key_input = gr.Textbox(
-                label="OpenRouter API Key",
-                placeholder="Enter your OpenRouter API key",
-                type="password"
-            )
-        chatbot = gr.Chatbot(height=500)
-        msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
-        # Container for site content (hidden from UI)
-        site_content_state = gr.State(site_content)
-        # Container for chat history
-        chat_history = gr.State([])
-        # Button to start the conversation
-        clear = gr.Button("Clear conversation")
-        # Events
-        def user_input(api_key, message, site_content, history):
-            if not message:
-                return "", chatbot, history
-            # Process the response
-            bot_response, updated_history = answer_question(api_key, message, site_content, history)
-            # Format history for chatbot display
-            chatbot_display = []
-            for user_msg, bot_msg in updated_history:
-                chatbot_display.append([user_msg, bot_msg])
-            return "", chatbot_display, updated_history
-        msg.submit(
-            user_input,
-            inputs=[api_key_input, msg, site_content_state, chat_history],
-            outputs=[msg, chatbot, chat_history]
-        )
-        def clear_chat():
-            return "", [], []
-        clear.click(
-            clear_chat,
-            outputs=[msg, chatbot, chat_history]
-        )
-    return app
-# Initialize and launch the app
-def main():
-    print("Starting to initialize the InnovativeSkills chatbot...")
-    # First, scrape the website content
-    site_content = {}
-    try:
-        site_content = crawl_website("https://innovativeskillsbd.com/")
-    except Exception as e:
-        print(f"Error during initial website crawling: {e}")
-        print("The chatbot will still work, but without initial website content.")
-    # Create the Gradio interface with the site content
-    app = create_interface(site_content)
-    # Launch the app
-    app.launch()
-if __name__ == "__main__":
     main()

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import re
+from openai import OpenAI
+import time
+import copy
+# Function to check if URL belongs to the website
+def is_valid_url(url, base_url):
+    parsed_url = urlparse(url)
+    parsed_base = urlparse(base_url)
+    return parsed_url.netloc == parsed_base.netloc
+# Function to scrape content from a single page
+def scrape_page(url):
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove script, style elements and comments
+            for element in soup(['script', 'style', 'header', 'footer', 'nav']):
+                element.decompose()
+            # Get text content
+            text = soup.get_text(separator=' ', strip=True)
+            # Clean up whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+        else:
+            return None
+    except Exception as e:
+        print(f"Error scraping {url}: {e}")
+        return None
+# Function to crawl website and get all links
+def crawl_website(base_url, max_pages=80):
+    print(f"Starting to crawl {base_url}")
+    visited_urls = set()
+    urls_to_visit = [base_url]
+    site_content = {}
+    while urls_to_visit and len(visited_urls) < max_pages:
+        current_url = urls_to_visit.pop(0)
+        if current_url in visited_urls:
+            continue
+        print(f"Crawling: {current_url}")
+        visited_urls.add(current_url)
+        try:
+            response = requests.get(current_url, timeout=10)
+            if response.status_code == 200:
+                # Get content of the current page
+                content = scrape_page(current_url)
+                if content:
+                    site_content[current_url] = content
+                # Find all links on the page
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    full_url = urljoin(current_url, href)
+                    # Only follow links that are part of the same website
+                    if is_valid_url(full_url, base_url) and full_url not in visited_urls:
+                        urls_to_visit.append(full_url)
+            # Add a small delay to be respectful
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"Error visiting {current_url}: {e}")
+    print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
+    return site_content
+# Function that creates a context from the scraped content
+def create_context(site_content, max_context_length=8000):
+    context = "Content from https://innovativeskillsbd.com website:\n\n"
+    for url, content in site_content.items():
+        # Add URL and a portion of its content (limited to keep context manageable)
+        page_content = f"Page: {url}\n{content[:1000]}...\n\n"
+        # Check if adding this would exceed max context length
+        if len(context) + len(page_content) > max_context_length:
+            break
+        context += page_content
+    return context
+# Function to fix URLs in text to ensure they point to the correct domain
+def fix_urls_in_text(text):
+    # Look for URLs in the text
+    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
+    urls = re.findall(url_pattern, text)
+    for url in urls:
+        # If the URL contains the wrong domain but appears to be an InnovativeSkills link
+        if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
+            # Create the correct URL by replacing the domain
+            path = urlparse(url).path
+            correct_url = f"https://innovativeskillsbd.com{path}"
+            # Replace in the text
+            text = text.replace(url, correct_url)
+    return text
+# Function to query the DeepSeek V3 model
+def query_model(api_key, messages):
+    try:
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key,
+        )
+        completion = client.chat.completions.create(
+            extra_headers={
+                "HTTP-Referer": "https://innovativeskillsbd.com",
+                "X-Title": "InnovativeSkills ChatBot",
+            },
+            model="deepseek/deepseek-chat-v3-0324:free",
+            messages=messages
+        )
+        response = completion.choices[0].message.content
+        # Fix any incorrect URLs - ensure all links point to the correct domain
+        response = fix_urls_in_text(response)
+        return response
+    except Exception as e:
+        return f"Error querying the model: {str(e)}"
+# Function to answer questions based on website content
+def answer_question(api_key, question, site_content, history):
+    if not api_key:
+        return "Please enter your OpenRouter API key.", history
+    # Prepare the context from scraped content
+    context = create_context(site_content)
+    # Create system message with context
+    system_message = {
+        "role": "system",
+        "content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
+        Use the following content from the website to answer user questions. If the question is not related to the website or the
+        information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
+        IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
+        For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
+        {context}"""
+    }
+    # Create user message
+    user_message = {"role": "user", "content": question}
+    # Create message history for the API call
+    messages = [system_message]
+    # Add conversation history
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    # Add current question
+    messages.append(user_message)
+    # Query the model
+    response = query_model(api_key, messages)
+    # Update history by adding the new exchange
+    new_history = copy.deepcopy(history)
+    new_history.append((question, response))
+    return response, new_history
+# Scrape the website when the app starts
+def init_scraper(progress=gr.Progress()):
+    base_url = "https://innovativeskillsbd.com/"
+    progress(0, desc="Starting website crawler...")
+    site_content = crawl_website(base_url)
+    progress(1, desc="Finished crawling website")
+    return site_content
+# Create Gradio interface
+def create_interface(site_content):
+    with gr.Blocks() as app:
+        gr.Markdown("# InnovativeSkills Bangladesh Chatbot")
+        gr.Markdown("This chatbot uses DeepSeek V3 to answer questions about InnovativeSkills Bangladesh website.")
+        with gr.Row():
+            api_key_input = gr.Textbox(
+                label="OpenRouter API Key",
+                placeholder="Enter your OpenRouter API key",
+                type="password"
+            )
+        chatbot = gr.Chatbot(height=500)
+        msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
+        # Container for site content (hidden from UI)
+        site_content_state = gr.State(site_content)
+        # Container for chat history
+        chat_history = gr.State([])
+        # Button to start the conversation
+        clear = gr.Button("Clear conversation")
+        # Events
+        def user_input(api_key, message, site_content, history):
+            if not message:
+                return "", chatbot, history
+            # Process the response
+            bot_response, updated_history = answer_question(api_key, message, site_content, history)
+            # Format history for chatbot display
+            chatbot_display = []
+            for user_msg, bot_msg in updated_history:
+                chatbot_display.append([user_msg, bot_msg])
+            return "", chatbot_display, updated_history
+        msg.submit(
+            user_input,
+            inputs=[api_key_input, msg, site_content_state, chat_history],
+            outputs=[msg, chatbot, chat_history]
+        )
+        def clear_chat():
+            return "", [], []
+        clear.click(
+            clear_chat,
+            outputs=[msg, chatbot, chat_history]
+        )
+    return app
+# Initialize and launch the app
+def main():
+    print("Starting to initialize the InnovativeSkills chatbot...")
+    # First, scrape the website content
+    site_content = {}
+    try:
+        site_content = crawl_website("https://innovativeskillsbd.com/")
+    except Exception as e:
+        print(f"Error during initial website crawling: {e}")
+        print("The chatbot will still work, but without initial website content.")
+    # Create the Gradio interface with the site content
+    app = create_interface(site_content)
+    # Launch the app
+    app.launch()
+if __name__ == "__main__":
     main()