File size: 7,004 Bytes
c22f035 4ba0755 c22f035 8f4ddfa d78024f c22f035 d78024f c22f035 bd118ce d78024f bd118ce 4ba0755 8f4ddfa bd118ce d78024f c22f035 bd118ce c22f035 4ba0755 c22f035 4ba0755 c22f035 bd118ce 4ba0755 d78024f bd118ce c22f035 bd118ce 4ba0755 bd118ce c22f035 d78024f bd118ce d78024f c22f035 8f4ddfa d78024f 8f4ddfa bd118ce 4ba0755 d78024f 4ba0755 d78024f 8f4ddfa bd118ce d78024f 8f4ddfa d78024f 8f4ddfa d78024f 8f4ddfa bd118ce d78024f 8f4ddfa d78024f bd118ce 8f4ddfa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import streamlit as st
import requests
import re
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import chromadb
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import uuid
# Page configuration
st.set_page_config(layout="wide")
# Initialize Gemini API
genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
# Initialize ChromaDB
CHROMA_PATH = "chroma_db"
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
# Initialize session state to track if scraping is complete and collection name
if 'scraped' not in st.session_state:
st.session_state.scraped = False
if 'collection_name' not in st.session_state:
st.session_state.collection_name = "default_collection"
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
def clean_text(text):
text = re.sub(r'http\S+', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def split_content_into_chunks(content):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
documents = [Document(page_content=content)]
return text_splitter.split_documents(documents)
def add_chunks_to_db(chunks, collection_name):
# Create or get collection
collection = chroma_client.get_or_create_collection(name=collection_name)
documents = [chunk.page_content for chunk in chunks]
ids = [f"ID{i}" for i in range(len(chunks))]
embeddings = embedding_model.encode(documents, convert_to_list=True)
collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
def scrape_text(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract domain for collection name
collection_name = st.session_state.collection_name
text = clean_text(soup.get_text())
chunks = split_content_into_chunks(text)
add_chunks_to_db(chunks, collection_name)
# Set scraped state to True
st.session_state.scraped = True
return "Scraping and processing complete. You can now ask questions!"
except requests.exceptions.RequestException as e:
return f"Error scraping {url}: {e}"
def ask_question(query, collection_name):
# Get the collection
collection = chroma_client.get_or_create_collection(name=collection_name)
query_embedding = embedding_model.encode(query, convert_to_list=True)
results = collection.query(query_embeddings=[query_embedding], n_results=2)
top_chunks = results.get("documents", [[]])[0]
system_prompt = f"""
You are a helpful assistant. You answer questions based on the provided context.
Only answer based on the knowledge I'm providing you. Don't use your internal
knowledge and don't make things up.
If you don't know the answer based on the provided context, just say: "I don't have enough information to answer that question based on the scraped content."
Context information:
{str(top_chunks)}
"""
full_prompt = system_prompt + "\nUser Query: " + query
model = genai.GenerativeModel('gemini-2.0-flash')
response = model.generate_content(full_prompt)
return response.text
# Create two columns: sidebar for database and main content
col1, main_col = st.columns([1, 3])
# Database management sidebar
with col1:
st.header("Database Management")
# List available collections
try:
# Fix for ChromaDB v0.6.0 - list_collections() now returns only names
collection_names = chroma_client.list_collections()
if collection_names:
st.write("Available data collections:")
selected_collection = st.selectbox("Select a collection to query:", collection_names)
if selected_collection and st.button("Load Selected Collection"):
st.session_state.collection_name = selected_collection
st.session_state.scraped = True
st.success(f"Loaded collection: {selected_collection}")
st.rerun()
except Exception as e:
st.error(f"Error: {str(e)}")
# Add a button to clear the session and start over
if st.button("Clear Chat History"):
st.session_state.chat_history = []
st.rerun()
# Scraping section
st.header("Step 1: Scrape a Website")
url = st.text_input("Enter the URL to scrape:")
if url:
if st.button("Scrape & Process"):
with st.spinner("Scraping and processing content..."):
result = scrape_text(url)
st.success(result)
# Main content area
with main_col:
st.title("Web Scraper & Q&A Chatbot")
# Use a container with custom CSS for the scrollable chat area
chat_container = st.container()
# Apply custom CSS for the chat container
st.markdown("""
<style>
.chat-container {
height: 500px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 5px;
padding: 15px;
margin-bottom: 10px;
background-color: #f9f9f9;
}
.stChatInputContainer {
position: sticky;
bottom: 0;
background-color: white;
padding-top: 10px;
z-index: 100;
}
</style>
""", unsafe_allow_html=True)
# Q&A section - only appears after scraping is complete
if st.session_state.scraped:
st.subheader("Step 2: Ask Questions About the Scraped Content")
# Use a div with our custom class for the scrollable area
st.markdown('<div class="chat-container">', unsafe_allow_html=True)
# Display chat history
for message in st.session_state.chat_history:
with chat_container.chat_message(message["role"]):
st.write(message["content"])
st.markdown('</div>', unsafe_allow_html=True)
# Input for new question - always at the bottom
user_query = st.chat_input("Ask your question here")
if user_query:
# Add user question to chat history
st.session_state.chat_history.append({"role": "user", "content": user_query})
# Get answer
with st.spinner("Searching database..."):
answer = ask_question(user_query, st.session_state.collection_name)
# Add answer to chat history
st.session_state.chat_history.append({"role": "assistant", "content": answer})
# Rerun to update the UI with new messages
st.rerun()
else:
st.info("Please scrape a website or load a collection to start chatting.") |