scrape-with-ai / app.py
PyQuarX's picture
Update app.py
1a0e69a verified
import streamlit as st
from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
from parse import parse, merge_tables_with_llm
import streamlit as st
from Data import markdown_to_csv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
st.title("AI Web Scraper")
# Multi-URL Input
urls = st.text_area("Enter Website URLs (one per line)", height=150)
urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
if st.button("Scrape Sites"):
all_results = []
for url in urls_list:
st.write(f"Scraping: {url}")
result = scrape_website(url)
body_content = extract_body_content(result)
cleaned_content = clean_body_content(body_content)
all_results.append(cleaned_content)
st.session_state.all_dom_content = all_results
if "all_dom_content" in st.session_state:
parse_description = st.text_area("Describe what you want to parse from ALL sites:")
if st.button("Parse Content"):
if parse_description:
all_tables = []
for i, dom_content in enumerate(st.session_state.all_dom_content):
st.write(f"Parsing content from site {i+1}")
dom_chunks = split_dom_content(dom_content)
result = parse(dom_chunks, parse_description)
st.write("Raw LLM Output:")
st.write(result)
tables = markdown_to_csv(result)
if tables:
st.write("Extracted Tables:")
for table in tables:
st.write(table)
all_tables.append(table)
else:
st.write("No tables found in the output. Displaying raw output instead.")
st.text_area("Raw Output", result, height=200) # Display raw output
# Merge tables using LLM
if all_tables:
st.write("Merging all tables using LLM...")
merged_table_string = merge_tables_with_llm(all_tables, parse_description)
st.write("Merged Table (LLM Output):")
st.write(merged_table_string)
# Convert merged table string to DataFrame
merged_tables = markdown_to_csv(merged_table_string)
if merged_tables:
st.write("Merged Table (DataFrame):")
st.write(merged_tables[0]) # Display the first (and hopefully only) merged table
else:
st.write("Could not convert merged table string to DataFrame.")
else:
st.write("No tables to merge.")