import streamlit as st from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content from parse import parse, merge_tables_with_llm import streamlit as st from Data import markdown_to_csv from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI st.title("AI Web Scraper") # Multi-URL Input urls = st.text_area("Enter Website URLs (one per line)", height=150) urls_list = [url.strip() for url in urls.splitlines() if url.strip()] if st.button("Scrape Sites"): all_results = [] for url in urls_list: st.write(f"Scraping: {url}") result = scrape_website(url) body_content = extract_body_content(result) cleaned_content = clean_body_content(body_content) all_results.append(cleaned_content) st.session_state.all_dom_content = all_results if "all_dom_content" in st.session_state: parse_description = st.text_area("Describe what you want to parse from ALL sites:") if st.button("Parse Content"): if parse_description: all_tables = [] for i, dom_content in enumerate(st.session_state.all_dom_content): st.write(f"Parsing content from site {i+1}") dom_chunks = split_dom_content(dom_content) result = parse(dom_chunks, parse_description) st.write("Raw LLM Output:") st.write(result) tables = markdown_to_csv(result) if tables: st.write("Extracted Tables:") for table in tables: st.write(table) all_tables.append(table) else: st.write("No tables found in the output. Displaying raw output instead.") st.text_area("Raw Output", result, height=200) # Display raw output # Merge tables using LLM if all_tables: st.write("Merging all tables using LLM...") merged_table_string = merge_tables_with_llm(all_tables, parse_description) st.write("Merged Table (LLM Output):") st.write(merged_table_string) # Convert merged table string to DataFrame merged_tables = markdown_to_csv(merged_table_string) if merged_tables: st.write("Merged Table (DataFrame):") st.write(merged_tables[0]) # Display the first (and hopefully only) merged table else: st.write("Could not convert merged table string to DataFrame.") else: st.write("No tables to merge.")