File size: 2,702 Bytes
188a2fe
 
b051531
 
d692aee
b051531
 
 
188a2fe
 
 
b051531
 
 
188a2fe
b051531
 
 
 
 
 
 
 
188a2fe
b051531
188a2fe
b051531
 
188a2fe
 
 
b051531
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
from parse import parse, merge_tables_with_llm
import streamlit as st
from Data import markdown_to_csv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI


st.title("AI Web Scraper")

# Multi-URL Input
urls = st.text_area("Enter Website URLs (one per line)", height=150)
urls_list = [url.strip() for url in urls.splitlines() if url.strip()]

if st.button("Scrape Sites"):
    all_results = []
    for url in urls_list:
        st.write(f"Scraping: {url}")
        result = scrape_website(url)
        body_content = extract_body_content(result)
        cleaned_content = clean_body_content(body_content)
        all_results.append(cleaned_content)

    st.session_state.all_dom_content = all_results

if "all_dom_content" in st.session_state:
    parse_description = st.text_area("Describe what you want to parse from ALL sites:")

    if st.button("Parse Content"):
        if parse_description:
            all_tables = []
            for i, dom_content in enumerate(st.session_state.all_dom_content):
                st.write(f"Parsing content from site {i+1}")
                dom_chunks = split_dom_content(dom_content)
                result = parse(dom_chunks, parse_description)
                st.write("Raw LLM Output:")
                st.write(result)

                tables = markdown_to_csv(result)
                if tables:
                    st.write("Extracted Tables:")
                    for table in tables:
                        st.write(table)
                        all_tables.append(table)
                else:
                    st.write("No tables found in the output.  Displaying raw output instead.")
                    st.text_area("Raw Output", result, height=200)  # Display raw output

            # Merge tables using LLM
            if all_tables:
                st.write("Merging all tables using LLM...")
                merged_table_string = merge_tables_with_llm(all_tables, parse_description)
                st.write("Merged Table (LLM Output):")
                st.write(merged_table_string)

                # Convert merged table string to DataFrame
                merged_tables = markdown_to_csv(merged_table_string)
                if merged_tables:
                    st.write("Merged Table (DataFrame):")
                    st.write(merged_tables[0])  # Display the first (and hopefully only) merged table
                else:
                    st.write("Could not convert merged table string to DataFrame.")
            else:
                st.write("No tables to merge.")