import streamlit as st from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content from parse import parse, merge_tables_with_llm import streamlit as st from Data import markdown_to_csv from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI # Load OpenRouter API Key openrouter_api_key = "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" model = ChatOpenAI( openai_api_key=openrouter_api_key, model="meta-llama/llama-4-maverick:free", base_url="https://openrouter.ai/api/v1" ) st.title("AI Web Scraper") # Multi-URL Input urls = st.text_area("Enter Website URLs (one per line)", height=150) urls_list = [url.strip() for url in urls.splitlines() if url.strip()] if st.button("Scrape Sites"): all_results = [] for url in urls_list: st.write(f"Scraping: {url}") result = scrape_website(url) body_content = extract_body_content(result) cleaned_content = clean_body_content(body_content) all_results.append(cleaned_content) st.session_state.all_dom_content = all_results if "all_dom_content" in st.session_state: parse_description = st.text_area("Describe what you want to parse from ALL sites:") if st.button("Parse Content"): if parse_description: all_tables = [] for i, dom_content in enumerate(st.session_state.all_dom_content): st.write(f"Parsing content from site {i+1}") dom_chunks = split_dom_content(dom_content) result = parse(dom_chunks, parse_description) st.write("Raw LLM Output:") st.write(result) tables = markdown_to_csv(result) if tables: st.write("Extracted Tables:") for table in tables: st.write(table) all_tables.append(table) else: st.write("No tables found in the output. Displaying raw output instead.") st.text_area("Raw Output", result, height=200) # Display raw output # Merge tables using LLM if all_tables: st.write("Merging all tables using LLM...") merged_table_string = merge_tables_with_llm(all_tables, parse_description) st.write("Merged Table (LLM Output):") st.write(merged_table_string) # Convert merged table string to DataFrame merged_tables = markdown_to_csv(merged_table_string) if merged_tables: st.write("Merged Table (DataFrame):") st.write(merged_tables[0]) # Display the first (and hopefully only) merged table else: st.write("Could not convert merged table string to DataFrame.") else: st.write("No tables to merge.") def merge_tables_with_llm(tables, parse_description): """Merges a list of Pandas DataFrames into a single Markdown table using LLM.""" from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI # Convert DataFrames to Markdown strings table_strings = [table.to_markdown(index=False) for table in tables] # Create a prompt for the LLM merge_prompt = ( "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n" "The tables contain information related to: {parse_description}.\n" "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n" "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n" "Ensure the final output is a single valid Markdown table.\n\n" "Here are the tables:\n\n" + "\n\n".join(table_strings) + "\n\nReturn the merged table in Markdown format:" ) # Invoke the LLM response = model.invoke({"dom_content": "", "parse_description": merge_prompt}) return response.content