PyQuarX commited on
Commit
b051531
·
verified ·
1 Parent(s): e540784

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -25
app.py CHANGED
@@ -1,37 +1,96 @@
1
  import streamlit as st
2
  from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
3
- from parse import parse
 
4
  from Data import markdown_to_csv
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  st.title("AI Web Scraper")
7
- url = st.text_input("Enter a Website URL")
8
 
9
- if st.button("Scrape Site"):
10
- st.write("Scraping the website")
 
11
 
12
- result = scrape_website(url)
13
- body_content = extract_body_content(result)
14
- cleaned_content = clean_body_content(body_content)
 
 
 
 
 
15
 
16
- st.session_state.dom_content = cleaned_content
17
 
18
- with st.expander("View DOM Content"):
19
- st.text_area("DOM Content", cleaned_content, height=300)
20
 
21
- if "dom_content" in st.session_state:
22
- parse_description = st.text_area("Describe what you want to parse?")
23
-
24
  if st.button("Parse Content"):
25
  if parse_description:
26
- st.write("Parsing Content")
27
-
28
- dom_chunks = split_dom_content(st.session_state.dom_content)
29
- result = parse(dom_chunks,parse_description)
30
- print(repr(result))
31
-
32
-
33
-
34
- # Appliquer la fonction
35
- tables = markdown_to_csv(result)
36
- for i in tables:
37
- st.write(i)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
3
+ from parse import parse, merge_tables_with_llm
4
+ import streamlit as st
5
  from Data import markdown_to_csv
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_openai import ChatOpenAI
8
+
9
+ # Load OpenRouter API Key
10
+ openrouter_api_key = "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"
11
+
12
+ model = ChatOpenAI(
13
+ openai_api_key=openrouter_api_key,
14
+ model="meta-llama/llama-4-maverick:free",
15
+ base_url="https://openrouter.ai/api/v1"
16
+ )
17
 
18
  st.title("AI Web Scraper")
 
19
 
20
+ # Multi-URL Input
21
+ urls = st.text_area("Enter Website URLs (one per line)", height=150)
22
+ urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
23
 
24
+ if st.button("Scrape Sites"):
25
+ all_results = []
26
+ for url in urls_list:
27
+ st.write(f"Scraping: {url}")
28
+ result = scrape_website(url)
29
+ body_content = extract_body_content(result)
30
+ cleaned_content = clean_body_content(body_content)
31
+ all_results.append(cleaned_content)
32
 
33
+ st.session_state.all_dom_content = all_results
34
 
35
+ if "all_dom_content" in st.session_state:
36
+ parse_description = st.text_area("Describe what you want to parse from ALL sites:")
37
 
 
 
 
38
  if st.button("Parse Content"):
39
  if parse_description:
40
+ all_tables = []
41
+ for i, dom_content in enumerate(st.session_state.all_dom_content):
42
+ st.write(f"Parsing content from site {i+1}")
43
+ dom_chunks = split_dom_content(dom_content)
44
+ result = parse(dom_chunks, parse_description)
45
+ st.write("Raw LLM Output:")
46
+ st.write(result)
47
+
48
+ tables = markdown_to_csv(result)
49
+ if tables:
50
+ st.write("Extracted Tables:")
51
+ for table in tables:
52
+ st.write(table)
53
+ all_tables.append(table)
54
+ else:
55
+ st.write("No tables found in the output. Displaying raw output instead.")
56
+ st.text_area("Raw Output", result, height=200) # Display raw output
57
+
58
+ # Merge tables using LLM
59
+ if all_tables:
60
+ st.write("Merging all tables using LLM...")
61
+ merged_table_string = merge_tables_with_llm(all_tables, parse_description)
62
+ st.write("Merged Table (LLM Output):")
63
+ st.write(merged_table_string)
64
+
65
+ # Convert merged table string to DataFrame
66
+ merged_tables = markdown_to_csv(merged_table_string)
67
+ if merged_tables:
68
+ st.write("Merged Table (DataFrame):")
69
+ st.write(merged_tables[0]) # Display the first (and hopefully only) merged table
70
+ else:
71
+ st.write("Could not convert merged table string to DataFrame.")
72
+ else:
73
+ st.write("No tables to merge.")
74
+
75
+ def merge_tables_with_llm(tables, parse_description):
76
+ """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
77
+ from langchain_core.prompts import ChatPromptTemplate
78
+ from langchain_openai import ChatOpenAI
79
+ # Convert DataFrames to Markdown strings
80
+ table_strings = [table.to_markdown(index=False) for table in tables]
81
+
82
+ # Create a prompt for the LLM
83
+ merge_prompt = (
84
+ "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
85
+ "The tables contain information related to: {parse_description}.\n"
86
+ "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
87
+ "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
88
+ "Ensure the final output is a single valid Markdown table.\n\n"
89
+ "Here are the tables:\n\n" + "\n\n".join(table_strings) +
90
+ "\n\nReturn the merged table in Markdown format:"
91
+ )
92
+
93
+ # Invoke the LLM
94
+ response = model.invoke({"dom_content": "", "parse_description": merge_prompt})
95
+ return response.content
96
+