Spaces:

PyQuarX
/

scrape-with-ai

Running

App Files Files Community

scrape-with-ai / parse.py

PyQuarX

Update parse.py

fb0ee3f verified 21 days ago

raw

history blame

4.53 kB

	import streamlit as st

	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
	import pandas as pd

	# Initialiser le modèle LLM avec transformers
	model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" # Modèle léger pour cet exemple
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# Créer un pipeline pour la génération de texte
	llm_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if torch.cuda.is_available() else -1, # Utiliser GPU si disponible
	max_new_tokens=500, # Limiter la longueur de la réponse
	)

	st.title("AI Web Scraper")

	# Multi-URL Input
	urls = st.text_area("Enter Website URLs (one per line)", height=150)
	urls_list = [url.strip() for url in urls.splitlines() if url.strip()]

	if st.button("Scrape Sites"):
	all_results = []
	for url in urls_list:
	st.write(f"Scraping: {url}")
	result = scrape_website(url)
	body_content = extract_body_content(result)
	cleaned_content = clean_body_content(body_content)
	all_results.append(cleaned_content)

	st.session_state.all_dom_content = all_results

	if "all_dom_content" in st.session_state:
	parse_description = st.text_area("Describe what you want to parse from ALL sites:")

	if st.button("Parse Content"):
	if parse_description:
	all_tables = []
	for i, dom_content in enumerate(st.session_state.all_dom_content):
	st.write(f"Parsing content from site {i+1}")
	dom_chunks = split_dom_content(dom_content)
	result = parse(dom_chunks, parse_description) # Assurez-vous que cette fonction utilise le modèle local
	st.write("Raw LLM Output:")
	st.write(result)

	tables = markdown_to_csv(result)
	if tables:
	st.write("Extracted Tables:")
	for table in tables:
	st.write(table)
	all_tables.append(table)
	else:
	st.write("No tables found in the output. Displaying raw output instead.")
	st.text_area("Raw Output", result, height=200)

	# Merge tables using LLM
	if all_tables:
	st.write("Merging all tables using LLM...")
	merged_table_string = merge_tables_with_llm(all_tables, parse_description)
	st.write("Merged Table (LLM Output):")
	st.write(merged_table_string)

	# Convert merged table string to DataFrame
	merged_tables = markdown_to_csv(merged_table_string)
	if merged_tables:
	st.write("Merged Table (DataFrame):")
	st.write(merged_tables[0])
	else:
	st.write("Could not convert merged table string to DataFrame.")
	else:
	st.write("No tables to merge.")

	def merge_tables_with_llm(tables, parse_description):
	"""Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
	# Convert DataFrames to Markdown strings
	table_strings = [table.to_markdown(index=False) for table in tables]

	# Créer un prompt pour le LLM
	merge_prompt = (
	"You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
	"The tables contain information related to: {parse_description}.\n"
	"Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
	"If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
	"Ensure the final output is a single valid Markdown table.\n\n"
	"Here are the tables:\n\n" + "\n\n".join(table_strings) +
	"\n\nReturn the merged table in Markdown format:"
	).format(parse_description=parse_description)

	# Appeler le modèle local via le pipeline
	response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
	merged_table = response[0]["generated_text"]

	# Nettoyer la sortie pour ne garder que la table Markdown
	# Supposons que la table commence après le prompt
	start_idx = merged_table.find("\|")
	if start_idx != -1:
	merged_table = merged_table[start_idx:]
	else:
	merged_table = "No valid Markdown table found in LLM output."

	return merged_table