Spaces:
Running
Running
import streamlit as st | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
import pandas as pd | |
# Initialiser le modèle LLM avec transformers | |
model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" # Modèle léger pour cet exemple | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
# Créer un pipeline pour la génération de texte | |
llm_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device=0 if torch.cuda.is_available() else -1, # Utiliser GPU si disponible | |
max_new_tokens=500, # Limiter la longueur de la réponse | |
) | |
st.title("AI Web Scraper") | |
# Multi-URL Input | |
urls = st.text_area("Enter Website URLs (one per line)", height=150) | |
urls_list = [url.strip() for url in urls.splitlines() if url.strip()] | |
if st.button("Scrape Sites"): | |
all_results = [] | |
for url in urls_list: | |
st.write(f"Scraping: {url}") | |
result = scrape_website(url) | |
body_content = extract_body_content(result) | |
cleaned_content = clean_body_content(body_content) | |
all_results.append(cleaned_content) | |
st.session_state.all_dom_content = all_results | |
if "all_dom_content" in st.session_state: | |
parse_description = st.text_area("Describe what you want to parse from ALL sites:") | |
if st.button("Parse Content"): | |
if parse_description: | |
all_tables = [] | |
for i, dom_content in enumerate(st.session_state.all_dom_content): | |
st.write(f"Parsing content from site {i+1}") | |
dom_chunks = split_dom_content(dom_content) | |
result = parse(dom_chunks, parse_description) # Assurez-vous que cette fonction utilise le modèle local | |
st.write("Raw LLM Output:") | |
st.write(result) | |
tables = markdown_to_csv(result) | |
if tables: | |
st.write("Extracted Tables:") | |
for table in tables: | |
st.write(table) | |
all_tables.append(table) | |
else: | |
st.write("No tables found in the output. Displaying raw output instead.") | |
st.text_area("Raw Output", result, height=200) | |
# Merge tables using LLM | |
if all_tables: | |
st.write("Merging all tables using LLM...") | |
merged_table_string = merge_tables_with_llm(all_tables, parse_description) | |
st.write("Merged Table (LLM Output):") | |
st.write(merged_table_string) | |
# Convert merged table string to DataFrame | |
merged_tables = markdown_to_csv(merged_table_string) | |
if merged_tables: | |
st.write("Merged Table (DataFrame):") | |
st.write(merged_tables[0]) | |
else: | |
st.write("Could not convert merged table string to DataFrame.") | |
else: | |
st.write("No tables to merge.") | |
def merge_tables_with_llm(tables, parse_description): | |
"""Merges a list of Pandas DataFrames into a single Markdown table using a local LLM.""" | |
# Convert DataFrames to Markdown strings | |
table_strings = [table.to_markdown(index=False) for table in tables] | |
# Créer un prompt pour le LLM | |
merge_prompt = ( | |
"You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n" | |
"The tables contain information related to: {parse_description}.\n" | |
"Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n" | |
"If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n" | |
"Ensure the final output is a single valid Markdown table.\n\n" | |
"Here are the tables:\n\n" + "\n\n".join(table_strings) + | |
"\n\nReturn the merged table in Markdown format:" | |
).format(parse_description=parse_description) | |
# Appeler le modèle local via le pipeline | |
response = llm_pipeline(merge_prompt, max_length=2000, truncation=True) | |
merged_table = response[0]["generated_text"] | |
# Nettoyer la sortie pour ne garder que la table Markdown | |
# Supposons que la table commence après le prompt | |
start_idx = merged_table.find("|") | |
if start_idx != -1: | |
merged_table = merged_table[start_idx:] | |
else: | |
merged_table = "No valid Markdown table found in LLM output." | |
return merged_table |