Spaces:
Running
Running
File size: 4,530 Bytes
fb65c7a fb0ee3f fb65c7a 90924f6 fb65c7a 90924f6 fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 188a2fe fb65c7a 90924f6 fb65c7a 90924f6 fb65c7a 90924f6 fb65c7a 90924f6 fb65c7a 90924f6 fb65c7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import streamlit as st
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import pandas as pd
# Initialiser le modèle LLM avec transformers
model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" # Modèle léger pour cet exemple
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Créer un pipeline pour la génération de texte
llm_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if torch.cuda.is_available() else -1, # Utiliser GPU si disponible
max_new_tokens=500, # Limiter la longueur de la réponse
)
st.title("AI Web Scraper")
# Multi-URL Input
urls = st.text_area("Enter Website URLs (one per line)", height=150)
urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
if st.button("Scrape Sites"):
all_results = []
for url in urls_list:
st.write(f"Scraping: {url}")
result = scrape_website(url)
body_content = extract_body_content(result)
cleaned_content = clean_body_content(body_content)
all_results.append(cleaned_content)
st.session_state.all_dom_content = all_results
if "all_dom_content" in st.session_state:
parse_description = st.text_area("Describe what you want to parse from ALL sites:")
if st.button("Parse Content"):
if parse_description:
all_tables = []
for i, dom_content in enumerate(st.session_state.all_dom_content):
st.write(f"Parsing content from site {i+1}")
dom_chunks = split_dom_content(dom_content)
result = parse(dom_chunks, parse_description) # Assurez-vous que cette fonction utilise le modèle local
st.write("Raw LLM Output:")
st.write(result)
tables = markdown_to_csv(result)
if tables:
st.write("Extracted Tables:")
for table in tables:
st.write(table)
all_tables.append(table)
else:
st.write("No tables found in the output. Displaying raw output instead.")
st.text_area("Raw Output", result, height=200)
# Merge tables using LLM
if all_tables:
st.write("Merging all tables using LLM...")
merged_table_string = merge_tables_with_llm(all_tables, parse_description)
st.write("Merged Table (LLM Output):")
st.write(merged_table_string)
# Convert merged table string to DataFrame
merged_tables = markdown_to_csv(merged_table_string)
if merged_tables:
st.write("Merged Table (DataFrame):")
st.write(merged_tables[0])
else:
st.write("Could not convert merged table string to DataFrame.")
else:
st.write("No tables to merge.")
def merge_tables_with_llm(tables, parse_description):
"""Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
# Convert DataFrames to Markdown strings
table_strings = [table.to_markdown(index=False) for table in tables]
# Créer un prompt pour le LLM
merge_prompt = (
"You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
"The tables contain information related to: {parse_description}.\n"
"Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
"If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
"Ensure the final output is a single valid Markdown table.\n\n"
"Here are the tables:\n\n" + "\n\n".join(table_strings) +
"\n\nReturn the merged table in Markdown format:"
).format(parse_description=parse_description)
# Appeler le modèle local via le pipeline
response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
merged_table = response[0]["generated_text"]
# Nettoyer la sortie pour ne garder que la table Markdown
# Supposons que la table commence après le prompt
start_idx = merged_table.find("|")
if start_idx != -1:
merged_table = merged_table[start_idx:]
else:
merged_table = "No valid Markdown table found in LLM output."
return merged_table |