Spaces:

PyQuarX
/

scrape-with-ai

Running

File size: 4,530 Bytes

fb65c7a
fb0ee3f
fb65c7a
90924f6
 
fb65c7a
 
 
 
90924f6
fb65c7a
 
 
 
 
 
 
 
188a2fe
fb65c7a
188a2fe
fb65c7a
 
 
188a2fe
fb65c7a
 
 
 
 
 
 
 
188a2fe
fb65c7a
188a2fe
fb65c7a
 
188a2fe
fb65c7a
 
 
 
 
 
 
 
 
188a2fe
fb65c7a
 
 
 
 
 
 
 
 
188a2fe
fb65c7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90924f6
 
fb65c7a
90924f6
 
 
fb65c7a
90924f6
 
 
fb65c7a
 
 
90924f6
 
fb65c7a
 
 
 
 
 
 
 
 
 
 
 
 
90924f6
fb65c7a

import streamlit as st

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import pandas as pd

# Initialiser le modèle LLM avec transformers
model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"  # Modèle léger pour cet exemple
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Créer un pipeline pour la génération de texte
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,  # Utiliser GPU si disponible
    max_new_tokens=500,  # Limiter la longueur de la réponse
)

st.title("AI Web Scraper")

# Multi-URL Input
urls = st.text_area("Enter Website URLs (one per line)", height=150)
urls_list = [url.strip() for url in urls.splitlines() if url.strip()]

if st.button("Scrape Sites"):
    all_results = []
    for url in urls_list:
        st.write(f"Scraping: {url}")
        result = scrape_website(url)
        body_content = extract_body_content(result)
        cleaned_content = clean_body_content(body_content)
        all_results.append(cleaned_content)

    st.session_state.all_dom_content = all_results

if "all_dom_content" in st.session_state:
    parse_description = st.text_area("Describe what you want to parse from ALL sites:")

    if st.button("Parse Content"):
        if parse_description:
            all_tables = []
            for i, dom_content in enumerate(st.session_state.all_dom_content):
                st.write(f"Parsing content from site {i+1}")
                dom_chunks = split_dom_content(dom_content)
                result = parse(dom_chunks, parse_description)  # Assurez-vous que cette fonction utilise le modèle local
                st.write("Raw LLM Output:")
                st.write(result)

                tables = markdown_to_csv(result)
                if tables:
                    st.write("Extracted Tables:")
                    for table in tables:
                        st.write(table)
                        all_tables.append(table)
                else:
                    st.write("No tables found in the output. Displaying raw output instead.")
                    st.text_area("Raw Output", result, height=200)

            # Merge tables using LLM
            if all_tables:
                st.write("Merging all tables using LLM...")
                merged_table_string = merge_tables_with_llm(all_tables, parse_description)
                st.write("Merged Table (LLM Output):")
                st.write(merged_table_string)

                # Convert merged table string to DataFrame
                merged_tables = markdown_to_csv(merged_table_string)
                if merged_tables:
                    st.write("Merged Table (DataFrame):")
                    st.write(merged_tables[0])
                else:
                    st.write("Could not convert merged table string to DataFrame.")
            else:
                st.write("No tables to merge.")

def merge_tables_with_llm(tables, parse_description):
    """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
    # Convert DataFrames to Markdown strings
    table_strings = [table.to_markdown(index=False) for table in tables]

    # Créer un prompt pour le LLM
    merge_prompt = (
        "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
        "The tables contain information related to: {parse_description}.\n"
        "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
        "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
        "Ensure the final output is a single valid Markdown table.\n\n"
        "Here are the tables:\n\n" + "\n\n".join(table_strings) +
        "\n\nReturn the merged table in Markdown format:"
    ).format(parse_description=parse_description)

    # Appeler le modèle local via le pipeline
    response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
    merged_table = response[0]["generated_text"]

    # Nettoyer la sortie pour ne garder que la table Markdown
    # Supposons que la table commence après le prompt
    start_idx = merged_table.find("|")
    if start_idx != -1:
        merged_table = merged_table[start_idx:]
    else:
        merged_table = "No valid Markdown table found in LLM output."

    return merged_table