Spaces:

PyQuarX
/

scrape-with-ai

Running

App Files Files Community

PyQuarX commited on 16 days ago

Commit

7d633ab

verified ·

1 Parent(s): fb0ee3f

Update parse.py

Browse files

Files changed (1) hide show

parse.py +102 -82

parse.py CHANGED Viewed

@@ -1,105 +1,125 @@
-import streamlit as st
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import pandas as pd
-# Initialiser le modèle LLM avec transformers
-model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"  # Modèle léger pour cet exemple
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# Créer un pipeline pour la génération de texte
-llm_pipeline = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    device=0 if torch.cuda.is_available() else -1,  # Utiliser GPU si disponible
-    max_new_tokens=500,  # Limiter la longueur de la réponse
-)
-st.title("AI Web Scraper")
-# Multi-URL Input
-urls = st.text_area("Enter Website URLs (one per line)", height=150)
-urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
-if st.button("Scrape Sites"):
-    all_results = []
-    for url in urls_list:
-        st.write(f"Scraping: {url}")
-        result = scrape_website(url)
-        body_content = extract_body_content(result)
-        cleaned_content = clean_body_content(body_content)
-        all_results.append(cleaned_content)
-    st.session_state.all_dom_content = all_results
-if "all_dom_content" in st.session_state:
-    parse_description = st.text_area("Describe what you want to parse from ALL sites:")
-    if st.button("Parse Content"):
-        if parse_description:
-            all_tables = []
-            for i, dom_content in enumerate(st.session_state.all_dom_content):
-                st.write(f"Parsing content from site {i+1}")
-                dom_chunks = split_dom_content(dom_content)
-                result = parse(dom_chunks, parse_description)  # Assurez-vous que cette fonction utilise le modèle local
-                st.write("Raw LLM Output:")
-                st.write(result)
-                tables = markdown_to_csv(result)
-                if tables:
-                    st.write("Extracted Tables:")
-                    for table in tables:
-                        st.write(table)
-                        all_tables.append(table)
-                else:
-                    st.write("No tables found in the output. Displaying raw output instead.")
-                    st.text_area("Raw Output", result, height=200)
-            # Merge tables using LLM
-            if all_tables:
-                st.write("Merging all tables using LLM...")
-                merged_table_string = merge_tables_with_llm(all_tables, parse_description)
-                st.write("Merged Table (LLM Output):")
-                st.write(merged_table_string)
-                # Convert merged table string to DataFrame
-                merged_tables = markdown_to_csv(merged_table_string)
-                if merged_tables:
-                    st.write("Merged Table (DataFrame):")
-                    st.write(merged_tables[0])
-                else:
-                    st.write("Could not convert merged table string to DataFrame.")
-            else:
-                st.write("No tables to merge.")
 def merge_tables_with_llm(tables, parse_description):
     """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
     # Convert DataFrames to Markdown strings
     table_strings = [table.to_markdown(index=False) for table in tables]
-    # Créer un prompt pour le LLM
     merge_prompt = (
         "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
         "The tables contain information related to: {parse_description}.\n"
-        "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
-        "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
-        "Ensure the final output is a single valid Markdown table.\n\n"
         "Here are the tables:\n\n" + "\n\n".join(table_strings) +
         "\n\nReturn the merged table in Markdown format:"
     ).format(parse_description=parse_description)
-    # Appeler le modèle local via le pipeline
     response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
     merged_table = response[0]["generated_text"]
-    # Nettoyer la sortie pour ne garder que la table Markdown
-    # Supposons que la table commence après le prompt
     start_idx = merged_table.find("|")
     if start_idx != -1:
         merged_table = merged_table[start_idx:]
     else:
-        merged_table = "No valid Markdown table found in LLM output."
     return merged_table

 import pandas as pd
+from langchain_core.prompts import ChatPromptTemplate
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import login
+import torch
+import os
+# Hugging Face API Token from Space Secrets
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Space Settings under Secrets.")
+# Model configuration
+MODEL_NAME = "facebook/opt-125m"  # Lightweight model; replace with e.g., mistralai/Mixtral-8x7B-Instruct-v0.1 for paid Spaces with GPU
+# Initialize model and tokenizer
+try:
+    # Log in to Hugging Face Hub
+    login(token=HF_TOKEN)
+    print("Successfully logged in to Hugging Face Hub")
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+    # Create text generation pipeline
+    llm_pipeline = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available in Space
+        max_new_tokens=500,  # Limit response length
+        pad_token_id=tokenizer.eos_token_id,  # Ensure proper padding
+    )
+except Exception as e:
+    print(f"Failed to load model: {str(e)}")
+    llm_pipeline = None
+# Function to parse and extract information from the chunks
+def parse(dom_chunks, parse_description):
+    """Parse and extract information from DOM chunks using a local LLM."""
+    if llm_pipeline is None:
+        raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
+    # Create a prompt template
+    template = (
+        "You are tasked with extracting specific information from the following text content: {dom_content}. "
+        "Please follow these instructions carefully:\n\n"
+        "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
+        "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
+        "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
+        "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
+        "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
+        "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
+        "   - Use pipes (|) to separate columns in each data row.\n"
+        "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
+        "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
+        "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
+        "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
+    )
+    parsed_results = []
+    # Loop through the chunks and parse
+    for i, chunk in enumerate(dom_chunks, start=1):
+        # Format the prompt
+        prompt = template.format(dom_content=chunk, parse_description=parse_description)
+        # Invoke the LLM pipeline
+        response = llm_pipeline(prompt, max_length=2000, truncation=True)
+        result = response[0]["generated_text"]
+        # Clean the output to keep only the Markdown table (remove prompt text)
+        start_idx = result.find("|")
+        if start_idx != -1:
+            result = result[start_idx:]
+        else:
+            result = ""  # Return empty string if no table is found
+        print(f"Parsed batch {i} of {len(dom_chunks)}")
+        parsed_results.append(result)
+    # Return the parsed results as a single string
+    return "\n".join(parsed_results)
 def merge_tables_with_llm(tables, parse_description):
     """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
+    if llm_pipeline is None:
+        raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
     # Convert DataFrames to Markdown strings
     table_strings = [table.to_markdown(index=False) for table in tables]
+    # Create a prompt for the LLM
     merge_prompt = (
         "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
         "The tables contain information related to: {parse_description}.\n"
+        "Please follow these instructions carefully:\n\n"
+        "1. **Task:** Merge the data from the following tables into a single table that matches the description: {parse_description}.\n"
+        "2. **Output Format:** Return the merged data ONLY as a single Markdown table. The table MUST be correctly formatted.\n"
+        "3. **Markdown Table Format:** The table must adhere to the following Markdown format:\n"
+        "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
+        "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
+        "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
+        "   - Use pipes (|) to separate columns in each data row.\n"
+        "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table.\n"
+        "5. **Empty Response:** If no information matches the description, return an empty string ('') if no data can be merged.\n"
+        "6. **Duplicate Columns:** If there are duplicate columns, rename them to be unique.\n"
+        "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
         "Here are the tables:\n\n" + "\n\n".join(table_strings) +
         "\n\nReturn the merged table in Markdown format:"
     ).format(parse_description=parse_description)
+    # Invoke the LLM pipeline
     response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
     merged_table = response[0]["generated_text"]
+    # Clean the output to keep only the Markdown table
     start_idx = merged_table.find("|")
     if start_idx != -1:
         merged_table = merged_table[start_idx:]
     else:
+        merged_table = ""  # Return empty string if no table is found
     return merged_table