Spaces:

PyQuarX
/

scrape-with-ai

Running

App Files Files Community

PyQuarX commited on 19 days ago

Commit

fb65c7a

verified ·

1 Parent(s): f64f0d6

Update parse.py

Browse files

Files changed (1) hide show

parse.py +85 -61

parse.py CHANGED Viewed

@@ -1,83 +1,107 @@
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_openai import ChatOpenAI
-from langchain_core.messages import HumanMessage
-import os
 import pandas as pd
-# Load OpenRouter API Key
-openrouter_api_key = "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"
-model = ChatOpenAI(
-    openai_api_key=openrouter_api_key,  # Use OpenRouter API key
-    model="meta-llama/llama-4-maverick:free",  # Specify Qwen VL Plus model
-    base_url="https://openrouter.ai/api/v1"  # OpenRouter API URL
-)
-# Create a chat prompt template
-template = (
-    "You are tasked with extracting specific information from the following text content: {dom_content}. "
-    "Please follow these instructions carefully:\n\n"
-    "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
-    "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
-    "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
-    "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
-    "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
-    "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
-    "   - Use pipes (|) to separate columns in each data row.\n"
-    "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
-    "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
-    "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
-    "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
-)
-# Function to parse and extract information from the chunks
-def parse(dom_chunks, parse_description):
-    prompt = ChatPromptTemplate.from_template(template)
-    chain = prompt | model
-    parsed_results = []
-    # Loop through the chunks and parse
-    for i, chunk in enumerate(dom_chunks, start=1):
-        response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
-        # Extract the content from AIMessage and add it to the results
-        print(f"Parsed batch {i} of {len(dom_chunks)}")
-        parsed_results.append(response.content)  # Ensure content is extracted properly
-    # Return the parsed results as a single string
-    return "\n".join(parsed_results)
 def merge_tables_with_llm(tables, parse_description):
-    """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
-    from langchain_core.prompts import ChatPromptTemplate
-    from langchain_openai import ChatOpenAI
     # Convert DataFrames to Markdown strings
     table_strings = [table.to_markdown(index=False) for table in tables]
-    # Create a prompt for the LLM
     merge_prompt = (
         "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
         "The tables contain information related to: {parse_description}.\n"
-        "Please follow these instructions carefully:\n\n"
-        "1. **Task:** Merge the data from the following tables into a single table that matches the description: {parse_description}.\n"
-        "2. **Output Format:** Return the merged data ONLY as a single Markdown table. The table MUST be correctly formatted.\n"
-        "3. **Markdown Table Format:** The table must adhere to the following Markdown format:\n"
-        "   - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
-        "   - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
-        "   - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
-        "   - Use pipes (|) to separate columns in each data row.\n"
-        "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table.\n"
-        "5. **Empty Response:** If no information matches the description, return an empty string ('') if no data can be merged.\n"
-        "6. **Duplicate Columns:** If there are duplicate columns, rename them to be unique.\n"
-        "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
         "Here are the tables:\n\n" + "\n\n".join(table_strings) +
         "\n\nReturn the merged table in Markdown format:"
-    )
-    # Invoke the LLM
-    message = HumanMessage(content=merge_prompt)
-    response = model.invoke([message])
-    return response.content

+import streamlit as st
+from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
+from parse import parse, merge_tables_with_llm
+from Data import markdown_to_csv
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import pandas as pd
+# Initialiser le modèle LLM avec transformers
+model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"  # Modèle léger pour cet exemple
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Créer un pipeline pour la génération de texte
+llm_pipeline = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device=0 if torch.cuda.is_available() else -1,  # Utiliser GPU si disponible
+    max_new_tokens=500,  # Limiter la longueur de la réponse
+)
+st.title("AI Web Scraper")
+# Multi-URL Input
+urls = st.text_area("Enter Website URLs (one per line)", height=150)
+urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
+if st.button("Scrape Sites"):
+    all_results = []
+    for url in urls_list:
+        st.write(f"Scraping: {url}")
+        result = scrape_website(url)
+        body_content = extract_body_content(result)
+        cleaned_content = clean_body_content(body_content)
+        all_results.append(cleaned_content)
+    st.session_state.all_dom_content = all_results
+if "all_dom_content" in st.session_state:
+    parse_description = st.text_area("Describe what you want to parse from ALL sites:")
+    if st.button("Parse Content"):
+        if parse_description:
+            all_tables = []
+            for i, dom_content in enumerate(st.session_state.all_dom_content):
+                st.write(f"Parsing content from site {i+1}")
+                dom_chunks = split_dom_content(dom_content)
+                result = parse(dom_chunks, parse_description)  # Assurez-vous que cette fonction utilise le modèle local
+                st.write("Raw LLM Output:")
+                st.write(result)
+                tables = markdown_to_csv(result)
+                if tables:
+                    st.write("Extracted Tables:")
+                    for table in tables:
+                        st.write(table)
+                        all_tables.append(table)
+                else:
+                    st.write("No tables found in the output. Displaying raw output instead.")
+                    st.text_area("Raw Output", result, height=200)
+            # Merge tables using LLM
+            if all_tables:
+                st.write("Merging all tables using LLM...")
+                merged_table_string = merge_tables_with_llm(all_tables, parse_description)
+                st.write("Merged Table (LLM Output):")
+                st.write(merged_table_string)
+                # Convert merged table string to DataFrame
+                merged_tables = markdown_to_csv(merged_table_string)
+                if merged_tables:
+                    st.write("Merged Table (DataFrame):")
+                    st.write(merged_tables[0])
+                else:
+                    st.write("Could not convert merged table string to DataFrame.")
+            else:
+                st.write("No tables to merge.")
 def merge_tables_with_llm(tables, parse_description):
+    """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
     # Convert DataFrames to Markdown strings
     table_strings = [table.to_markdown(index=False) for table in tables]
+    # Créer un prompt pour le LLM
     merge_prompt = (
         "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
         "The tables contain information related to: {parse_description}.\n"
+        "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
+        "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
+        "Ensure the final output is a single valid Markdown table.\n\n"
         "Here are the tables:\n\n" + "\n\n".join(table_strings) +
         "\n\nReturn the merged table in Markdown format:"
+    ).format(parse_description=parse_description)
+    # Appeler le modèle local via le pipeline
+    response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
+    merged_table = response[0]["generated_text"]
+    # Nettoyer la sortie pour ne garder que la table Markdown
+    # Supposons que la table commence après le prompt
+    start_idx = merged_table.find("|")
+    if start_idx != -1:
+        merged_table = merged_table[start_idx:]
+    else:
+        merged_table = "No valid Markdown table found in LLM output."
+    return merged_table