Spaces:

PyQuarX
/

scrape-with-ai

Running

App Files Files Community

PyQuarX commited on Apr 13

Commit

188a2fe

verified ·

1 Parent(s): 502cf8b

Upload 6 files

Browse files

Files changed (6) hide show

Data.py +25 -0
README.md +45 -14
app.py +37 -0
parse.py +47 -0
requirements.txt +10 -0
scraper.py +50 -0

Data.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import re
+import pandas as pd
+def markdown_to_csv(llm_output):
+    # Find all Markdown tables
+    tables = re.findall(r"(\|(?:[^\n]+\|)+\n\|(?:\s*-+\s*\|)+\n(?:\|(?:[^\n]+\|)+\n)+)", llm_output)
+    dataframes = []
+    if tables:  # Ajouté pour éviter le 'NoneType' issue
+        for table in tables:
+            # Split lines and extract columns
+            lines = table.strip().split("\n")
+            headers = [col.strip() for col in lines[0].split("|")[1:-1]]  # Headers
+            data_rows = [line.split("|")[1:-1] for line in lines[2:]]  # Ignore separator row
+            # Clean and validate rows
+            cleaned_data = [[col.strip() for col in row] for row in data_rows if len(row) == len(headers)]
+            if cleaned_data:
+                df = pd.DataFrame(cleaned_data, columns=headers)
+                dataframes.append(df)
+    return dataframes  # <- Corrigé : en dehors de la boucle

README.md CHANGED Viewed

@@ -1,14 +1,45 @@
----
-title: Scrape With Ai
-emoji: 📉
-colorFrom: pink
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.44.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: Webscraper
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Project Overview
+This project implements a text parsing and information extraction tool using a chat model. The main functionality is encapsulated in the `parse.py` file, which utilizes an API to process text content and extract relevant information based on specified descriptions.
+## Files
+- `parse.py`: Contains the implementation for parsing and extracting information from text content using a chat model. It imports necessary libraries, sets up an API key, defines a chat prompt template, and includes a function `parse_with_ollama` that processes chunks of text based on a provided description.
+## Setup Instructions
+1. **Clone the Repository**
+   ```
+   git clone <repository-url>
+   cd project
+   ```
+2. **Install Dependencies**
+   Ensure you have Python installed, then install the required libraries:
+   ```
+   pip install langchain_core langchain_openai
+   ```
+3. **Set Up API Key**
+   Replace the placeholder API key in `parse.py` with your actual OpenRouter API key.
+## Usage
+To use the parsing functionality, call the `parse_with_ollama` function from `parse.py` with the appropriate parameters:
+```python
+from parse import parse_with_ollama
+dom_chunks = ["Your text content here"]
+parse_description = "Description of the information to extract"
+results = parse_with_ollama(dom_chunks, parse_description)
+print(results)
+```
+## Contributing
+Contributions are welcome! Please submit a pull request or open an issue for any enhancements or bug fixes.
+## License
+This project is licensed under the MIT License.

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import streamlit as st
+from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
+from parse import parse
+from project.Data import markdown_to_csv
+st.title("AI Web Scraper")
+url = st.text_input("Enter a Website URL")
+if st.button("Scrape Site"):
+    st.write("Scraping the website")
+    result = scrape_website(url)
+    body_content = extract_body_content(result)
+    cleaned_content = clean_body_content(body_content)
+    st.session_state.dom_content = cleaned_content
+    with st.expander("View DOM Content"):
+        st.text_area("DOM Content", cleaned_content, height=300)
+if "dom_content" in st.session_state:
+    parse_description = st.text_area("Describe what you want to parse?")
+    if st.button("Parse Content"):
+        if parse_description:
+            st.write("Parsing Content")
+            dom_chunks = split_dom_content(st.session_state.dom_content)
+            result = parse(dom_chunks,parse_description)
+            print(repr(result))
+            # Appliquer la fonction
+            tables = markdown_to_csv(result)
+            for i in tables:
+                st.write(i)

parse.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+import os
+# Load OpenRouter API Key
+openrouter_api_key = "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"
+model = ChatOpenAI(
+    openai_api_key=openrouter_api_key,  # Use OpenRouter API key
+    model="meta-llama/llama-4-maverick:free",  # Specify Qwen VL Plus model
+    base_url="https://openrouter.ai/api/v1"  # OpenRouter API URL
+)
+# Create a chat prompt template
+template = (
+    "You are tasked with extracting specific information from the following text content: {dom_content}. "
+    "Please follow these instructions carefully: \n\n"
+    "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
+    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
+    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
+    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
+    "5. **Type:** The output should always be a table, and if there's more than one table, return every table separately. Use Markdown table format.\n"
+    "6. **Standardized Table Format:** Ensure each table is formatted as a Markdown table with clear headers and consistent column alignment.\n"
+    "7. **Accuracy:** The output should be as accurate as possible.\n"
+    "8. **Column Separators:** Use the pipe symbol (|) to clearly separate columns in the Markdown table.\n"
+    "9. **Header Row:** The first row of each table should be the header row, clearly labeling each column.\n"
+    "10. **Alignment Row:** The second row should contain hyphens (-) to indicate column alignment (e.g., --- for left alignment, :---: for center alignment, ---: for right alignment).\n"
+    "11. **Data Rows:** Subsequent rows should contain the data, with each cell aligned according to the alignment row.\n"
+)
+# Function to parse and extract information from the chunks
+def parse(dom_chunks, parse_description):
+    prompt = ChatPromptTemplate.from_template(template)
+    chain = prompt | model
+    parsed_results = []
+    # Loop through the chunks and parse
+    for i, chunk in enumerate(dom_chunks, start=1):
+        response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
+        # Extract the content from AIMessage and add it to the results
+        print(f"Parsed batch {i} of {len(dom_chunks)}")
+        parsed_results.append(response.content)  # Ensure content is extracted properly
+    # Return the parsed results as a single string
+    return "\n".join(parsed_results)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+bs4
+time
+numpy
+selenium
+selenium.webdriver.chrome.service
+streamlit
+langchain_core.prompts
+langchain_openai
+os
+re

scraper.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+import time
+from bs4 import BeautifulSoup
+def scrape_website(website):
+    print("Launching chrome browser...")
+    chrome_driver_path = "/usr/bin/chromedriver"
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
+    try:
+        driver.get(website)
+        print("Page Loaded...")
+        html = driver.page_source
+        return html
+    finally:
+        driver.quit()
+def extract_body_content(html_content):
+    soup = BeautifulSoup(html_content,"html.parser")
+    body_content = soup.body
+    if body_content:
+        return str(body_content)
+    return ""
+def clean_body_content(body_content):
+    soup = BeautifulSoup(body_content,"html.parser")
+    for script_or_style in soup(["script","style"]):
+        script_or_style.extract()
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
+def split_dom_content(dom_content,max_length=60000):
+    return [
+        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
+    ]