PyQuarX commited on
Commit
188a2fe
·
verified ·
1 Parent(s): 502cf8b

Upload 6 files

Browse files
Files changed (6) hide show
  1. Data.py +25 -0
  2. README.md +45 -14
  3. app.py +37 -0
  4. parse.py +47 -0
  5. requirements.txt +10 -0
  6. scraper.py +50 -0
Data.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import pandas as pd
4
+
5
+ def markdown_to_csv(llm_output):
6
+ # Find all Markdown tables
7
+ tables = re.findall(r"(\|(?:[^\n]+\|)+\n\|(?:\s*-+\s*\|)+\n(?:\|(?:[^\n]+\|)+\n)+)", llm_output)
8
+
9
+ dataframes = []
10
+
11
+ if tables: # Ajouté pour éviter le 'NoneType' issue
12
+ for table in tables:
13
+ # Split lines and extract columns
14
+ lines = table.strip().split("\n")
15
+ headers = [col.strip() for col in lines[0].split("|")[1:-1]] # Headers
16
+ data_rows = [line.split("|")[1:-1] for line in lines[2:]] # Ignore separator row
17
+
18
+ # Clean and validate rows
19
+ cleaned_data = [[col.strip() for col in row] for row in data_rows if len(row) == len(headers)]
20
+
21
+ if cleaned_data:
22
+ df = pd.DataFrame(cleaned_data, columns=headers)
23
+ dataframes.append(df)
24
+
25
+ return dataframes # <- Corrigé : en dehors de la boucle
README.md CHANGED
@@ -1,14 +1,45 @@
1
- ---
2
- title: Scrape With Ai
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Webscraper
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Overview
2
+
3
+ This project implements a text parsing and information extraction tool using a chat model. The main functionality is encapsulated in the `parse.py` file, which utilizes an API to process text content and extract relevant information based on specified descriptions.
4
+
5
+ ## Files
6
+
7
+ - `parse.py`: Contains the implementation for parsing and extracting information from text content using a chat model. It imports necessary libraries, sets up an API key, defines a chat prompt template, and includes a function `parse_with_ollama` that processes chunks of text based on a provided description.
8
+
9
+ ## Setup Instructions
10
+
11
+ 1. **Clone the Repository**
12
+ ```
13
+ git clone <repository-url>
14
+ cd project
15
+ ```
16
+
17
+ 2. **Install Dependencies**
18
+ Ensure you have Python installed, then install the required libraries:
19
+ ```
20
+ pip install langchain_core langchain_openai
21
+ ```
22
+
23
+ 3. **Set Up API Key**
24
+ Replace the placeholder API key in `parse.py` with your actual OpenRouter API key.
25
+
26
+ ## Usage
27
+
28
+ To use the parsing functionality, call the `parse_with_ollama` function from `parse.py` with the appropriate parameters:
29
+
30
+ ```python
31
+ from parse import parse_with_ollama
32
+
33
+ dom_chunks = ["Your text content here"]
34
+ parse_description = "Description of the information to extract"
35
+ results = parse_with_ollama(dom_chunks, parse_description)
36
+ print(results)
37
+ ```
38
+
39
+ ## Contributing
40
+
41
+ Contributions are welcome! Please submit a pull request or open an issue for any enhancements or bug fixes.
42
+
43
+ ## License
44
+
45
+ This project is licensed under the MIT License.
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
3
+ from parse import parse
4
+ from project.Data import markdown_to_csv
5
+
6
+ st.title("AI Web Scraper")
7
+ url = st.text_input("Enter a Website URL")
8
+
9
+ if st.button("Scrape Site"):
10
+ st.write("Scraping the website")
11
+
12
+ result = scrape_website(url)
13
+ body_content = extract_body_content(result)
14
+ cleaned_content = clean_body_content(body_content)
15
+
16
+ st.session_state.dom_content = cleaned_content
17
+
18
+ with st.expander("View DOM Content"):
19
+ st.text_area("DOM Content", cleaned_content, height=300)
20
+
21
+ if "dom_content" in st.session_state:
22
+ parse_description = st.text_area("Describe what you want to parse?")
23
+
24
+ if st.button("Parse Content"):
25
+ if parse_description:
26
+ st.write("Parsing Content")
27
+
28
+ dom_chunks = split_dom_content(st.session_state.dom_content)
29
+ result = parse(dom_chunks,parse_description)
30
+ print(repr(result))
31
+
32
+
33
+
34
+ # Appliquer la fonction
35
+ tables = markdown_to_csv(result)
36
+ for i in tables:
37
+ st.write(i)
parse.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_openai import ChatOpenAI
3
+ import os
4
+
5
+ # Load OpenRouter API Key
6
+ openrouter_api_key = "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"
7
+
8
+ model = ChatOpenAI(
9
+ openai_api_key=openrouter_api_key, # Use OpenRouter API key
10
+ model="meta-llama/llama-4-maverick:free", # Specify Qwen VL Plus model
11
+ base_url="https://openrouter.ai/api/v1" # OpenRouter API URL
12
+ )
13
+
14
+ # Create a chat prompt template
15
+ template = (
16
+ "You are tasked with extracting specific information from the following text content: {dom_content}. "
17
+ "Please follow these instructions carefully: \n\n"
18
+ "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
19
+ "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
20
+ "3. **Empty Response:** If no information matches the description, return an empty string ('')."
21
+ "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
22
+ "5. **Type:** The output should always be a table, and if there's more than one table, return every table separately. Use Markdown table format.\n"
23
+ "6. **Standardized Table Format:** Ensure each table is formatted as a Markdown table with clear headers and consistent column alignment.\n"
24
+ "7. **Accuracy:** The output should be as accurate as possible.\n"
25
+ "8. **Column Separators:** Use the pipe symbol (|) to clearly separate columns in the Markdown table.\n"
26
+ "9. **Header Row:** The first row of each table should be the header row, clearly labeling each column.\n"
27
+ "10. **Alignment Row:** The second row should contain hyphens (-) to indicate column alignment (e.g., --- for left alignment, :---: for center alignment, ---: for right alignment).\n"
28
+ "11. **Data Rows:** Subsequent rows should contain the data, with each cell aligned according to the alignment row.\n"
29
+ )
30
+
31
+ # Function to parse and extract information from the chunks
32
+ def parse(dom_chunks, parse_description):
33
+ prompt = ChatPromptTemplate.from_template(template)
34
+ chain = prompt | model
35
+
36
+ parsed_results = []
37
+
38
+ # Loop through the chunks and parse
39
+ for i, chunk in enumerate(dom_chunks, start=1):
40
+ response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
41
+
42
+ # Extract the content from AIMessage and add it to the results
43
+ print(f"Parsed batch {i} of {len(dom_chunks)}")
44
+ parsed_results.append(response.content) # Ensure content is extracted properly
45
+
46
+ # Return the parsed results as a single string
47
+ return "\n".join(parsed_results)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ bs4
2
+ time
3
+ numpy
4
+ selenium
5
+ selenium.webdriver.chrome.service
6
+ streamlit
7
+ langchain_core.prompts
8
+ langchain_openai
9
+ os
10
+ re
scraper.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ import time
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+
8
+ def scrape_website(website):
9
+ print("Launching chrome browser...")
10
+
11
+ chrome_driver_path = "/usr/bin/chromedriver"
12
+ options = webdriver.ChromeOptions()
13
+ driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
14
+
15
+
16
+ try:
17
+ driver.get(website)
18
+ print("Page Loaded...")
19
+ html = driver.page_source
20
+
21
+ return html
22
+
23
+ finally:
24
+ driver.quit()
25
+
26
+
27
+ def extract_body_content(html_content):
28
+ soup = BeautifulSoup(html_content,"html.parser")
29
+ body_content = soup.body
30
+ if body_content:
31
+ return str(body_content)
32
+ return ""
33
+
34
+ def clean_body_content(body_content):
35
+ soup = BeautifulSoup(body_content,"html.parser")
36
+
37
+ for script_or_style in soup(["script","style"]):
38
+ script_or_style.extract()
39
+
40
+ cleaned_content = soup.get_text(separator="\n")
41
+ cleaned_content = "\n".join(
42
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
43
+ )
44
+
45
+ return cleaned_content
46
+
47
+ def split_dom_content(dom_content,max_length=60000):
48
+ return [
49
+ dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
50
+ ]