Spaces:
Running
Running
Upload 6 files
Browse files- Data.py +25 -0
- README.md +45 -14
- app.py +37 -0
- parse.py +47 -0
- requirements.txt +10 -0
- scraper.py +50 -0
Data.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def markdown_to_csv(llm_output):
|
6 |
+
# Find all Markdown tables
|
7 |
+
tables = re.findall(r"(\|(?:[^\n]+\|)+\n\|(?:\s*-+\s*\|)+\n(?:\|(?:[^\n]+\|)+\n)+)", llm_output)
|
8 |
+
|
9 |
+
dataframes = []
|
10 |
+
|
11 |
+
if tables: # Ajouté pour éviter le 'NoneType' issue
|
12 |
+
for table in tables:
|
13 |
+
# Split lines and extract columns
|
14 |
+
lines = table.strip().split("\n")
|
15 |
+
headers = [col.strip() for col in lines[0].split("|")[1:-1]] # Headers
|
16 |
+
data_rows = [line.split("|")[1:-1] for line in lines[2:]] # Ignore separator row
|
17 |
+
|
18 |
+
# Clean and validate rows
|
19 |
+
cleaned_data = [[col.strip() for col in row] for row in data_rows if len(row) == len(headers)]
|
20 |
+
|
21 |
+
if cleaned_data:
|
22 |
+
df = pd.DataFrame(cleaned_data, columns=headers)
|
23 |
+
dataframes.append(df)
|
24 |
+
|
25 |
+
return dataframes # <- Corrigé : en dehors de la boucle
|
README.md
CHANGED
@@ -1,14 +1,45 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Project Overview
|
2 |
+
|
3 |
+
This project implements a text parsing and information extraction tool using a chat model. The main functionality is encapsulated in the `parse.py` file, which utilizes an API to process text content and extract relevant information based on specified descriptions.
|
4 |
+
|
5 |
+
## Files
|
6 |
+
|
7 |
+
- `parse.py`: Contains the implementation for parsing and extracting information from text content using a chat model. It imports necessary libraries, sets up an API key, defines a chat prompt template, and includes a function `parse_with_ollama` that processes chunks of text based on a provided description.
|
8 |
+
|
9 |
+
## Setup Instructions
|
10 |
+
|
11 |
+
1. **Clone the Repository**
|
12 |
+
```
|
13 |
+
git clone <repository-url>
|
14 |
+
cd project
|
15 |
+
```
|
16 |
+
|
17 |
+
2. **Install Dependencies**
|
18 |
+
Ensure you have Python installed, then install the required libraries:
|
19 |
+
```
|
20 |
+
pip install langchain_core langchain_openai
|
21 |
+
```
|
22 |
+
|
23 |
+
3. **Set Up API Key**
|
24 |
+
Replace the placeholder API key in `parse.py` with your actual OpenRouter API key.
|
25 |
+
|
26 |
+
## Usage
|
27 |
+
|
28 |
+
To use the parsing functionality, call the `parse_with_ollama` function from `parse.py` with the appropriate parameters:
|
29 |
+
|
30 |
+
```python
|
31 |
+
from parse import parse_with_ollama
|
32 |
+
|
33 |
+
dom_chunks = ["Your text content here"]
|
34 |
+
parse_description = "Description of the information to extract"
|
35 |
+
results = parse_with_ollama(dom_chunks, parse_description)
|
36 |
+
print(results)
|
37 |
+
```
|
38 |
+
|
39 |
+
## Contributing
|
40 |
+
|
41 |
+
Contributions are welcome! Please submit a pull request or open an issue for any enhancements or bug fixes.
|
42 |
+
|
43 |
+
## License
|
44 |
+
|
45 |
+
This project is licensed under the MIT License.
|
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
|
3 |
+
from parse import parse
|
4 |
+
from project.Data import markdown_to_csv
|
5 |
+
|
6 |
+
st.title("AI Web Scraper")
|
7 |
+
url = st.text_input("Enter a Website URL")
|
8 |
+
|
9 |
+
if st.button("Scrape Site"):
|
10 |
+
st.write("Scraping the website")
|
11 |
+
|
12 |
+
result = scrape_website(url)
|
13 |
+
body_content = extract_body_content(result)
|
14 |
+
cleaned_content = clean_body_content(body_content)
|
15 |
+
|
16 |
+
st.session_state.dom_content = cleaned_content
|
17 |
+
|
18 |
+
with st.expander("View DOM Content"):
|
19 |
+
st.text_area("DOM Content", cleaned_content, height=300)
|
20 |
+
|
21 |
+
if "dom_content" in st.session_state:
|
22 |
+
parse_description = st.text_area("Describe what you want to parse?")
|
23 |
+
|
24 |
+
if st.button("Parse Content"):
|
25 |
+
if parse_description:
|
26 |
+
st.write("Parsing Content")
|
27 |
+
|
28 |
+
dom_chunks = split_dom_content(st.session_state.dom_content)
|
29 |
+
result = parse(dom_chunks,parse_description)
|
30 |
+
print(repr(result))
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
# Appliquer la fonction
|
35 |
+
tables = markdown_to_csv(result)
|
36 |
+
for i in tables:
|
37 |
+
st.write(i)
|
parse.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.prompts import ChatPromptTemplate
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Load OpenRouter API Key
|
6 |
+
openrouter_api_key = "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"
|
7 |
+
|
8 |
+
model = ChatOpenAI(
|
9 |
+
openai_api_key=openrouter_api_key, # Use OpenRouter API key
|
10 |
+
model="meta-llama/llama-4-maverick:free", # Specify Qwen VL Plus model
|
11 |
+
base_url="https://openrouter.ai/api/v1" # OpenRouter API URL
|
12 |
+
)
|
13 |
+
|
14 |
+
# Create a chat prompt template
|
15 |
+
template = (
|
16 |
+
"You are tasked with extracting specific information from the following text content: {dom_content}. "
|
17 |
+
"Please follow these instructions carefully: \n\n"
|
18 |
+
"1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
|
19 |
+
"2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
|
20 |
+
"3. **Empty Response:** If no information matches the description, return an empty string ('')."
|
21 |
+
"4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
|
22 |
+
"5. **Type:** The output should always be a table, and if there's more than one table, return every table separately. Use Markdown table format.\n"
|
23 |
+
"6. **Standardized Table Format:** Ensure each table is formatted as a Markdown table with clear headers and consistent column alignment.\n"
|
24 |
+
"7. **Accuracy:** The output should be as accurate as possible.\n"
|
25 |
+
"8. **Column Separators:** Use the pipe symbol (|) to clearly separate columns in the Markdown table.\n"
|
26 |
+
"9. **Header Row:** The first row of each table should be the header row, clearly labeling each column.\n"
|
27 |
+
"10. **Alignment Row:** The second row should contain hyphens (-) to indicate column alignment (e.g., --- for left alignment, :---: for center alignment, ---: for right alignment).\n"
|
28 |
+
"11. **Data Rows:** Subsequent rows should contain the data, with each cell aligned according to the alignment row.\n"
|
29 |
+
)
|
30 |
+
|
31 |
+
# Function to parse and extract information from the chunks
|
32 |
+
def parse(dom_chunks, parse_description):
|
33 |
+
prompt = ChatPromptTemplate.from_template(template)
|
34 |
+
chain = prompt | model
|
35 |
+
|
36 |
+
parsed_results = []
|
37 |
+
|
38 |
+
# Loop through the chunks and parse
|
39 |
+
for i, chunk in enumerate(dom_chunks, start=1):
|
40 |
+
response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
|
41 |
+
|
42 |
+
# Extract the content from AIMessage and add it to the results
|
43 |
+
print(f"Parsed batch {i} of {len(dom_chunks)}")
|
44 |
+
parsed_results.append(response.content) # Ensure content is extracted properly
|
45 |
+
|
46 |
+
# Return the parsed results as a single string
|
47 |
+
return "\n".join(parsed_results)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bs4
|
2 |
+
time
|
3 |
+
numpy
|
4 |
+
selenium
|
5 |
+
selenium.webdriver.chrome.service
|
6 |
+
streamlit
|
7 |
+
langchain_core.prompts
|
8 |
+
langchain_openai
|
9 |
+
os
|
10 |
+
re
|
scraper.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.service import Service
|
3 |
+
import time
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def scrape_website(website):
|
9 |
+
print("Launching chrome browser...")
|
10 |
+
|
11 |
+
chrome_driver_path = "/usr/bin/chromedriver"
|
12 |
+
options = webdriver.ChromeOptions()
|
13 |
+
driver = webdriver.Chrome(service=Service(chrome_driver_path, options=options))
|
14 |
+
|
15 |
+
|
16 |
+
try:
|
17 |
+
driver.get(website)
|
18 |
+
print("Page Loaded...")
|
19 |
+
html = driver.page_source
|
20 |
+
|
21 |
+
return html
|
22 |
+
|
23 |
+
finally:
|
24 |
+
driver.quit()
|
25 |
+
|
26 |
+
|
27 |
+
def extract_body_content(html_content):
|
28 |
+
soup = BeautifulSoup(html_content,"html.parser")
|
29 |
+
body_content = soup.body
|
30 |
+
if body_content:
|
31 |
+
return str(body_content)
|
32 |
+
return ""
|
33 |
+
|
34 |
+
def clean_body_content(body_content):
|
35 |
+
soup = BeautifulSoup(body_content,"html.parser")
|
36 |
+
|
37 |
+
for script_or_style in soup(["script","style"]):
|
38 |
+
script_or_style.extract()
|
39 |
+
|
40 |
+
cleaned_content = soup.get_text(separator="\n")
|
41 |
+
cleaned_content = "\n".join(
|
42 |
+
line.strip() for line in cleaned_content.splitlines() if line.strip()
|
43 |
+
)
|
44 |
+
|
45 |
+
return cleaned_content
|
46 |
+
|
47 |
+
def split_dom_content(dom_content,max_length=60000):
|
48 |
+
return [
|
49 |
+
dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
|
50 |
+
]
|