Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# Set up the Hugging Face model and tokenizer for text extraction | |
model_name = "distilbert-base-uncased" | |
model = AutoModelForTokenClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def scrape_website(url): | |
# Send an HTTP request to the website | |
response = requests.get(url) | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract the text content from the HTML | |
text = soup.get_text() | |
# Preprocess the text using the Hugging Face tokenizer | |
inputs = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
# Use the Hugging Face model to extract the content | |
outputs = model(**inputs) | |
content = outputs.last_hidden_state[:, 0, :] | |
# Convert the content to a string | |
content_str = tokenizer.decode(content, skip_special_tokens=True) | |
return content_str | |
# Define a function to scrape multiple URLs | |
def scrape_multiple_websites(urls): | |
contents = [] | |
for url in urls: | |
content = scrape_website(url) | |
contents.append(content) | |
# Join the contents of multiple URLs | |
joined_content = '\n\n'.join(contents) | |
return joined_content | |
# Example usage: Scrape a single URL | |
url = "https://www.example.com" | |
content = scrape_website(url) | |
print(content) | |
# Example usage: Scrape multiple URLs | |
urls = ["https://www.example.com", "https://www.example2.com"] | |
content = scrape_multiple_websites(urls) | |
print(content) |