import requests from bs4 import BeautifulSoup from transformers import AutoModelForTokenClassification, AutoTokenizer # Set up the Hugging Face model and tokenizer for text extraction model_name = "distilbert-base-uncased" model = AutoModelForTokenClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def scrape_website(url): # Send an HTTP request to the website response = requests.get(url) # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Extract the text content from the HTML text = soup.get_text() # Preprocess the text using the Hugging Face tokenizer inputs = tokenizer.encode_plus( text, add_special_tokens=True, max_length=512, return_attention_mask=True, return_tensors='pt' ) # Use the Hugging Face model to extract the content outputs = model(**inputs) content = outputs.last_hidden_state[:, 0, :] # Convert the content to a string content_str = tokenizer.decode(content, skip_special_tokens=True) return content_str # Define a function to scrape multiple URLs def scrape_multiple_websites(urls): contents = [] for url in urls: content = scrape_website(url) contents.append(content) # Join the contents of multiple URLs joined_content = '\n\n'.join(contents) return joined_content # Example usage: Scrape a single URL url = "https://www.example.com" content = scrape_website(url) print(content) # Example usage: Scrape multiple URLs urls = ["https://www.example.com", "https://www.example2.com"] content = scrape_multiple_websites(urls) print(content)