File size: 1,741 Bytes
6b8a953
 
 
8d892b3
6b8a953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
from bs4 import BeautifulSoup
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Set up the Hugging Face model and tokenizer for text extraction
model_name = "distilbert-base-uncased"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def scrape_website(url):
    # Send an HTTP request to the website
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text content from the HTML
    text = soup.get_text()
    
    # Preprocess the text using the Hugging Face tokenizer
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Use the Hugging Face model to extract the content
    outputs = model(**inputs)
    content = outputs.last_hidden_state[:, 0, :]
    
    # Convert the content to a string
    content_str = tokenizer.decode(content, skip_special_tokens=True)
    
    return content_str

# Define a function to scrape multiple URLs
def scrape_multiple_websites(urls):
    contents = []
    for url in urls:
        content = scrape_website(url)
        contents.append(content)
    
    # Join the contents of multiple URLs
    joined_content = '\n\n'.join(contents)
    
    return joined_content

# Example usage: Scrape a single URL
url = "https://www.example.com"
content = scrape_website(url)
print(content)

# Example usage: Scrape multiple URLs
urls = ["https://www.example.com", "https://www.example2.com"]
content = scrape_multiple_websites(urls)
print(content)