Spaces:

leetuan023
/

pack

Sleeping

App Files Files Community

pack / app.py

leetuan023

Update app.py

6b8a953 verified 11 months ago

raw

history blame contribute delete

1.74 kB

	import requests
	from bs4 import BeautifulSoup
	from transformers import AutoModelForTokenClassification, AutoTokenizer

	# Set up the Hugging Face model and tokenizer for text extraction
	model_name = "distilbert-base-uncased"
	model = AutoModelForTokenClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	def scrape_website(url):
	# Send an HTTP request to the website
	response = requests.get(url)

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract the text content from the HTML
	text = soup.get_text()

	# Preprocess the text using the Hugging Face tokenizer
	inputs = tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=512,
	return_attention_mask=True,
	return_tensors='pt'
	)

	# Use the Hugging Face model to extract the content
	outputs = model(**inputs)
	content = outputs.last_hidden_state[:, 0, :]

	# Convert the content to a string
	content_str = tokenizer.decode(content, skip_special_tokens=True)

	return content_str

	# Define a function to scrape multiple URLs
	def scrape_multiple_websites(urls):
	contents = []
	for url in urls:
	content = scrape_website(url)
	contents.append(content)

	# Join the contents of multiple URLs
	joined_content = '\n\n'.join(contents)

	return joined_content

	# Example usage: Scrape a single URL
	url = "https://www.example.com"
	content = scrape_website(url)
	print(content)

	# Example usage: Scrape multiple URLs
	urls = ["https://www.example.com", "https://www.example2.com"]
	content = scrape_multiple_websites(urls)
	print(content)