leetuan023 commited on
Commit
6b8a953
·
verified ·
1 Parent(s): 8d892b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -2
app.py CHANGED
@@ -1,3 +1,58 @@
1
- from diffusers import DiffusionPipeline
 
 
2
 
3
- pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
4
 
5
+ # Set up the Hugging Face model and tokenizer for text extraction
6
+ model_name = "distilbert-base-uncased"
7
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+
10
+ def scrape_website(url):
11
+ # Send an HTTP request to the website
12
+ response = requests.get(url)
13
+
14
+ # Parse the HTML content using BeautifulSoup
15
+ soup = BeautifulSoup(response.content, 'html.parser')
16
+
17
+ # Extract the text content from the HTML
18
+ text = soup.get_text()
19
+
20
+ # Preprocess the text using the Hugging Face tokenizer
21
+ inputs = tokenizer.encode_plus(
22
+ text,
23
+ add_special_tokens=True,
24
+ max_length=512,
25
+ return_attention_mask=True,
26
+ return_tensors='pt'
27
+ )
28
+
29
+ # Use the Hugging Face model to extract the content
30
+ outputs = model(**inputs)
31
+ content = outputs.last_hidden_state[:, 0, :]
32
+
33
+ # Convert the content to a string
34
+ content_str = tokenizer.decode(content, skip_special_tokens=True)
35
+
36
+ return content_str
37
+
38
+ # Define a function to scrape multiple URLs
39
+ def scrape_multiple_websites(urls):
40
+ contents = []
41
+ for url in urls:
42
+ content = scrape_website(url)
43
+ contents.append(content)
44
+
45
+ # Join the contents of multiple URLs
46
+ joined_content = '\n\n'.join(contents)
47
+
48
+ return joined_content
49
+
50
+ # Example usage: Scrape a single URL
51
+ url = "https://www.example.com"
52
+ content = scrape_website(url)
53
+ print(content)
54
+
55
+ # Example usage: Scrape multiple URLs
56
+ urls = ["https://www.example.com", "https://www.example2.com"]
57
+ content = scrape_multiple_websites(urls)
58
+ print(content)