Spaces:
Sleeping
Sleeping
Commit
·
24d01e0
1
Parent(s):
e8893c1
✨ Add HTML purification functionality: integrate PurifyHtml in app and create Gradio interface for user input; update requirements to include transformers.
Browse files- Purify.py +41 -42
- app.py +30 -3
- requirements.txt +1 -0
Purify.py
CHANGED
@@ -1,10 +1,9 @@
|
|
|
|
1 |
from bs4 import BeautifulSoup, Tag
|
2 |
import datetime
|
3 |
import requests
|
4 |
import re
|
5 |
|
6 |
-
Url = 'https://huggingface.co'
|
7 |
-
|
8 |
NoisePatterns = {
|
9 |
'(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
10 |
'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
@@ -32,14 +31,11 @@ def RemoveNoise(RawHtml: str) -> str:
|
|
32 |
str: Cleaned HTML content without noise.
|
33 |
'''
|
34 |
CleanedHtml = RawHtml
|
35 |
-
OriginalCharCount = len(RawHtml)
|
36 |
for PatternName, Pattern in NoisePatterns.items():
|
37 |
if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
|
38 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
|
39 |
else:
|
40 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
41 |
-
print(f'• Removed {PatternName} noise. Removed {OriginalCharCount - len(CleanedHtml)} characters.')
|
42 |
-
OriginalCharCount = len(CleanedHtml)
|
43 |
return CleanedHtml
|
44 |
|
45 |
def FetchHtmlContent(Url: str) -> str | int:
|
@@ -58,43 +54,46 @@ def FetchHtmlContent(Url: str) -> str | int:
|
|
58 |
else:
|
59 |
return Response.status_code
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
'<!-- --- Purification Summary ---',
|
84 |
-
f'URL: {Url}',
|
85 |
-
f'Title: {Title}',
|
86 |
-
f'Description: {Description}',
|
87 |
-
f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
|
88 |
-
f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
|
89 |
-
f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
|
90 |
-
'----------------------------- -->'
|
91 |
-
]
|
92 |
-
for Line in Summary:
|
93 |
-
print(Line)
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
File.write(Line + '\n')
|
98 |
-
File.write(CleanedHtml)
|
99 |
-
else:
|
100 |
-
print(f'Failed to fetch HTML content. Status code: {RawHtml}')
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
from bs4 import BeautifulSoup, Tag
|
3 |
import datetime
|
4 |
import requests
|
5 |
import re
|
6 |
|
|
|
|
|
7 |
NoisePatterns = {
|
8 |
'(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
9 |
'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
|
|
|
31 |
str: Cleaned HTML content without noise.
|
32 |
'''
|
33 |
CleanedHtml = RawHtml
|
|
|
34 |
for PatternName, Pattern in NoisePatterns.items():
|
35 |
if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
|
36 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
|
37 |
else:
|
38 |
CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
|
|
|
|
|
39 |
return CleanedHtml
|
40 |
|
41 |
def FetchHtmlContent(Url: str) -> str | int:
|
|
|
54 |
else:
|
55 |
return Response.status_code
|
56 |
|
57 |
+
def PurifyHtml(Url: str) -> str: # type: ignore
|
58 |
+
Start = datetime.datetime.now()
|
59 |
+
RawHtml = FetchHtmlContent(Url)
|
60 |
+
if isinstance(RawHtml, str):
|
61 |
+
RawCharCount = len(RawHtml)
|
62 |
+
|
63 |
+
Soup = BeautifulSoup(RawHtml, 'html.parser')
|
64 |
+
PrettifiedHtml = str(Soup.prettify())
|
65 |
+
|
66 |
+
Title = Soup.title.string if Soup.title else 'No title found'
|
67 |
+
MetaDesc = Soup.find('meta', attrs={'name': 'description'})
|
68 |
+
Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found'
|
69 |
+
|
70 |
+
CleanedHtml = RemoveNoise(PrettifiedHtml)
|
71 |
+
|
72 |
+
CleanedCharCount = len(CleanedHtml)
|
73 |
+
Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0
|
74 |
+
|
75 |
+
Summary = [
|
76 |
+
'<!-- --- Purification Summary ---',
|
77 |
+
f'URL: {Url}',
|
78 |
+
f'Title: {Title}',
|
79 |
+
f'Description: {Description}',
|
80 |
+
f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
|
81 |
+
f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
|
82 |
+
f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
|
83 |
+
'----------------------------- -->'
|
84 |
+
]
|
85 |
+
for Line in Summary:
|
86 |
+
print(Line)
|
87 |
+
|
88 |
+
Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
|
89 |
+
Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2')
|
90 |
|
91 |
+
Message = [
|
92 |
+
{'role': 'user', 'content': f'Please summarize the following HTML content in clean markdown:\n\n{CleanedHtml}'},
|
93 |
+
]
|
94 |
+
SummaryOutput = Model.chat(Message, tokenizer=Tokenizer, max_new_tokens=500, do_sample=False)
|
95 |
|
96 |
+
return str(SummaryOutput)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
else:
|
99 |
+
print(f'Failed to fetch HTML content. Status code: {RawHtml}')
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from pymongo import MongoClient
|
2 |
from dotenv import load_dotenv
|
|
|
3 |
from typing import Literal
|
4 |
from bson import ObjectId
|
5 |
from io import StringIO
|
@@ -143,6 +144,16 @@ def Ping(Host: str, Count: int = 8) -> str:
|
|
143 |
else:
|
144 |
return f'Ping to {Host} failed: No successful responses'
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# ╭───────────────────────────────────╮
|
147 |
# │ Fun and Entertainment Tools │
|
148 |
# ╰───────────────────────────────────╯
|
@@ -163,15 +174,22 @@ def Fact() -> str:
|
|
163 |
'''
|
164 |
return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
|
165 |
|
166 |
-
def Plot() -> str:
|
167 |
'''Generate a random plot for a movie or story.
|
|
|
|
|
168 |
Returns:
|
169 |
str: A random plot description.
|
170 |
'''
|
171 |
with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
|
172 |
Data = json.load(PlotFile)
|
173 |
Plot = random.choice(Data)
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
# ╭─────────────────────────────╮
|
177 |
# │ Text Processing Tools │
|
@@ -398,6 +416,13 @@ with gradio.Blocks(
|
|
398 |
PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
|
399 |
PingBtn = gradio.Button('Ping Host 📡', variant='primary')
|
400 |
PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
with gradio.TabItem('Fun & Entertainment 🎭'):
|
403 |
with gradio.TabItem('Random Joke 😂'):
|
@@ -416,8 +441,10 @@ with gradio.Blocks(
|
|
416 |
with gradio.TabItem('Random Plot 🎬'):
|
417 |
with gradio.Group():
|
418 |
PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
|
|
|
|
|
419 |
PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
|
420 |
-
PlotBtn.click(Plot, outputs=PlotOutput)
|
421 |
|
422 |
with gradio.TabItem('Text Processing 📝'):
|
423 |
with gradio.TabItem('Text Reversal 🔄'):
|
|
|
1 |
from pymongo import MongoClient
|
2 |
from dotenv import load_dotenv
|
3 |
+
from Purify import PurifyHtml
|
4 |
from typing import Literal
|
5 |
from bson import ObjectId
|
6 |
from io import StringIO
|
|
|
144 |
else:
|
145 |
return f'Ping to {Host} failed: No successful responses'
|
146 |
|
147 |
+
def Purify(Url: str) -> str:
|
148 |
+
'''Purify HTML content from a URL.
|
149 |
+
Args:
|
150 |
+
Url (str): The URL to fetch and purify HTML content from.
|
151 |
+
Returns:
|
152 |
+
str: The purified HTML content or an error message.
|
153 |
+
'''
|
154 |
+
|
155 |
+
return PurifyHtml(Url)
|
156 |
+
|
157 |
# ╭───────────────────────────────────╮
|
158 |
# │ Fun and Entertainment Tools │
|
159 |
# ╰───────────────────────────────────╯
|
|
|
174 |
'''
|
175 |
return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
|
176 |
|
177 |
+
def Plot(GiveExamplePrompt: bool = True) -> list[str]:
|
178 |
'''Generate a random plot for a movie or story.
|
179 |
+
Args:
|
180 |
+
GiveExamplePrompt (bool): If True, returns a random plot prompt from a predefined dataset.
|
181 |
Returns:
|
182 |
str: A random plot description.
|
183 |
'''
|
184 |
with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
|
185 |
Data = json.load(PlotFile)
|
186 |
Plot = random.choice(Data)
|
187 |
+
Prompt = Plot['prompt']
|
188 |
+
Chosen = Plot['chosen']
|
189 |
+
if GiveExamplePrompt:
|
190 |
+
return [Prompt, Chosen]
|
191 |
+
else:
|
192 |
+
return [Prompt, '']
|
193 |
|
194 |
# ╭─────────────────────────────╮
|
195 |
# │ Text Processing Tools │
|
|
|
416 |
PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
|
417 |
PingBtn = gradio.Button('Ping Host 📡', variant='primary')
|
418 |
PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
|
419 |
+
|
420 |
+
with gradio.TabItem('Web Scraping & Purification 🌐'):
|
421 |
+
with gradio.Group():
|
422 |
+
PurifyInput = gradio.Textbox(label='URL to Purify 🌐', placeholder='Enter URL to fetch and purify HTML', lines=1, max_lines=1)
|
423 |
+
PurifyOutput = gradio.Text(label='Purified HTML Content 📝', interactive=False)
|
424 |
+
PurifyBtn = gradio.Button('Purify HTML 🧹', variant='primary')
|
425 |
+
PurifyBtn.click(Purify, inputs=PurifyInput, outputs=PurifyOutput)
|
426 |
|
427 |
with gradio.TabItem('Fun & Entertainment 🎭'):
|
428 |
with gradio.TabItem('Random Joke 😂'):
|
|
|
441 |
with gradio.TabItem('Random Plot 🎬'):
|
442 |
with gradio.Group():
|
443 |
PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
|
444 |
+
PlotExample = gradio.Checkbox(label='Give Example Plot Prompt 📜', value=True, interactive=True)
|
445 |
+
PlotExampleOutput = gradio.Text(label='Example Plot Prompt 📜', interactive=False)
|
446 |
PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
|
447 |
+
PlotBtn.click(Plot, inputs=[PlotExample], outputs=[PlotOutput, PlotExampleOutput])
|
448 |
|
449 |
with gradio.TabItem('Text Processing 📝'):
|
450 |
with gradio.TabItem('Text Reversal 🔄'):
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
|
|
1 |
pymongo
|
|
|
1 |
+
transformers
|
2 |
pymongo
|