Hyphonical commited on
Commit
24d01e0
·
1 Parent(s): e8893c1

✨ Add HTML purification functionality: integrate PurifyHtml in app and create Gradio interface for user input; update requirements to include transformers.

Browse files
Files changed (3) hide show
  1. Purify.py +41 -42
  2. app.py +30 -3
  3. requirements.txt +1 -0
Purify.py CHANGED
@@ -1,10 +1,9 @@
 
1
  from bs4 import BeautifulSoup, Tag
2
  import datetime
3
  import requests
4
  import re
5
 
6
- Url = 'https://huggingface.co'
7
-
8
  NoisePatterns = {
9
  '(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
10
  'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
@@ -32,14 +31,11 @@ def RemoveNoise(RawHtml: str) -> str:
32
  str: Cleaned HTML content without noise.
33
  '''
34
  CleanedHtml = RawHtml
35
- OriginalCharCount = len(RawHtml)
36
  for PatternName, Pattern in NoisePatterns.items():
37
  if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
38
  CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
39
  else:
40
  CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
41
- print(f'• Removed {PatternName} noise. Removed {OriginalCharCount - len(CleanedHtml)} characters.')
42
- OriginalCharCount = len(CleanedHtml)
43
  return CleanedHtml
44
 
45
  def FetchHtmlContent(Url: str) -> str | int:
@@ -58,43 +54,46 @@ def FetchHtmlContent(Url: str) -> str | int:
58
  else:
59
  return Response.status_code
60
 
61
- Start = datetime.datetime.now()
62
- RawHtml = FetchHtmlContent(Url)
63
- if isinstance(RawHtml, str):
64
- RawCharCount = len(RawHtml)
65
-
66
- print('Prettifying HTML content...')
67
-
68
- Soup = BeautifulSoup(RawHtml, 'html.parser')
69
- PrettifiedHtml = str(Soup.prettify())
70
-
71
- Title = Soup.title.string if Soup.title else 'No title found'
72
- MetaDesc = Soup.find('meta', attrs={'name': 'description'})
73
- Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found'
74
-
75
- print('Purifying HTML content...')
76
-
77
- CleanedHtml = RemoveNoise(PrettifiedHtml)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- CleanedCharCount = len(CleanedHtml)
80
- Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0
 
 
81
 
82
- Summary = [
83
- '<!-- --- Purification Summary ---',
84
- f'URL: {Url}',
85
- f'Title: {Title}',
86
- f'Description: {Description}',
87
- f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
88
- f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
89
- f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
90
- '----------------------------- -->'
91
- ]
92
- for Line in Summary:
93
- print(Line)
94
 
95
- with open('CleanedHtml.html', 'w', encoding='utf-8') as File:
96
- for Line in Summary:
97
- File.write(Line + '\n')
98
- File.write(CleanedHtml)
99
- else:
100
- print(f'Failed to fetch HTML content. Status code: {RawHtml}')
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from bs4 import BeautifulSoup, Tag
3
  import datetime
4
  import requests
5
  import re
6
 
 
 
7
  NoisePatterns = {
8
  '(No)Script': r'<[ ]*(script|noscript)[^>]*?>.*?<\/[ ]*\1[ ]*>',
9
  'Style': r'<[ ]*(style)[^>]*?>.*?<\/[ ]*\1[ ]*>',
 
31
  str: Cleaned HTML content without noise.
32
  '''
33
  CleanedHtml = RawHtml
 
34
  for PatternName, Pattern in NoisePatterns.items():
35
  if PatternName in ['EmptyLines', 'EmptyTags']: # These patterns are line-based
36
  CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.MULTILINE)
37
  else:
38
  CleanedHtml = re.sub(Pattern, '', CleanedHtml, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
 
 
39
  return CleanedHtml
40
 
41
  def FetchHtmlContent(Url: str) -> str | int:
 
54
  else:
55
  return Response.status_code
56
 
57
+ def PurifyHtml(Url: str) -> str: # type: ignore
58
+ Start = datetime.datetime.now()
59
+ RawHtml = FetchHtmlContent(Url)
60
+ if isinstance(RawHtml, str):
61
+ RawCharCount = len(RawHtml)
62
+
63
+ Soup = BeautifulSoup(RawHtml, 'html.parser')
64
+ PrettifiedHtml = str(Soup.prettify())
65
+
66
+ Title = Soup.title.string if Soup.title else 'No title found'
67
+ MetaDesc = Soup.find('meta', attrs={'name': 'description'})
68
+ Description = MetaDesc.get('content', 'No description found') if isinstance(MetaDesc, Tag) else 'No description found'
69
+
70
+ CleanedHtml = RemoveNoise(PrettifiedHtml)
71
+
72
+ CleanedCharCount = len(CleanedHtml)
73
+ Ratio = CleanedCharCount / RawCharCount if RawCharCount > 0 else 0
74
+
75
+ Summary = [
76
+ '<!-- --- Purification Summary ---',
77
+ f'URL: {Url}',
78
+ f'Title: {Title}',
79
+ f'Description: {Description}',
80
+ f'Time of Fetch: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} (Took {datetime.datetime.now() - Start})',
81
+ f'Noise Removal Ratio: {Ratio:.2%} (lower is better)',
82
+ f'Characters: {RawCharCount} -> {CleanedCharCount} ({RawCharCount - CleanedCharCount} characters removed)',
83
+ '----------------------------- -->'
84
+ ]
85
+ for Line in Summary:
86
+ print(Line)
87
+
88
+ Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
89
+ Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2')
90
 
91
+ Message = [
92
+ {'role': 'user', 'content': f'Please summarize the following HTML content in clean markdown:\n\n{CleanedHtml}'},
93
+ ]
94
+ SummaryOutput = Model.chat(Message, tokenizer=Tokenizer, max_new_tokens=500, do_sample=False)
95
 
96
+ return str(SummaryOutput)
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ else:
99
+ print(f'Failed to fetch HTML content. Status code: {RawHtml}')
 
 
 
 
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from pymongo import MongoClient
2
  from dotenv import load_dotenv
 
3
  from typing import Literal
4
  from bson import ObjectId
5
  from io import StringIO
@@ -143,6 +144,16 @@ def Ping(Host: str, Count: int = 8) -> str:
143
  else:
144
  return f'Ping to {Host} failed: No successful responses'
145
 
 
 
 
 
 
 
 
 
 
 
146
  # ╭───────────────────────────────────╮
147
  # │ Fun and Entertainment Tools │
148
  # ╰───────────────────────────────────╯
@@ -163,15 +174,22 @@ def Fact() -> str:
163
  '''
164
  return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
165
 
166
- def Plot() -> str:
167
  '''Generate a random plot for a movie or story.
 
 
168
  Returns:
169
  str: A random plot description.
170
  '''
171
  with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
172
  Data = json.load(PlotFile)
173
  Plot = random.choice(Data)
174
- return Plot['prompt'] if isinstance(Plot, dict) and 'prompt' in Plot else 'Error: Invalid plot data format'
 
 
 
 
 
175
 
176
  # ╭─────────────────────────────╮
177
  # │ Text Processing Tools │
@@ -398,6 +416,13 @@ with gradio.Blocks(
398
  PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
399
  PingBtn = gradio.Button('Ping Host 📡', variant='primary')
400
  PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
 
 
 
 
 
 
 
401
 
402
  with gradio.TabItem('Fun & Entertainment 🎭'):
403
  with gradio.TabItem('Random Joke 😂'):
@@ -416,8 +441,10 @@ with gradio.Blocks(
416
  with gradio.TabItem('Random Plot 🎬'):
417
  with gradio.Group():
418
  PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
 
 
419
  PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
420
- PlotBtn.click(Plot, outputs=PlotOutput)
421
 
422
  with gradio.TabItem('Text Processing 📝'):
423
  with gradio.TabItem('Text Reversal 🔄'):
 
1
  from pymongo import MongoClient
2
  from dotenv import load_dotenv
3
+ from Purify import PurifyHtml
4
  from typing import Literal
5
  from bson import ObjectId
6
  from io import StringIO
 
144
  else:
145
  return f'Ping to {Host} failed: No successful responses'
146
 
147
+ def Purify(Url: str) -> str:
148
+ '''Purify HTML content from a URL.
149
+ Args:
150
+ Url (str): The URL to fetch and purify HTML content from.
151
+ Returns:
152
+ str: The purified HTML content or an error message.
153
+ '''
154
+
155
+ return PurifyHtml(Url)
156
+
157
  # ╭───────────────────────────────────╮
158
  # │ Fun and Entertainment Tools │
159
  # ╰───────────────────────────────────╯
 
174
  '''
175
  return requests.get('https://uselessfacts.jsph.pl/random.json?language=en').json()['text']
176
 
177
+ def Plot(GiveExamplePrompt: bool = True) -> list[str]:
178
  '''Generate a random plot for a movie or story.
179
+ Args:
180
+ GiveExamplePrompt (bool): If True, returns a random plot prompt from a predefined dataset.
181
  Returns:
182
  str: A random plot description.
183
  '''
184
  with open(r'Data/human-writing-dpo.json', 'r', encoding='utf-8') as PlotFile:
185
  Data = json.load(PlotFile)
186
  Plot = random.choice(Data)
187
+ Prompt = Plot['prompt']
188
+ Chosen = Plot['chosen']
189
+ if GiveExamplePrompt:
190
+ return [Prompt, Chosen]
191
+ else:
192
+ return [Prompt, '']
193
 
194
  # ╭─────────────────────────────╮
195
  # │ Text Processing Tools │
 
416
  PingOutput = gradio.Text(label='Ping Result 📡', interactive=False)
417
  PingBtn = gradio.Button('Ping Host 📡', variant='primary')
418
  PingBtn.click(Ping, inputs=[PingInput, PingCount], outputs=PingOutput)
419
+
420
+ with gradio.TabItem('Web Scraping & Purification 🌐'):
421
+ with gradio.Group():
422
+ PurifyInput = gradio.Textbox(label='URL to Purify 🌐', placeholder='Enter URL to fetch and purify HTML', lines=1, max_lines=1)
423
+ PurifyOutput = gradio.Text(label='Purified HTML Content 📝', interactive=False)
424
+ PurifyBtn = gradio.Button('Purify HTML 🧹', variant='primary')
425
+ PurifyBtn.click(Purify, inputs=PurifyInput, outputs=PurifyOutput)
426
 
427
  with gradio.TabItem('Fun & Entertainment 🎭'):
428
  with gradio.TabItem('Random Joke 😂'):
 
441
  with gradio.TabItem('Random Plot 🎬'):
442
  with gradio.Group():
443
  PlotOutput = gradio.Text(label='Random Plot 🎬', interactive=False)
444
+ PlotExample = gradio.Checkbox(label='Give Example Plot Prompt 📜', value=True, interactive=True)
445
+ PlotExampleOutput = gradio.Text(label='Example Plot Prompt 📜', interactive=False)
446
  PlotBtn = gradio.Button('Get Plot 🎥', variant='primary')
447
+ PlotBtn.click(Plot, inputs=[PlotExample], outputs=[PlotOutput, PlotExampleOutput])
448
 
449
  with gradio.TabItem('Text Processing 📝'):
450
  with gradio.TabItem('Text Reversal 🔄'):
requirements.txt CHANGED
@@ -1 +1,2 @@
 
1
  pymongo
 
1
+ transformers
2
  pymongo