Spaces:
Running
Running
Commit
·
3fa127c
1
Parent(s):
cac9909
✨ Update PurifyHtml function: change model loading to use 'jinaai/ReaderLM-v2' for improved performance.
Browse files
Purify.py
CHANGED
@@ -86,8 +86,8 @@ def PurifyHtml(Url: str) -> str: # type: ignore
|
|
86 |
for Line in Summary:
|
87 |
print(Line)
|
88 |
|
89 |
-
Tokenizer = AutoTokenizer.from_pretrained('jinaai/
|
90 |
-
Model = AutoModelForCausalLM.from_pretrained('jinaai/
|
91 |
|
92 |
Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}'
|
93 |
Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192)
|
|
|
86 |
for Line in Summary:
|
87 |
print(Line)
|
88 |
|
89 |
+
Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
|
90 |
+
Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2', torch_dtype=torch.float32, device_map='cpu')
|
91 |
|
92 |
Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}'
|
93 |
Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192)
|
app.py
CHANGED
@@ -419,7 +419,7 @@ with gradio.Blocks(
|
|
419 |
|
420 |
with gradio.TabItem('Web Scraping & Purification 🌐'):
|
421 |
with gradio.Group():
|
422 |
-
PurifyInput = gradio.Textbox(label='URL to Purify 🌐', placeholder='Enter URL to fetch and purify HTML', lines=1, max_lines=1)
|
423 |
PurifyOutput = gradio.Text(label='Purified HTML Content 📝', interactive=False)
|
424 |
PurifyBtn = gradio.Button('Purify HTML 🧹', variant='primary')
|
425 |
PurifyBtn.click(Purify, inputs=PurifyInput, outputs=PurifyOutput)
|
|
|
419 |
|
420 |
with gradio.TabItem('Web Scraping & Purification 🌐'):
|
421 |
with gradio.Group():
|
422 |
+
PurifyInput = gradio.Textbox(label='URL to Purify 🌐', placeholder='Enter URL to fetch and purify HTML (e.g., https://huggingface.co)', lines=1, max_lines=1)
|
423 |
PurifyOutput = gradio.Text(label='Purified HTML Content 📝', interactive=False)
|
424 |
PurifyBtn = gradio.Button('Purify HTML 🧹', variant='primary')
|
425 |
PurifyBtn.click(Purify, inputs=PurifyInput, outputs=PurifyOutput)
|