Hyphonical commited on
Commit
cac9909
·
1 Parent(s): bcf4c09

✨ Update PurifyHtml function: modify model loading to use 'jinaai/jina-readerLM-1.5B' with torch support, and enhance markdown conversion process.

Browse files
Files changed (1) hide show
  1. Purify.py +9 -8
Purify.py CHANGED
@@ -2,6 +2,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from bs4 import BeautifulSoup, Tag
3
  import datetime
4
  import requests
 
5
  import re
6
 
7
  NoisePatterns = {
@@ -84,16 +85,16 @@ def PurifyHtml(Url: str) -> str: # type: ignore
84
  ]
85
  for Line in Summary:
86
  print(Line)
87
-
88
- Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
89
- Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2')
90
 
91
- Message = [
92
- {'role': 'user', 'content': f'Please summarize the following HTML content in clean markdown:\n\n{CleanedHtml}'},
93
- ]
94
- SummaryOutput = Model.chat(Message, tokenizer=Tokenizer, max_new_tokens=500, do_sample=False)
 
 
 
95
 
96
- return str(SummaryOutput)
97
 
98
  else:
99
  print(f'Failed to fetch HTML content. Status code: {RawHtml}')
 
2
  from bs4 import BeautifulSoup, Tag
3
  import datetime
4
  import requests
5
+ import torch
6
  import re
7
 
8
  NoisePatterns = {
 
85
  ]
86
  for Line in Summary:
87
  print(Line)
 
 
 
88
 
89
+ Tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-readerLM-1.5B')
90
+ Model = AutoModelForCausalLM.from_pretrained('jinaai/jina-readerLM-1.5B', torch_dtype=torch.float32, device_map='cpu')
91
+
92
+ Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}'
93
+ Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192)
94
+ Outputs = Model.generate(Inputs.input_ids, max_new_tokens=8192, do_sample=False)
95
+ SummaryOutput = Tokenizer.decode(Outputs[0], skip_special_tokens=True)
96
 
97
+ return SummaryOutput[len(Prompt):].strip()
98
 
99
  else:
100
  print(f'Failed to fetch HTML content. Status code: {RawHtml}')