Spaces:

Hyphonical
/

MCP-Utilities

Sleeping

Hyphonical commited on Jun 29

Commit

cac9909

1 Parent(s): bcf4c09

✨ Update PurifyHtml function: modify model loading to use 'jinaai/jina-readerLM-1.5B' with torch support, and enhance markdown conversion process.

Files changed (1) hide show

Purify.py CHANGED Viewed

@@ -2,6 +2,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from bs4 import BeautifulSoup, Tag
 import datetime
 import requests
 import re
 NoisePatterns = {
@@ -84,16 +85,16 @@ def PurifyHtml(Url: str) -> str: # type: ignore
 		]
 		for Line in Summary:
 			print(Line)
-		Tokenizer = AutoTokenizer.from_pretrained('jinaai/ReaderLM-v2')
-		Model = AutoModelForCausalLM.from_pretrained('jinaai/ReaderLM-v2')
-		Message = [
-			{'role': 'user', 'content': f'Please summarize the following HTML content in clean markdown:\n\n{CleanedHtml}'},
-		]
-		SummaryOutput = Model.chat(Message, tokenizer=Tokenizer, max_new_tokens=500, do_sample=False)
-		return str(SummaryOutput)
 	else:
 		print(f'Failed to fetch HTML content. Status code: {RawHtml}')

 from bs4 import BeautifulSoup, Tag
 import datetime
 import requests
+import torch
 import re
 NoisePatterns = {
 		]
 		for Line in Summary:
 			print(Line)
+		Tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-readerLM-1.5B')
+		Model = AutoModelForCausalLM.from_pretrained('jinaai/jina-readerLM-1.5B', torch_dtype=torch.float32, device_map='cpu')
+		Prompt = f'Convert this HTML to markdown:\n\n{CleanedHtml}'
+		Inputs = Tokenizer(Prompt, return_tensors='pt', truncation=True, max_length=8192)
+		Outputs = Model.generate(Inputs.input_ids, max_new_tokens=8192, do_sample=False)
+		SummaryOutput = Tokenizer.decode(Outputs[0], skip_special_tokens=True)
+		return SummaryOutput[len(Prompt):].strip()
 	else:
 		print(f'Failed to fetch HTML content. Status code: {RawHtml}')