Spaces:
Running
Running
File size: 2,654 Bytes
5a2da96 da047a9 307a792 5a2da96 307a792 5a2da96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re
import random
model_name = "humarin/chatgpt_paraphraser_on_T5_base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
def post_process_humanize(text: str) -> str:
# Step 1: Remove generic AI prefixes
generic_starts = [
"the following text is",
"here is a",
"this is the",
"paraphrased version",
"rephrased version",
"reworded version",
"rewritten version"
]
for start in generic_starts:
if text.lower().startswith(start):
text = text[len(start):].strip(" :.-\"\n")
# Step 2: Light contractions and filler words
replacements = {
"do not": "don't",
"cannot": "can't",
"will not": "won't",
"should not": "shouldn't",
"has not": "hasn't",
"have not": "haven't",
"it is": "it's",
"we are": "we're",
"you are": "you're",
"they are": "they're",
"I am": "I'm",
"There is": "There's",
"That is": "That's",
"because": "since",
}
for orig, repl in replacements.items():
text = re.sub(rf"\b{orig}\b", repl, text, flags=re.IGNORECASE)
# Step 3: Add human filler expressions in safe spots
softeners = ["frankly", "actually", "honestly", "to be fair", "in fact", "well"]
if random.random() < 0.5:
sentences = re.split(r'(?<=[.!?]) +', text)
if len(sentences) > 1:
insert_idx = random.randint(0, len(sentences) - 2)
sentences[insert_idx] += f", {random.choice(softeners)}"
text = " ".join(sentences)
return text.strip()
def rephrase_text(text: str, tone: str = "general") -> str:
if tone == "general":
prompt = f"paraphrase: {text} </s>"
else:
prompt = f"Rephrase in a {tone} tone without introductory phrases:\n{text} </s>"
inputs = tokenizer([prompt], return_tensors="pt", padding=True, truncation=True).to(device)
input_length = inputs['input_ids'].shape[1]
max_length = min(int(input_length * 1.3), 1024)
outputs = model.generate(
**inputs,
do_sample=True,
top_k=50,
top_p=0.92,
temperature=0.8,
max_length=max_length,
repetition_penalty=1.1,
early_stopping=True
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Post-process to humanize
result = post_process_humanize(result)
return result |