from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch import re import random model_name = "humarin/chatgpt_paraphraser_on_T5_base" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) def post_process_humanize(text: str) -> str: # Step 1: Remove generic AI prefixes generic_starts = [ "the following text is", "here is a", "this is the", "paraphrased version", "rephrased version", "reworded version", "rewritten version" ] for start in generic_starts: if text.lower().startswith(start): text = text[len(start):].strip(" :.-\"\n") # Step 2: Light contractions and filler words replacements = { "do not": "don't", "cannot": "can't", "will not": "won't", "should not": "shouldn't", "has not": "hasn't", "have not": "haven't", "it is": "it's", "we are": "we're", "you are": "you're", "they are": "they're", "I am": "I'm", "There is": "There's", "That is": "That's", "because": "since", } for orig, repl in replacements.items(): text = re.sub(rf"\b{orig}\b", repl, text, flags=re.IGNORECASE) # Step 3: Add human filler expressions in safe spots softeners = ["frankly", "actually", "honestly", "to be fair", "in fact", "well"] if random.random() < 0.5: sentences = re.split(r'(?<=[.!?]) +', text) if len(sentences) > 1: insert_idx = random.randint(0, len(sentences) - 2) sentences[insert_idx] += f", {random.choice(softeners)}" text = " ".join(sentences) return text.strip() def rephrase_text(text: str, tone: str = "general") -> str: if tone == "general": prompt = f"paraphrase: {text} " else: prompt = f"Rephrase in a {tone} tone without introductory phrases:\n{text} " inputs = tokenizer([prompt], return_tensors="pt", padding=True, truncation=True).to(device) input_length = inputs['input_ids'].shape[1] max_length = min(int(input_length * 1.3), 1024) outputs = model.generate( **inputs, do_sample=True, top_k=50, top_p=0.92, temperature=0.8, max_length=max_length, repetition_penalty=1.1, early_stopping=True ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) # Post-process to humanize result = post_process_humanize(result) return result