Spaces:

axnand
/

verbo-backend

Running

File size: 2,654 Bytes

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re
import random

model_name = "humarin/chatgpt_paraphraser_on_T5_base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

def post_process_humanize(text: str) -> str:
    # Step 1: Remove generic AI prefixes
    generic_starts = [
        "the following text is",
        "here is a",
        "this is the",
        "paraphrased version",
        "rephrased version",
        "reworded version",
        "rewritten version"
    ]
    for start in generic_starts:
        if text.lower().startswith(start):
            text = text[len(start):].strip(" :.-\"\n")

    # Step 2: Light contractions and filler words
    replacements = {
        "do not": "don't",
        "cannot": "can't",
        "will not": "won't",
        "should not": "shouldn't",
        "has not": "hasn't",
        "have not": "haven't",
        "it is": "it's",
        "we are": "we're",
        "you are": "you're",
        "they are": "they're",
        "I am": "I'm",
        "There is": "There's",
        "That is": "That's",
        "because": "since",
    }

    for orig, repl in replacements.items():
        text = re.sub(rf"\b{orig}\b", repl, text, flags=re.IGNORECASE)

    # Step 3: Add human filler expressions in safe spots
    softeners = ["frankly", "actually", "honestly", "to be fair", "in fact", "well"]
    if random.random() < 0.5:
        sentences = re.split(r'(?<=[.!?]) +', text)
        if len(sentences) > 1:
            insert_idx = random.randint(0, len(sentences) - 2)
            sentences[insert_idx] += f", {random.choice(softeners)}"
            text = " ".join(sentences)

    return text.strip()

def rephrase_text(text: str, tone: str = "general") -> str:
    if tone == "general":
        prompt = f"paraphrase: {text} </s>"
    else:
        prompt = f"Rephrase in a {tone} tone without introductory phrases:\n{text} </s>"

    inputs = tokenizer([prompt], return_tensors="pt", padding=True, truncation=True).to(device)
    input_length = inputs['input_ids'].shape[1]
    max_length = min(int(input_length * 1.3), 1024)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=50,
        top_p=0.92,
        temperature=0.8,
        max_length=max_length,
        repetition_penalty=1.1,
        early_stopping=True
    )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-process to humanize
    result = post_process_humanize(result)
    return result