File size: 6,096 Bytes
d00901f
 
 
 
 
 
 
 
 
 
 
fdf2776
d00901f
 
 
 
fdf2776
 
 
 
 
d00901f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdf2776
d00901f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import torch
import joblib

import numpy as np
import pandas as pd
import gradio as gr

from nltk.data import load as nltk_load
from transformers import AutoTokenizer, AutoModelForCausalLM


print("Loading model & Tokenizer...")
model_id  = 'gpt2-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model     = AutoModelForCausalLM.from_pretrained(model_id)

print("Loading NLTL & and scikit-learn model...")
NLTK = nltk_load('data/english.pickle')
sent_cut_en = NLTK.tokenize
clf = joblib.load(f'data/gpt2-large-model', 'rb')

CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')


def gpt2_features(text, tokenizer, model, sent_cut):
    # Tokenize
    input_max_length = tokenizer.model_max_length - 2
    token_ids, offsets = list(), list()
    sentences = sent_cut(text)
    for s in sentences:
        tokens = tokenizer.tokenize(s)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        difference = len(token_ids) + len(ids) - input_max_length
        if difference > 0:
            ids = ids[:-difference]
        offsets.append((len(token_ids), len(token_ids) + len(ids)))
        token_ids.extend(ids)
        if difference >= 0:
            break

    input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
    logits = model(input_ids).logits
    # Shift so that n-1 predict n
    shift_logits = logits[:-1].contiguous()
    shift_target = input_ids[1:].contiguous()
    loss = CROSS_ENTROPY(shift_logits, shift_target)

    all_probs = torch.softmax(shift_logits, dim=-1)
    sorted_ids = torch.argsort(all_probs, dim=-1, descending=True)  # stable=True
    expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
    indices = torch.where(sorted_ids == expanded_tokens)
    rank = indices[-1]
    counter = [
        rank < 10,
        (rank >= 10) & (rank < 100),
        (rank >= 100) & (rank < 1000),
        rank >= 1000
    ]
    counter = [c.long().sum(-1).item() for c in counter]


    # compute different-level ppl
    text_ppl = loss.mean().exp().item()
    sent_ppl = list()
    for start, end in offsets:
        nll = loss[start: end].sum() / (end - start)
        sent_ppl.append(nll.exp().item())
    max_sent_ppl = max(sent_ppl)
    sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
    if len(sent_ppl) > 1:
        sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
    else:
        sent_ppl_std = 0

    mask = torch.tensor([1] * loss.size(0))
    step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
    max_step_ppl = step_ppl.max(dim=-1)[0].item()
    step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
    if step_ppl.size(0) > 1:
        step_ppl_std = step_ppl.std().item()
    else:
        step_ppl_std = 0
    ppls = [
        text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
        max_step_ppl, step_ppl_avg, step_ppl_std
    ]
    return ppls + counter  # type: ignore


def predict(features, classifier, id_to_label):
    x = np.asarray([features])
    pred = classifier.predict(x)[0]
    prob = classifier.predict_proba(x)[0, pred]
    return [id_to_label[pred], prob]


def predict(text):
    with torch.no_grad():
        feats = gpt2_features(text, tokenizer, model, sent_cut_en)
    out = predict(*feats, clf, ['Human Written', 'LLM Generated'])
    return out


print("Building Gradio Interface...")
with gr.Blocks() as demo:
    gr.Markdown(
        """
        ## ChatGPT Detector 🔬 (Linguistic version / 语言学版)

        Visit our project on Github: [chatgpt-comparison-detection project](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
        欢迎在 Github 上关注我们的 [ChatGPT 对比与检测项目](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
        We provide three kinds of detectors, all in Bilingual / 我们提供了三个版本的检测器,且都支持中英文:
        - [QA version / 问答版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-qa)<br>
            detect whether an **answer** is generated by ChatGPT for certain **question**, using PLM-based classifiers / 判断某个**问题的回答**是否由ChatGPT生成,使用基于PTM的分类器来开发;
        - [Sinlge-text version / 独立文本版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-single)<br>
            detect whether a piece of text is ChatGPT generated, using PLM-based classifiers / 判断**单条文本**是否由ChatGPT生成,使用基于PTM的分类器来开发;
        - [**Linguistic version / 语言学版** (👈 Current / 当前使用)](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-ling)<br>
            detect whether a piece of text is ChatGPT generated, using linguistic features / 判断**单条文本**是否由ChatGPT生成,使用基于语言学特征的模型来开发;

        """
    )

    gr.Markdown(
        """
        ## Introduction:
        Two Logistic regression models trained with two kinds of features:
        1. [GLTR](https://aclanthology.org/P19-3019) Test-2, Language model predict token rank top-k buckets, top 10, 10-100, 100-1000, 1000+.
        2. PPL-based, text ppl, sentence ppl, etc.

        English LM is [GPT2-small](https://huggingface.co/gpt2).

        Note: Providing more text to the `Text` box can make the prediction more accurate!
        """
    )
    a1 = gr.Textbox(
        lines=5, label='Text',
        value="There are a few things that can help protect your credit card information from being misused when you give it to a restaurant or any other business:\n\nEncryption: Many businesses use encryption to protect your credit card information when it is being transmitted or stored. This means that the information is transformed into a code that is difficult for anyone to read without the right key."
    )
    button1 = gr.Button("🤖 Predict!")
    gr.Markdown("GLTR")
    label1_gltr = gr.Textbox(lines=1, label='GLTR Predicted Label 🎃')
    score1_gltr = gr.Textbox(lines=1, label='GLTR Probability')

    button1.click(predict, inputs=[a1], outputs=[label1_gltr, score1_gltr])

demo.launch()