Spaces:
Running
Running
File size: 6,096 Bytes
d00901f fdf2776 d00901f fdf2776 d00901f fdf2776 d00901f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import torch
import joblib
import numpy as np
import pandas as pd
import gradio as gr
from nltk.data import load as nltk_load
from transformers import AutoTokenizer, AutoModelForCausalLM
print("Loading model & Tokenizer...")
model_id = 'gpt2-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
print("Loading NLTL & and scikit-learn model...")
NLTK = nltk_load('data/english.pickle')
sent_cut_en = NLTK.tokenize
clf = joblib.load(f'data/gpt2-large-model', 'rb')
CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')
def gpt2_features(text, tokenizer, model, sent_cut):
# Tokenize
input_max_length = tokenizer.model_max_length - 2
token_ids, offsets = list(), list()
sentences = sent_cut(text)
for s in sentences:
tokens = tokenizer.tokenize(s)
ids = tokenizer.convert_tokens_to_ids(tokens)
difference = len(token_ids) + len(ids) - input_max_length
if difference > 0:
ids = ids[:-difference]
offsets.append((len(token_ids), len(token_ids) + len(ids)))
token_ids.extend(ids)
if difference >= 0:
break
input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
logits = model(input_ids).logits
# Shift so that n-1 predict n
shift_logits = logits[:-1].contiguous()
shift_target = input_ids[1:].contiguous()
loss = CROSS_ENTROPY(shift_logits, shift_target)
all_probs = torch.softmax(shift_logits, dim=-1)
sorted_ids = torch.argsort(all_probs, dim=-1, descending=True) # stable=True
expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
indices = torch.where(sorted_ids == expanded_tokens)
rank = indices[-1]
counter = [
rank < 10,
(rank >= 10) & (rank < 100),
(rank >= 100) & (rank < 1000),
rank >= 1000
]
counter = [c.long().sum(-1).item() for c in counter]
# compute different-level ppl
text_ppl = loss.mean().exp().item()
sent_ppl = list()
for start, end in offsets:
nll = loss[start: end].sum() / (end - start)
sent_ppl.append(nll.exp().item())
max_sent_ppl = max(sent_ppl)
sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
if len(sent_ppl) > 1:
sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
else:
sent_ppl_std = 0
mask = torch.tensor([1] * loss.size(0))
step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
max_step_ppl = step_ppl.max(dim=-1)[0].item()
step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
if step_ppl.size(0) > 1:
step_ppl_std = step_ppl.std().item()
else:
step_ppl_std = 0
ppls = [
text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
max_step_ppl, step_ppl_avg, step_ppl_std
]
return ppls + counter # type: ignore
def predict(features, classifier, id_to_label):
x = np.asarray([features])
pred = classifier.predict(x)[0]
prob = classifier.predict_proba(x)[0, pred]
return [id_to_label[pred], prob]
def predict(text):
with torch.no_grad():
feats = gpt2_features(text, tokenizer, model, sent_cut_en)
out = predict(*feats, clf, ['Human Written', 'LLM Generated'])
return out
print("Building Gradio Interface...")
with gr.Blocks() as demo:
gr.Markdown(
"""
## ChatGPT Detector 🔬 (Linguistic version / 语言学版)
Visit our project on Github: [chatgpt-comparison-detection project](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
欢迎在 Github 上关注我们的 [ChatGPT 对比与检测项目](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
We provide three kinds of detectors, all in Bilingual / 我们提供了三个版本的检测器,且都支持中英文:
- [QA version / 问答版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-qa)<br>
detect whether an **answer** is generated by ChatGPT for certain **question**, using PLM-based classifiers / 判断某个**问题的回答**是否由ChatGPT生成,使用基于PTM的分类器来开发;
- [Sinlge-text version / 独立文本版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-single)<br>
detect whether a piece of text is ChatGPT generated, using PLM-based classifiers / 判断**单条文本**是否由ChatGPT生成,使用基于PTM的分类器来开发;
- [**Linguistic version / 语言学版** (👈 Current / 当前使用)](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-ling)<br>
detect whether a piece of text is ChatGPT generated, using linguistic features / 判断**单条文本**是否由ChatGPT生成,使用基于语言学特征的模型来开发;
"""
)
gr.Markdown(
"""
## Introduction:
Two Logistic regression models trained with two kinds of features:
1. [GLTR](https://aclanthology.org/P19-3019) Test-2, Language model predict token rank top-k buckets, top 10, 10-100, 100-1000, 1000+.
2. PPL-based, text ppl, sentence ppl, etc.
English LM is [GPT2-small](https://huggingface.co/gpt2).
Note: Providing more text to the `Text` box can make the prediction more accurate!
"""
)
a1 = gr.Textbox(
lines=5, label='Text',
value="There are a few things that can help protect your credit card information from being misused when you give it to a restaurant or any other business:\n\nEncryption: Many businesses use encryption to protect your credit card information when it is being transmitted or stored. This means that the information is transformed into a code that is difficult for anyone to read without the right key."
)
button1 = gr.Button("🤖 Predict!")
gr.Markdown("GLTR")
label1_gltr = gr.Textbox(lines=1, label='GLTR Predicted Label 🎃')
score1_gltr = gr.Textbox(lines=1, label='GLTR Probability')
button1.click(predict, inputs=[a1], outputs=[label1_gltr, score1_gltr])
demo.launch() |