Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline, AutoTokenizer | |
# --- MODEL LOADING --- | |
# Load both the pipeline and the tokenizer for the model | |
# The tokenizer is needed to split the text into chunks the model can understand. | |
model_name = "openai-community/roberta-base-openai-detector" | |
pipe = pipeline("text-classification", model=model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def detect_ai_text(text): | |
""" | |
Analyzes input text, handling long texts by chunking them into smaller pieces. | |
""" | |
# Get the model's max length, subtracting a few tokens for safety margin. | |
max_length = tokenizer.model_max_length - 2 | |
# Tokenize the entire input text | |
tokens = tokenizer.encode(text) | |
# If the text is short enough, process it in one go. | |
if len(tokens) <= max_length: | |
results = pipe(text) | |
return {item['label']: item['score'] for item in results} | |
# --- CHUNKING LOGIC FOR LONG TEXT --- | |
# If the text is too long, we process it in overlapping chunks. | |
all_scores = [] | |
# Create chunks with a 50-token overlap to maintain context between them | |
for i in range(0, len(tokens), max_length - 50): | |
chunk_tokens = tokens[i:i + max_length] | |
# Decode the chunk tokens back to a string for the pipeline | |
chunk_text = tokenizer.decode(chunk_tokens) | |
# Run the model on the chunk | |
chunk_results = pipe(chunk_text) | |
# Find the score for the 'AI_GENERATED' label (LABEL_1) | |
for item in chunk_results: | |
if item['label'] == 'LABEL_1': # LABEL_1 is the AI score | |
all_scores.append(item['score']) | |
break # Move to the next chunk | |
# If for some reason no scores were collected, return an error state. | |
if not all_scores: | |
return {"error": "Could not process text."} | |
# Average the AI scores from all chunks to get a final score | |
average_ai_score = sum(all_scores) / len(all_scores) | |
# Return the aggregated result in the same format as a single run | |
return { | |
'LABEL_1': average_ai_score, # AI score | |
'LABEL_0': 1 - average_ai_score, # Human score | |
'note': f'Result aggregated from {len(all_scores)} chunks.' | |
} | |
# --- GRADIO INTERFACE --- | |
iface = gr.Interface( | |
fn=detect_ai_text, | |
inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."), | |
outputs="json", | |
title="AI Content Detector (Robust Version)", | |
description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model." | |
) | |
# Launch the app | |
iface.launch() |