Pratham0409's picture
Update app.py
2ef2e08 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer
# --- MODEL LOADING ---
# Load both the pipeline and the tokenizer for the model
# The tokenizer is needed to split the text into chunks the model can understand.
model_name = "openai-community/roberta-base-openai-detector"
pipe = pipeline("text-classification", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def detect_ai_text(text):
"""
Analyzes input text, handling long texts by chunking them into smaller pieces.
"""
# Get the model's max length, subtracting a few tokens for safety margin.
max_length = tokenizer.model_max_length - 2
# Tokenize the entire input text
tokens = tokenizer.encode(text)
# If the text is short enough, process it in one go.
if len(tokens) <= max_length:
results = pipe(text)
return {item['label']: item['score'] for item in results}
# --- CHUNKING LOGIC FOR LONG TEXT ---
# If the text is too long, we process it in overlapping chunks.
all_scores = []
# Create chunks with a 50-token overlap to maintain context between them
for i in range(0, len(tokens), max_length - 50):
chunk_tokens = tokens[i:i + max_length]
# Decode the chunk tokens back to a string for the pipeline
chunk_text = tokenizer.decode(chunk_tokens)
# Run the model on the chunk
chunk_results = pipe(chunk_text)
# Find the score for the 'AI_GENERATED' label (LABEL_1)
for item in chunk_results:
if item['label'] == 'LABEL_1': # LABEL_1 is the AI score
all_scores.append(item['score'])
break # Move to the next chunk
# If for some reason no scores were collected, return an error state.
if not all_scores:
return {"error": "Could not process text."}
# Average the AI scores from all chunks to get a final score
average_ai_score = sum(all_scores) / len(all_scores)
# Return the aggregated result in the same format as a single run
return {
'LABEL_1': average_ai_score, # AI score
'LABEL_0': 1 - average_ai_score, # Human score
'note': f'Result aggregated from {len(all_scores)} chunks.'
}
# --- GRADIO INTERFACE ---
iface = gr.Interface(
fn=detect_ai_text,
inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."),
outputs="json",
title="AI Content Detector (Robust Version)",
description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model."
)
# Launch the app
iface.launch()