Spaces:

Pratham0409
/

ai-text-detector

Sleeping

App Files Files Community

ai-text-detector / app.py

Pratham0409

Update app.py

2ef2e08 verified 4 days ago

raw

history blame contribute delete

2.7 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer

	# --- MODEL LOADING ---
	# Load both the pipeline and the tokenizer for the model
	# The tokenizer is needed to split the text into chunks the model can understand.
	model_name = "openai-community/roberta-base-openai-detector"
	pipe = pipeline("text-classification", model=model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	def detect_ai_text(text):
	"""
	Analyzes input text, handling long texts by chunking them into smaller pieces.
	"""
	# Get the model's max length, subtracting a few tokens for safety margin.
	max_length = tokenizer.model_max_length - 2

	# Tokenize the entire input text
	tokens = tokenizer.encode(text)

	# If the text is short enough, process it in one go.
	if len(tokens) <= max_length:
	results = pipe(text)
	return {item['label']: item['score'] for item in results}

	# --- CHUNKING LOGIC FOR LONG TEXT ---
	# If the text is too long, we process it in overlapping chunks.
	all_scores = []

	# Create chunks with a 50-token overlap to maintain context between them
	for i in range(0, len(tokens), max_length - 50):
	chunk_tokens = tokens[i:i + max_length]
	# Decode the chunk tokens back to a string for the pipeline
	chunk_text = tokenizer.decode(chunk_tokens)

	# Run the model on the chunk
	chunk_results = pipe(chunk_text)

	# Find the score for the 'AI_GENERATED' label (LABEL_1)
	for item in chunk_results:
	if item['label'] == 'LABEL_1': # LABEL_1 is the AI score
	all_scores.append(item['score'])
	break # Move to the next chunk

	# If for some reason no scores were collected, return an error state.
	if not all_scores:
	return {"error": "Could not process text."}

	# Average the AI scores from all chunks to get a final score
	average_ai_score = sum(all_scores) / len(all_scores)

	# Return the aggregated result in the same format as a single run
	return {
	'LABEL_1': average_ai_score, # AI score
	'LABEL_0': 1 - average_ai_score, # Human score
	'note': f'Result aggregated from {len(all_scores)} chunks.'
	}

	# --- GRADIO INTERFACE ---
	iface = gr.Interface(
	fn=detect_ai_text,
	inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."),
	outputs="json",
	title="AI Content Detector (Robust Version)",
	description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model."
	)

	# Launch the app
	iface.launch()