File size: 2,701 Bytes
2dc17aa
2ef2e08
2dc17aa
2ef2e08
 
 
 
 
 
2dc17aa
 
 
2ef2e08
2dc17aa
2ef2e08
 
 
 
 
 
 
 
 
 
2dc17aa
2ef2e08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc17aa
 
2ef2e08
2dc17aa
2ef2e08
 
2dc17aa
 
2ef2e08
2dc17aa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
from transformers import pipeline, AutoTokenizer

# --- MODEL LOADING ---
# Load both the pipeline and the tokenizer for the model
# The tokenizer is needed to split the text into chunks the model can understand.
model_name = "openai-community/roberta-base-openai-detector"
pipe = pipeline("text-classification", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def detect_ai_text(text):
    """
    Analyzes input text, handling long texts by chunking them into smaller pieces.
    """
    # Get the model's max length, subtracting a few tokens for safety margin.
    max_length = tokenizer.model_max_length - 2 
    
    # Tokenize the entire input text
    tokens = tokenizer.encode(text)
    
    # If the text is short enough, process it in one go.
    if len(tokens) <= max_length:
        results = pipe(text)
        return {item['label']: item['score'] for item in results}

    # --- CHUNKING LOGIC FOR LONG TEXT ---
    # If the text is too long, we process it in overlapping chunks.
    all_scores = []
    
    # Create chunks with a 50-token overlap to maintain context between them
    for i in range(0, len(tokens), max_length - 50):
        chunk_tokens = tokens[i:i + max_length]
        # Decode the chunk tokens back to a string for the pipeline
        chunk_text = tokenizer.decode(chunk_tokens)
        
        # Run the model on the chunk
        chunk_results = pipe(chunk_text)
        
        # Find the score for the 'AI_GENERATED' label (LABEL_1)
        for item in chunk_results:
            if item['label'] == 'LABEL_1': # LABEL_1 is the AI score
                all_scores.append(item['score'])
                break # Move to the next chunk
    
    # If for some reason no scores were collected, return an error state.
    if not all_scores:
        return {"error": "Could not process text."}
        
    # Average the AI scores from all chunks to get a final score
    average_ai_score = sum(all_scores) / len(all_scores)
    
    # Return the aggregated result in the same format as a single run
    return {
        'LABEL_1': average_ai_score, # AI score
        'LABEL_0': 1 - average_ai_score, # Human score
        'note': f'Result aggregated from {len(all_scores)} chunks.'
    }

# --- GRADIO INTERFACE ---
iface = gr.Interface(
    fn=detect_ai_text,
    inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."),
    outputs="json",
    title="AI Content Detector (Robust Version)",
    description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model."
)

# Launch the app
iface.launch()