Spaces:

blazingbunny
/

google-word-classification

Sleeping

File size: 11,137 Bytes

import gradio as gr
import json
import re
from google.cloud import language_v1
from google.oauth2 import service_account
import os
import pandas as pd

# Initialize the Google Cloud Natural Language client
def init_client():
    """Initialize the Google Cloud Natural Language client"""
    try:
        # Option 1: Using service account key file
        # Uncomment and modify the path to your service account key
        # credentials = service_account.Credentials.from_service_account_file(
        #     "path/to/your/service-account-key.json"
        # )
        # client = language_v1.LanguageServiceClient(credentials=credentials)
        
        # Option 2: Using environment variable (recommended for Hugging Face Spaces)
        # Set GOOGLE_APPLICATION_CREDENTIALS_JSON as a Hugging Face Space secret
        if 'GOOGLE_APPLICATION_CREDENTIALS_JSON' in os.environ:
            credentials_info = json.loads(os.environ['GOOGLE_APPLICATION_CREDENTIALS_JSON'])
            credentials = service_account.Credentials.from_service_account_info(credentials_info)
            client = language_v1.LanguageServiceClient(credentials=credentials)
        else:
            # Fallback to default credentials
            client = language_v1.LanguageServiceClient()
        
        return client
    except Exception as e:
        return None, str(e)

def classify_text(client, text_content):
    """Classify text using Google Cloud Natural Language API with V2 model"""
    try:
        document = language_v1.Document(
            content=text_content, 
            type_=language_v1.Document.Type.PLAIN_TEXT
        )
        
        # Configure V2 model options for better performance
        classification_model_options = language_v1.ClassificationModelOptions(
            v2_model=language_v1.ClassificationModelOptions.V2Model()
        )
        
        response = client.classify_text(
            request={
                'document': document,
                'classification_model_options': classification_model_options
            }
        )
        categories = response.categories
        
        results = []
        for category in categories:
            results.append({
                'category': category.name,
                'confidence': round(category.confidence, 4)
            })
        
        return results
    except Exception as e:
        return [{'error': str(e)}]

def extract_words(text):
    """Extract words from text, preserving punctuation context"""
    # Split text into words while keeping track of positions
    words = re.findall(r'\b\w+\b', text.lower())
    return list(set(words))  # Remove duplicates

def process_paragraph(paragraph):
    """Process the paragraph for both word-level and paragraph-level classification"""
    if not paragraph.strip():
        return "Please enter some text to analyze.", "", ""
    
    # Initialize client
    client_result = init_client()
    if isinstance(client_result, tuple):
        client, error = client_result
        return f"Error initializing Google Cloud client: {error}", "", ""
    else:
        client = client_result
    
    # Extract unique words
    words = extract_words(paragraph)
    
    # Process individual words
    word_results = {}
    for word in words:
        if len(word) > 1:  # Skip single characters
            classifications = classify_text(client, word)
            if classifications and 'error' not in classifications[0]:
                word_results[word] = classifications
    
    # Process entire paragraph
    paragraph_classifications = classify_text(client, paragraph)
    
    # Format word-level results
    word_analysis = "## Word-Level Classification Results\n\n"
    if word_results:
        for word, classifications in word_results.items():
            word_analysis += f"**{word.upper()}**\n"
            for i, result in enumerate(classifications, 1):
                word_analysis += f"  {i}. Category: `{result['category']}`\n"
                word_analysis += f"     Confidence: {result['confidence']:.4f}\n"
            word_analysis += "\n"
    else:
        word_analysis += "No classifications found for individual words.\n"
    
    # Format paragraph-level results
    paragraph_analysis = "## Paragraph-Level Classification Results\n\n"
    if paragraph_classifications and 'error' not in paragraph_classifications[0]:
        for i, result in enumerate(paragraph_classifications, 1):
            paragraph_analysis += f"**{i}. Category:** `{result['category']}`\n"
            paragraph_analysis += f"   **Confidence:** {result['confidence']:.4f}\n\n"
    else:
        if paragraph_classifications and 'error' in paragraph_classifications[0]:
            paragraph_analysis += f"Error: {paragraph_classifications[0]['error']}\n"
        else:
            paragraph_analysis += "No classifications found for the paragraph.\n"
    
    # Create annotated text
    annotated_text = create_annotated_text(paragraph, word_results)
    
    return word_analysis, paragraph_analysis, annotated_text

def create_annotated_text(original_text, word_results):
    """Create annotated version of the text with classification info"""
    annotated = "## Annotated Text\n\n"
    
    words = original_text.split()
    annotated_words = []
    
    for word in words:
        clean_word = re.sub(r'[^\w]', '', word.lower())
        if clean_word in word_results:
            # Get the top classification for annotation
            top_result = word_results[clean_word][0]
            category_short = top_result['category'].split('/')[-1]  # Get last part of category
            confidence = top_result['confidence']
            annotated_word = f"**{word}** `[{category_short}: {confidence:.3f}]`"
            annotated_words.append(annotated_word)
        else:
            annotated_words.append(word)
    
    annotated += " ".join(annotated_words)
    return annotated

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Google Cloud Natural Language Classifier") as app:
        gr.Markdown("""
        # Google Cloud Natural Language Text Classifier (V2 Model)
        
        This intelligent text classification tool leverages Google Cloud's powerful Natural Language API V2 model to analyze and categorize your content with high accuracy and confidence scores.
        
        ## How to Use This Tool
        
        1. **Enter Your Text**: Type or paste any paragraph, article excerpt, or text content into the input box on the left
        2. **Click "Analyze Text"**: Hit the analyze button to process your content
        3. **Review Results**: The tool provides three types of analysis:
           - **Word-Level Classification**: Each unique word gets categorized individually with confidence scores
           - **Annotated Text**: Your original text with inline category labels and confidence scores
           - **Paragraph-Level Classification**: The entire text analyzed as a cohesive unit
        
        ## What You'll Get
        
        - **Detailed Categories**: Content is classified into specific categories like `/Arts & Entertainment/Movies`, `/Business & Industrial/Finance`, `/Health/Medical`, etc.
        - **Confidence Scores**: Each classification includes a confidence score (0-1) indicating how certain the AI is about the categorization
        - **Multiple Classifications**: Words and paragraphs can belong to multiple categories simultaneously
        - **Comprehensive Analysis**: Both granular (word-level) and holistic (paragraph-level) insights
        
        ## Perfect For
        
        - Content creators organizing their material
        - Researchers analyzing text themes
        - Marketers understanding content categories
        - Students exploring text classification
        - Anyone curious about how AI categorizes written content
        
        ## Powered By
        
        This tool uses Google Cloud's Natural Language API V2 model for superior performance and accuracy. The V2 model supports an expanded set of content categories for more precise classification.
        
        **Source**: [Google Cloud Natural Language API - Content Categories](https://cloud.google.com/natural-language/docs/categories#version_2)
        
        ---
        
        **Try the examples below or enter your own text to get started!**
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                input_text = gr.Textbox(
                    label="Enter your paragraph",
                    placeholder="Type or paste your text here...",
                    lines=8,
                    max_lines=20
                )
                
                analyze_btn = gr.Button("Analyze Text", variant="primary")
            
            with gr.Column(scale=1):
                word_results = gr.Markdown(
                    label="Word-Level Results",
                    value="Results will appear here after analysis..."
                )
        
        with gr.Row():
            with gr.Column():
                annotated_output = gr.Markdown(
                    label="Annotated Text",
                    value="Annotated text with classifications will appear here..."
                )
        
        with gr.Row():
            with gr.Column():
                paragraph_results = gr.Markdown(
                    label="Paragraph-Level Results",
                    value="Paragraph classification results will appear here..."
                )
        
        # Event handlers
        analyze_btn.click(
            fn=process_paragraph,
            inputs=[input_text],
            outputs=[word_results, paragraph_results, annotated_output]
        )
        
        # Example inputs - designed to show different confidence score ranges
        gr.Examples(
            examples=[
                # High confidence example - clear, focused topic
                ["Apple Inc. reported record quarterly earnings today, with iPhone sales driving revenue growth of 15% year-over-year. The company's stock price surged 8% in after-hours trading as investors celebrated the strong financial performance and optimistic guidance for the upcoming fiscal quarter."],
                
                # Low confidence example - mixed/ambiguous content
                ["The thing about stuff is that it matters sometimes. People often think about various topics and ideas when they consider different aspects of life and situations that may or may not be relevant to their daily experiences and general thoughts."],
                
                # Medium confidence example - specific but niche topic
                ["Quantum computing researchers at MIT have developed a new algorithm that could potentially solve certain optimization problems exponentially faster than classical computers, though practical applications remain years away from commercial implementation."]
            ],
            inputs=[input_text]
        )
        
    return app

# Launch the app
if __name__ == "__main__":
    app = create_interface()
    app.launch()