File size: 11,137 Bytes
ca4d658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b0d3b2
ca4d658
 
 
 
 
 
5b0d3b2
 
 
 
 
 
 
 
 
 
 
ca4d658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5d75ac
ca4d658
d5d75ac
ca4d658
d5d75ac
ca4d658
d5d75ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca4d658
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81d2076
ca4d658
 
81d2076
 
 
 
 
 
 
 
ca4d658
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import gradio as gr
import json
import re
from google.cloud import language_v1
from google.oauth2 import service_account
import os
import pandas as pd

# Initialize the Google Cloud Natural Language client
def init_client():
    """Initialize the Google Cloud Natural Language client"""
    try:
        # Option 1: Using service account key file
        # Uncomment and modify the path to your service account key
        # credentials = service_account.Credentials.from_service_account_file(
        #     "path/to/your/service-account-key.json"
        # )
        # client = language_v1.LanguageServiceClient(credentials=credentials)
        
        # Option 2: Using environment variable (recommended for Hugging Face Spaces)
        # Set GOOGLE_APPLICATION_CREDENTIALS_JSON as a Hugging Face Space secret
        if 'GOOGLE_APPLICATION_CREDENTIALS_JSON' in os.environ:
            credentials_info = json.loads(os.environ['GOOGLE_APPLICATION_CREDENTIALS_JSON'])
            credentials = service_account.Credentials.from_service_account_info(credentials_info)
            client = language_v1.LanguageServiceClient(credentials=credentials)
        else:
            # Fallback to default credentials
            client = language_v1.LanguageServiceClient()
        
        return client
    except Exception as e:
        return None, str(e)

def classify_text(client, text_content):
    """Classify text using Google Cloud Natural Language API with V2 model"""
    try:
        document = language_v1.Document(
            content=text_content, 
            type_=language_v1.Document.Type.PLAIN_TEXT
        )
        
        # Configure V2 model options for better performance
        classification_model_options = language_v1.ClassificationModelOptions(
            v2_model=language_v1.ClassificationModelOptions.V2Model()
        )
        
        response = client.classify_text(
            request={
                'document': document,
                'classification_model_options': classification_model_options
            }
        )
        categories = response.categories
        
        results = []
        for category in categories:
            results.append({
                'category': category.name,
                'confidence': round(category.confidence, 4)
            })
        
        return results
    except Exception as e:
        return [{'error': str(e)}]

def extract_words(text):
    """Extract words from text, preserving punctuation context"""
    # Split text into words while keeping track of positions
    words = re.findall(r'\b\w+\b', text.lower())
    return list(set(words))  # Remove duplicates

def process_paragraph(paragraph):
    """Process the paragraph for both word-level and paragraph-level classification"""
    if not paragraph.strip():
        return "Please enter some text to analyze.", "", ""
    
    # Initialize client
    client_result = init_client()
    if isinstance(client_result, tuple):
        client, error = client_result
        return f"Error initializing Google Cloud client: {error}", "", ""
    else:
        client = client_result
    
    # Extract unique words
    words = extract_words(paragraph)
    
    # Process individual words
    word_results = {}
    for word in words:
        if len(word) > 1:  # Skip single characters
            classifications = classify_text(client, word)
            if classifications and 'error' not in classifications[0]:
                word_results[word] = classifications
    
    # Process entire paragraph
    paragraph_classifications = classify_text(client, paragraph)
    
    # Format word-level results
    word_analysis = "## Word-Level Classification Results\n\n"
    if word_results:
        for word, classifications in word_results.items():
            word_analysis += f"**{word.upper()}**\n"
            for i, result in enumerate(classifications, 1):
                word_analysis += f"  {i}. Category: `{result['category']}`\n"
                word_analysis += f"     Confidence: {result['confidence']:.4f}\n"
            word_analysis += "\n"
    else:
        word_analysis += "No classifications found for individual words.\n"
    
    # Format paragraph-level results
    paragraph_analysis = "## Paragraph-Level Classification Results\n\n"
    if paragraph_classifications and 'error' not in paragraph_classifications[0]:
        for i, result in enumerate(paragraph_classifications, 1):
            paragraph_analysis += f"**{i}. Category:** `{result['category']}`\n"
            paragraph_analysis += f"   **Confidence:** {result['confidence']:.4f}\n\n"
    else:
        if paragraph_classifications and 'error' in paragraph_classifications[0]:
            paragraph_analysis += f"Error: {paragraph_classifications[0]['error']}\n"
        else:
            paragraph_analysis += "No classifications found for the paragraph.\n"
    
    # Create annotated text
    annotated_text = create_annotated_text(paragraph, word_results)
    
    return word_analysis, paragraph_analysis, annotated_text

def create_annotated_text(original_text, word_results):
    """Create annotated version of the text with classification info"""
    annotated = "## Annotated Text\n\n"
    
    words = original_text.split()
    annotated_words = []
    
    for word in words:
        clean_word = re.sub(r'[^\w]', '', word.lower())
        if clean_word in word_results:
            # Get the top classification for annotation
            top_result = word_results[clean_word][0]
            category_short = top_result['category'].split('/')[-1]  # Get last part of category
            confidence = top_result['confidence']
            annotated_word = f"**{word}** `[{category_short}: {confidence:.3f}]`"
            annotated_words.append(annotated_word)
        else:
            annotated_words.append(word)
    
    annotated += " ".join(annotated_words)
    return annotated

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Google Cloud Natural Language Classifier") as app:
        gr.Markdown("""
        # Google Cloud Natural Language Text Classifier (V2 Model)
        
        This intelligent text classification tool leverages Google Cloud's powerful Natural Language API V2 model to analyze and categorize your content with high accuracy and confidence scores.
        
        ## How to Use This Tool
        
        1. **Enter Your Text**: Type or paste any paragraph, article excerpt, or text content into the input box on the left
        2. **Click "Analyze Text"**: Hit the analyze button to process your content
        3. **Review Results**: The tool provides three types of analysis:
           - **Word-Level Classification**: Each unique word gets categorized individually with confidence scores
           - **Annotated Text**: Your original text with inline category labels and confidence scores
           - **Paragraph-Level Classification**: The entire text analyzed as a cohesive unit
        
        ## What You'll Get
        
        - **Detailed Categories**: Content is classified into specific categories like `/Arts & Entertainment/Movies`, `/Business & Industrial/Finance`, `/Health/Medical`, etc.
        - **Confidence Scores**: Each classification includes a confidence score (0-1) indicating how certain the AI is about the categorization
        - **Multiple Classifications**: Words and paragraphs can belong to multiple categories simultaneously
        - **Comprehensive Analysis**: Both granular (word-level) and holistic (paragraph-level) insights
        
        ## Perfect For
        
        - Content creators organizing their material
        - Researchers analyzing text themes
        - Marketers understanding content categories
        - Students exploring text classification
        - Anyone curious about how AI categorizes written content
        
        ## Powered By
        
        This tool uses Google Cloud's Natural Language API V2 model for superior performance and accuracy. The V2 model supports an expanded set of content categories for more precise classification.
        
        **Source**: [Google Cloud Natural Language API - Content Categories](https://cloud.google.com/natural-language/docs/categories#version_2)
        
        ---
        
        **Try the examples below or enter your own text to get started!**
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                input_text = gr.Textbox(
                    label="Enter your paragraph",
                    placeholder="Type or paste your text here...",
                    lines=8,
                    max_lines=20
                )
                
                analyze_btn = gr.Button("Analyze Text", variant="primary")
            
            with gr.Column(scale=1):
                word_results = gr.Markdown(
                    label="Word-Level Results",
                    value="Results will appear here after analysis..."
                )
        
        with gr.Row():
            with gr.Column():
                annotated_output = gr.Markdown(
                    label="Annotated Text",
                    value="Annotated text with classifications will appear here..."
                )
        
        with gr.Row():
            with gr.Column():
                paragraph_results = gr.Markdown(
                    label="Paragraph-Level Results",
                    value="Paragraph classification results will appear here..."
                )
        
        # Event handlers
        analyze_btn.click(
            fn=process_paragraph,
            inputs=[input_text],
            outputs=[word_results, paragraph_results, annotated_output]
        )
        
        # Example inputs - designed to show different confidence score ranges
        gr.Examples(
            examples=[
                # High confidence example - clear, focused topic
                ["Apple Inc. reported record quarterly earnings today, with iPhone sales driving revenue growth of 15% year-over-year. The company's stock price surged 8% in after-hours trading as investors celebrated the strong financial performance and optimistic guidance for the upcoming fiscal quarter."],
                
                # Low confidence example - mixed/ambiguous content
                ["The thing about stuff is that it matters sometimes. People often think about various topics and ideas when they consider different aspects of life and situations that may or may not be relevant to their daily experiences and general thoughts."],
                
                # Medium confidence example - specific but niche topic
                ["Quantum computing researchers at MIT have developed a new algorithm that could potentially solve certain optimization problems exponentially faster than classical computers, though practical applications remain years away from commercial implementation."]
            ],
            inputs=[input_text]
        )
        
    return app

# Launch the app
if __name__ == "__main__":
    app = create_interface()
    app.launch()