Spaces:

blazingbunny
/

google-word-classification

Sleeping

App Files Files Community

google-word-classification / app.py

blazingbunny

Update app.py

81d2076 verified 2 months ago

raw

history blame contribute delete

11.1 kB

	import gradio as gr
	import json
	import re
	from google.cloud import language_v1
	from google.oauth2 import service_account
	import os
	import pandas as pd

	# Initialize the Google Cloud Natural Language client
	def init_client():
	"""Initialize the Google Cloud Natural Language client"""
	try:
	# Option 1: Using service account key file
	# Uncomment and modify the path to your service account key
	# credentials = service_account.Credentials.from_service_account_file(
	# "path/to/your/service-account-key.json"
	# )
	# client = language_v1.LanguageServiceClient(credentials=credentials)

	# Option 2: Using environment variable (recommended for Hugging Face Spaces)
	# Set GOOGLE_APPLICATION_CREDENTIALS_JSON as a Hugging Face Space secret
	if 'GOOGLE_APPLICATION_CREDENTIALS_JSON' in os.environ:
	credentials_info = json.loads(os.environ['GOOGLE_APPLICATION_CREDENTIALS_JSON'])
	credentials = service_account.Credentials.from_service_account_info(credentials_info)
	client = language_v1.LanguageServiceClient(credentials=credentials)
	else:
	# Fallback to default credentials
	client = language_v1.LanguageServiceClient()

	return client
	except Exception as e:
	return None, str(e)

	def classify_text(client, text_content):
	"""Classify text using Google Cloud Natural Language API with V2 model"""
	try:
	document = language_v1.Document(
	content=text_content,
	type_=language_v1.Document.Type.PLAIN_TEXT
	)

	# Configure V2 model options for better performance
	classification_model_options = language_v1.ClassificationModelOptions(
	v2_model=language_v1.ClassificationModelOptions.V2Model()
	)

	response = client.classify_text(
	request={
	'document': document,
	'classification_model_options': classification_model_options
	}
	)
	categories = response.categories

	results = []
	for category in categories:
	results.append({
	'category': category.name,
	'confidence': round(category.confidence, 4)
	})

	return results
	except Exception as e:
	return [{'error': str(e)}]

	def extract_words(text):
	"""Extract words from text, preserving punctuation context"""
	# Split text into words while keeping track of positions
	words = re.findall(r'\b\w+\b', text.lower())
	return list(set(words)) # Remove duplicates

	def process_paragraph(paragraph):
	"""Process the paragraph for both word-level and paragraph-level classification"""
	if not paragraph.strip():
	return "Please enter some text to analyze.", "", ""

	# Initialize client
	client_result = init_client()
	if isinstance(client_result, tuple):
	client, error = client_result
	return f"Error initializing Google Cloud client: {error}", "", ""
	else:
	client = client_result

	# Extract unique words
	words = extract_words(paragraph)

	# Process individual words
	word_results = {}
	for word in words:
	if len(word) > 1: # Skip single characters
	classifications = classify_text(client, word)
	if classifications and 'error' not in classifications[0]:
	word_results[word] = classifications

	# Process entire paragraph
	paragraph_classifications = classify_text(client, paragraph)

	# Format word-level results
	word_analysis = "## Word-Level Classification Results\n\n"
	if word_results:
	for word, classifications in word_results.items():
	word_analysis += f"{word.upper()}\n"
	for i, result in enumerate(classifications, 1):
	word_analysis += f" {i}. Category: `{result['category']}`\n"
	word_analysis += f" Confidence: {result['confidence']:.4f}\n"
	word_analysis += "\n"
	else:
	word_analysis += "No classifications found for individual words.\n"

	# Format paragraph-level results
	paragraph_analysis = "## Paragraph-Level Classification Results\n\n"
	if paragraph_classifications and 'error' not in paragraph_classifications[0]:
	for i, result in enumerate(paragraph_classifications, 1):
	paragraph_analysis += f"{i}. Category: `{result['category']}`\n"
	paragraph_analysis += f" Confidence: {result['confidence']:.4f}\n\n"
	else:
	if paragraph_classifications and 'error' in paragraph_classifications[0]:
	paragraph_analysis += f"Error: {paragraph_classifications[0]['error']}\n"
	else:
	paragraph_analysis += "No classifications found for the paragraph.\n"

	# Create annotated text
	annotated_text = create_annotated_text(paragraph, word_results)

	return word_analysis, paragraph_analysis, annotated_text

	def create_annotated_text(original_text, word_results):
	"""Create annotated version of the text with classification info"""
	annotated = "## Annotated Text\n\n"

	words = original_text.split()
	annotated_words = []

	for word in words:
	clean_word = re.sub(r'[^\w]', '', word.lower())
	if clean_word in word_results:
	# Get the top classification for annotation
	top_result = word_results[clean_word][0]
	category_short = top_result['category'].split('/')[-1] # Get last part of category
	confidence = top_result['confidence']
	annotated_word = f"{word} `[{category_short}: {confidence:.3f}]`"
	annotated_words.append(annotated_word)
	else:
	annotated_words.append(word)

	annotated += " ".join(annotated_words)
	return annotated

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="Google Cloud Natural Language Classifier") as app:
	gr.Markdown("""
	# Google Cloud Natural Language Text Classifier (V2 Model)

	This intelligent text classification tool leverages Google Cloud's powerful Natural Language API V2 model to analyze and categorize your content with high accuracy and confidence scores.

	## How to Use This Tool

	1. Enter Your Text: Type or paste any paragraph, article excerpt, or text content into the input box on the left
	2. Click "Analyze Text": Hit the analyze button to process your content
	3. Review Results: The tool provides three types of analysis:
	- Word-Level Classification: Each unique word gets categorized individually with confidence scores
	- Annotated Text: Your original text with inline category labels and confidence scores
	- Paragraph-Level Classification: The entire text analyzed as a cohesive unit

	## What You'll Get

	- Detailed Categories: Content is classified into specific categories like `/Arts & Entertainment/Movies`, `/Business & Industrial/Finance`, `/Health/Medical`, etc.
	- Confidence Scores: Each classification includes a confidence score (0-1) indicating how certain the AI is about the categorization
	- Multiple Classifications: Words and paragraphs can belong to multiple categories simultaneously
	- Comprehensive Analysis: Both granular (word-level) and holistic (paragraph-level) insights

	## Perfect For

	- Content creators organizing their material
	- Researchers analyzing text themes
	- Marketers understanding content categories
	- Students exploring text classification
	- Anyone curious about how AI categorizes written content

	## Powered By

	This tool uses Google Cloud's Natural Language API V2 model for superior performance and accuracy. The V2 model supports an expanded set of content categories for more precise classification.

	Source: [Google Cloud Natural Language API - Content Categories](https://cloud.google.com/natural-language/docs/categories#version_2)

	---

	Try the examples below or enter your own text to get started!
	""")

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="Enter your paragraph",
	placeholder="Type or paste your text here...",
	lines=8,
	max_lines=20
	)

	analyze_btn = gr.Button("Analyze Text", variant="primary")

	with gr.Column(scale=1):
	word_results = gr.Markdown(
	label="Word-Level Results",
	value="Results will appear here after analysis..."
	)

	with gr.Row():
	with gr.Column():
	annotated_output = gr.Markdown(
	label="Annotated Text",
	value="Annotated text with classifications will appear here..."
	)

	with gr.Row():
	with gr.Column():
	paragraph_results = gr.Markdown(
	label="Paragraph-Level Results",
	value="Paragraph classification results will appear here..."
	)

	# Event handlers
	analyze_btn.click(
	fn=process_paragraph,
	inputs=[input_text],
	outputs=[word_results, paragraph_results, annotated_output]
	)

	# Example inputs - designed to show different confidence score ranges
	gr.Examples(
	examples=[
	# High confidence example - clear, focused topic
	["Apple Inc. reported record quarterly earnings today, with iPhone sales driving revenue growth of 15% year-over-year. The company's stock price surged 8% in after-hours trading as investors celebrated the strong financial performance and optimistic guidance for the upcoming fiscal quarter."],

	# Low confidence example - mixed/ambiguous content
	["The thing about stuff is that it matters sometimes. People often think about various topics and ideas when they consider different aspects of life and situations that may or may not be relevant to their daily experiences and general thoughts."],

	# Medium confidence example - specific but niche topic
	["Quantum computing researchers at MIT have developed a new algorithm that could potentially solve certain optimization problems exponentially faster than classical computers, though practical applications remain years away from commercial implementation."]
	],
	inputs=[input_text]
	)

	return app

	# Launch the app
	if __name__ == "__main__":
	app = create_interface()
	app.launch()