Spaces:

iimran
/

Abuse-Detection

Running

App Files Files Community

Abuse-Detection / app.py

iimran

Update app.py

0f3fb53 verified 6 months ago

raw

history blame

4.42 kB

	import os
	import json
	import numpy as np
	from tokenizers import Tokenizer
	import onnxruntime as ort
	from huggingface_hub import hf_hub_download
	import gradio as gr

	class ONNXInferencePipeline:
	def __init__(self, repo_id):
	# Retrieve the Hugging Face token from the environment variable
	hf_token = os.getenv("HF_TOKEN")
	if hf_token is None:
	raise ValueError("HF_TOKEN environment variable is not set.")

	# Download files from Hugging Face Hub using the token
	self.onnx_path = hf_hub_download(repo_id=repo_id, filename="RudeRater.onnx", use_auth_token=hf_token)
	self.tokenizer_path = hf_hub_download(repo_id=repo_id, filename="train_bpe_tokenizer.json", use_auth_token=hf_token)
	self.config_path = hf_hub_download(repo_id=repo_id, filename="hyperparameters.json", use_auth_token=hf_token)

	# Load configuration
	with open(self.config_path) as f:
	self.config = json.load(f)

	# Initialize tokenizer
	self.tokenizer = Tokenizer.from_file(self.tokenizer_path)
	self.max_len = self.config["tokenizer"]["max_len"]

	# Initialize ONNX runtime session
	self.session = ort.InferenceSession(self.onnx_path)
	self.providers = ['CPUExecutionProvider'] # Use CUDA if available
	if 'CUDAExecutionProvider' in ort.get_available_providers():
	self.providers = ['CUDAExecutionProvider']
	self.session.set_providers(self.providers)

	def preprocess(self, text):
	encoding = self.tokenizer.encode(text)
	ids = encoding.ids[:self.max_len]
	padding = [0] * (self.max_len - len(ids))
	return np.array(ids + padding, dtype=np.int64).reshape(1, -1)

	def predict(self, text):
	# Preprocess
	input_array = self.preprocess(text)

	# Run inference
	results = self.session.run(
	None,
	{'input': input_array}
	)

	# Post-process
	logits = results[0]
	probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
	predicted_class = int(np.argmax(probabilities))

	# Map to labels
	class_labels = ['Inappropriate Content', 'Not Inappropriate']
	return {
	'label': class_labels[predicted_class],
	'confidence': float(probabilities[0][predicted_class]),
	'probabilities': probabilities[0].tolist()
	}

	# Example usage
	if __name__ == "__main__":
	# Initialize the pipeline with the Hugging Face repository ID
	pipeline = ONNXInferencePipeline(repo_id="iimran/RudeRater")

	# Example texts for testing
	example_texts = [
	"You're a worthless piece of garbage who should die"
	]

	for text in example_texts:
	result = pipeline.predict(text)
	print(f"Input: {text}")
	print(f"Prediction: {result['label']} ")
	#print(f"Probabilities: Inappropriate={result['probabilities'][0]:.2%}, Not Inappropriate={result['probabilities'][1]:.2%}")
	print("-" * 80)

	# Define a function for Gradio to use
	def gradio_predict(text):
	result = pipeline.predict(text)
	return (
	f"Prediction: {result['label']} \n"
	#f"Probabilities: Inappropriate={result['probabilities'][0]:.2%}, Not Inappropriate={result['probabilities'][1]:.2%}"
	)

	# Create a Gradio interface
	iface = gr.Interface(
	fn=gradio_predict,
	inputs=gr.Textbox(lines=7, placeholder="Enter text here..."),
	outputs="text",
	title="RudeRater - Content Appropriateness Classifier",
	description="RudeRater is designed to identify inappropriate content in text. It analyzes input for offensive language and explicit content. It's trained on a tiny database so it might not be able to detect something fancy, but should easily detect day to day used offensive language.",
	examples=[
	"Congrats, you fuckbrain arsehole, you’ve outdone yourself in stupidity. A real cock-up of a human—should we clap for your bollocks-faced greatness or just pity you?",
	"You’re a mad bastard, but I’d still grab a beer with you! Fuck around all you like, you cockheaded legend—your arsehole antics are bloody brilliant.",
	"Your mother should have done better raising such a useless idiot"
	]
	)

	# Launch the Gradio app
	iface.launch()