Spaces:

rwillats
/

Contextual-Policy-Engine-Hate-Speech-Classification

Running

App Files Files Community

Contextual-Policy-Engine-Hate-Speech-Classification / perspective.py

rwillats

Upload folder using huggingface_hub

0886c09 verified 7 months ago

raw

history blame

5.34 kB

	import json
	import requests
	import csv
	import os
	import time
	import sys
	from pathlib import Path
	from datetime import datetime
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv("key.env")
	API_KEY = os.getenv("perspective_key")

	if not API_KEY:
	print("Error: Perspective API key is missing. Please set it in a .env file.")
	sys.exit(1)

	OUTPUT_CSV = "perspective_moderation_output.csv" # Define output CSV name

	def analyze_text(text):
	"""Send text to Perspective API for analysis."""
	url = 'https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze'
	params = {'key': API_KEY}
	data = {
	'comment': {'text': text},
	'requestedAttributes': {
	'TOXICITY': {},
	'SEVERE_TOXICITY': {},
	'IDENTITY_ATTACK': {},
	'INSULT': {},
	'PROFANITY': {},
	'THREAT': {},
	'SEXUALLY_EXPLICIT': {}
	}
	}

	try:
	start_time = time.time()
	response = requests.post(url, params=params, data=json.dumps(data))
	end_time = time.time()
	response_json = response.json()

	if 'attributeScores' not in response_json:
	print(f"Error in API response: {response_json}")
	return {attr: 0.0 for attr in data['requestedAttributes']}, end_time - start_time

	return response_json, end_time - start_time
	except Exception as e:
	print(f"Exception during API call: {e}")
	return {attr: 0.0 for attr in data['requestedAttributes']}, 0.0

	def process_csv(input_file):
	"""Process CSV file, analyze each prompt-response pair, and write results to a new CSV."""

	if not os.path.exists(input_file):
	print(f"Input file not found: {input_file}")
	return

	attributes = [
	'TOXICITY', 'SEVERE_TOXICITY', 'IDENTITY_ATTACK', 'INSULT',
	'PROFANITY', 'THREAT', 'SEXUALLY_EXPLICIT'
	]

	prompt_headers = [f"prompt_{attr}" for attr in attributes]
	response_headers = [f"response_{attr}" for attr in attributes]
	additional_metrics = ["Processing_Time (s)", "Timestamp"]

	# Create output file at the start
	if not os.path.exists(OUTPUT_CSV):
	with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as outfile:
	fieldnames = ['prompt', 'response'] + prompt_headers + response_headers + additional_metrics
	writer = csv.DictWriter(outfile, fieldnames=fieldnames)
	writer.writeheader()
	outfile.flush()

	print(f"Processing started. Writing results to {OUTPUT_CSV}")

	with open(input_file, 'r', encoding='utf-8') as infile:
	reader = csv.DictReader(infile)

	if 'prompt' not in reader.fieldnames or 'response' not in reader.fieldnames:
	print("Error: Input CSV must contain 'prompt' and 'response' columns")
	return

	for i, row in enumerate(reader):
	prompt = row['prompt']
	response = row['response']

	if not prompt or not response:
	print(f"Skipping row {i+1} due to empty prompt or response")
	continue

	print(f"\nProcessing row {i+1}/{sum(1 for _ in open(input_file)) - 1}")
	print(f"Prompt: {prompt[:50]}...")
	print(f"Response: {response[:50]}...")

	print("Analyzing prompt...")
	prompt_analysis, prompt_time = analyze_text(prompt)
	time.sleep(1)

	print("Analyzing response...")
	response_analysis, response_time = analyze_text(response)

	result_row = {
	'prompt': prompt,
	'response': response,
	"Processing_Time (s)": round(prompt_time + response_time, 4),
	"Timestamp": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
	}

	if 'attributeScores' in prompt_analysis:
	for attr in attributes:
	result_row[f'prompt_{attr}'] = prompt_analysis['attributeScores'].get(attr, {}).get('summaryScore', {}).get('value', 0.0)
	else:
	for attr in attributes:
	result_row[f'prompt_{attr}'] = 0.0

	if 'attributeScores' in response_analysis:
	for attr in attributes:
	result_row[f'response_{attr}'] = response_analysis['attributeScores'].get(attr, {}).get('summaryScore', {}).get('value', 0.0)
	else:
	for attr in attributes:
	result_row[f'response_{attr}'] = 0.0

	with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8') as outfile:
	writer = csv.DictWriter(outfile, fieldnames=['prompt', 'response'] + prompt_headers + response_headers + additional_metrics)
	writer.writerow(result_row)
	outfile.flush()

	print(f"Row {i+1} processed and written to {OUTPUT_CSV}")
	time.sleep(1)

	print(f"\nProcessing complete. Results saved to {OUTPUT_CSV}")

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python script.py <input_csv>")
	sys.exit(1)

	input_file = sys.argv[1]
	process_csv(input_file)