Spaces:

Kuberwastaken
/

Syllabus-Formatter

Sleeping

App Files Files Community

Syllabus-Formatter / model /analyzer.py

Kuberwastaken

Initial Commit

2cadc3d 7 months ago

raw

history blame

7.93 kB

	import os
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	from datetime import datetime
	import gradio as gr
	from typing import Dict, List, Union, Optional
	import logging
	import traceback

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ContentAnalyzer:
	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model = None
	self.tokenizer = None
	logger.info(f"Initialized analyzer with device: {self.device}")

	async def load_model(self, progress=None) -> None:
	"""Load the model and tokenizer with progress updates and detailed logging."""
	try:
	print("\n=== Starting Model Loading ===")
	print(f"Time: {datetime.now()}")

	if progress:
	progress(0.1, "Loading tokenizer...")

	print("Loading tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(
	"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
	use_fast=True
	)

	if progress:
	progress(0.3, "Loading model...")

	print(f"Loading model on {self.device}...")
	self.model = AutoModelForCausalLM.from_pretrained(
	"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
	device_map="auto"
	)

	if progress:
	progress(0.5, "Model loaded successfully")

	print("Model and tokenizer loaded successfully")
	logger.info(f"Model loaded successfully on {self.device}")
	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	print(f"\nERROR DURING MODEL LOADING: {str(e)}")
	print("Stack trace:")
	traceback.print_exc()
	raise

	def _chunk_text(self, text: str, chunk_size: int = 2048, overlap: int = 256) -> List[str]:
	"""Split text into overlapping chunks for processing."""
	chunks = []
	for i in range(0, len(text), chunk_size - overlap):
	chunk = text[i:i + chunk_size]
	chunks.append(chunk)
	print(f"Split text into {len(chunks)} chunks with {overlap} token overlap")
	return chunks

	async def analyze_chunk(
	self,
	chunk: str,
	progress: Optional[gr.Progress] = None,
	current_progress: float = 0,
	progress_step: float = 0
	) -> List[str]:
	"""Analyze a single chunk of text for triggers with detailed logging."""
	print(f"\n--- Processing Chunk ---")
	print(f"Chunk text (preview): {chunk[:50]}...")

	# Comprehensive trigger categories
	categories = [
	"Violence", "Death", "Substance Use", "Gore",
	"Vomit", "Sexual Content", "Sexual Abuse",
	"Self-Harm", "Gun Use", "Animal Cruelty",
	"Mental Health Issues"
	]

	# Comprehensive prompt for single-pass analysis
	prompt = f"""Comprehensive Content Sensitivity Analysis

	Carefully analyze the following text for sensitive content categories:
	{', '.join(categories)}

	Detailed Requirements:
	1. Thoroughly examine entire text chunk
	2. Identify presence of ANY of these categories
	3. Provide clear, objective assessment
	4. Minimal subjective interpretation

	TEXT CHUNK:
	{chunk}

	RESPONSE FORMAT:
	- List categories DEFINITIVELY present
	- Brief objective justification for each
	- Strict YES/NO categorization"""

	try:
	print("Sending prompt to model...")
	inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	with torch.no_grad():
	print("Generating response...")
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.2,
	top_p=0.9,
	pad_token_id=self.tokenizer.eos_token_id
	)

	response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
	print("Full Model Response:", response_text)

	# Parse detected triggers
	detected_triggers = []
	for category in categories:
	if category.upper() in response_text.upper():
	detected_triggers.append(category)

	print(f"Detected triggers in chunk: {detected_triggers}")

	if progress:
	current_progress += progress_step
	progress(min(current_progress, 0.9), "Analyzing chunk...")

	return detected_triggers

	except Exception as e:
	logger.error(f"Error analyzing chunk: {str(e)}")
	print(f"Error during chunk analysis: {str(e)}")
	traceback.print_exc()
	return []

	async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> List[str]:
	"""Analyze the entire script for triggers with progress updates."""
	print("\n=== Starting Script Analysis ===")
	print(f"Time: {datetime.now()}")

	if not self.model or not self.tokenizer:
	await self.load_model(progress)

	chunks = self._chunk_text(script)
	identified_triggers = set()
	progress_step = 0.4 / len(chunks)
	current_progress = 0.5 # Starting after model loading

	for chunk_idx, chunk in enumerate(chunks, 1):
	chunk_triggers = await self.analyze_chunk(
	chunk,
	progress,
	current_progress,
	progress_step
	)
	identified_triggers.update(chunk_triggers)

	if progress:
	progress(0.95, "Finalizing results...")

	final_triggers = list(identified_triggers)
	print("\n=== Analysis Complete ===")
	print("Final Results:", final_triggers)

	return final_triggers if final_triggers else ["None"]

	async def analyze_content(
	script: str,
	progress: Optional[gr.Progress] = None
	) -> Dict[str, Union[List[str], str]]:
	"""Main analysis function for the Gradio interface."""
	print("\n=== Starting Content Analysis ===")
	print(f"Time: {datetime.now()}")

	analyzer = ContentAnalyzer()

	try:
	triggers = await analyzer.analyze_script(script, progress)

	if progress:
	progress(1.0, "Analysis complete!")

	result = {
	"detected_triggers": triggers,
	"confidence": "High - Content detected" if triggers != ["None"] else "High - No concerning content detected",
	"model": "DeepSeek-R1-Distill-Qwen-1.5B",
	"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	}

	print("\nFinal Result Dictionary:", result)
	return result

	except Exception as e:
	logger.error(f"Analysis error: {str(e)}")
	print(f"\nERROR OCCURRED: {str(e)}")
	print("Stack trace:")
	traceback.print_exc()
	return {
	"detected_triggers": ["Error occurred during analysis"],
	"confidence": "Error",
	"model": "DeepSeek-R1-Distill-Qwen-1.5B",
	"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"error": str(e)
	}

	if __name__ == "__main__":
	# Gradio interface
	iface = gr.Interface(
	fn=analyze_content,
	inputs=gr.Textbox(lines=8, label="Input Text"),
	outputs=gr.JSON(),
	title="Content Sensitivity Analysis",
	description="Analyze text content for sensitive topics using DeepSeek R1"
	)
	iface.launch()