Spaces:

davidgturner
/

GaiaAgentEvaluator

Sleeping

App Files Files Community

GaiaAgentEvaluator / utils /llama_cpp_model.py

davidgturner

- changes for agent

fc78ae4 3 months ago

raw

history blame

11.8 kB

	"""
	Fallback model implementation for testing when llama-cpp-python is not available.

	This provides a compatible model class that doesn't require any external dependencies,
	allowing the rest of the application to function while we solve the llama-cpp-python
	installation issues.
	"""

	import os
	import logging
	from typing import Dict, List, Optional, Any, Union
	import requests
	from smolagents import Model
	from pathlib import Path

	# Try to import llama_cpp, but don't fail if not available
	try:
	from llama_cpp import Llama
	from pathlib import Path
	LLAMA_CPP_AVAILABLE = True
	except ImportError:
	LLAMA_CPP_AVAILABLE = False
	print("llama_cpp module not available, using fallback implementation")

	logger = logging.getLogger(__name__)

	class LlamaCppModel(Model):
	"""Model using llama.cpp Python bindings for efficient local inference without PyTorch.
	Falls back to a simple text generation if llama_cpp is not available."""
	def __init__(
	self,
	model_path: str = None,
	model_url: str = None,
	n_ctx: int = 2048,
	n_gpu_layers: int = 0,
	max_tokens: int = 512,
	temperature: float = 0.7,
	verbose: bool = True
	):
	"""
	Initialize a local llama.cpp model or fallback to a simple implementation.

	Args:
	model_path: Path to local GGUF model file
	model_url: URL to download model if model_path doesn't exist
	n_ctx: Context window size
	n_gpu_layers: Number of layers to offload to GPU (0 means CPU only)
	max_tokens: Maximum new tokens to generate
	temperature: Sampling temperature
	verbose: Whether to print verbose messages
	"""
	super().__init__()

	self.model_path = model_path
	self.model_url = model_url
	self.n_ctx = n_ctx
	self.max_tokens = max_tokens
	self.temperature = temperature
	self.verbose = verbose
	self.llm = None

	# Check if we can use llama_cpp
	if LLAMA_CPP_AVAILABLE:
	try:
	if self.verbose:
	print("Attempting to initialize LlamaCpp model...")

	# Try to initialize the real model
	if model_path and os.path.exists(model_path):
	if self.verbose:
	print(f"Loading model from {model_path}...")

	# Initialize the llama-cpp model
	self.llm = Llama(
	model_path=model_path,
	n_ctx=n_ctx,
	n_gpu_layers=n_gpu_layers,
	verbose=verbose
	)

	if self.verbose:
	print("LlamaCpp model loaded successfully")
	else:
	if self.verbose:
	print(f"Model path not found or not specified. Using fallback mode.")
	except Exception as e:
	logger.error(f"Error initializing LlamaCpp model: {e}")
	if self.verbose:
	print(f"Error initializing LlamaCpp model: {e}")
	self.llm = None
	else:
	if self.verbose:
	print("LlamaCpp not available, using fallback implementation")

	if not self.llm and self.verbose:
	print("Using fallback text generation mode")

	def _resolve_model_path(self, model_path: str = None, model_url: str = None) -> str:
	"""
	Resolve model path, downloading if necessary.

	Returns:
	Absolute path to model file
	"""
	# Default to a small model if none specified
	if not model_path:
	models_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
	os.makedirs(models_dir, exist_ok=True)
	model_path = os.path.join(models_dir, "ggml-model-q4_0.bin")

	# Convert to Path for easier handling
	path = Path(model_path)

	# If model exists, return it
	if path.exists():
	return str(path.absolute())

	# Download if URL provided
	if model_url and not path.exists():
	try:
	print(f"Downloading model from {model_url}...")
	os.makedirs(path.parent, exist_ok=True)

	try:
	# Try with streaming download first
	with requests.get(model_url, stream=True, timeout=30) as r:
	r.raise_for_status()
	total_size = int(r.headers.get('content-length', 0))
	block_size = 8192

	with open(path, 'wb') as f:
	downloaded = 0
	for chunk in r.iter_content(chunk_size=block_size):
	if chunk:
	f.write(chunk)
	downloaded += len(chunk)
	if total_size > 0:
	percent = (downloaded / total_size) * 100
	if percent % 10 < (block_size / total_size) * 100:
	print(f"Download progress: {int(percent)}%")
	except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
	print(f"Streaming download timed out: {e}. Using a simpler approach...")
	# Fall back to simpler download method
	r = requests.get(model_url, timeout=60)
	r.raise_for_status()
	with open(path, 'wb') as f:
	f.write(r.content)
	print("Download complete with simple method")

	print(f"Model download complete: {path}")
	return str(path.absolute())
	except Exception as e:
	logger.error(f"Error downloading model: {e}")
	print(f"Error downloading model: {e}")
	print("Continuing with dummy model instead...")
	# Create a small dummy model file so we can continue
	with open(path, 'wb') as f:
	f.write(b"DUMMY MODEL")
	return str(path.absolute())

	# If we get here without a model, create a dummy one
	print(f"Model file not found at {model_path} and no URL provided. Creating dummy model...")
	os.makedirs(path.parent, exist_ok=True)
	with open(path, 'wb') as f:
	f.write(b"DUMMY MODEL")
	return str(path.absolute())

	def generate(self, prompt: str, **kwargs) -> str:
	"""
	Generate text completion for the given prompt.

	Args:
	prompt: Input text

	Returns:
	Generated text completion
	"""
	try:
	if self.verbose:
	print(f"Generating with prompt: {prompt[:50]}...")

	# If we have a real model, use it
	if self.llm:
	# Actual generation with llama-cpp
	response = self.llm(
	prompt=prompt,
	max_tokens=self.max_tokens,
	temperature=self.temperature,
	echo=False # Don't include the prompt in the response
	)

	# Extract generated text
	if not response:
	return ""

	if isinstance(response, dict):
	generated_text = response.get('choices', [{}])[0].get('text', '')
	else:
	# List of responses
	generated_text = response[0].get('text', '')

	return generated_text.strip()
	else:
	# Fallback simple generation
	if self.verbose:
	print("Using fallback text generation")

	# Extract key information from prompt
	words = prompt.strip().split()
	last_words = ' '.join(words[-10:]) if len(words) > 10 else prompt

	# Simple response generation based on prompt content
	if "?" in prompt:
	return f"Based on the information provided, I believe the answer is related to {last_words}. This is a fallback response as the LLM model could not be loaded."
	else:
	return f"I understand you're asking about {last_words}. Since I'm running in fallback mode without a proper language model, I can only acknowledge your query but not provide a detailed response."

	except Exception as e:
	logger.error(f"Error generating text: {e}")
	if self.verbose:
	print(f"Error generating text: {e}")
	return f"Error generating response: {str(e)}"

	def generate_with_tools(
	self,
	messages: List[Dict[str, Any]],
	tools: Optional[List[Dict[str, Any]]] = None,
	**kwargs
	) -> Dict[str, Any]:
	"""
	Generate a response with tool-calling capabilities.
	This method implements the smolagents Model interface for tool-calling.

	Args:
	messages: List of message objects with role and content
	tools: List of tool definitions

	Returns:
	Response with message and optional tool calls
	"""
	try:
	# Format messages into a prompt
	prompt = self._format_messages_to_prompt(messages, tools)

	# Generate response
	completion = self.generate(prompt)

	# For now, just return the text without tool parsing
	return {
	"message": {
	"role": "assistant",
	"content": completion
	}
	}
	except Exception as e:
	logger.error(f"Error generating with tools: {e}")
	print(f"Error generating with tools: {e}")
	return {
	"message": {
	"role": "assistant",
	"content": f"Error: {str(e)}"
	}
	}

	def _format_messages_to_prompt(
	self,
	messages: List[Dict[str, Any]],
	tools: Optional[List[Dict[str, Any]]] = None
	) -> str:
	"""Format chat messages into a text prompt for the model."""
	formatted_prompt = ""

	# Include tool descriptions if available
	if tools and len(tools) > 0:
	tool_descriptions = "\n".join([
	f"Tool {i+1}: {tool['name']} - {tool['description']}"
	for i, tool in enumerate(tools)
	])
	formatted_prompt += f"Available tools:\n{tool_descriptions}\n\n"

	# Add conversation history
	for msg in messages:
	role = msg.get("role", "")
	content = msg.get("content", "")

	if role == "system":
	formatted_prompt += f"System: {content}\n\n"
	elif role == "user":
	formatted_prompt += f"User: {content}\n\n"
	elif role == "assistant":
	formatted_prompt += f"Assistant: {content}\n\n"

	# Add final prompt for assistant
	formatted_prompt += "Assistant: "

	return formatted_prompt