Final_Assignment_Template

Running

App Files Files Community

Final_Assignment_Template / agent.py

dawid-lorek

Update agent.py

e70ca94 verified 3 days ago

raw

history blame

4.67 kB

	# agent.py

	import os
	import requests
	from smolagents import LiteLLMModel, CodeAgent, tool, DuckDuckGoSearchTool, SpeechToTextTool, VisitWebpageTool
	import speech_recognition as sr
	from pydub import AudioSegment
	from PIL import Image

	# Ustaw endpoint API (dostosuj jeśli inny)
	api_url = "https://agents-course-unit4-scoring.hf.space"

	# ==== Narzędzia własne do podpięcia ====

	@tool
	def download_question_file(task_id: str, file_name: str = "", save_dir: str = ".") -> str:
	"""
	Downloads the file associated with a given task ID and saves it to disk.
	Args:
	task_id (str): Unique question/task identifier.
	file_name (str): Optional file name.
	save_dir (str): Directory to save.
	Returns:
	str: Path to the saved file, or error.
	"""
	url = f"{api_url}/files/{task_id}"
	try:
	resp = requests.get(url, timeout=15)
	resp.raise_for_status()
	except requests.exceptions.HTTPError as e:
	return f"HTTP error: {e.response.status_code}"
	except Exception as e:
	return f"Network error: {e}"
	content_disposition = resp.headers.get("Content-Disposition", "")
	filename = (
	content_disposition.split('filename="')[-1].rstrip('"')
	if "filename=" in content_disposition
	else file_name if file_name else f"{task_id}.dat"
	)
	os.makedirs(save_dir, exist_ok=True)
	file_path = os.path.join(save_dir, filename)
	with open(file_path, "wb") as f:
	f.write(resp.content)
	return file_path

	@tool
	def read_image(image_path: str) -> Image:
	"""
	Loads image from disk.
	Args:
	image_path (str): Path to the image file.
	Returns:
	The image.
	"""
	return Image.open(image_path)

	@tool
	def audio_to_text(audio_path: str) -> str:
	"""
	Converts audio (mp3/wav) to text using Google Speech Recognition.
	Args:
	audio_path (str): Path to the audio file.
	Returns:
	str: Recognized text.
	"""
	if audio_path.endswith(".mp3"):
	source_file = audio_path.replace(".mp3", ".wav")
	sound = AudioSegment.from_mp3(audio_path)
	sound.export(source_file, format="wav")
	else:
	source_file = audio_path
	r = sr.Recognizer()
	audio_file = sr.AudioFile(source_file)
	with audio_file as source:
	audio = r.record(source)
	text = r.recognize_google(audio)
	return text

	@tool
	def extract_text_from_image(image_path: str) -> str:
	"""
	Extract text from image using pytesseract (OCR).
	Args:
	image_path: Path to the image file.
	Returns:
	Extracted text or error message.
	"""
	try:
	import pytesseract
	from PIL import Image
	image = Image.open(image_path)
	text = pytesseract.image_to_string(image)
	return text
	except ImportError:
	return "Error: pytesseract is not installed."
	except Exception as e:
	return f"Error extracting text from image: {str(e)}"

	# ==== AGENT ====

	class GaiaAgent:
	def __init__(self, model=None, max_steps=8):
	# Jeśli model nie został przekazany, inicjalizuj domyślnie na OpenAI GPT-4o (lub inny)
	if model is None:
	api_key = os.getenv("OPENAI_API_KEY", "")
	model = LiteLLMModel(
	model_id="gpt-4o", # Zmień na swój model jeśli potrzeba
	api_key=api_key,
	)
	self.gaia_agent = CodeAgent(
	model=model,
	tools=[
	DuckDuckGoSearchTool(),
	download_question_file,
	read_image,
	audio_to_text,
	extract_text_from_image,
	VisitWebpageTool(),
	SpeechToTextTool()
	],
	additional_authorized_imports=["pandas", "numpy", "math", "statistics", "scipy"],
	max_steps=max_steps
	)
	# Możesz dodać tu dodatkową konfigurację promptów jeśli chcesz.

	def __call__(self, question: str) -> str:
	print(f"Agent received question (first 50 chars): {question[:50]}...")
	if self.gaia_agent:
	try:
	answer = self.gaia_agent.run(question)
	print(f"Agent generated answer: {answer[:50]}..." if len(answer) > 50 else f"Agent generated answer: {answer}")
	return answer
	except Exception as e:
	print(f"Error processing question: {e}")
	return "An error occurred while processing your question. Please check the agent logs for details."
	else:
	return "The agent is not properly initialized. Please check your API keys and configuration."