import base64 import os from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage from langchain_google_genai import ChatGoogleGenerativeAI from langchain.tools import Tool from langchain_core.tools import tool api_key = os.getenv("GEMINI_API_KEY") # Create LLM class vision_llm = ChatGoogleGenerativeAI( model= "gemini-2.5-flash-preview-05-20", temperature=0, max_retries=2, google_api_key=api_key ) def extract_text(img_path: str) -> str: """ Extract text from an image file using a multimodal model. Input needs to be the path of the image. """ all_text = "" try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare the prompt including the base64 image data message = [ HumanMessage( content=[ { "type": "text", "text": ( "Extract all the text from this image. " "Return only the extracted text, no explanations." ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] # Call the vision-capable model response = vision_llm.invoke(message) # Append extracted text all_text += response.content + "\n\n" return all_text.strip() except Exception as e: # A butler should handle errors gracefully error_msg = f"Error extracting text: {str(e)}" print(error_msg) return "" @tool("analyze_image_tool", parse_docstring=True) def analyze_image_tool(user_query: str, img_path: str) -> str: """ Answer the question reasoning on the image. Args: user_query (str): The question to be answered. img_path (str): Path to the image file. """ all_text = "" try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare the prompt including the base64 image data message = [ HumanMessage( content=[ { "type": "text", "text": ( f"User query: {user_query}" ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] # Call the vision-capable model response = vision_llm.invoke(message) # Append extracted text all_text += response.content + "\n\n" return all_text.strip() except Exception as e: # A butler should handle errors gracefully error_msg = f"Error analyzing image: {str(e)}" print(error_msg) return "" @tool("analyze_audio_tool", parse_docstring=True) def analyze_audio_tool(user_query: str, audio_path: str) -> str: """ Answer the question by reasoning on the provided audio file. Args: user_query (str): The question to be answered. audio_path (str): Path to the audio file (e.g., .mp3, .wav, .flac, .aac, .ogg). """ try: # Determine MIME type from file extension _filename, file_extension = os.path.splitext(audio_path) file_extension = file_extension.lower() supported_formats = { ".mp3": "audio/mp3", ".wav": "audio/wav", ".flac": "audio/flac", ".aac": "audio/aac", ".ogg": "audio/ogg" } if file_extension not in supported_formats: return (f"Error: Unsupported audio file format '{file_extension}'. " f"Supported extensions: {', '.join(supported_formats.keys())}.") mime_type = supported_formats[file_extension] # Read audio file and encode as base64 with open(audio_path, "rb") as audio_file: audio_bytes = audio_file.read() audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") # Prepare the prompt including the base64 audio data message = [ HumanMessage( content=[ { "type": "text", "text": f"User query: {user_query}", }, { "type": "audio", "source_type": "base64", "mime_type": mime_type, "data": audio_base64 }, ] ) ] # Call the vision-capable model response = vision_llm.invoke(message) return response.content.strip() except Exception as e: error_msg = f"Error analyzing audio: {str(e)}" print(error_msg) return "" extract_text_tool = Tool( name="extract_text_tool", func=extract_text, description="Extract text from an image file using a multimodal model." )