Spaces:

samyakshrestha
/

multiagent-xray-assistant

Running

App Files Files Community

multiagent-xray-assistant / src /tools /vision_caption_tool.py

samyakshrestha

Deploy multi-agent radiology assistant

d8e0712 12 days ago

raw

history blame contribute delete

4.96 kB

	from crewai.tools import BaseTool, tool
	from pydantic import BaseModel, Field
	from typing import Optional
	from google import genai
	from google.genai.types import Part, GenerateContentConfig
	import os

	# Schema for tool arguments, specifying image path and optional prompt
	class VisionCaptionToolSchema(BaseModel):
	image_path: str
	prompt: Optional[str] = None

	# Main tool class for generating radiology reports from chest X-ray images
	class VisionCaptionTool(BaseTool):
	name: str = "vision_caption_tool"
	description: str = (
	"Generates a structured radiology report from a chest X-ray image using optimized Gemini 2.5 Flash."
	)
	args_schema: type = VisionCaptionToolSchema
	metadata: dict = {}

	# Core method to run the tool logic
	def _run(self, image_path: str, prompt: Optional[str] = None) -> str:
	# Retrieve Gemini API key from metadata
	api_key = self.metadata.get("GEMINI_API_KEY")
	if not api_key:
	raise ValueError("GEMINI_API_KEY not found in metadata.")
	client = genai.Client(api_key=api_key)

	# System prompt defines the expert persona and approach for the model
	system_prompt = (
	"You are a board-certified thoracic radiologist with over 20 years of experience in interpreting chest X-rays. "
	"You are known for your meticulous attention to detail, clinical restraint, and deep respect for image-grounded reasoning. "
	"You prioritize accuracy over speculation and communicate with diagnostic clarity, always aligning your impressions with what is visibly demonstrable in the radiograph."
	)

	# Use default optimized prompt if none is provided by the user
	if prompt is None:
	prompt = (
	"Examine the chest X-ray step by step, following a structured A–G radiological workflow (Airway & Mediastinum, Bones & soft tissues, Cardiac silhouette, Diaphragm, Lung fields, Pleura, any Devices/foreign objects, and \"Global\" sanity checks). "
	"For each region, mentally assess both normal and abnormal findings before synthesizing them into a cohesive narrative report.\n"
	"All directional terms (left/right) must refer strictly to the PATIENT'S perspective.\n"
	"When evaluating cardiac size on AP films, assume that mild to moderate enlargement may be projectional unless the heart silhouette is clearly disproportionate or supported by additional findings (e.g., pulmonary congestion). "
	"If abnormal patterns are seen (e.g., opacities, effusion, pneumothorax, atelectasis, consolidation), propose the most likely clinical significance in a professional and cautious tone, consistent with expert radiology language. "
	"Weigh the clinical importance of each finding, and simulate a brief SECOND-PASS REVIEW of the image to verify that no significant abnormalities were overlooked.\n"
	"Your final output must be formatted with two sections only:\n"
	"FINDINGS: Structured prose following (but not explicitly labeling) the A–G sweep. Mention technical limitations only if they meaningfully impact interpretation. Always explicitly comment on signs of chronic lung disease (emphysema, fibrosis, interstitial changes) as either present or absent.\n"
	"IMPRESSION: A concise, prioritized interpretation that integrates key findings into a diagnostic hypothesis. Rank findings by CLINICAL SIGNIFICANCE, listing the most urgent or actionable abnormalities first. Remaining key diagnoses/differentials, each with a probability qualifier (\"probable\", \"possible\", etc.). Use confident, direct language—but avoid speculation beyond what is visibly supported."
	)

	# Determine the MIME type of the image based on its file extension
	ext = os.path.splitext(image_path)[-1].lower()
	if ext == ".png":
	mime_type = "image/png"
	elif ext in [".jpg", ".jpeg"]:
	mime_type = "image/jpeg"
	else:
	raise ValueError(f"Unsupported image type: {ext}")

	# Read the image file as bytes
	with open(image_path, "rb") as f:
	image_bytes = f.read()

	# Prepare input parts for the Gemini model: image and prompt
	parts = [
	Part.from_bytes(data=image_bytes, mime_type=mime_type),
	prompt
	]

	# Generate content using Gemini 2.5 Flash with specified parameters
	response = client.models.generate_content(
	model="gemini-2.5-flash",
	contents=parts,
	config=GenerateContentConfig(
	system_instruction=system_prompt,
	temperature=0.2,
	max_output_tokens=2048
	)
	)

	# Return the generated report text, stripped of leading/trailing whitespace
	return response.text.strip()