Spaces:

thanhkt
/

t2m

Running

App Files Files Community

thanhkt commited on 10 days ago

Commit

8fb7841

verified ·

1 Parent(s): 4efafe0

Upload 26 files

Browse files

Files changed (26) hide show

eval_suite/__init__.py +0 -0
eval_suite/image_utils.py +104 -0
eval_suite/parse_prompt.py +54 -0
eval_suite/prompts_raw/__init__.py +145 -0
eval_suite/prompts_raw/fix_transcript.txt +8 -0
eval_suite/prompts_raw/image_eval.txt +45 -0
eval_suite/prompts_raw/text_eval_new.txt +47 -0
eval_suite/prompts_raw/video_eval_new.txt +37 -0
eval_suite/text_utils.py +80 -0
eval_suite/utils.py +81 -0
eval_suite/video_utils.py +167 -0
mllm_tools/__init__.py +1 -0
mllm_tools/__pycache__/__init__.cpython-312.pyc +0 -0
mllm_tools/__pycache__/gemini.cpython-312.pyc +0 -0
mllm_tools/__pycache__/litellm.cpython-312.pyc +0 -0
mllm_tools/__pycache__/openai.cpython-312.pyc +0 -0
mllm_tools/__pycache__/openrouter.cpython-312.pyc +0 -0
mllm_tools/__pycache__/utils.cpython-312.pyc +0 -0
mllm_tools/__pycache__/vertex_ai.cpython-312.pyc +0 -0
mllm_tools/gemini.py +176 -0
mllm_tools/github.py +305 -0
mllm_tools/litellm.py +193 -0
mllm_tools/openai.py +594 -0
mllm_tools/openrouter.py +266 -0
mllm_tools/utils.py +177 -0
mllm_tools/vertex_ai.py +86 -0

eval_suite/__init__.py ADDED Viewed

File without changes

eval_suite/image_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import tempfile
+import numpy as np
+from PIL import Image, ImageOps
+from moviepy import VideoFileClip
+from eval_suite.prompts_raw import _image_eval
+from eval_suite.utils import extract_json, convert_score_fields, calculate_geometric_mean
+from mllm_tools.utils import _prepare_text_image_inputs
+from src.core.parse_video import image_with_most_non_black_space
+def extract_key_frames(video_path, output_dir, num_chunks):
+    """Extract key frames from a video by dividing it into chunks and selecting representative frames.
+    Args:
+        video_path (str): Path to the input video file
+        output_dir (str): Directory where extracted frames will be saved
+        num_chunks (int): Number of chunks to divide the video into
+    Returns:
+        list: List of paths to the extracted key frames
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Extract all frames from the video
+    clip = VideoFileClip(video_path)
+    frames = list(clip.iter_frames(fps=1))  # one frame every second
+    total_frames = len(frames)
+    if total_frames == 0:
+        print("No frames extracted from the video.")
+        return []
+    # Determine the number of frames per chunk
+    frames_per_chunk = total_frames // num_chunks
+    num_chunks = min(num_chunks, (total_frames + frames_per_chunk - 1) // frames_per_chunk)
+    key_frames = []
+    # Process each chunk of frames
+    for i in range(num_chunks):
+        start_idx = i * frames_per_chunk
+        end_idx = min((i + 1) * frames_per_chunk, total_frames)
+        chunk_frames = frames[start_idx:end_idx]
+        if chunk_frames:
+            # Save the frame with most non-black space
+            output_path = os.path.join(output_dir, f"key_frame_{i+1}.jpg")
+            result = image_with_most_non_black_space(chunk_frames, output_path)
+        else:
+            print(f"No frames in chunk {i+1}. Skipping.")
+            result = None
+        if result is not None:
+            key_frames.append(output_path)
+    clip.close()
+    return key_frames
+def evaluate_sampled_images(model, video_path, description="No description provided", num_chunks=10, output_folder=None):
+    """Evaluate sampled frames from a video using an image evaluation model.
+    Args:
+        model: The image evaluation model to use
+        video_path (str): Path to the input video file
+        description (str, optional): Description of the video content. Defaults to "No description provided"
+        num_chunks (int, optional): Number of chunks to divide the video into. Defaults to 10
+        output_folder (str, optional): Directory for temporary files. Defaults to None
+    Returns:
+        dict: Dictionary containing evaluation scores and individual frame assessments with keys:
+            - evaluation: Dictionary of averaged scores for each criterion
+            - image_chunks: List of individual frame evaluation results
+    """
+    with tempfile.TemporaryDirectory(dir=output_folder) as temp_dir:
+        key_frames = extract_key_frames(video_path, temp_dir, num_chunks)
+        prompt = _image_eval.format(description=description)
+        responses = []
+        for key_frame in key_frames:
+            inputs = _prepare_text_image_inputs(prompt, key_frame)
+            response = model(inputs)
+            response_json = extract_json(response)
+            response_json = convert_score_fields(response_json)
+            responses.append(response_json)
+    criteria = list(responses[0]["evaluation"].keys())
+    scores_dict = {c: [] for c in criteria}
+    for response in responses:
+        for key, val in response["evaluation"].items():
+            scores_dict[key].append(val["score"])
+    res_score = {}
+    for key, scores in scores_dict.items():
+        res_score[key] = {"score": calculate_geometric_mean(scores)}
+    return {
+        "evaluation": res_score,
+        "image_chunks": responses
+    }

eval_suite/parse_prompt.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from tqdm import tqdm
+def call_parse_prompt():
+    """
+    Locates the prompts_raw directory and generates an __init__.py file containing prompt texts.
+    Searches for prompts_raw directory in current and parent directories. Once found, calls
+    create_python_file_with_texts() to generate the __init__.py file.
+    """
+    current_file_path = os.path.abspath(__file__)
+    current_folder_path = os.path.dirname(current_file_path)
+    folder_path = os.path.join(current_folder_path, "prompts_raw")
+    # If prompts_raw not found in current directory, search parent directories
+    if not os.path.exists(folder_path):
+        parent_dir = current_folder_path
+        while parent_dir != os.path.dirname(parent_dir):  # Stop at root directory
+            parent_dir = os.path.dirname(parent_dir)
+            test_path = os.path.join(parent_dir, "prompts_raw")
+            if os.path.exists(test_path):
+                folder_path = test_path
+                break
+    output_file = os.path.join(folder_path, "__init__.py")
+    create_python_file_with_texts(folder_path, output_file)
+def create_python_file_with_texts(folder_path, output_file):
+    """
+    Creates a Python file containing prompt texts from .txt files.
+    Args:
+        folder_path (str): Path to directory containing prompt .txt files
+        output_file (str): Path where the output __init__.py file will be created
+    The function reads all .txt files in the given folder, converts their contents into
+    Python variables, and writes them to the output file. Variable names are derived from
+    file paths with special characters replaced.
+    """
+    with open(output_file, 'w', encoding='utf-8') as out_file:
+        out_file.write("# This file is generated automatically through parse_prompt.py\n\n")
+        txt_files = [file for root, dirs, files in os.walk(folder_path) for file in files if file.endswith(".txt")]
+        for file in tqdm(txt_files, desc="Processing files"):
+            file_path = os.path.join(folder_path, file)
+            var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_")
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read().replace('"""', '\"\"\"')
+                out_file.write(f'{var_name} = """{content}"""\n\n')
+if __name__ == "__main__":
+    call_parse_prompt()

eval_suite/prompts_raw/__init__.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# This file is generated automatically through parse_prompt.py
+_video_eval_new = """# Task: Video Frame Quality Evaluation
+You are tasked with analyzing and scoring a chunk of a theorem explanation video. Note that you may not have the full context of the video. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
+## Evaluation Criteria
+1. **Visual Consistency**
+   - Style Consistency: Does the visual style remain consistent across frames?
+   - Smoothness: Are the motions and transitions smooth?
+## Scoring Instructions
+1. Assign a score from **1 to 5** for each dimension:
+   - **1**: Very poor quality, completely fails to meet the criteria.
+   - **2**: Below average, significant issues present.
+   - **3**: Acceptable, meets the basic criteria with minor issues.
+   - **4**: Good, performs well with no major issues.
+   - **5**: Excellent, fully meets or exceeds expectations.
+2. Provide a comprehensive evaluation for each dimension.
+3. Format your output in **JSON**
+### JSON Output Format
+```json
+{{
+  "overall_analysis": "[Provide a general assessment of the video's quality]",
+  "evaluation": {{
+    "visual_consistency": {{
+      "comprehensive_evaluation": "[Analysis of visual consistency]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+Description of the theorem:
+{description}
+Video chunk:"""
+_text_eval_new = """You are a specialist in evaluating theorem explanation videos, known for giving clear and objective feedback. You will be given the transcript of a video. Your task is to evaluate and score the content of the video in several dimensions.
+### Task Objective
+1. Perform an overall analysis of the video.
+    * Identify the topic of the video.
+    * Note your general thoughts and impression of the video, and any findings and observations.
+2. Conduct a comprehensive evaluation and score each criterion in the given dimensions.
+    * Analyze how well or poorly the video meets each criterion.
+    * Assign a score from **1 to 5** for each dimension:
+        - **1**: Very poor quality, completely fails to meet the criteria.
+        - **2**: Below average, significant issues present.
+        - **3**: Acceptable, meets the basic criteria with minor issues.
+        - **4**: Good, performs well with no major issues.
+        - **5**: Excellent, fully meets or exceeds expectations.
+3. Output the results in the specified JSON format.
+### Evaluation Criteria
+1. **Accuracy and Depth**
+    - Does the narration explain the theorem accurately?
+    - Does the video provide intuitive and/or rigorous explanations for why the theorem holds?
+2. **Logical Flow**
+    - Does the video follow a clear and logical structure?
+    - Does the video present a coherent buildup of ideas?
+### Notes
+* You do not have access to the visual portion of the video as you are given only the textual portion. Do not reference or commentate on the visuals as they will be evaluated separately - just assume that there are reasonable visuals (e.g., geometric objects, graphs of functions, and calculations) to accompany the narration.
+* The evaluation criteria are intended to be independent of each other. Do not restate the same violation in multiple criteria; only consider it in the most relevant criterion.
+### Output Format
+```json
+{{
+  "overall_analysis": "[Overall analysis]",
+  "evaluation": {{
+    "accuracy_and_depth": {{
+      "comprehensive_evaluation": "[Analysis of accuracy and depth]",
+      "score": [1-5]
+    }},
+    "logical_flow": {{
+      "comprehensive_evaluation": "[Analysis of logical flow]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+The transcript of the video is as follows:
+{transcript}
+"""
+_fix_transcript = """You are an expert in YouTube video transcripts. There is a transcript that was automatically generated through YouTube, so it lacks proper capitalization and punctuation. Your task is to fix the transcript so that there is proper punctuation, capitalization, and spacing. Do not make other modifications (e.g., keep the original word choice).
+You should enclose the fixed transcript with a <SCRIPT></SCRIPT> block, i.e.:
+<SCRIPT>
+(Fixed transcript here)
+</SCRIPT>
+Original transcript: {transcript}
+"""
+_image_eval = """# Task: Video Frame Quality Evaluation
+You are tasked with analyzing and scoring a frame taken from a theorem explanation video. Note that you may not have the context of the video, so the captured frame may be a frame where some motion of visual elements is taking place. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
+## Evaluation Criteria
+1. **Visual Relevance**
+   - Does the video frame align with the theorem's concepts and derivations?
+2. **Element Layout**
+   - Placemend and Size: Are the visual elements well-placed and appropriately sized within the frame?
+   - Overlap: Are the visual elements free of unintentional overlap?
+   - Clarity: Is the visual information conveyed in the frame clear and easy to understand?
+## Scoring Instructions
+1. Assign a score from **1 to 5** for each dimension:
+   - **1**: Very poor quality, completely fails to meet the criteria.
+   - **2**: Below average, significant issues present.
+   - **3**: Acceptable, meets the basic criteria with minor issues.
+   - **4**: Good, performs well with no major issues.
+   - **5**: Excellent, fully meets or exceeds expectations.
+2. Provide a comprehensive evaluation for each dimension.
+3. Format your output in **JSON**
+### JSON Output Format
+```json
+{{
+  "overall_analysis": "[Provide a general assessment of the image's quality]",
+  "evaluation": {{
+    "visual_relevance": {{
+      "comprehensive_evaluation": "[Analysis of visual relevance]",
+      "score": [1-5]
+    }},
+    "element_layout": {{
+      "comprehensive_evaluation": "[Analysis of element layout]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+Description of the theorem:
+{description}
+Image:"""

eval_suite/prompts_raw/fix_transcript.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+You are an expert in YouTube video transcripts. There is a transcript that was automatically generated through YouTube, so it lacks proper capitalization and punctuation. Your task is to fix the transcript so that there is proper punctuation, capitalization, and spacing. Do not make other modifications (e.g., keep the original word choice).
+You should enclose the fixed transcript with a <SCRIPT></SCRIPT> block, i.e.:
+<SCRIPT>
+(Fixed transcript here)
+</SCRIPT>
+Original transcript: {transcript}

eval_suite/prompts_raw/image_eval.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+# Task: Video Frame Quality Evaluation
+You are tasked with analyzing and scoring a frame taken from a theorem explanation video. Note that you may not have the context of the video, so the captured frame may be a frame where some motion of visual elements is taking place. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
+## Evaluation Criteria
+1. **Visual Relevance**
+   - Does the video frame align with the theorem's concepts and derivations?
+2. **Element Layout**
+   - Placemend and Size: Are the visual elements well-placed and appropriately sized within the frame?
+   - Overlap: Are the visual elements free of unintentional overlap?
+   - Clarity: Is the visual information conveyed in the frame clear and easy to understand?
+## Scoring Instructions
+1. Assign a score from **1 to 5** for each dimension:
+   - **1**: Very poor quality, completely fails to meet the criteria.
+   - **2**: Below average, significant issues present.
+   - **3**: Acceptable, meets the basic criteria with minor issues.
+   - **4**: Good, performs well with no major issues.
+   - **5**: Excellent, fully meets or exceeds expectations.
+2. Provide a comprehensive evaluation for each dimension.
+3. Format your output in **JSON**
+### JSON Output Format
+```json
+{{
+  "overall_analysis": "[Provide a general assessment of the image's quality]",
+  "evaluation": {{
+    "visual_relevance": {{
+      "comprehensive_evaluation": "[Analysis of visual relevance]",
+      "score": [1-5]
+    }},
+    "element_layout": {{
+      "comprehensive_evaluation": "[Analysis of element layout]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+Description of the theorem:
+{description}
+Image:

eval_suite/prompts_raw/text_eval_new.txt ADDED Viewed

	@@ -0,0 +1,47 @@

+You are a specialist in evaluating theorem explanation videos, known for giving clear and objective feedback. You will be given the transcript of a video. Your task is to evaluate and score the content of the video in several dimensions.
+### Task Objective
+1. Perform an overall analysis of the video.
+    * Identify the topic of the video.
+    * Note your general thoughts and impression of the video, and any findings and observations.
+2. Conduct a comprehensive evaluation and score each criterion in the given dimensions.
+    * Analyze how well or poorly the video meets each criterion.
+    * Assign a score from **1 to 5** for each dimension:
+        - **1**: Very poor quality, completely fails to meet the criteria.
+        - **2**: Below average, significant issues present.
+        - **3**: Acceptable, meets the basic criteria with minor issues.
+        - **4**: Good, performs well with no major issues.
+        - **5**: Excellent, fully meets or exceeds expectations.
+3. Output the results in the specified JSON format.
+### Evaluation Criteria
+1. **Accuracy and Depth**
+    - Does the narration explain the theorem accurately?
+    - Does the video provide intuitive and/or rigorous explanations for why the theorem holds?
+2. **Logical Flow**
+    - Does the video follow a clear and logical structure?
+    - Does the video present a coherent buildup of ideas?
+### Notes
+* You do not have access to the visual portion of the video as you are given only the textual portion. Do not reference or commentate on the visuals as they will be evaluated separately - just assume that there are reasonable visuals (e.g., geometric objects, graphs of functions, and calculations) to accompany the narration.
+* The evaluation criteria are intended to be independent of each other. Do not restate the same violation in multiple criteria; only consider it in the most relevant criterion.
+### Output Format
+```json
+{{
+  "overall_analysis": "[Overall analysis]",
+  "evaluation": {{
+    "accuracy_and_depth": {{
+      "comprehensive_evaluation": "[Analysis of accuracy and depth]",
+      "score": [1-5]
+    }},
+    "logical_flow": {{
+      "comprehensive_evaluation": "[Analysis of logical flow]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+The transcript of the video is as follows:
+{transcript}

eval_suite/prompts_raw/video_eval_new.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+# Task: Video Frame Quality Evaluation
+You are tasked with analyzing and scoring a chunk of a theorem explanation video. Note that you may not have the full context of the video. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
+## Evaluation Criteria
+1. **Visual Consistency**
+   - Style Consistency: Does the visual style remain consistent across frames?
+   - Smoothness: Are the motions and transitions smooth?
+## Scoring Instructions
+1. Assign a score from **1 to 5** for each dimension:
+   - **1**: Very poor quality, completely fails to meet the criteria.
+   - **2**: Below average, significant issues present.
+   - **3**: Acceptable, meets the basic criteria with minor issues.
+   - **4**: Good, performs well with no major issues.
+   - **5**: Excellent, fully meets or exceeds expectations.
+2. Provide a comprehensive evaluation for each dimension.
+3. Format your output in **JSON**
+### JSON Output Format
+```json
+{{
+  "overall_analysis": "[Provide a general assessment of the video's quality]",
+  "evaluation": {{
+    "visual_consistency": {{
+      "comprehensive_evaluation": "[Analysis of visual consistency]",
+      "score": [1-5]
+    }}
+  }}
+}}
+```
+Description of the theorem:
+{description}
+Video chunk:

eval_suite/text_utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Union
+import pysrt
+from mllm_tools.litellm import LiteLLMWrapper
+from mllm_tools.gemini import GeminiWrapper
+from mllm_tools.utils import _prepare_text_inputs
+from eval_suite.prompts_raw import _fix_transcript, _text_eval_new
+from eval_suite.utils import extract_json, convert_score_fields
+def parse_srt_to_text(srt_path) -> str:
+    """
+    Parse an SRT subtitle file into plain text.
+    Args:
+        srt_path: Path to the SRT subtitle file.
+    Returns:
+        str: The subtitle text with duplicates removed and ellipses replaced.
+    """
+    subs = pysrt.open(srt_path)
+    full_text = []
+    for sub in subs:
+        sub.text = sub.text.replace("...", ".")
+        for line in sub.text.splitlines():
+            # .srt can contain repeated lines
+            if full_text and full_text[-1] == line:
+                continue
+            full_text.append(line)
+    return "\n".join(full_text)
+def fix_transcript(text_eval_model: Union[LiteLLMWrapper, GeminiWrapper], transcript: str) -> str:
+    """
+    Fix and clean up a transcript using an LLM model.
+    Args:
+        text_eval_model: The LLM model wrapper to use for fixing the transcript.
+        transcript: The input transcript text to fix.
+    Returns:
+        str: The fixed and cleaned transcript text.
+    """
+    print("Fixing transcript...")
+    prompt = _fix_transcript.format(transcript=transcript)
+    response = text_eval_model(_prepare_text_inputs(prompt))
+    fixed_script = response.split("<SCRIPT>", maxsplit=1)[1].split("</SCRIPT>")[0]
+    return fixed_script
+def evaluate_text(text_eval_model: LiteLLMWrapper, transcript: str, retry_limit: int) -> dict:
+    """
+    Evaluate transcript text using an LLM model with retry logic.
+    Args:
+        text_eval_model: The LLM model wrapper to use for evaluation.
+        transcript: The transcript text to evaluate.
+        retry_limit: Maximum number of retry attempts on failure.
+    Returns:
+        dict: The evaluation results as a JSON object.
+    Raises:
+        ValueError: If all retry attempts fail.
+    """
+    # prompt = _text_eval.format(transcript=transcript)
+    prompt = _text_eval_new.format(transcript=transcript)
+    for attempt in range(retry_limit):
+        try:
+            evaluation = text_eval_model(_prepare_text_inputs(prompt))
+            evaluation_json = extract_json(evaluation)
+            evaluation_json = convert_score_fields(evaluation_json)
+            return evaluation_json
+        except Exception as e:
+            print(f"Attempt {attempt + 1} failed: {e.__class__.__name__}: {e}")
+            if attempt + 1 == retry_limit:
+                raise ValueError("Reached maximum retry limit. Evaluation failed.") from None

eval_suite/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import json
+import re
+from math import prod
+from typing import List
+def extract_json(response: str) -> dict:
+    """
+    Extract JSON content from a string response.
+    Args:
+        response (str): String containing JSON content, possibly within code blocks.
+    Returns:
+        dict: Extracted and parsed JSON content.
+    Raises:
+        ValueError: If no valid JSON content could be extracted.
+    """
+    try:
+        evaluation_json = json.loads(response)
+    except json.JSONDecodeError:
+        # If JSON parsing fails, try to extract the content between ```json and ```
+        match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
+        if not match:
+            # If no match for ```json, try to extract content between ``` and ```
+            match = re.search(r'```\n(.*?)\n```', response, re.DOTALL)
+        if match:
+            evaluation_content = match.group(1)
+            evaluation_json = json.loads(evaluation_content)
+        else:
+            raise ValueError("Failed to extract valid JSON content")
+    return evaluation_json
+def convert_score_fields(data: dict) -> dict:
+    """
+    Convert score fields in a dictionary to integers recursively.
+    Args:
+        data (dict): Dictionary containing score fields to convert.
+    Returns:
+        dict: Dictionary with score fields converted to integers.
+    Raises:
+        ValueError: If a score value cannot be converted to integer.
+    """
+    # Create a new dictionary with the converted values
+    converted_data = {}
+    for key, value in data.items():
+        if key == "score":
+            if isinstance(value, int):
+                converted_data[key] = value
+            elif isinstance(value, str) and value.isdigit():
+                converted_data[key] = int(value)
+            else:
+                raise ValueError(f"Invalid score value: {value!r}")
+        elif isinstance(value, dict):
+            converted_data[key] = convert_score_fields(value)
+        else:
+            converted_data[key] = value
+    return converted_data
+def calculate_geometric_mean(scores: List[int]) -> float:
+    """
+    Calculate the geometric mean of a list of scores.
+    Args:
+        scores (List[int]): List of integer scores, may contain None values.
+    Returns:
+        float: Geometric mean of non-None scores. Returns 0.0 if list is empty
+            or contains only None values.
+    """
+    scores = [s for s in scores if s is not None]
+    if not scores:
+        return 0.0
+    product = prod(scores)
+    return product ** (1 / len(scores))

eval_suite/video_utils.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import cv2
+import tempfile
+from dotenv import load_dotenv
+from mllm_tools.utils import _prepare_text_video_inputs
+from eval_suite.prompts_raw import _video_eval_new
+from eval_suite.utils import extract_json, convert_score_fields
+load_dotenv()
+def reduce_video_framerate(input_path, target_fps=1, output_path=None):
+    """
+    Reduces the frame rate of a video by only keeping frames at the target interval.
+    Args:
+        input_path (str): Path to the input video
+        target_fps (int): Target frames per second (default: 1)
+        output_path (str, optional): Path to save the processed video. If None, uses a temporary file.
+    Returns:
+        str: Path to the processed video
+    Raises:
+        ValueError: If input video cannot be opened or has invalid FPS
+        RuntimeError: If video writer initialization fails or output video creation fails
+    """
+    cap = cv2.VideoCapture(input_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open input video: {input_path}")
+    original_fps = cap.get(cv2.CAP_PROP_FPS)
+    if original_fps <= 0:
+        raise ValueError(f"Invalid FPS ({original_fps}) detected in input video")
+    frame_interval = int(original_fps / target_fps)
+    # Get video properties
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Use provided output path or create temporary file
+    if output_path is None:
+        temp_output = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
+        output_path = temp_output.name
+    # Ensure output directory exists
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    # Try different codecs in order of preference
+    codecs = [
+        ('avc1', '.mp4'),  # H.264 codec
+        ('mp4v', '.mp4'),  # MP4V codec
+        ('XVID', '.avi'),  # XVID codec
+        ('MJPG', '.avi'),  # Motion JPEG codec
+    ]
+    success = False
+    for codec, ext in codecs:
+        if output_path.endswith('.mp4') and not ext.endswith('.mp4'):
+            # If we're switching to AVI format, change the extension
+            output_path = output_path[:-4] + ext
+        fourcc = cv2.VideoWriter_fourcc(*codec)
+        out = cv2.VideoWriter(output_path, fourcc, target_fps, (width, height))
+        if out.isOpened():
+            success = True
+            print(f"Successfully initialized video writer with codec: {codec}")
+            break
+        else:
+            out.release()
+            if os.path.exists(output_path):
+                os.remove(output_path)
+    if not success:
+        raise RuntimeError("Could not initialize video writer with any available codec")
+    frame_count = 0
+    frames_written = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Only write frames at the specified interval
+        if frame_count % frame_interval == 0:
+            out.write(frame)
+            frames_written += 1
+        frame_count += 1
+    cap.release()
+    out.release()
+    # Verify the output
+    verify_cap = cv2.VideoCapture(output_path)
+    if not verify_cap.isOpened():
+        raise RuntimeError(f"Failed to create output video at {output_path}")
+    actual_fps = verify_cap.get(cv2.CAP_PROP_FPS)
+    total_frames = verify_cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    verify_cap.release()
+    if actual_fps <= 0:
+        print("Warning: Output video reports invalid FPS. This might be a codec issue.")
+        actual_fps = target_fps  # Use target FPS for duration calculation
+    print(f"Created video with {frames_written} frames at {actual_fps} FPS")
+    print(f"Total duration: {total_frames/actual_fps:.2f} seconds")
+    print(f"Video saved to: {output_path}")
+    return output_path
+def evaluate_video_chunk_new(model, video_path, transcript="No transcript provided", description="No description provided",
+                             save_processed_video=None, target_fps=None, retry_limit=5):
+    """
+    Evaluate a single video chunk using a multimodal model.
+    Args:
+        model: The multimodal model to use for evaluation
+        video_path (str): Path to the video file to evaluate
+        transcript (str, optional): Video transcript text. Defaults to "No transcript provided"
+        description (str, optional): Video description text. Defaults to "No description provided"
+        save_processed_video (str, optional): Path to save processed video. If None, uses temporary file
+        target_fps (int, optional): Target frames per second for video processing. If None, no processing
+        retry_limit (int, optional): Maximum number of retry attempts. Defaults to 5
+    Returns:
+        dict: Evaluation results as a JSON object with scores converted to integers
+    Raises:
+        FileNotFoundError: If video file does not exist
+        Exception: If evaluation fails after all retry attempts
+    """
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    # Only process video if target_fps is specified
+    if target_fps is not None:
+        processed_video_path = reduce_video_framerate(video_path, target_fps=target_fps, output_path=save_processed_video)
+        video_to_use = processed_video_path
+    else:
+        video_to_use = video_path
+    prompt = _video_eval_new.format(description=description)
+    inputs = _prepare_text_video_inputs(prompt, video_to_use)
+    try:
+        for attempt in range(retry_limit):
+            try:
+                response = model(inputs)
+                response_json = extract_json(response)
+                response_json = convert_score_fields(response_json)
+                return response_json
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed: {e}")
+                if attempt + 1 == retry_limit:
+                    print("Reached maximum retry limit. Evaluation failed.")
+                    raise
+    finally:
+        # Clean up the temporary processed video if we created one
+        if target_fps is not None and save_processed_video is None and os.path.exists(processed_video_path):
+            os.unlink(processed_video_path)

mllm_tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty file to make this directory a Python package

mllm_tools/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (154 Bytes). View file

mllm_tools/__pycache__/gemini.cpython-312.pyc ADDED Viewed

Binary file (8.04 kB). View file

mllm_tools/__pycache__/litellm.cpython-312.pyc ADDED Viewed

Binary file (7.58 kB). View file

mllm_tools/__pycache__/openai.cpython-312.pyc ADDED Viewed

Binary file (21.9 kB). View file

mllm_tools/__pycache__/openrouter.cpython-312.pyc ADDED Viewed

Binary file (11.1 kB). View file

mllm_tools/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7.32 kB). View file

mllm_tools/__pycache__/vertex_ai.cpython-312.pyc ADDED Viewed

Binary file (3.64 kB). View file

mllm_tools/gemini.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from typing import List, Dict, Any, Union, Optional
+import io
+import os
+import base64
+from PIL import Image
+import mimetypes
+import google.generativeai as genai
+import tempfile
+import time
+from urllib.parse import urlparse
+import requests
+from io import BytesIO
+class GeminiWrapper:
+    """Wrapper for Gemini to support multiple models and logging"""
+    def __init__(
+        self,
+        model_name: str = "gemini-1.5-pro-002",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = False
+    ):
+        """
+        Initialize the Gemini wrapper
+        Args:
+            model_name: Name of the model to use
+            temperature: Temperature for completion
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+        """
+        self.model_name = model_name.split('/')[-1] if '/' in model_name else model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        self.accumulated_cost = 0
+        api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("No API_KEY found. Please set the `GEMINI_API_KEY` or `GOOGLE_API_KEY` environment variable.")
+        genai.configure(api_key=api_key)
+        generation_config = {
+            "temperature": self.temperature,
+            "top_p": 0.95,
+            "response_mime_type": "text/plain",
+        }
+        safety_settings = [
+            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+        ]
+        self.model = genai.GenerativeModel(
+            model_name=self.model_name,
+            safety_settings=safety_settings,
+            generation_config=generation_config,
+        )
+    def _get_mime_type(self, file_path: str) -> str:
+        """
+        Get the MIME type of a file based on its extension
+        Args:
+            file_path: Path to the file
+        Returns:
+            MIME type as a string (e.g., "image/jpeg", "audio/mp3")
+        """
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        return mime_type
+    def _download_file(self, url: str) -> str:
+        """
+        Download a file from a URL and save it as a temporary file
+        Args:
+            url: URL of the file to download
+        Returns:
+            Path to the temporary file
+        """
+        response = requests.get(url)
+        if response.status_code == 200:
+            temp_file = tempfile.NamedTemporaryFile(delete=False)
+            temp_file.write(response.content)
+            temp_file.close()
+            return temp_file.name
+        else:
+            raise ValueError(f"Failed to download file from URL: {url}")
+    def _save_image_to_temp(self, image: Image.Image) -> str:
+        """
+        Save a PIL Image to a temporary file
+        Args:
+            image: PIL Image object
+        Returns:
+            Path to the temporary file
+        """
+        temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+        image.save(temp_file, format="PNG")
+        temp_file.close()
+        return temp_file.name
+    def _upload_to_gemini(self, file_path: str, mime_type: Optional[str] = None):
+        """
+        Uploads the given file to Gemini.
+        Args:
+            file_path: Path to the file
+            mime_type: MIME type of the file
+        Returns:
+            Uploaded file object
+        """
+        return genai.upload_file(file_path, mime_type=mime_type)
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Process messages and return completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to Gemini completion
+        Returns:
+            Generated text response
+        """
+        contents = []
+        for msg in messages:
+            if msg["type"] == "text":
+                contents.append(msg["content"])
+            elif msg["type"] in ["image", "audio", "video"]:
+                if isinstance(msg["content"], Image.Image):
+                    file_path = self._save_image_to_temp(msg["content"])
+                    mime_type = "image/png"
+                elif isinstance(msg["content"], str):
+                    if msg["content"].startswith("http"):
+                        file_path = self._download_file(msg["content"])
+                        mime_type = self._get_mime_type(msg["content"])
+                    else:
+                        file_path = msg["content"]
+                        mime_type = self._get_mime_type(file_path)
+                else:
+                    raise ValueError("Unsupported content type")
+                uploaded_file = self._upload_to_gemini(file_path, mime_type)
+                while uploaded_file.state.name == "PROCESSING":
+                    print('.', end='')
+                    time.sleep(3)
+                    uploaded_file = genai.get_file(uploaded_file.name)
+                if uploaded_file.state.name == "FAILED":
+                    raise ValueError(uploaded_file.state.name)
+                print("Upload successfully")
+                contents.append(uploaded_file)
+            else:
+                raise ValueError("Unsupported message type")
+        response = self.model.generate_content(contents, request_options={"timeout": 600})
+        try:
+            return response.text
+        except Exception as e:
+            print(e)
+            print(response.prompt_feedback)
+            return str(response.prompt_feedback)
+if __name__ == "__main__":
+    pass

mllm_tools/github.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# filepath: d:\Theory2Manim-2\Theory2Manim\mllm_tools\github.py
+import json
+import re
+from typing import List, Dict, Any, Union, Optional
+import io
+import os
+import base64
+from PIL import Image
+import mimetypes
+import litellm
+from litellm import completion, completion_cost
+from dotenv import load_dotenv
+load_dotenv()
+class GitHubModelsWrapper:
+    """Wrapper for GitHub Models using LiteLLM to support multiple GitHub hosted models"""
+    def __init__(
+        self,
+        model_name: str = "github/gpt-4o",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = True,
+        github_token: Optional[str] = None
+    ):
+        """
+        Initialize the GitHub Models wrapper
+        Args:
+            model_name: Name of the GitHub model to use (e.g. "github/gpt-4o", "github/gpt-4o-mini",
+                       "github/o1-preview", "github/claude-3-5-sonnet", "github/phi-3.5-mini-instruct")
+            temperature: Temperature for completion
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+            github_token: GitHub token for authentication (if not provided, will use GITHUB_TOKEN env var)
+        """
+        self.model_name = model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        self.accumulated_cost = 0
+        # Set up GitHub token
+        self.github_token = github_token or os.getenv('GITHUB_TOKEN')
+        if not self.github_token:
+            raise ValueError("GitHub token is required. Please set GITHUB_TOKEN environment variable or pass github_token parameter.")
+        # Set environment variable for LiteLLM
+        os.environ['GITHUB_TOKEN'] = self.github_token
+        if self.verbose:
+            os.environ['LITELLM_LOG'] = 'DEBUG'
+        # Set langfuse callback only if enabled
+        if use_langfuse:
+            litellm.success_callback = ["langfuse"]
+            litellm.failure_callback = ["langfuse"]
+    def _encode_file(self, file_path: Union[str, Image.Image]) -> str:
+        """
+        Encode local file or PIL Image to base64 string
+        Args:
+            file_path: Path to local file or PIL Image object
+        Returns:
+            Base64 encoded file string
+        """
+        if isinstance(file_path, Image.Image):
+            buffered = io.BytesIO()
+            file_path.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        else:
+            with open(file_path, "rb") as file:
+                return base64.b64encode(file.read()).decode("utf-8")
+    def _get_mime_type(self, file_path: str) -> str:
+        """
+        Get the MIME type of a file based on its extension
+        Args:
+            file_path: Path to the file
+        Returns:
+            MIME type as a string (e.g., "image/jpeg", "audio/mp3")
+        """
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        return mime_type
+    def _supports_vision(self, model_name: str) -> bool:
+        """
+        Check if the model supports vision/image processing
+        Args:
+            model_name: Name of the model
+        Returns:
+            True if model supports vision, False otherwise
+        """
+        vision_models = [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "claude-3-5-sonnet",
+            "claude-3-haiku"
+        ]
+        # Extract model name without the github/ prefix
+        clean_model_name = model_name.replace("github/", "")
+        return any(vision_model in clean_model_name for vision_model in vision_models)
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Process messages and return completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to litellm completion, e.g. for Langfuse tracking
+        Returns:
+            Generated text response
+        """
+        if metadata is None:
+            metadata = {}
+        metadata["trace_name"] = f"github-models-completion-{self.model_name}"
+        # Convert messages to LiteLLM format
+        formatted_messages = []
+        for msg in messages:
+            if msg["type"] == "text":
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [{"type": "text", "text": msg["content"]}]
+                })
+            elif msg["type"] == "image":
+                # Check if model supports vision
+                if not self._supports_vision(self.model_name):
+                    raise ValueError(f"Model {self.model_name} does not support image processing")
+                # Check if content is a local file path or PIL Image
+                if isinstance(msg["content"], Image.Image) or os.path.isfile(msg["content"]):
+                    try:
+                        if isinstance(msg["content"], Image.Image):
+                            mime_type = "image/png"
+                        else:
+                            mime_type = self._get_mime_type(msg["content"])
+                        base64_data = self._encode_file(msg["content"])
+                        data_url = f"data:{mime_type};base64,{base64_data}"
+                    except ValueError as e:
+                        print(f"Error processing file {msg['content']}: {e}")
+                        continue
+                else:
+                    data_url = msg["content"]
+                # Format for vision-capable models
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": data_url,
+                                "detail": "high"
+                            }
+                        }
+                    ]
+                })
+            else:
+                raise ValueError(f"Unsupported message type: {msg['type']}. GitHub models currently support 'text' and 'image' types.")
+        try:
+            # Check if it's an o-series model (like o1-preview, o1-mini)
+            if (re.match(r".*o1.*", self.model_name)):
+                # O-series models don't support temperature and have reasoning_effort
+                response = completion(
+                    model=self.model_name,
+                    messages=formatted_messages,
+                    reasoning_effort="medium",  # Options: "low", "medium", "high"
+                    metadata=metadata,
+                    max_retries=3
+                )
+            else:
+                response = completion(
+                    model=self.model_name,
+                    messages=formatted_messages,
+                    temperature=self.temperature,
+                    metadata=metadata,
+                    max_retries=3
+                )
+            if self.print_cost:
+                try:
+                    # Note: GitHub Models may not provide cost information
+                    cost = completion_cost(completion_response=response)
+                    if cost is not None:
+                        self.accumulated_cost += cost
+                        print(f"Cost: ${float(cost):.10f}")
+                        print(f"Accumulated Cost: ${self.accumulated_cost:.10f}")
+                    else:
+                        print("Cost information not available for GitHub Models")
+                except Exception as e:
+                    print(f"Could not calculate cost: {e}")
+            content = response.choices[0].message.content
+            if content is None:
+                print(f"Got null response from GitHub model. Full response: {response}")
+                return ""
+            return content
+        except Exception as e:
+            print(f"Error in GitHub model completion: {e}")
+            return str(e)
+def create_github_model_wrapper(model_name: str = "github/gpt-4o", **kwargs) -> GitHubModelsWrapper:
+    """
+    Convenience function to create a GitHub Models wrapper
+    Args:
+        model_name: GitHub model name (e.g., "github/gpt-4o", "github/claude-3-5-sonnet")
+        **kwargs: Additional arguments passed to GitHubModelsWrapper
+    Returns:
+        Configured GitHubModelsWrapper instance
+    Example:
+        >>> # Create a wrapper for GPT-4o
+        >>> wrapper = create_github_model_wrapper("github/gpt-4o", temperature=0.3)
+        >>>
+        >>> # Use it for text generation
+        >>> response = wrapper([{"type": "text", "content": "Explain quantum computing"}])
+        >>>
+        >>> # Use it for vision (if model supports it)
+        >>> response = wrapper([
+        ...     {"type": "text", "content": "What's in this image?"},
+        ...     {"type": "image", "content": "path/to/image.jpg"}
+        ... ])
+    """
+    return GitHubModelsWrapper(model_name=model_name, **kwargs)
+# Available GitHub Models (as of the documentation)
+AVAILABLE_MODELS = {
+    # GPT Models
+    "gpt-4o": "github/gpt-4o",
+    "gpt-4o-mini": "github/gpt-4o-mini",
+    "o1-preview": "github/o1-preview",
+    "o1-mini": "github/o1-mini",
+    "gpt-4.1": "github/gpt-4.1",
+    # Phi Models
+    "phi-3-5-mini-instruct": "github/phi-3.5-mini-instruct",
+    "phi-3-5-moe-instruct": "github/phi-3.5-moe-instruct",
+    # Llama Models
+    "llama-3.1-405b-instruct": "github/llama-3.1-405b-instruct",
+    "llama-3.1-70b-instruct": "github/llama-3.1-70b-instruct",
+    "llama-3.1-8b-instruct": "github/llama-3.1-8b-instruct",
+    # Mistral Models
+    "mistral-large": "github/mistral-large",
+    "mistral-large-2407": "github/mistral-large-2407",
+    "mistral-nemo": "github/mistral-nemo",
+    "mistral-small": "github/mistral-small",
+    # Cohere Models
+    "cohere-command-r": "github/cohere-command-r",
+    "cohere-command-r-plus": "github/cohere-command-r-plus",
+    # AI21 Models
+    "ai21-jamba-1.5-large": "github/ai21-jamba-1.5-large",
+    "ai21-jamba-1.5-mini": "github/ai21-jamba-1.5-mini"
+}
+def list_available_models() -> Dict[str, str]:
+    """
+    Get a dictionary of available GitHub models
+    Returns:
+        Dictionary mapping friendly names to full model names
+    """
+    return AVAILABLE_MODELS.copy()
+if __name__ == "__main__":
+    # Example usage
+    print("Available GitHub Models:")
+    for friendly_name, full_name in AVAILABLE_MODELS.items():
+        print(f"  {friendly_name}: {full_name}")
+    # Example of creating a wrapper (requires GITHUB_TOKEN environment variable)
+    try:
+        wrapper = create_github_model_wrapper("github/gpt-4o-mini", temperature=0.3)
+        print("\nGitHub Models wrapper created successfully!")
+        # Test with a simple text prompt
+        response = wrapper([{"type": "text", "content": "Hello! Can you confirm you're working?"}])
+        print(f"Response: {response}")
+    except Exception as e:
+        print(f"Error creating wrapper: {e}")
+        print("Make sure to set GITHUB_TOKEN environment variable")

mllm_tools/litellm.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import json
+import re
+from typing import List, Dict, Any, Union, Optional
+import io
+import os
+import base64
+from PIL import Image
+import mimetypes
+import litellm
+from litellm import completion, completion_cost
+from dotenv import load_dotenv
+load_dotenv()
+class LiteLLMWrapper:
+    """Wrapper for LiteLLM to support multiple models and logging"""
+    def __init__(
+        self,
+        model_name: str = "gpt-4-vision-preview",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = True,
+    ):
+        """
+        Initialize the LiteLLM wrapper
+        Args:
+            model_name: Name of the model to use (e.g. "azure/gpt-4", "vertex_ai/gemini-pro")
+            temperature: Temperature for completion
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+        """
+        self.model_name = model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        self.accumulated_cost = 0
+        if self.verbose:
+            os.environ['LITELLM_LOG'] = 'DEBUG'
+        # Set langfuse callback only if enabled
+        if use_langfuse:
+            litellm.success_callback = ["langfuse"]
+            litellm.failure_callback = ["langfuse"]
+    def _encode_file(self, file_path: Union[str, Image.Image]) -> str:
+        """
+        Encode local file or PIL Image to base64 string
+        Args:
+            file_path: Path to local file or PIL Image object
+        Returns:
+            Base64 encoded file string
+        """
+        if isinstance(file_path, Image.Image):
+            buffered = io.BytesIO()
+            file_path.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        else:
+            with open(file_path, "rb") as file:
+                return base64.b64encode(file.read()).decode("utf-8")
+    def _get_mime_type(self, file_path: str) -> str:
+        """
+        Get the MIME type of a file based on its extension
+        Args:
+            file_path: Path to the file
+        Returns:
+            MIME type as a string (e.g., "image/jpeg", "audio/mp3")
+        """
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        return mime_type
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Process messages and return completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to litellm completion, e.g. for Langfuse tracking
+        Returns:
+            Generated text response
+        """
+        if metadata is None:
+            print("No metadata provided, using empty metadata")
+            metadata = {}
+        metadata["trace_name"] = f"litellm-completion-{self.model_name}"
+        # Convert messages to LiteLLM format
+        formatted_messages = []
+        for msg in messages:
+            if msg["type"] == "text":
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [{"type": "text", "text": msg["content"]}]
+                })
+            elif msg["type"] in ["image", "audio", "video"]:
+                # Check if content is a local file path or PIL Image
+                if isinstance(msg["content"], Image.Image) or os.path.isfile(msg["content"]):
+                    try:
+                        if isinstance(msg["content"], Image.Image):
+                            mime_type = "image/png"
+                        else:
+                            mime_type = self._get_mime_type(msg["content"])
+                        base64_data = self._encode_file(msg["content"])
+                        data_url = f"data:{mime_type};base64,{base64_data}"
+                    except ValueError as e:
+                        print(f"Error processing file {msg['content']}: {e}")
+                        continue
+                else:
+                    data_url = msg["content"]
+                # Append the formatted message based on the model
+                if "gemini" in self.model_name:
+                    formatted_messages.append({
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": data_url
+                            }
+                        ]
+                    })
+                elif "gpt" in self.model_name:
+                    # GPT and other models expect a different format
+                    if msg["type"] == "image":
+                        # Default format for images and videos in GPT
+                        formatted_messages.append({
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": f"image_url",
+                                    f"{msg['type']}_url": {
+                                        "url": data_url,
+                                        "detail": "high"
+                                    }
+                                }
+                            ]
+                        })
+                    else:
+                        raise ValueError("For GPT, only text and image inferencing are supported")
+                else:
+                    raise ValueError("Only support Gemini and Gpt for Multimodal capability now")
+        try:
+            # if it's openai o series model, set temperature to None and reasoning_effort to "medium"
+            if (re.match(r"^o\d+.*$", self.model_name) or re.match(r"^openai/o.*$", self.model_name)):
+                self.temperature = None
+                self.reasoning_effort = "medium"
+                response = completion(
+                    model=self.model_name,
+                    messages=formatted_messages,
+                    temperature=self.temperature,
+                    reasoning_effort=self.reasoning_effort,
+                    metadata=metadata,
+                    max_retries=99
+                )
+            else:
+                response = completion(
+                    model=self.model_name,
+                    messages=formatted_messages,
+                    temperature=self.temperature,
+                    metadata=metadata,
+                    max_retries=99
+                )
+            if self.print_cost:
+                # pass your response from completion to completion_cost
+                cost = completion_cost(completion_response=response)
+                formatted_string = f"Cost: ${float(cost):.10f}"
+                # print(formatted_string)
+                self.accumulated_cost += cost
+                print(f"Accumulated Cost: ${self.accumulated_cost:.10f}")
+            content = response.choices[0].message.content
+            if content is None:
+                print(f"Got null response from model. Full response: {response}")
+            return content
+        except Exception as e:
+            print(f"Error in model completion: {e}")
+            return str(e)
+if __name__ == "__main__":
+    pass

mllm_tools/openai.py ADDED Viewed

	@@ -0,0 +1,594 @@

+# filepath: d:\Theory2Manim-2\Theory2Manim\mllm_tools\openai.py
+import json
+import re
+from typing import List, Dict, Any, Union, Optional
+import io
+import os
+import base64
+from PIL import Image
+import mimetypes
+import litellm
+from litellm import completion, completion_cost
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Note: Environment variables should be loaded from .env file or set manually using os.environ
+class OpenAIWrapper:
+    """Wrapper for OpenAI using LiteLLM to support all OpenAI models with unified interface"""
+    def __init__(
+        self,
+        model_name: str = "gpt-4o",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = True,
+        api_key: Optional[str] = None,
+        organization: Optional[str] = None,
+        base_url: Optional[str] = None,
+        use_github_token: bool = True,
+        github_token: Optional[str] = os.getenv('GITHUB_TOKEN')
+    ):
+        """
+        Initialize the OpenAI wrapper
+        Args:
+            model_name: Name of the OpenAI model to use (e.g. "gpt-4o", "gpt-4o-mini",
+                       "gpt-3.5-turbo", "o1-preview", "o1-mini", "dall-e-3")
+            temperature: Temperature for completion (ignored for o1 models)
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+            api_key: OpenAI API key (if not provided, will use OPENAI_API_KEY env var)
+            organization: OpenAI organization ID (optional)
+            base_url: Custom base URL for OpenAI API (optional, for proxies)
+            use_github_token: Whether to use GitHub AI model inference endpoint
+            github_token: GitHub token (if not provided, will use GITHUB_TOKEN env var)
+        """
+        self.model_name = model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        self.accumulated_cost = 0
+        self.use_github_token = use_github_token
+        # Configure API based on whether using GitHub token or OpenAI API
+        if use_github_token:
+            # Set up GitHub token and endpoint
+            self.github_token = github_token or os.getenv('GITHUB_TOKEN')
+            if not self.github_token:
+                raise ValueError("GitHub token is required when use_github_token=True. Please set GITHUB_TOKEN environment variable or pass github_token parameter.")
+            # Set GitHub AI inference endpoint
+            self.base_url = "https://models.github.ai/inference"
+            self.api_key = self.github_token
+            # Set environment variables for LiteLLM to use GitHub endpoint
+            os.environ['OPENAI_API_KEY'] = self.github_token
+            os.environ['OPENAI_BASE_URL'] = self.base_url
+            # Adjust model name for GitHub endpoint (add openai/ prefix if not present)
+            if not self.model_name.startswith("openai/"):
+                self.model_name = f"openai/{self.model_name}"
+        else:
+            # Original OpenAI API setup
+            self.api_key = api_key or os.getenv('OPENAI_API_KEY')
+            if not self.api_key:
+                raise ValueError("OpenAI API key is required. Please set OPENAI_API_KEY environment variable or pass api_key parameter.")
+            # Set environment variables for LiteLLM
+            os.environ['OPENAI_API_KEY'] = self.api_key
+            # Set optional custom base URL
+            if base_url:
+                os.environ['OPENAI_BASE_URL'] = base_url
+                self.base_url = base_url
+            else:
+                self.base_url = os.getenv('OPENAI_BASE_URL')
+        # Set optional organization (only for OpenAI, not GitHub)
+        if not use_github_token:
+            if organization:
+                os.environ['OPENAI_ORGANIZATION'] = organization
+                self.organization = organization
+            else:
+                self.organization = os.getenv('OPENAI_ORGANIZATION')
+        else:
+            self.organization = None
+        if self.verbose:
+            os.environ['LITELLM_LOG'] = 'DEBUG'
+        # Set langfuse callback only if enabled
+        if use_langfuse:
+            litellm.success_callback = ["langfuse"]
+            litellm.failure_callback = ["langfuse"]
+    def _encode_file(self, file_path: Union[str, Image.Image]) -> str:
+        """
+        Encode local file or PIL Image to base64 string
+        Args:
+            file_path: Path to local file or PIL Image object
+        Returns:
+            Base64 encoded file string
+        """
+        if isinstance(file_path, Image.Image):
+            buffered = io.BytesIO()
+            file_path.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        else:
+            with open(file_path, "rb") as file:
+                return base64.b64encode(file.read()).decode("utf-8")
+    def _get_mime_type(self, file_path: str) -> str:
+        """
+        Get the MIME type of a file based on its extension
+        Args:
+            file_path: Path to the file
+        Returns:
+            MIME type as a string (e.g., "image/jpeg", "application/pdf")
+        """
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        return mime_type
+    def _supports_vision(self, model_name: str) -> bool:
+        """
+        Check if the model supports vision/image processing
+        Args:
+            model_name: Name of the model
+        Returns:
+            True if model supports vision, False otherwise
+        """
+        vision_models = [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4-vision-preview",
+            "gpt-4-turbo",
+            "gpt-4-turbo-vision"
+        ]
+        return any(vision_model in model_name for vision_model in vision_models)
+    def _supports_files(self, model_name: str) -> bool:
+        """
+        Check if the model supports file processing (PDFs, documents)
+        Args:
+            model_name: Name of the model
+        Returns:
+            True if model supports file processing, False otherwise
+        """
+        file_models = [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4-turbo"
+        ]
+        return any(file_model in model_name for file_model in file_models)
+    def _is_o1_model(self, model_name: str) -> bool:
+        """
+        Check if the model is an o1 series model (reasoning models)
+        Args:
+            model_name: Name of the model
+        Returns:
+            True if it's an o1 model, False otherwise
+        """
+        return "o1" in model_name
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None, **kwargs) -> str:
+        """
+        Process messages and return completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to litellm completion, e.g. for Langfuse tracking
+            **kwargs: Additional parameters for completion (max_tokens, stream, etc.)
+        Returns:
+            Generated text response
+        """
+        if metadata is None:
+            metadata = {}
+        metadata["trace_name"] = f"openai-completion-{self.model_name}"
+        # Convert messages to LiteLLM format
+        formatted_messages = []
+        for msg in messages:
+            if msg["type"] == "text":
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [{"type": "text", "text": msg["content"]}]
+                })
+            elif msg["type"] == "image":
+                # Check if model supports vision
+                if not self._supports_vision(self.model_name):
+                    raise ValueError(f"Model {self.model_name} does not support image processing")
+                # Check if content is a local file path or PIL Image
+                if isinstance(msg["content"], Image.Image) or (isinstance(msg["content"], str) and os.path.isfile(msg["content"])):
+                    try:
+                        if isinstance(msg["content"], Image.Image):
+                            mime_type = "image/png"
+                        else:
+                            mime_type = self._get_mime_type(msg["content"])
+                        base64_data = self._encode_file(msg["content"])
+                        data_url = f"data:{mime_type};base64,{base64_data}"
+                    except ValueError as e:
+                        print(f"Error processing file {msg['content']}: {e}")
+                        continue
+                else:
+                    # Assume it's already a URL or base64 string
+                    data_url = msg["content"]
+                # Format for vision-capable models
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": data_url,
+                                "detail": "high"
+                            }
+                        }
+                    ]
+                })
+            elif msg["type"] == "file":
+                # Check if model supports file processing
+                if not self._supports_files(self.model_name):
+                    raise ValueError(f"Model {self.model_name} does not support file processing")
+                # Handle file content (PDF, documents, etc.)
+                if os.path.isfile(msg["content"]):
+                    try:
+                        mime_type = self._get_mime_type(msg["content"])
+                        base64_data = self._encode_file(msg["content"])
+                        # Use the file format for document processing
+                        formatted_messages.append({
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "file",
+                                    "file": {
+                                        "filename": os.path.basename(msg["content"]),
+                                        "file_data": f"data:{mime_type};base64,{base64_data}",
+                                    }
+                                }
+                            ]
+                        })
+                    except ValueError as e:
+                        print(f"Error processing file {msg['content']}: {e}")
+                        continue
+                else:
+                    raise ValueError(f"File not found: {msg['content']}")
+            else:
+                raise ValueError(f"Unsupported message type: {msg['type']}. OpenAI models support 'text', 'image', and 'file' types.")
+        try:
+            # Prepare completion parameters
+            completion_params = {
+                "model": self.model_name,
+                "messages": formatted_messages,
+                "metadata": metadata,
+                "max_retries": 3
+            }
+            # Add additional kwargs
+            completion_params.update(kwargs)
+            # Check if it's an o1 series model (reasoning models)
+            if self._is_o1_model(self.model_name):
+                # O1 models don't support temperature and have reasoning_effort
+                if "reasoning_effort" not in completion_params:
+                    completion_params["reasoning_effort"] = "medium"  # Options: "low", "medium", "high"
+                # Remove temperature if it was added via kwargs
+                completion_params.pop("temperature", None)
+            else:
+                # Regular models support temperature
+                if "temperature" not in completion_params:
+                    completion_params["temperature"] = self.temperature
+            response = completion(**completion_params)
+            if self.print_cost:
+                try:
+                    cost = completion_cost(completion_response=response)
+                    if cost is not None:
+                        self.accumulated_cost += cost
+                        print(f"Cost: ${float(cost):.10f}")
+                        print(f"Accumulated Cost: ${self.accumulated_cost:.10f}")
+                    else:
+                        print("Cost information not available")
+                except Exception as e:
+                    print(f"Could not calculate cost: {e}")
+            content = response.choices[0].message.content
+            if content is None:
+                print(f"Got null response from OpenAI model. Full response: {response}")
+                return ""
+            return content
+        except Exception as e:
+            print(f"Error in OpenAI model completion: {e}")
+            return str(e)
+    def stream_completion(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None, **kwargs):
+        """
+        Process messages and return streaming completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to litellm completion
+            **kwargs: Additional parameters for completion
+        Yields:
+            Streaming response chunks
+        """
+        kwargs["stream"] = True
+        # Use the same message formatting as regular completion
+        if metadata is None:
+            metadata = {}
+        metadata["trace_name"] = f"openai-streaming-{self.model_name}"
+        try:
+            # Convert messages to the same format as __call__
+            formatted_messages = []
+            for msg in messages:
+                if msg["type"] == "text":
+                    formatted_messages.append({
+                        "role": "user",
+                        "content": [{"type": "text", "text": msg["content"]}]
+                    })
+                elif msg["type"] == "image":
+                    if not self._supports_vision(self.model_name):
+                        raise ValueError(f"Model {self.model_name} does not support image processing")
+                    if isinstance(msg["content"], Image.Image) or (isinstance(msg["content"], str) and os.path.isfile(msg["content"])):
+                        try:
+                            if isinstance(msg["content"], Image.Image):
+                                mime_type = "image/png"
+                            else:
+                                mime_type = self._get_mime_type(msg["content"])
+                            base64_data = self._encode_file(msg["content"])
+                            data_url = f"data:{mime_type};base64,{base64_data}"
+                        except ValueError as e:
+                            print(f"Error processing file {msg['content']}: {e}")
+                            continue
+                    else:
+                        data_url = msg["content"]
+                    formatted_messages.append({
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": data_url,
+                                    "detail": "high"
+                                }
+                            }
+                        ]
+                    })
+            # Prepare completion parameters
+            completion_params = {
+                "model": self.model_name,
+                "messages": formatted_messages,
+                "metadata": metadata,
+                "max_retries": 3,
+                "stream": True
+            }
+            # Add additional kwargs
+            completion_params.update(kwargs)
+            # Handle o1 models
+            if self._is_o1_model(self.model_name):
+                if "reasoning_effort" not in completion_params:
+                    completion_params["reasoning_effort"] = "medium"
+                completion_params.pop("temperature", None)
+            else:
+                if "temperature" not in completion_params:
+                    completion_params["temperature"] = self.temperature
+            response = completion(**completion_params)
+            # Yield streaming chunks
+            for chunk in response:
+                yield chunk
+        except Exception as e:
+            print(f"Error in OpenAI streaming completion: {e}")
+            yield {"error": str(e)}
+def create_openai_wrapper(model_name: str = "gpt-4o", use_github: bool = False, **kwargs) -> OpenAIWrapper:
+    """
+    Convenience function to create an OpenAI wrapper
+    Args:
+        model_name: OpenAI model name (e.g., "gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo")
+        use_github: Whether to use GitHub's AI model inference endpoint
+        **kwargs: Additional arguments passed to OpenAIWrapper
+    Returns:
+        Configured OpenAIWrapper instance
+    Example:
+        >>> # Create a wrapper for GPT-4o using regular OpenAI
+        >>> wrapper = create_openai_wrapper("gpt-4o", temperature=0.3)
+        >>>
+        >>> # Create a wrapper for GPT-4o using GitHub AI models
+        >>> wrapper = create_openai_wrapper("gpt-4o", use_github=True, temperature=0.3)
+        >>>
+        >>> # Use it for text generation
+        >>> response = wrapper([{"type": "text", "content": "Explain quantum computing"}])
+        >>>
+        >>> # Use it for vision (if model supports it)
+        >>> response = wrapper([
+        ...     {"type": "text", "content": "What's in this image?"},
+        ...     {"type": "image", "content": "path/to/image.jpg"}
+        ... ])
+        >>>
+        >>> # Use it for file processing (PDFs, etc.)
+        >>> response = wrapper([
+        ...     {"type": "text", "content": "Summarize this document"},
+        ...     {"type": "file", "content": "path/to/document.pdf"}
+        ... ])
+    """
+    return OpenAIWrapper(model_name=model_name, use_github_token=use_github, **kwargs)
+# Available OpenAI Models
+AVAILABLE_MODELS = {
+    # GPT-4 Models
+    "gpt-4o": "gpt-4o",
+    "gpt-4o-mini": "gpt-4o-mini",
+    "gpt-4-turbo": "gpt-4-turbo",
+    "gpt-4": "gpt-4",
+    "gpt-4-vision-preview": "gpt-4-vision-preview",
+    # O1 Reasoning Models
+    "o1-preview": "o1-preview",
+    "o1-mini": "o1-mini",
+    # GPT-3.5 Models
+    "gpt-3.5-turbo": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-instruct": "gpt-3.5-turbo-instruct",
+    # Image Generation Models
+    "dall-e-3": "dall-e-3",
+    "dall-e-2": "dall-e-2",
+    # Embedding Models
+    "text-embedding-3-large": "text-embedding-3-large",
+    "text-embedding-3-small": "text-embedding-3-small",
+    "text-embedding-ada-002": "text-embedding-ada-002",
+    # Audio Models
+    "whisper-1": "whisper-1",
+    "tts-1": "tts-1",
+    "tts-1-hd": "tts-1-hd",
+}
+def create_github_openai_wrapper(model_name: str = "gpt-4o", **kwargs) -> OpenAIWrapper:
+    """
+    Convenience function to create an OpenAI wrapper using GitHub's AI model inference
+    Args:
+        model_name: OpenAI model name (e.g., "gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo")
+        **kwargs: Additional arguments passed to OpenAIWrapper
+    Returns:
+        Configured OpenAIWrapper instance using GitHub endpoint
+    Example:
+        >>> # Create a wrapper for GPT-4o using GitHub AI models
+        >>> wrapper = create_github_openai_wrapper("gpt-4o", temperature=0.3)
+        >>>
+        >>> # Use it for text generation
+        >>> response = wrapper([{"type": "text", "content": "What is the capital of France?"}])
+    """
+    return OpenAIWrapper(model_name=model_name, use_github_token=True, **kwargs)
+def list_available_models() -> Dict[str, str]:
+    """
+    Get a dictionary of available OpenAI models
+    Returns:
+        Dictionary mapping model names to their identifiers
+    """
+    return AVAILABLE_MODELS.copy()
+def get_model_capabilities(model_name: str) -> Dict[str, bool]:
+    """
+    Get the capabilities of a specific model
+    Args:
+        model_name: Name of the model
+    Returns:
+        Dictionary of capabilities (vision, files, reasoning, etc.)
+    """
+    wrapper = OpenAIWrapper(model_name=model_name)
+    return {
+        "vision": wrapper._supports_vision(model_name),
+        "files": wrapper._supports_files(model_name),
+        "reasoning": wrapper._is_o1_model(model_name),
+        "streaming": not wrapper._is_o1_model(model_name),  # O1 models don't support streaming
+        "temperature": not wrapper._is_o1_model(model_name),  # O1 models don't support temperature
+    }
+if __name__ == "__main__":
+    # Example usage
+    print("Available OpenAI Models:")
+    for model_name, model_id in AVAILABLE_MODELS.items():
+        capabilities = get_model_capabilities(model_name)
+        print(f"  {model_name} ({model_id}): {capabilities}")
+    print("\n" + "="*50)
+    print("Testing OpenAI wrapper...")
+    # Example 1: Regular OpenAI (requires OPENAI_API_KEY environment variable)
+    try:
+        print("\n1. Testing regular OpenAI wrapper:")
+        wrapper = create_openai_wrapper("gpt-4o-mini", temperature=0.3)
+        print("Regular OpenAI wrapper created successfully!")
+        # Test with a simple text prompt
+        response = wrapper([{"type": "text", "content": "Hello! Can you confirm you're working?"}])
+        print(f"Response: {response}")
+    except Exception as e:
+        print(f"Error creating regular OpenAI wrapper: {e}")
+        print("Make sure to set OPENAI_API_KEY environment variable")
+    # Example 2: GitHub AI models (requires GITHUB_TOKEN environment variable)
+    try:
+        print("\n2. Testing GitHub AI models wrapper:")
+        github_wrapper = create_github_openai_wrapper("gpt-4o", temperature=1.0)
+        print("GitHub OpenAI wrapper created successfully!")
+        # Test with a simple text prompt
+        response = github_wrapper([{
+            "type": "text",
+            "content": "What is the capital of France?"
+        }])
+        print(f"GitHub Response: {response}")
+    except Exception as e:
+        print(f"Error creating GitHub wrapper: {e}")
+        print("Make sure to set GITHUB_TOKEN environment variable")
+      # Example 3: Manual GitHub configuration
+    try:
+        print("\n3. Testing manual GitHub configuration:")
+        manual_wrapper = OpenAIWrapper(
+            model_name="openai/gpt-4o",
+            use_github_token=True,
+            temperature=1.0,
+            verbose=False
+        )
+        print("Manual GitHub wrapper created successfully!")
+    except Exception as e:
+        print(f"Error creating manual GitHub wrapper: {e}")
+        print("Make sure to set GITHUB_TOKEN environment variable")

mllm_tools/openrouter.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import re
+from typing import List, Dict, Any, Optional, Union
+import io
+import base64
+from PIL import Image
+import mimetypes
+from litellm import completion, completion_cost
+from dotenv import load_dotenv
+load_dotenv()
+class OpenRouterWrapper:
+    """
+    OpenRouter wrapper using LiteLLM for various language models.
+    Compatible with the existing wrapper interface.
+    """
+    def __init__(
+        self,
+        model_name: str = "openrouter/deepseek/deepseek-chat-v3-0324:free",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = True,
+        site_url: str = "",
+        app_name: str = "Theory2Manim"
+    ):
+        """
+        Initialize OpenRouter wrapper.
+        Args:
+            model_name: OpenRouter model name (with openrouter/ prefix)
+            temperature: Temperature for completion
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+            site_url: Optional site URL for tracking
+            app_name: Optional app name for tracking
+        """
+        self.model_name = model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        self.accumulated_cost = 0
+        # Setup OpenRouter environment variables
+        api_key = os.getenv("OPENROUTER_API_KEY")
+        if not api_key:
+            raise ValueError("No OPENROUTER_API_KEY found. Please set the environment variable.")
+        os.environ["OPENROUTER_API_KEY"] = api_key
+        os.environ["OPENROUTER_API_BASE"] = "https://openrouter.ai/api/v1"
+        if site_url or os.getenv("OR_SITE_URL"):
+            os.environ["OR_SITE_URL"] = site_url or os.getenv("OR_SITE_URL", "")
+        if app_name:
+            os.environ["OR_APP_NAME"] = app_name
+        if self.verbose:
+            os.environ['LITELLM_LOG'] = 'DEBUG'
+        # Set langfuse callback only if enabled
+        if use_langfuse:
+            import litellm
+            litellm.success_callback = ["langfuse"]
+            litellm.failure_callback = ["langfuse"]
+    def _encode_file(self, file_path: Union[str, Image.Image]) -> str:
+        """
+        Encode local file or PIL Image to base64 string
+        Args:
+            file_path: Path to local file or PIL Image object
+        Returns:
+            Base64 encoded file string
+        """
+        if isinstance(file_path, Image.Image):
+            buffered = io.BytesIO()
+            file_path.save(buffered, format="PNG")
+            return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        else:
+            with open(file_path, "rb") as file:
+                return base64.b64encode(file.read()).decode("utf-8")
+    def _get_mime_type(self, file_path: str) -> str:
+        """
+        Get the MIME type of a file based on its extension
+        Args:
+            file_path: Path to the file
+        Returns:
+            MIME type as a string (e.g., "image/jpeg", "audio/mp3")
+        """
+        mime_type, _ = mimetypes.guess_type(file_path)
+        if mime_type is None:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        return mime_type
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Process messages and return completion
+        Args:
+            messages: List of message dictionaries with 'type' and 'content' keys
+            metadata: Optional metadata to pass to completion
+        Returns:
+            Generated text response
+        """
+        if metadata is None:
+            metadata = {}
+        metadata["trace_name"] = f"openrouter-completion-{self.model_name}"
+        # Convert messages to LiteLLM format
+        formatted_messages = []
+        for msg in messages:
+            if msg["type"] == "text":
+                formatted_messages.append({
+                    "role": "user",
+                    "content": [{"type": "text", "text": msg["content"]}]
+                })
+            elif msg["type"] in ["image", "audio", "video"]:
+                # Check if content is a local file path or PIL Image
+                if isinstance(msg["content"], Image.Image) or os.path.isfile(msg["content"]):
+                    try:
+                        if isinstance(msg["content"], Image.Image):
+                            mime_type = "image/png"
+                        else:
+                            mime_type = self._get_mime_type(msg["content"])
+                        base64_data = self._encode_file(msg["content"])
+                        data_url = f"data:{mime_type};base64,{base64_data}"
+                    except ValueError as e:
+                        print(f"Error processing file {msg['content']}: {e}")
+                        continue
+                else:
+                    data_url = msg["content"]
+                # Format for vision models
+                if msg["type"] == "image":
+                    formatted_messages.append({
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": data_url,
+                                    "detail": "high"
+                                }
+                            }
+                        ]
+                    })
+                else:
+                    # For audio/video, treat as text for now
+                    formatted_messages.append({
+                        "role": "user",
+                        "content": [{"type": "text", "text": f"[{msg['type'].upper()}]: {msg['content']}"}]
+                    })
+        try:
+            response = completion(
+                model=self.model_name,
+                messages=formatted_messages,
+                temperature=self.temperature,
+                metadata=metadata,
+                max_retries=99
+            )
+            if self.print_cost:
+                # Calculate and print cost
+                cost = completion_cost(completion_response=response)
+                self.accumulated_cost += cost
+                print(f"Accumulated Cost: ${self.accumulated_cost:.10f}")
+            content = response.choices[0].message.content
+            if content is None:
+                print(f"Got null response from model. Full response: {response}")
+                return "Error: Received null response from model"
+            # Check if the response contains error messages about unmapped models
+            if "This model isn't mapped yet" in content or "model isn't mapped" in content.lower():
+                error_msg = f"Error: Model {self.model_name} is not supported by LiteLLM. Please use a supported model."
+                print(error_msg)
+                return error_msg
+            return content
+        except Exception as e:
+            print(f"Error in OpenRouter completion: {e}")
+            return f"Error: {str(e)}"
+class OpenRouterClient:
+    """
+    Legacy OpenRouter client for backward compatibility.
+    """
+    def __init__(self, api_key: str, site_url: str = "", app_name: str = "Theory2Manim"):
+        """
+        Initialize OpenRouter client.
+        Args:
+            api_key: OpenRouter API key
+            site_url: Optional site URL for tracking
+            app_name: Optional app name for tracking
+        """
+        os.environ["OPENROUTER_API_KEY"] = api_key
+        os.environ["OPENROUTER_API_BASE"] = "https://openrouter.ai/api/v1"
+        if site_url:
+            os.environ["OR_SITE_URL"] = site_url
+        if app_name:
+            os.environ["OR_APP_NAME"] = app_name
+    def complete(
+        self,
+        messages: List[Dict[str, str]],
+        model: str = "openrouter/openai/gpt-3.5-turbo",
+        transforms: Optional[List[str]] = None,
+        route: Optional[str] = None,
+        **kwargs
+    ) -> Any:
+        """
+        Generate completion using OpenRouter model.
+        Args:
+            messages: List of message dictionaries with 'role' and 'content'
+            model: Model name (with openrouter/ prefix)
+            transforms: Optional transforms to apply
+            route: Optional route specification
+            **kwargs: Additional parameters for completion
+        Returns:
+            Completion response
+        """
+        params = {
+            "model": model,
+            "messages": messages,
+            **kwargs
+        }
+        if transforms:
+            params["transforms"] = transforms
+        if route:
+            params["route"] = route
+        return completion(**params)
+# Convenience functions for common models
+def ds_r1(messages: List[Dict[str, str]], **kwargs) -> Any:
+    """Use GPT-3.5 Turbo via OpenRouter"""
+    client = OpenRouterClient(os.environ.get("OPENROUTER_API_KEY", ""))
+    return client.complete(messages, "deepseek/deepseek-r1:free", **kwargs)
+def ds_v3(messages: List[Dict[str, str]], **kwargs) -> Any:
+    """Use GPT-4 via OpenRouter"""
+    client = OpenRouterClient(os.environ.get("OPENROUTER_API_KEY", ""))
+    return client.complete(messages, "deepseek/deepseek-chat-v3-0324:free", **kwargs)
+def qwen3(messages: List[Dict[str, str]], **kwargs) -> Any:
+    """Use Claude-2 via OpenRouter"""
+    client = OpenRouterClient(os.environ.get("OPENROUTER_API_KEY", ""))
+    return client.complete(messages, "qwen/qwen3-235b-a22b:free", **kwargs)

mllm_tools/utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from typing import Union, List, Dict, Any, Optional
+from PIL import Image
+import google.generativeai as genai
+import tempfile
+import os
+from .gemini import GeminiWrapper
+from .vertex_ai import VertexAIWrapper
+from .openrouter import OpenRouterWrapper
+def _prepare_text_inputs(texts: List[str]) -> List[Dict[str, str]]:
+    """
+    Converts a list of text strings into the input format for the Agent model.
+    Args:
+        texts (List[str]): The list of text strings to be processed.
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
+    """
+    inputs = []
+    # Add each text string to the inputs
+    if isinstance(texts, str):
+        texts = [texts]
+    for text in texts:
+        inputs.append({
+            "type": "text",
+            "content": text
+        })
+    return inputs
+def _prepare_text_image_inputs(texts: Union[str, List[str]], images: Union[str, Image.Image, List[Union[str, Image.Image]]]) -> List[Dict[str, str]]:
+    """
+    Converts text strings and images into the input format for the Agent model.
+    Args:
+        texts (Union[str, List[str]]): Text string(s) to be processed.
+        images (Union[str, Image.Image, List[Union[str, Image.Image]]]): Image file path(s) or PIL Image object(s).
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
+    """
+    inputs = []
+    # Add each text string to the inputs
+    if isinstance(texts, str):
+        texts = [texts]
+    for text in texts:
+        inputs.append({
+            "type": "text",
+            "content": text
+        })
+    if isinstance(images, (str, Image.Image)):
+        images = [images]
+    for image in images:
+        inputs.append({
+            "type": "image",
+            "content": image
+        })
+    return inputs
+def _prepare_text_video_inputs(texts: Union[str, List[str]], videos: Union[str, List[str]]) -> List[Dict[str, str]]:
+    """
+    Converts text strings and video file paths into the input format for the Agent model.
+    Args:
+        texts (Union[str, List[str]]): Text string(s) to be processed.
+        videos (Union[str, List[str]]): Video file path(s).
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
+    """
+    inputs = []
+    # Add each text string to the inputs
+    if isinstance(texts, str):
+        texts = [texts]
+    for text in texts:
+        inputs.append({
+            "type": "text",
+            "content": text
+        })
+    # Add each video file path to the inputs
+    if isinstance(videos, str):
+        videos = [videos]
+    for video in videos:
+        inputs.append({
+            "type": "video",
+            "content": video
+        })
+    return inputs
+def _prepare_text_audio_inputs(texts: Union[str, List[str]], audios: Union[str, List[str]]) -> List[Dict[str, str]]:
+    """
+    Converts text strings and audio file paths into the input format for the Agent model.
+    Args:
+        texts (Union[str, List[str]]): Text string(s) to be processed.
+        audios (Union[str, List[str]]): Audio file path(s).
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
+    """
+    inputs = []
+    # Add each text string to the inputs
+    if isinstance(texts, str):
+        texts = [texts]
+    for text in texts:
+        inputs.append({
+            "type": "text",
+            "content": text
+        })
+    # Add each audio file path to the inputs
+    if isinstance(audios, str):
+        audios = [audios]
+    for audio in audios:
+        inputs.append({
+            "type": "audio",
+            "content": audio
+        })
+    return inputs
+def _extract_code(text: str) -> str:
+    """Helper to extract code block from model response, support Gemini style and OpenAI style"""
+    try:
+        # Find code between ```python and ``` tags
+        start = text.split("```python\n")[-1]
+        end = start.split("```")[0]
+        return end.strip()
+    except IndexError:
+        return text
+def _upload_to_gemini(input, mime_type=None):
+    """Uploads the given file or PIL image to Gemini.
+    See https://ai.google.dev/gemini-api/docs/prompting_with_media
+    """
+    if isinstance(input, str):
+        # Input is a file path
+        file = genai.upload_file(input, mime_type=mime_type)
+    elif isinstance(input, Image.Image):
+        # Input is a PIL image
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+            input.save(tmp_file, format="JPEG")
+            tmp_file_path = tmp_file.name
+        file = genai.upload_file(tmp_file_path, mime_type=mime_type or "image/jpeg")
+        os.remove(tmp_file_path)
+    else:
+        raise ValueError("Unsupported input type. Must be a file path or PIL Image.")
+    #print(f"Uploaded file '{file.display_name}' as: {file.uri}")
+    return file
+def get_media_wrapper(model_name: str) -> Optional[Union[GeminiWrapper, VertexAIWrapper, OpenRouterWrapper]]:
+    """Get appropriate wrapper for media handling based on model name"""
+    if model_name.startswith('gemini/'):
+        return GeminiWrapper(model_name=model_name.split('/')[-1])
+    elif model_name.startswith('vertex_ai/'):
+        return VertexAIWrapper(model_name=model_name.split('/')[-1])
+    elif model_name.startswith('openrouter/'):
+        return OpenRouterWrapper(model_name=model_name)
+    return None
+def prepare_media_messages(prompt: str, media_path: Union[str, Image.Image], model_name: str) -> List[Dict[str, Any]]:
+    """Prepare messages for media input based on model type"""
+    is_video = isinstance(media_path, str) and media_path.endswith('.mp4')
+    if is_video and (model_name.startswith('gemini/') or model_name.startswith('vertex_ai/') or model_name.startswith('openrouter/')):
+        return [
+            {"type": "text", "content": prompt},
+            {"type": "video", "content": media_path}
+        ]
+    else:
+        # For images or non-video content
+        if isinstance(media_path, str):
+            media = Image.open(media_path)
+        else:
+            media = media_path
+        return [
+            {"type": "text", "content": prompt},
+            {"type": "image", "content": media}
+        ]

mllm_tools/vertex_ai.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from typing import List, Dict, Any, Optional
+import vertexai
+from vertexai.generative_models import GenerativeModel, Part
+from google.auth import default
+from google.auth.transport import requests
+# TODO: check if this is the correct way to use Vertex AI
+# TODO: add langfuse support
+class VertexAIWrapper:
+    """Wrapper for Vertex AI to support Gemini models."""
+    def __init__(
+        self,
+        model_name: str = "gemini-1.5-pro",
+        temperature: float = 0.7,
+        print_cost: bool = False,
+        verbose: bool = False,
+        use_langfuse: bool = False
+    ):
+        """Initialize the Vertex AI wrapper.
+        Args:
+            model_name: Name of the model to use (e.g. "gemini-1.5-pro")
+            temperature: Temperature for generation between 0 and 1
+            print_cost: Whether to print the cost of the completion
+            verbose: Whether to print verbose output
+            use_langfuse: Whether to enable Langfuse logging
+        """
+        self.model_name = model_name
+        self.temperature = temperature
+        self.print_cost = print_cost
+        self.verbose = verbose
+        # Initialize Vertex AI
+        project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
+        location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
+        if not project_id:
+            raise ValueError("No GOOGLE_CLOUD_PROJECT found in environment variables")
+        vertexai.init(project=project_id, location=location)
+        self.model = GenerativeModel(model_name)
+    def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
+        """Process messages and return completion.
+        Args:
+            messages: List of message dictionaries containing type and content
+            metadata: Optional metadata dictionary to pass to the model
+        Returns:
+            Generated text response from the model
+        Raises:
+            ValueError: If message type is not supported
+        """
+        parts = []
+        for msg in messages:
+            if msg["type"] == "text":
+                parts.append(Part.from_text(msg["content"]))
+            elif msg["type"] in ["image", "video"]:
+                mime_type = "video/mp4" if msg["type"] == "video" else "image/jpeg"
+                if isinstance(msg["content"], str):
+                    # Handle GCS URI
+                    parts.append(Part.from_uri(
+                        msg["content"],
+                        mime_type=mime_type
+                    ))
+                else:
+                    # Handle file path or bytes
+                    parts.append(Part.from_data(
+                        msg["content"],
+                        mime_type=mime_type
+                    ))
+        response = self.model.generate_content(
+            parts,
+            generation_config={
+                "temperature": self.temperature,
+                "top_p": 0.95,
+            }
+        )
+        return response.text