watsonx.ai_Function_Deployment_MNB_V2

Sleeping

App Files Files Community

MilanM commited on Jul 20

Commit

e22be34

verified ·

1 Parent(s): af4cfe4

Delete new_templates

Browse files

Files changed (2) hide show

new_templates/vision_llm_text_extraction.py +0 -228
new_templates/website_monitor_function_v2 +0 -123

new_templates/vision_llm_text_extraction.py DELETED Viewed

@@ -1,228 +0,0 @@
-def extract_text_from_images_deployable():
-    """
-    Deployable watsonx.ai function that extracts text from multiple images/PDFs using foundation models.
-    Expected input payload:
-    {
-        "input_data": [{
-            "values": [["<image_url_1>", "<image_url_2>", ...], ["<optional_extraction_prompt>"]]
-        }]
-    }
-    Returns:
-    {
-        "predictions": [{
-            "fields": ["extracted_texts"],
-            "values": [[["<extracted_text_1>", "<extracted_text_2>", ...]]]
-        }]
-    }
-    """
-    import mimetypes
-    import base64
-    import requests
-    from urllib.parse import urlparse
-    import fitz
-    from ibm_watsonx_ai import APIClient, Credentials
-    from ibm_watsonx_ai.foundation_models import ModelInference
-    # Initialize watsonx client (these should be set as environment variables)
-    import os
-    WX_URL = os.getenv('WX_URL', "")
-    WX_APIKEY = os.getenv('WX_APIKEY', "")
-    PROJECT_ID = os.getenv('PROJECT_ID', "")
-    CHAT_MODEL = os.getenv('CHAT_MODEL', 'mistralai/mistral-medium-2505')
-    DEFAULT_EXTRACTION_PROMPT = '''Extract all text within the image in a markdown form as close as possible to the original, free of any additional outputs that are not in the text, including descriptions of the element, comments about making outputs, etc.'''
-    wx_credentials = Credentials(
-        url=WX_URL,
-        api_key=WX_APIKEY
-    )
-    client = APIClient(credentials=wx_credentials, project_id=PROJECT_ID)
-    def create_data_url(source, filename=None):
-        """Create data URL from bytes, file path, or URL. Returns list for PDFs."""
-        if isinstance(source, str) and source.startswith(('http://', 'https://')):
-            content = requests.get(source).content
-            filename = filename or urlparse(source).path.split('/')[-1] or 'file'
-        elif isinstance(source, str):
-            with open(source, 'rb') as f:
-                content = f.read()
-            filename = filename or source
-        else:
-            content = source
-            if not filename:
-                raise ValueError("filename required for bytes input")
-        mime_type = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
-        if mime_type == 'application/pdf':
-            doc = fitz.open(stream=content, filetype="pdf")
-            result = []
-            for page in doc:
-                pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
-                img_data = pix.tobytes("png")
-                encoded = base64.b64encode(img_data).decode('utf-8')
-                result.append(f"data:image/png;base64,{encoded}")
-            doc.close()
-            return result
-        encoded = base64.b64encode(content).decode('utf-8')
-        return f"data:{mime_type};base64,{encoded}"
-    def score(payload):
-        """
-        Score function called for each prediction request.
-        Args:
-            payload: Input payload containing list of image URLs/paths and optional extraction prompt
-        Returns:
-            Dictionary with predictions containing list of extracted texts
-        """
-        try:
-            # Extract input data from payload
-            input_values = payload.get("input_data")[0].get("values")
-            image_urls = input_values[0]  # List of URLs
-            extraction_prompt = input_values[1] if len(input_values) > 1 else DEFAULT_EXTRACTION_PROMPT
-            # Model parameters
-            params = {
-                "temperature": 1.0,
-                "max_tokens": 6553,
-                "top_p": 1.0,
-                "stop": [
-                    "</s>",
-                    "<|end_of_text|>"
-                ]
-            }
-            extracted_texts = []
-            # Process each image URL
-            for image_url in image_urls:
-                # Convert image to data URL
-                image_data_url = create_data_url(image_url)
-                # Handle PDF case (multiple pages)
-                if isinstance(image_data_url, list):
-                    all_extracted_text = []
-                    for page_num, page_url in enumerate(image_data_url):
-                        messages = [
-                            {
-                                "role": "user",
-                                "content": [
-                                    {
-                                        "type": "text",
-                                        "text": f"Page {page_num + 1}:\n{extraction_prompt}"
-                                    },
-                                    {
-                                        "type": "image_url",
-                                        "image_url": {
-                                            "url": page_url,
-                                        }
-                                    }
-                                ]
-                            }
-                        ]
-                        chat_model = ModelInference(api_client=client, model_id=CHAT_MODEL, params=params)
-                        model_response = chat_model.chat(messages=messages)
-                        page_text = model_response["choices"][0]["message"]["content"]
-                        all_extracted_text.append(f"## Page {page_num + 1}\n\n{page_text}")
-                    extracted_text = "\n\n".join(all_extracted_text)
-                else:
-                    # Single image case
-                    messages = [
-                        {
-                            "role": "user",
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": extraction_prompt
-                                },
-                                {
-                                    "type": "image_url",
-                                    "image_url": {
-                                        "url": image_data_url,
-                                    }
-                                }
-                            ]
-                        }
-                    ]
-                    chat_model = ModelInference(api_client=client, model_id=CHAT_MODEL, params=params)
-                    model_response = chat_model.chat(messages=messages)
-                    extracted_text = model_response["choices"][0]["message"]["content"]
-                extracted_texts.append(extracted_text)
-            # Return in required format
-            return {
-                'predictions': [{
-                    'fields': ['extracted_texts'],
-                    'values': [extracted_texts]
-                }]
-            }
-        except Exception as e:
-            # Return error in predictions format
-            return {
-                'predictions': [{
-                    'fields': ['extracted_texts', 'error'],
-                    'values': [[], str(e)]
-                }]
-            }
-    return score
-# Create the deployable score function
-score = extract_text_from_images_deployable()
-input_schema_example = [
-    {
-        "id": "1",
-        "type": "struct",
-        "fields": [
-            {
-                "name": "image_urls",
-                "type": "array<string>",
-                "nullable": False,
-                "metadata": {
-                    "description": "List of image/PDF URLs or file paths to process"
-                },
-            },
-            {
-                "name": "extraction_prompt",
-                "type": "string",
-                "nullable": True,
-                "metadata": {
-                    "description": "Optional custom extraction prompt. Uses default if null."
-                },
-            },
-        ],
-    }
-]
-output_schema_example = [
-    {
-        "id": "1",
-        "type": "struct",
-        "fields": [
-            {
-                "name": "extracted_texts",
-                "type": "array<string>",
-                "nullable": False,
-                "metadata": {
-                    "description": "Array of extracted text strings, one per input URL in same order"
-                },
-            }
-        ],
-    }
-]

new_templates/website_monitor_function_v2 DELETED Viewed

@@ -1,123 +0,0 @@
-def website_monitor_function_v2():
-    import requests
-    import threading
-    import time
-    from datetime import datetime
-    from difflib import SequenceMatcher
-    import re
-    # Configuration
-    URL = "<add your target url here>"  # Replace with your target URL
-    CHECK_INTERVAL = 4  # in seconds
-    CHANGE_THRESHOLD = 0.01  # 1% difference threshold
-    # Shared state variables
-    current_state = "Initializing..."
-    previous_html = None
-    last_check_time = None
-    monitor_thread = None
-    def extract_text_content(html):
-        """Extract text content from HTML, removing tags and normalizing whitespace"""
-        # Remove HTML tags
-        text = re.sub(r'<[^>]+>', ' ', html)
-        # Normalize whitespace
-        text = ' '.join(text.split())
-        return text.lower().strip()
-    def calculate_text_difference(text1, text2):
-        """Calculate percentage difference between two text strings"""
-        if not text1 and not text2:
-            return 0.0
-        if not text1 or not text2:
-            return 1.0
-        similarity = SequenceMatcher(None, text1, text2).ratio()
-        sim_score = 1.0 - similarity
-        return sim_score
-    def fetch_and_compare():
-        """Fetch URL content and compare with previous version"""
-        nonlocal current_state, previous_html, last_check_time
-        try:
-            response = requests.get(URL, timeout=30)
-            response.raise_for_status()
-            current_html = response.text
-            current_time = datetime.now()
-            if previous_html is None:
-                # First run
-                previous_html = current_html
-                last_check_time = current_time
-                current_state = f"Initial check completed at {current_time.strftime('%Y-%m-%d %H:%M:%S')}"
-                return
-            # Extract and compare text content
-            previous_text = extract_text_content(previous_html)
-            current_text = extract_text_content(current_html)
-            difference_ratio = calculate_text_difference(previous_text, current_text)
-            if difference_ratio >= CHANGE_THRESHOLD:
-                current_state = f"Changes occurred between {last_check_time.strftime('%Y-%m-%d %H:%M:%S')} - {current_time.strftime('%Y-%m-%d %H:%M:%S')}"
-                previous_html = current_html
-            else:
-                current_state = f"No changes occurred since {last_check_time.strftime('%Y-%m-%d %H:%M:%S')}"
-            last_check_time = current_time
-        except requests.RequestException as e:
-            current_state = f"Error fetching URL at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: {str(e)}"
-        except Exception as e:
-            current_state = f"Unexpected error at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: {str(e)}"
-    def monitor_loop():
-        """Continuous monitoring loop"""
-        while True:
-            fetch_and_compare()
-            time.sleep(CHECK_INTERVAL)
-    def start_monitoring():
-        """Start the monitoring thread if not already running"""
-        nonlocal monitor_thread
-        if monitor_thread is None or not monitor_thread.is_alive():
-            monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
-            monitor_thread.start()
-    # Start monitoring when function is deployed
-    start_monitoring()
-    def score(input_data):
-        """Score function that returns current monitoring state"""
-        try:
-            # Extract any parameters from input if needed (optional)
-            # For now, just return current state
-            score_response = {
-                'predictions': [{
-                    'fields': ['monitoring_state', 'check_interval_seconds', 'target_url', 'last_updated'],
-                    'values': [[
-                        current_state,
-                        CHECK_INTERVAL,
-                        URL,
-                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-                    ]]
-                }]
-            }
-            return score_response
-        except Exception as e:
-            error_response = {
-                'predictions': [{
-                    'fields': ['error'],
-                    'values': [[f"Error in score function: {str(e)}"]]
-                }]
-            }
-            return error_response
-    return score
-# Create the deployable score function
-score = website_monitor_function_v2()