Spaces:

imageomics
/

bioclip-2-demo

Running

File size: 5,854 Bytes

import io
import boto3
import requests
import numpy as np
import polars as pl
from PIL import Image
from botocore.config import Config
import logging

logger = logging.getLogger(__name__)

# S3 for sample images
my_config = Config(
    region_name='us-east-1'
)
s3_client = boto3.client('s3', config=my_config)

# Set basepath for EOL pages for info
EOL_URL = "https://eol.org/pages/"
GBIF_URL = "https://gbif.org/species/"
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]

def get_sample(df, pred_taxon, rank):
    '''
    Function to retrieve a sample image of the predicted taxon and GBIF or EOL page link for more info.

    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    img : PIL.Image
        Sample image of predicted taxon for display.
    ref_page : str
        URL to GBIF or EOL page for the taxon (may be a lower rank, e.g., species sample).
    '''
    logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
    try:
        filepath, gbif_taxon_id, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
    except Exception as e:
        logger.error(f"Error retrieving sample data: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."
    if filepath is None:
        logger.warning(f"No sample image found for taxon: {pred_taxon}")
        return None, f"Sorry, our GBIF and EOL images do not include {pred_taxon}."

    # Get sample image of selected individual
    try:
        img_src = s3_client.generate_presigned_url('get_object',
                                                   Params={'Bucket': 'treeoflife-200m-sample-images',
                                                           'Key': filepath}
                                                   )
        img_resp = requests.get(img_src)
        img = Image.open(io.BytesIO(img_resp.content))
        if gbif_taxon_id:
            gbif_url = GBIF_URL + gbif_taxon_id
            if eol_page_id:
                eol_url = EOL_URL + eol_page_id
                if is_exact:
                    ref_page = f"<p>Check out the <a href={eol_url} target='_blank'>EOL</a> or <a href={gbif_url} target='_blank'>GBIF</a> entry for {pred_taxon} to learn more.</p>"
                else:
                    ref_page = f"<p>Check out an example entry within {pred_taxon} to learn more: {full_name} at <a href={eol_url} target='_blank'>EOL</a> or <a href={gbif_url} target='_blank'>GBIF</a>.</p>"
            else:
                if is_exact:
                    ref_page = f"<p>Check out the <a href={gbif_url} target='_blank'>GBIF</a> entry for {pred_taxon} to learn more.</p>"
                else:
                    ref_page = f"<p>Check out an example GBIF entry within {pred_taxon} to learn more: <a href={gbif_url} target='_blank'>{full_name}</a>.</p>"
        else:
            eol_url = EOL_URL + eol_page_id
            if is_exact:
                    ref_page = f"<p>Check out the <a href={eol_url} target='_blank'>EOL</a> entry for {pred_taxon} to learn more.</p>"
            else:
                ref_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: <a href={eol_url} target='_blank'>{full_name}</a>.</p>"
        logger.info(f"Successfully retrieved sample image and page for {pred_taxon}")
        return img, ref_page
    except Exception as e:
        logger.error(f"Error retrieving sample image: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."

def get_sample_data(df, pred_taxon, rank):
    '''
    Function to randomly select a sample individual of the given taxon and provide associated native location.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    filepath : str
        Filepath of selected sample image for predicted taxon.
    gbif_taxon_id: str
        GBIF page ID associated with predicted taxon for more information.
    eol_page_id : str
        EOL page ID associated with predicted taxon for more information.
    full_name : str
        Full taxonomic name of the selected sample.
    is_exact : bool
        Flag indicating if the match is exact (i.e., with empty lower ranks).
    '''
    for idx in range(rank + 1):
        taxon = RANKS[idx]
        target_taxon = pred_taxon.split(" ")[idx]
        df = df.filter(pl.col(taxon) == target_taxon)

    if df.shape[0] == 0:
        return None, np.nan, "", False

    # First, try to find entries with empty lower ranks
    exact_df = df
    for lower_rank in RANKS[rank + 1:]:
        exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))

    if exact_df.shape[0] > 0:
        df_filtered = exact_df.sample()
        full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
        return df_filtered["file_path"][0], df_filtered["gbif_taxon_id"].cast(pl.String)[0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True

    # If no exact matches, return any entry with the specified rank
    df_filtered = df.sample()
    full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
    return df_filtered["file_path"][0], df_filtered["gbif_taxon_id"].cast(pl.String)[0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False