File size: 5,854 Bytes
d4ca384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb3900a
d4ca384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb3900a
 
d4ca384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb3900a
d4ca384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd1374f
d4ca384
 
 
 
 
 
cb3900a
d4ca384
 
 
 
cb3900a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import io
import boto3
import requests
import numpy as np
import polars as pl
from PIL import Image
from botocore.config import Config
import logging

logger = logging.getLogger(__name__)

# S3 for sample images
my_config = Config(
    region_name='us-east-1'
)
s3_client = boto3.client('s3', config=my_config)

# Set basepath for EOL pages for info
EOL_URL = "https://eol.org/pages/"
GBIF_URL = "https://gbif.org/species/"
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]

def get_sample(df, pred_taxon, rank):
    '''
    Function to retrieve a sample image of the predicted taxon and GBIF or EOL page link for more info.

    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    img : PIL.Image
        Sample image of predicted taxon for display.
    ref_page : str
        URL to GBIF or EOL page for the taxon (may be a lower rank, e.g., species sample).
    '''
    logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
    try:
        filepath, gbif_taxon_id, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
    except Exception as e:
        logger.error(f"Error retrieving sample data: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."
    if filepath is None:
        logger.warning(f"No sample image found for taxon: {pred_taxon}")
        return None, f"Sorry, our GBIF and EOL images do not include {pred_taxon}."

    # Get sample image of selected individual
    try:
        img_src = s3_client.generate_presigned_url('get_object',
                                                   Params={'Bucket': 'treeoflife-200m-sample-images',
                                                           'Key': filepath}
                                                   )
        img_resp = requests.get(img_src)
        img = Image.open(io.BytesIO(img_resp.content))
        if gbif_taxon_id:
            gbif_url = GBIF_URL + gbif_taxon_id
            if eol_page_id:
                eol_url = EOL_URL + eol_page_id
                if is_exact:
                    ref_page = f"<p>Check out the <a href={eol_url} target='_blank'>EOL</a> or <a href={gbif_url} target='_blank'>GBIF</a> entry for {pred_taxon} to learn more.</p>"
                else:
                    ref_page = f"<p>Check out an example entry within {pred_taxon} to learn more: {full_name} at <a href={eol_url} target='_blank'>EOL</a> or <a href={gbif_url} target='_blank'>GBIF</a>.</p>"
            else:
                if is_exact:
                    ref_page = f"<p>Check out the <a href={gbif_url} target='_blank'>GBIF</a> entry for {pred_taxon} to learn more.</p>"
                else:
                    ref_page = f"<p>Check out an example GBIF entry within {pred_taxon} to learn more: <a href={gbif_url} target='_blank'>{full_name}</a>.</p>"
        else:
            eol_url = EOL_URL + eol_page_id
            if is_exact:
                    ref_page = f"<p>Check out the <a href={eol_url} target='_blank'>EOL</a> entry for {pred_taxon} to learn more.</p>"
            else:
                ref_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: <a href={eol_url} target='_blank'>{full_name}</a>.</p>"
        logger.info(f"Successfully retrieved sample image and page for {pred_taxon}")
        return img, ref_page
    except Exception as e:
        logger.error(f"Error retrieving sample image: {e}")
        return None, f"We encountered the following error trying to retrieve a sample image: {e}."

def get_sample_data(df, pred_taxon, rank):
    '''
    Function to randomly select a sample individual of the given taxon and provide associated native location.
    
    Parameters:
    -----------
    df : DataFrame
        DataFrame with all sample images listed and their filepaths (in "file_path" column).
    pred_taxon : str
        Predicted taxon of the uploaded image.
    rank : int
        Index of rank in RANKS chosen for prediction.

    Returns:
    --------
    filepath : str
        Filepath of selected sample image for predicted taxon.
    gbif_taxon_id: str
        GBIF page ID associated with predicted taxon for more information.
    eol_page_id : str
        EOL page ID associated with predicted taxon for more information.
    full_name : str
        Full taxonomic name of the selected sample.
    is_exact : bool
        Flag indicating if the match is exact (i.e., with empty lower ranks).
    '''
    for idx in range(rank + 1):
        taxon = RANKS[idx]
        target_taxon = pred_taxon.split(" ")[idx]
        df = df.filter(pl.col(taxon) == target_taxon)

    if df.shape[0] == 0:
        return None, np.nan, "", False

    # First, try to find entries with empty lower ranks
    exact_df = df
    for lower_rank in RANKS[rank + 1:]:
        exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))

    if exact_df.shape[0] > 0:
        df_filtered = exact_df.sample()
        full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
        return df_filtered["file_path"][0], df_filtered["gbif_taxon_id"].cast(pl.String)[0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True

    # If no exact matches, return any entry with the specified rank
    df_filtered = df.sample()
    full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
    return df_filtered["file_path"][0], df_filtered["gbif_taxon_id"].cast(pl.String)[0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False