Spaces:

giswqs
/

scholar

Sleeping

File size: 8,704 Bytes

import os
import scholarpy
import pandas as pd
import streamlit as st
import leafmap.foliumap as leafmap
import plotly.express as px
from leafmap.common import temp_file_path

if "dsl" not in st.session_state:
    st.session_state["dsl"] = scholarpy.Dsl()


@st.cache_data
def get_geonames():
    return scholarpy.get_geonames()


def json_to_df(json_data, transpose=False):
    df = json_data.as_dataframe()
    if not df.empty:
        if transpose:
            df = df.transpose()

        out_csv = temp_file_path(".csv")
        df.to_csv(out_csv, index=transpose)
        df = pd.read_csv(out_csv)
        os.remove(out_csv)
        return df
    else:
        return None


def annual_pubs(pubs, col="year"):
    if pubs is not None:
        df = pubs[col].value_counts().sort_index()
        df2 = pd.DataFrame({"year": df.index, "publications": df.values})
        return df2
    else:
        return None


def annual_collaborators(pubs, col="year"):
    if pubs is not None:
        df = pubs.groupby([col]).sum()
        df2 = pd.DataFrame(
            {"year": df.index, "collaborators": df["authors_count"].values}
        )
        fig = px.bar(
            df2,
            x="year",
            y="collaborators",
        )
        return fig
    else:
        return None


def annual_citations(pubs, col="year"):
    if pubs is not None:
        df = pubs.groupby([col]).sum()
        df2 = pd.DataFrame({"year": df.index, "citations": df["times_cited"].values})
        fig = px.bar(
            df2,
            x="year",
            y="citations",
        )
        return fig
    else:
        return None


def the_H_function(sorted_citations_list, n=1):
    """from a list of integers [n1, n2 ..] representing publications citations,
    return the max list-position which is >= integer

    eg
    >>> the_H_function([10, 8, 5, 4, 3]) => 4
    >>> the_H_function([25, 8, 5, 3, 3]) => 3
    >>> the_H_function([1000, 20]) => 2
    """
    if sorted_citations_list and sorted_citations_list[0] >= n:
        return the_H_function(sorted_citations_list[1:], n + 1)
    else:
        return n - 1


def app():

    st.title("Search Researchers")
    dsl = st.session_state["dsl"]
    row1_col1, row1_col2 = st.columns([1, 1])

    with row1_col1:
        name = st.text_input("Enter a researcher name:", "")

    if name:

        ids, names = dsl.search_researcher_by_name(name, return_list=True)
        if ids.count_total > 0:
            # options = ids.as_dataframe()["id"].values.tolist()
            with row1_col1:
                name = st.selectbox("Select a researcher id:", names)

            if name:
                id = name.split("|")[1].strip()
                id_info = dsl.search_researcher_by_id(id, return_df=False)

                info_df = json_to_df(id_info, transpose=True)
                info_df.rename(
                    columns={info_df.columns[0]: "Type", info_df.columns[1]: "Value"},
                    inplace=True,
                )
                with row1_col1:
                    st.header("Researcher Information")
                    if not info_df.empty:
                        st.dataframe(info_df)
                        leafmap.st_download_button(
                            "Download data", info_df, csv_sep="\t"
                        )
                    else:
                        st.text("No information found")

                pubs = dsl.search_pubs_by_researcher_id(id)
                df = json_to_df(pubs)
                # annual_df = annual_pubs(df)
                if df is not None:
                    df1, df2 = dsl.researcher_annual_stats(
                        pubs, geonames_df=get_geonames()
                    )
                    df3 = scholarpy.collaborator_locations(df2)

                    with row1_col2:
                        st.header("Researcher statistics")
                        columns = ["pubs", "collaborators", "institutions", "cities"]
                        selected_columns = st.multiselect(
                            "Select attributes to display:", columns, columns
                        )
                        if selected_columns:
                            fig = scholarpy.annual_stats_barplot(df1, selected_columns)
                            st.plotly_chart(fig)
                        leafmap.st_download_button(
                            "Download data",
                            df1,
                            file_name="data.csv",
                            csv_sep="\t",
                        )

                        st.header("Map of collaborator institutions")
                        markdown = f"""
                        - Total number of collaborator institutions: **{len(df3)}**
                        """
                        st.markdown(markdown)
                        m = leafmap.Map(
                            center=[0, 0],
                            zoom_start=1,
                            latlon_control=False,
                            draw_control=False,
                            measure_control=False,
                            locate_control=True,
                        )
                        m.add_points_from_xy(df3)
                        m.to_streamlit(height=420)
                        leafmap.st_download_button(
                            "Download data",
                            df3,
                            file_name="data.csv",
                            csv_sep="\t",
                        )

                        st.header("Publication counts with collaborators")
                        collaborators = dsl.search_researcher_collaborators(id, pubs)
                        markdown = f"""
                        - Total number of collaborators: **{len(collaborators)}**
                        """
                        st.markdown(markdown)
                        st.dataframe(collaborators)
                        leafmap.st_download_button(
                            "Download data",
                            collaborators,
                            file_name="data.csv",
                            csv_sep="\t",
                        )
                else:
                    st.text("No publications found")

                with row1_col1:
                    st.header("Publications")
                    if df is not None:
                        citations = df["times_cited"].values.tolist()
                        citations.sort(reverse=True)
                        h_index = the_H_function(citations)
                        markdown = f"""
                        - Total number of publications: **{len(df)}**
                        - Total number of citations: **{df["times_cited"].sum()}**
                        - i10-index: **{len(df[df["times_cited"]>=10])}**
                        - h-index: **{h_index}**
                        """
                        st.markdown(markdown)
                        st.dataframe(df)
                        leafmap.st_download_button(
                            "Download data", df, file_name="data.csv", csv_sep="\t"
                        )

                        if "journal.title" in df.columns:
                            st.header("Publication counts by journal")
                            journals = df["journal.title"].value_counts()
                            summary = pd.DataFrame(
                                {"Journal": journals.index, "Count": journals}
                            ).reset_index(drop=True)
                            markdown = f"""
                            - Total number of journals: **{len(summary)}**
                            """
                            st.markdown(markdown)
                            st.dataframe(summary)
                            leafmap.st_download_button(
                                "Download data",
                                summary,
                                file_name="data.csv",
                                csv_sep="\t",
                            )
                        else:
                            st.text("No journal publications")

                    else:
                        st.text("No publications found")

                    grants = dsl.search_grants_by_researcher(id)
                    df = grants.as_dataframe()
                    if not df.empty:
                        st.header("Grants")
                        st.dataframe(df)
                        leafmap.st_download_button(
                            "Download data", df, file_name="data.csv", csv_sep="\t"
                        )
        else:
            st.text("No results found.")