Spaces:

giswqs
/

scholar

Sleeping

File size: 10,798 Bytes

import os
import json
import dimcli
import pandas as pd
import plotly.express as px
import streamlit as st
import scholarpy
import leafmap.foliumap as leafmap
import datetime

current_year = datetime.datetime.now().year

if "dsl" not in st.session_state:
    st.session_state["dsl"] = scholarpy.Dsl()

# create output data folder
FOLDER_NAME = "data"
if not (os.path.exists(FOLDER_NAME)):
    os.mkdir(FOLDER_NAME)


def save(df, filename_dot_csv):
    df.to_csv(FOLDER_NAME + "/" + filename_dot_csv, index=False)


def read(filename_dot_csv):
    df = pd.read_csv(FOLDER_NAME + "/" + filename_dot_csv)
    return df


@st.cache_data
def get_token():

    return os.environ.get("DIM_TOKEN")


@st.cache_data
def get_journals():

    with open("data/journals.json") as f:
        journals = json.load(f)

    return journals


@st.cache_data
def read_excel(sheet_name):

    df = pd.read_excel(
        "data/journals.xlsx", sheet_name=sheet_name, index_col=False, engine="openpyxl"
    )
    df.set_index("Rank", inplace=True)
    return df


def app():

    st.title("Search Journals")
    dsl = st.session_state["dsl"]
    search_type = st.radio(
        "Select a search type",
        ["Search by journal title", "List Google Scholar journal categories"],
    )

    if search_type == "Search by journal title":
        row1_col1, row1_col2, row1_col3, _ = st.columns([1, 1, 2, 1])
        with row1_col1:
            name = st.text_input("Enter a journal title")

        with row1_col2:
            exact_match = st.checkbox("Exact match")

        with row1_col3:
            options = [
                "book",
                "book_series",
                "proceeding",
                "journal",
                "preprint_platform",
            ]
            types = st.multiselect(
                "Select journal types", options, ["journal", "book_series"]
            )

        if name:
            result = dsl.search_journal_by_title(name, exact_match=exact_match)
            if result is not None:
                titles = result.as_dataframe()
                titles = titles[titles["type"].isin(types)]
                titles.sort_values("title", inplace=True)
            else:
                titles = pd.DataFrame()
            # titles = titles.astype({"start_year": int})
            if not titles.empty:

                markdown = f"""
                Returned Journals: {len(titles)}        
                
                """
                st.markdown(markdown)

                st.dataframe(titles)
                titles["uid"] = (
                    titles["id"] + " | " + titles["type"] + " | " + titles["title"]
                )

                row2_col1, row2_col2, row2_col3, row2_col4, row2_col5 = st.columns(
                    [2.4, 1, 0.6, 1, 1]
                )

                with row2_col1:
                    title = st.selectbox(
                        "Select a journal title", titles["uid"].values.tolist()
                    )

                with row2_col2:
                    keyword = st.text_input("Enter a keyword to search for")

                with row2_col3:
                    exact_match = st.checkbox("Exact match", True)

                with row2_col4:
                    scope = st.selectbox(
                        "Select a search scope",
                        [
                            "authors",
                            "concepts",
                            "full_data",
                            "full_data_exact",
                            "title_abstract_only",
                            "title_only",
                        ],
                        index=5,
                    )

                with row2_col5:
                    years = st.slider(
                        "Select the start and end year:",
                        1950,
                        current_year,
                        (1980, current_year),
                    )

                if title:
                    journal_id = title.split(" | ")[0]
                    if keyword:
                        pubs = dsl.search_pubs_by_keyword(
                            keyword, exact_match, scope, years[0], years[1], journal_id
                        )
                    else:
                        pubs = dsl.search_pubs_by_journal_id(
                            journal_id, years[0], years[1]
                        )
                    pubs_df = pubs.as_dataframe()
                    if pubs_df is not None and (not pubs_df.empty):
                        st.write(
                            f"Total number of pulications: {pubs.count_total:,}. Display {min(pubs.count_total, 1000)} publications below."
                        )
                        try:
                            st.dataframe(pubs_df)
                        except Exception as e:
                            st.dataframe(scholarpy.json_to_df(pubs))
                            # st.error("An error occurred: " + str(e))
                        leafmap.st_download_button(
                            "Download data", pubs_df, csv_sep="\t"
                        )
            else:
                st.text("No results found")

    elif search_type == "List Google Scholar journal categories":

        st.markdown(
            """
        The journal categories are adopted from [Google Scholar](https://scholar.google.com/citations?view_op=top_venues&hl=en&inst=9897619243961157265).
        See the list of journals [here](https://docs.google.com/spreadsheets/d/1uCEi3TsJCWl9QEZimvjlM8wjt7hNq3QvMqHGeT44HXQ/edit?usp=sharing).
        """
        )

        st.session_state["orcids"] = None
        # dsl = st.session_state["dsl"]
        # token = get_token()
        # dimcli.login(key=token, endpoint="https://app.dimensions.ai")
        # dsl = dimcli.Dsl()

        categories = get_journals()

        row1_col1, row1_col2, _, row1_col3 = st.columns([1, 1, 0.05, 1])

        with row1_col1:
            category = st.selectbox("Select a category:", categories.keys())

        if category:
            with row1_col2:
                journal = st.selectbox("Select a journal:", categories[category].keys())

        with row1_col3:
            years = st.slider(
                "Select the start and end year:",
                1950,
                current_year,
                (1980, current_year),
            )

        if journal:
            pubs = read_excel(sheet_name=category)
            with st.expander("Show journal metrics"):
                st.dataframe(pubs)

        journal_id = categories[category][journal]
        if journal_id is not None and str(journal_id).startswith("jour"):
            q_template = """search publications where
                journal.id="{}" and
                year>={} and 
                year<={}
                return publications[id+title+doi+year+authors+type+pages+journal+issue+volume+altmetric+times_cited]
                limit 1000"""
            q = q_template.format(journal_id, years[0], years[1])
        else:
            q_template = """search publications where
                journal.title="{}" and
                year>={} and 
                year<={}
                return publications[id+title+doi+year+authors+type+pages+journal+issue+volume+altmetric+times_cited]
                limit 1000"""
            q = q_template.format(journal, years[0], years[1])

        pubs = dsl.query(q)
        if pubs.count_total > 0:
            st.header("Publications")
            st.write(
                f"Total number of pulications: {pubs.count_total:,}. Display 1,000 publications below."
            )
            df_pubs = pubs.as_dataframe()
            save(df_pubs, "publications.csv")
            df_pubs = read("publications.csv")
            st.dataframe(df_pubs)

            st.header("Authors")
            authors = pubs.as_dataframe_authors()
            st.write(
                f"Total number of authors of the 1,000 pubs shown above: {authors.shape[0]:,}"
            )
            save(authors, "authors.csv")
            df_authors = read("authors.csv")
            st.dataframe(df_authors)

            df_authors_orcid = df_authors[~df_authors["orcid"].isna()]
            # st.dataframe(df_authors_orcid)
            orcids = list(set(df_authors_orcid["orcid"].values.tolist()))
            orcids = [i[2:21] for i in orcids]
            orcids.sort()
            # st.write(orcids)
            st.session_state["orcids"] = orcids

            st.header("Affiliations")
            affiliations = pubs.as_dataframe_authors_affiliations()
            st.write(
                f"Total number of affiliations of the 1,000 pubs shown above: {affiliations.shape[0]:,}"
            )
            save(affiliations, "affiliations.csv")
            df_affiliations = read("affiliations.csv")
            st.dataframe(df_affiliations)

            researchers = authors.query("researcher_id!=''")
            #
            df_researchers = pd.DataFrame(
                {
                    "measure": [
                        "Authors in total (non unique)",
                        "Authors with a researcher ID",
                        "Authors with a researcher ID (unique)",
                    ],
                    "count": [
                        len(authors),
                        len(researchers),
                        researchers["researcher_id"].nunique(),
                    ],
                }
            )
            fig_researchers = px.bar(
                df_researchers,
                x="measure",
                y="count",
                title=f"Author Research ID stats for {journal} ({years[0]}-{years[1]})",
            )

            orcids = authors.query("orcid!=''")
            #
            df_orcids = pd.DataFrame(
                {
                    "measure": [
                        "Authors in total (non unique)",
                        "Authors with a ORCID",
                        "Authors with a ORCID (unique)",
                    ],
                    "count": [
                        len(authors),
                        len(orcids),
                        orcids["orcid"].nunique(),
                    ],
                }
            )
            fig_orcids = px.bar(
                df_orcids,
                x="measure",
                y="count",
                title=f"Author ORCID stats for {journal} ({years[0]}-{years[1]})",
            )

            st.header("Stats")

            row2_col1, row1_col2 = st.columns(2)
            with row2_col1:
                st.plotly_chart(fig_researchers)
            with row1_col2:
                st.plotly_chart(fig_orcids)

        else:
            st.warning("No publications found")