scholar / apps /researcher.py
giswqs's picture
Fix tempe file issue
d3021f0
import os
import scholarpy
import pandas as pd
import streamlit as st
import leafmap.foliumap as leafmap
import plotly.express as px
from leafmap.common import temp_file_path
if "dsl" not in st.session_state:
st.session_state["dsl"] = scholarpy.Dsl()
@st.cache_data
def get_geonames():
return scholarpy.get_geonames()
def json_to_df(json_data, transpose=False):
df = json_data.as_dataframe()
if not df.empty:
if transpose:
df = df.transpose()
out_csv = temp_file_path(".csv")
df.to_csv(out_csv, index=transpose)
df = pd.read_csv(out_csv)
os.remove(out_csv)
return df
else:
return None
def annual_pubs(pubs, col="year"):
if pubs is not None:
df = pubs[col].value_counts().sort_index()
df2 = pd.DataFrame({"year": df.index, "publications": df.values})
return df2
else:
return None
def annual_collaborators(pubs, col="year"):
if pubs is not None:
df = pubs.groupby([col]).sum()
df2 = pd.DataFrame(
{"year": df.index, "collaborators": df["authors_count"].values}
)
fig = px.bar(
df2,
x="year",
y="collaborators",
)
return fig
else:
return None
def annual_citations(pubs, col="year"):
if pubs is not None:
df = pubs.groupby([col]).sum()
df2 = pd.DataFrame({"year": df.index, "citations": df["times_cited"].values})
fig = px.bar(
df2,
x="year",
y="citations",
)
return fig
else:
return None
def the_H_function(sorted_citations_list, n=1):
"""from a list of integers [n1, n2 ..] representing publications citations,
return the max list-position which is >= integer
eg
>>> the_H_function([10, 8, 5, 4, 3]) => 4
>>> the_H_function([25, 8, 5, 3, 3]) => 3
>>> the_H_function([1000, 20]) => 2
"""
if sorted_citations_list and sorted_citations_list[0] >= n:
return the_H_function(sorted_citations_list[1:], n + 1)
else:
return n - 1
def app():
st.title("Search Researchers")
dsl = st.session_state["dsl"]
row1_col1, row1_col2 = st.columns([1, 1])
with row1_col1:
name = st.text_input("Enter a researcher name:", "")
if name:
ids, names = dsl.search_researcher_by_name(name, return_list=True)
if ids.count_total > 0:
# options = ids.as_dataframe()["id"].values.tolist()
with row1_col1:
name = st.selectbox("Select a researcher id:", names)
if name:
id = name.split("|")[1].strip()
id_info = dsl.search_researcher_by_id(id, return_df=False)
info_df = json_to_df(id_info, transpose=True)
info_df.rename(
columns={info_df.columns[0]: "Type", info_df.columns[1]: "Value"},
inplace=True,
)
with row1_col1:
st.header("Researcher Information")
if not info_df.empty:
st.dataframe(info_df)
leafmap.st_download_button(
"Download data", info_df, csv_sep="\t"
)
else:
st.text("No information found")
pubs = dsl.search_pubs_by_researcher_id(id)
df = json_to_df(pubs)
# annual_df = annual_pubs(df)
if df is not None:
df1, df2 = dsl.researcher_annual_stats(
pubs, geonames_df=get_geonames()
)
df3 = scholarpy.collaborator_locations(df2)
with row1_col2:
st.header("Researcher statistics")
columns = ["pubs", "collaborators", "institutions", "cities"]
selected_columns = st.multiselect(
"Select attributes to display:", columns, columns
)
if selected_columns:
fig = scholarpy.annual_stats_barplot(df1, selected_columns)
st.plotly_chart(fig)
leafmap.st_download_button(
"Download data",
df1,
file_name="data.csv",
csv_sep="\t",
)
st.header("Map of collaborator institutions")
markdown = f"""
- Total number of collaborator institutions: **{len(df3)}**
"""
st.markdown(markdown)
m = leafmap.Map(
center=[0, 0],
zoom_start=1,
latlon_control=False,
draw_control=False,
measure_control=False,
locate_control=True,
)
m.add_points_from_xy(df3)
m.to_streamlit(height=420)
leafmap.st_download_button(
"Download data",
df3,
file_name="data.csv",
csv_sep="\t",
)
st.header("Publication counts with collaborators")
collaborators = dsl.search_researcher_collaborators(id, pubs)
markdown = f"""
- Total number of collaborators: **{len(collaborators)}**
"""
st.markdown(markdown)
st.dataframe(collaborators)
leafmap.st_download_button(
"Download data",
collaborators,
file_name="data.csv",
csv_sep="\t",
)
else:
st.text("No publications found")
with row1_col1:
st.header("Publications")
if df is not None:
citations = df["times_cited"].values.tolist()
citations.sort(reverse=True)
h_index = the_H_function(citations)
markdown = f"""
- Total number of publications: **{len(df)}**
- Total number of citations: **{df["times_cited"].sum()}**
- i10-index: **{len(df[df["times_cited"]>=10])}**
- h-index: **{h_index}**
"""
st.markdown(markdown)
st.dataframe(df)
leafmap.st_download_button(
"Download data", df, file_name="data.csv", csv_sep="\t"
)
if "journal.title" in df.columns:
st.header("Publication counts by journal")
journals = df["journal.title"].value_counts()
summary = pd.DataFrame(
{"Journal": journals.index, "Count": journals}
).reset_index(drop=True)
markdown = f"""
- Total number of journals: **{len(summary)}**
"""
st.markdown(markdown)
st.dataframe(summary)
leafmap.st_download_button(
"Download data",
summary,
file_name="data.csv",
csv_sep="\t",
)
else:
st.text("No journal publications")
else:
st.text("No publications found")
grants = dsl.search_grants_by_researcher(id)
df = grants.as_dataframe()
if not df.empty:
st.header("Grants")
st.dataframe(df)
leafmap.st_download_button(
"Download data", df, file_name="data.csv", csv_sep="\t"
)
else:
st.text("No results found.")