Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
added embedding plot
Browse files- app.py +40 -3
- pfdr_arxiv_cutoff_distances.npy +3 -0
app.py
CHANGED
|
@@ -276,12 +276,15 @@ class RetrievalSystem():
|
|
| 276 |
def return_formatted_df(self, top_results, small_df):
|
| 277 |
|
| 278 |
df = pd.DataFrame(small_df)
|
| 279 |
-
df = df.drop(columns=['
|
| 280 |
links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
|
| 281 |
scores = [top_results[i] for i in top_results]
|
|
|
|
| 282 |
df.insert(1,'ADS Link',links,True)
|
| 283 |
df.insert(2,'Relevance',scores,True)
|
| 284 |
-
df
|
|
|
|
|
|
|
| 285 |
return df
|
| 286 |
|
| 287 |
# @st.cache_resource
|
|
@@ -547,7 +550,39 @@ def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConse
|
|
| 547 |
|
| 548 |
return response
|
| 549 |
|
|
|
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
|
| 553 |
# ---------------------------------------
|
|
@@ -599,7 +634,6 @@ if st.session_state.get('runpfdr'):
|
|
| 599 |
question_type_gen = question_type_gen.replace('\n',' \n')
|
| 600 |
st.markdown(question_type_gen)
|
| 601 |
|
| 602 |
-
with col2:
|
| 603 |
with st.spinner("Evaluating abstract consensus"):
|
| 604 |
with st.expander("Abstract consensus", expanded=True):
|
| 605 |
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
|
|
@@ -607,6 +641,9 @@ if st.session_state.get('runpfdr'):
|
|
| 607 |
st.markdown(consensus_answer.explanation)
|
| 608 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
| 609 |
|
|
|
|
|
|
|
|
|
|
| 610 |
session_vars = {
|
| 611 |
"runtime": "pathfinder_v1_online",
|
| 612 |
"query": query,
|
|
|
|
| 276 |
def return_formatted_df(self, top_results, small_df):
|
| 277 |
|
| 278 |
df = pd.DataFrame(small_df)
|
| 279 |
+
df = df.drop(columns=['umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
|
| 280 |
links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
|
| 281 |
scores = [top_results[i] for i in top_results]
|
| 282 |
+
indices = [i for i in top_results]
|
| 283 |
df.insert(1,'ADS Link',links,True)
|
| 284 |
df.insert(2,'Relevance',scores,True)
|
| 285 |
+
df.insert(3,'Indices',indices,True)
|
| 286 |
+
df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','Indices','embed']]
|
| 287 |
+
df.index += 1
|
| 288 |
return df
|
| 289 |
|
| 290 |
# @st.cache_resource
|
|
|
|
| 550 |
|
| 551 |
return response
|
| 552 |
|
| 553 |
+
def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
|
| 554 |
|
| 555 |
+
cut_dist = np.load('pfdr_arxiv_cutoff_distances.npy') - cutoff_adjust
|
| 556 |
+
pts = np.array(papers_df['embed'].tolist())
|
| 557 |
+
centroid = np.mean(pts,0)
|
| 558 |
+
dists = np.sqrt(np.sum((pts-centroid)**2,1))
|
| 559 |
+
outlier_flag = (dists > cut_dist[top_k-1])
|
| 560 |
+
|
| 561 |
+
return outlier_flag
|
| 562 |
+
|
| 563 |
+
def make_embedding_plot(papers_df, consensus_answer):
|
| 564 |
+
|
| 565 |
+
plt_indices = np.array(papers_df['Indices'].tolist())
|
| 566 |
+
|
| 567 |
+
if 'arxiv_corpus' not in st.session_state:
|
| 568 |
+
st.session_state.arxiv_corpus = load_arxiv_corpus()
|
| 569 |
+
|
| 570 |
+
xax = np.array(st.session_state.arxiv_corpus['umap_x'])
|
| 571 |
+
yax = np.array(st.session_state.arxiv_corpus['umap_y'])
|
| 572 |
+
|
| 573 |
+
outlier_flag = calc_outlier_flag(papers_df, top_k, cutoff_adjust=0.25)
|
| 574 |
+
alphas = np.ones((len(plt_indices),)) * 0.9
|
| 575 |
+
alphas[outlier_flag] = 0.5
|
| 576 |
+
|
| 577 |
+
fig = plt.figure(figsize=(9,12))
|
| 578 |
+
plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
|
| 579 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w')
|
| 580 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue')
|
| 581 |
+
# plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
|
| 582 |
+
plt.axis([0,20,-4.2,18])
|
| 583 |
+
plt.axis('off')
|
| 584 |
+
plt.title('Query: '+st.session_state.query+'\n'+r'N$_{\rm outliers}: %.0f/%.0f$, Consensus: ' %(np.sum(outlier_flag), len(outlier_flag)) + consensus_answer.consensus + ' (%.1f)' %consensus_answer.relevance_score)
|
| 585 |
+
st.pyplot(fig)
|
| 586 |
|
| 587 |
|
| 588 |
# ---------------------------------------
|
|
|
|
| 634 |
question_type_gen = question_type_gen.replace('\n',' \n')
|
| 635 |
st.markdown(question_type_gen)
|
| 636 |
|
|
|
|
| 637 |
with st.spinner("Evaluating abstract consensus"):
|
| 638 |
with st.expander("Abstract consensus", expanded=True):
|
| 639 |
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
|
|
|
|
| 641 |
st.markdown(consensus_answer.explanation)
|
| 642 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
| 643 |
|
| 644 |
+
with col2:
|
| 645 |
+
make_embedding_plot(papers_df, consensus_answer)
|
| 646 |
+
|
| 647 |
session_vars = {
|
| 648 |
"runtime": "pathfinder_v1_online",
|
| 649 |
"query": query,
|
pfdr_arxiv_cutoff_distances.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64edda3cf9c3cde63a6dc818f0e6df573dc1ce32217acac1e2bcdfe7f3a4e0e3
|
| 3 |
+
size 928
|