Spaces:
Running
Running
Upload 2 files
Browse files- app.py +18 -11
- kw_tags.npz +3 -0
app.py
CHANGED
|
@@ -43,7 +43,7 @@ from openai import OpenAI
|
|
| 43 |
# import anthropic
|
| 44 |
import cohere
|
| 45 |
import faiss
|
| 46 |
-
|
| 47 |
import spacy
|
| 48 |
from string import punctuation
|
| 49 |
import pytextrank
|
|
@@ -282,8 +282,8 @@ class RetrievalSystem():
|
|
| 282 |
indices = [i for i in top_results]
|
| 283 |
df.insert(1,'ADS Link',links,True)
|
| 284 |
df.insert(2,'Relevance',scores,True)
|
| 285 |
-
df.insert(3,'
|
| 286 |
-
df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','
|
| 287 |
df.index += 1
|
| 288 |
return df
|
| 289 |
|
|
@@ -391,7 +391,7 @@ def Library(query):
|
|
| 391 |
papers_df = run_query_ret(st.session_state.query)
|
| 392 |
op_docs = ''
|
| 393 |
for i in range(len(papers_df)):
|
| 394 |
-
op_docs = op_docs + 'Paper %.0f:' %(i+1) + papers_df['title'][i] + '\n' + papers_df['abstract'][i] + '\n\n'
|
| 395 |
|
| 396 |
return op_docs
|
| 397 |
|
|
@@ -451,7 +451,7 @@ def run_rag_qa(query, papers_df):
|
|
| 451 |
metadata = {"source": row['ads_id']}
|
| 452 |
doc = Document(page_content=content, metadata=metadata)
|
| 453 |
documents.append(doc)
|
| 454 |
-
my_bar.progress((i
|
| 455 |
|
| 456 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
|
| 457 |
|
|
@@ -562,7 +562,7 @@ def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
|
|
| 562 |
|
| 563 |
def make_embedding_plot(papers_df, consensus_answer):
|
| 564 |
|
| 565 |
-
plt_indices = np.array(papers_df['
|
| 566 |
|
| 567 |
if 'arxiv_corpus' not in st.session_state:
|
| 568 |
st.session_state.arxiv_corpus = load_arxiv_corpus()
|
|
@@ -574,10 +574,17 @@ def make_embedding_plot(papers_df, consensus_answer):
|
|
| 574 |
alphas = np.ones((len(plt_indices),)) * 0.9
|
| 575 |
alphas[outlier_flag] = 0.5
|
| 576 |
|
| 577 |
-
fig = plt.figure(figsize=(9
|
| 578 |
plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
|
| 579 |
-
|
| 580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
# plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
|
| 582 |
plt.axis([0,20,-4.2,18])
|
| 583 |
plt.axis('off')
|
|
@@ -589,7 +596,7 @@ def make_embedding_plot(papers_df, consensus_answer):
|
|
| 589 |
|
| 590 |
if st.session_state.get('runpfdr'):
|
| 591 |
with st.spinner(search_text_list[np.random.choice(len(search_text_list))]):
|
| 592 |
-
st.write('Settings: [Kw:',toggle_a, 'Time:',toggle_b, 'Cite:',toggle_c, '] top_k:',top_k, 'retrieval:',method)
|
| 593 |
|
| 594 |
papers_df = run_query_ret(st.session_state.query)
|
| 595 |
st.header(st.session_state.query)
|
|
@@ -636,7 +643,7 @@ if st.session_state.get('runpfdr'):
|
|
| 636 |
|
| 637 |
with st.spinner("Evaluating abstract consensus"):
|
| 638 |
with st.expander("Abstract consensus", expanded=True):
|
| 639 |
-
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
|
| 640 |
st.subheader("Consensus: "+consensus_answer.consensus)
|
| 641 |
st.markdown(consensus_answer.explanation)
|
| 642 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
|
|
|
| 43 |
# import anthropic
|
| 44 |
import cohere
|
| 45 |
import faiss
|
| 46 |
+
import matplotlib.pyplot as plt
|
| 47 |
import spacy
|
| 48 |
from string import punctuation
|
| 49 |
import pytextrank
|
|
|
|
| 282 |
indices = [i for i in top_results]
|
| 283 |
df.insert(1,'ADS Link',links,True)
|
| 284 |
df.insert(2,'Relevance',scores,True)
|
| 285 |
+
df.insert(3,'indices',indices,True)
|
| 286 |
+
df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','indices','embed']]
|
| 287 |
df.index += 1
|
| 288 |
return df
|
| 289 |
|
|
|
|
| 391 |
papers_df = run_query_ret(st.session_state.query)
|
| 392 |
op_docs = ''
|
| 393 |
for i in range(len(papers_df)):
|
| 394 |
+
op_docs = op_docs + 'Paper %.0f:' %(i+1) + papers_df['title'][i+1] + '\n' + papers_df['abstract'][i+1] + '\n\n'
|
| 395 |
|
| 396 |
return op_docs
|
| 397 |
|
|
|
|
| 451 |
metadata = {"source": row['ads_id']}
|
| 452 |
doc = Document(page_content=content, metadata=metadata)
|
| 453 |
documents.append(doc)
|
| 454 |
+
my_bar.progress((i)/len(papers_df), text='adding documents to LLM context')
|
| 455 |
|
| 456 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
|
| 457 |
|
|
|
|
| 562 |
|
| 563 |
def make_embedding_plot(papers_df, consensus_answer):
|
| 564 |
|
| 565 |
+
plt_indices = np.array(papers_df['indices'].tolist())
|
| 566 |
|
| 567 |
if 'arxiv_corpus' not in st.session_state:
|
| 568 |
st.session_state.arxiv_corpus = load_arxiv_corpus()
|
|
|
|
| 574 |
alphas = np.ones((len(plt_indices),)) * 0.9
|
| 575 |
alphas[outlier_flag] = 0.5
|
| 576 |
|
| 577 |
+
fig = plt.figure(figsize=(9*2.,12*2.))
|
| 578 |
plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
|
| 579 |
+
|
| 580 |
+
clkws = np.load('kw_tags.npz')
|
| 581 |
+
all_x, all_y, all_topics, repeat_flag = clkws['all_x'], clkws['all_y'], clkws['all_topics'], clkws['repeat_flag']
|
| 582 |
+
for i in range(len(all_topics)):
|
| 583 |
+
if repeat_flag[i] == False:
|
| 584 |
+
plt.text(all_x[i], all_y[i], all_topics[i],fontsize=9,ha="center", va="center",
|
| 585 |
+
bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.81))
|
| 586 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w',zorder=1000)
|
| 587 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue',zorder=1001)
|
| 588 |
# plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
|
| 589 |
plt.axis([0,20,-4.2,18])
|
| 590 |
plt.axis('off')
|
|
|
|
| 596 |
|
| 597 |
if st.session_state.get('runpfdr'):
|
| 598 |
with st.spinner(search_text_list[np.random.choice(len(search_text_list))]):
|
| 599 |
+
st.write('Settings: [Kw:',toggle_a, 'Time:',toggle_b, 'Cite:',toggle_c, '] top_k:',top_k, 'retrieval: `',method+'`')
|
| 600 |
|
| 601 |
papers_df = run_query_ret(st.session_state.query)
|
| 602 |
st.header(st.session_state.query)
|
|
|
|
| 643 |
|
| 644 |
with st.spinner("Evaluating abstract consensus"):
|
| 645 |
with st.expander("Abstract consensus", expanded=True):
|
| 646 |
+
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i+1] for i in range(len(papers_df))])
|
| 647 |
st.subheader("Consensus: "+consensus_answer.consensus)
|
| 648 |
st.markdown(consensus_answer.explanation)
|
| 649 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
kw_tags.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d7068524d3d3029b8d36f4dd2fdf20d5c8a12fc69d8d1a404921aa54a6b40a8
|
| 3 |
+
size 17849
|