Spaces:

kiyer
/

pathfinder

Running on CPU Upgrade

App Files Files Community

Kartheik Iyer commited on Sep 13, 2024

Commit

18e51e3

1 Parent(s): 036767e

update dataset and security fixes

Browse files

Files changed (1) hide show

app_gradio.py +75 -45

app_gradio.py CHANGED Viewed

@@ -34,7 +34,7 @@ from typing import List, Literal
 from nltk.corpus import stopwords
 import nltk
-from openai import OpenAI
 # import anthropic
 import cohere
 import faiss
@@ -64,6 +64,12 @@ embed_model = "text-embedding-3-small"
 embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
 nlp = load_nlp()
 def get_keywords(text, nlp=nlp):
     result = []
@@ -77,8 +83,12 @@ def get_keywords(text, nlp=nlp):
     return result
 def load_arxiv_corpus():
-    arxiv_corpus = load_from_disk('data/')
-    arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
     print('loading arxiv corpus from disk')
     return arxiv_corpus
@@ -344,6 +354,23 @@ def guess_question_type(query: str):
     messages = [("system",question_categorization_prompt,),("human", query),]
     return gen_client.invoke(messages).content
 class OverallConsensusEvaluation(BaseModel):
     rewritten_statement: str = Field(
         ...,
@@ -459,48 +486,51 @@ def run_pathfinder(query, top_k, extra_keywords, toggles, prompt_type, rag_type,
     search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
     gen_text_list = ['making the LLM talk to the papers...','invoking arcane rituals...','gone to library, please wait...','is there really an answer to this...']
-    input_keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
-    query_keywords = get_keywords(query)
-    ec.query_input_keywords = input_keywords+query_keywords
-    ec.toggles = toggles
-    if rag_type == "Semantic Search":
-        ec.hyde = False
-        ec.rerank = False
-    elif rag_type == "Semantic + HyDE":
-        ec.hyde = True
-        ec.rerank = False
-    elif rag_type == "Semantic + HyDE + CoHERE":
-        ec.hyde = True
-        ec.rerank = True
-    progress(0.2, desc=search_text_list[np.random.choice(len(search_text_list))])
-    rs, small_df = ec.retrieve(query, top_k = top_k, return_scores=True)
-    formatted_df = ec.return_formatted_df(rs, small_df)
-    yield formatted_df, None, None, None, None
-    progress(0.4, desc=gen_text_list[np.random.choice(len(gen_text_list))])
-    rag_answer = run_rag_qa(query, formatted_df, prompt_type)
-    yield formatted_df, rag_answer['answer'], None, None, None
-    progress(0.6, desc="Generating consensus")
-    consensus_answer = evaluate_overall_consensus(query, [formatted_df['abstract'][i+1] for i in range(len(formatted_df))])
-    consensus = '## Consensus \n'+consensus_answer.consensus + '\n\n'+consensus_answer.explanation + '\n\n > Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score
-    yield formatted_df, rag_answer['answer'], consensus, None, None
-    progress(0.8, desc="Analyzing question type")
-    question_type_gen = guess_question_type(query)
-    if '<categorization>' in question_type_gen:
-        question_type_gen = question_type_gen.split('<categorization>')[1]
-    if '</categorization>' in question_type_gen:
-        question_type_gen = question_type_gen.split('</categorization>')[0]
-    question_type_gen = question_type_gen.replace('\n','  \n')
-    qn_type = question_type_gen
-    yield formatted_df, rag_answer['answer'], consensus, qn_type, None
-    progress(1.0, desc="Visualizing embeddings")
-    fig = make_embedding_plot(formatted_df, top_k, consensus_answer)
-    yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
 def create_interface():
     custom_css = """

 from nltk.corpus import stopwords
 import nltk
+from openai import OpenAI, moderations
 # import anthropic
 import cohere
 import faiss
 embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
 nlp = load_nlp()
+def check_mod(query):
+    mod_report = moderations.create(input=query)
+    for i in mod_report.results[0].categories:
+        if i[1] == True:
+            return True
+    return False
 def get_keywords(text, nlp=nlp):
     result = []
     return result
 def load_arxiv_corpus():
+    # arxiv_corpus = load_from_disk('data/')
+    # arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
+    # keeping it up to date with the dataset
+    arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data', split='train')
+    arxiv_corpus.add_faiss_index(column='embed')
     print('loading arxiv corpus from disk')
     return arxiv_corpus
     messages = [("system",question_categorization_prompt,),("human", query),]
     return gen_client.invoke(messages).content
+def log_to_gist(strings):
+    # Adding query logs to prevent and account for possible malicious use.
+    # Logs will be deleted periodically if not needed.
+    github_token = os.environ['github_token']
+    gist_id = os.environ['gist_id']
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    content = f"\n{timestamp}: {' '.join(strings)}\n"
+    headers = {'Authorization': f'token {github_token}','Accept': 'application/vnd.github.v3+json'}
+    response = requests.get(f'https://api.github.com/gists/{gist_id}', headers=headers)
+    if response.status_code == 200:
+        existing_content = response.json()['files']['log.txt']['content']
+        content = existing_content + content
+    data = {"description": "Logged Strings","public": False,"files": {"log.txt": {"content": content}}}
+    headers = {'Authorization': f'token {github_token}','Accept': 'application/vnd.github.v3+json'}
+    response = requests.patch(f'https://api.github.com/gists/{gist_id}', headers=headers, data=json.dumps(data)) # Update existing gist
+    return
 class OverallConsensusEvaluation(BaseModel):
     rewritten_statement: str = Field(
         ...,
     search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
     gen_text_list = ['making the LLM talk to the papers...','invoking arcane rituals...','gone to library, please wait...','is there really an answer to this...']
+    log_to_gist(['[mod flag: '+str(check_mod(query))+']', query])
+    if check_mod(query) == False:
+        input_keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
+        query_keywords = get_keywords(query)
+        ec.query_input_keywords = input_keywords+query_keywords
+        ec.toggles = toggles
+        if rag_type == "Semantic Search":
+            ec.hyde = False
+            ec.rerank = False
+        elif rag_type == "Semantic + HyDE":
+            ec.hyde = True
+            ec.rerank = False
+        elif rag_type == "Semantic + HyDE + CoHERE":
+            ec.hyde = True
+            ec.rerank = True
+        progress(0.2, desc=search_text_list[np.random.choice(len(search_text_list))])
+        rs, small_df = ec.retrieve(query, top_k = top_k, return_scores=True)
+        formatted_df = ec.return_formatted_df(rs, small_df)
+        yield formatted_df, None, None, None, None
+        progress(0.4, desc=gen_text_list[np.random.choice(len(gen_text_list))])
+        rag_answer = run_rag_qa(query, formatted_df, prompt_type)
+        yield formatted_df, rag_answer['answer'], None, None, None
+        progress(0.6, desc="Generating consensus")
+        consensus_answer = evaluate_overall_consensus(query, [formatted_df['abstract'][i+1] for i in range(len(formatted_df))])
+        consensus = '## Consensus \n'+consensus_answer.consensus + '\n\n'+consensus_answer.explanation + '\n\n > Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score
+        yield formatted_df, rag_answer['answer'], consensus, None, None
+        progress(0.8, desc="Analyzing question type")
+        question_type_gen = guess_question_type(query)
+        if '<categorization>' in question_type_gen:
+            question_type_gen = question_type_gen.split('<categorization>')[1]
+        if '</categorization>' in question_type_gen:
+            question_type_gen = question_type_gen.split('</categorization>')[0]
+        question_type_gen = question_type_gen.replace('\n','  \n')
+        qn_type = question_type_gen
+        yield formatted_df, rag_answer['answer'], consensus, qn_type, None
+        progress(1.0, desc="Visualizing embeddings")
+        fig = make_embedding_plot(formatted_df, top_k, consensus_answer)
+        yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
 def create_interface():
     custom_css = """