hamza50 Rams901 commited on
Commit
b75382a
·
0 Parent(s):

Duplicate from Rams901/Cicero-interactive-QA

Browse files

Co-authored-by: Ramsis Hammadi <[email protected]>

Files changed (6) hide show
  1. .env +1 -0
  2. .gitattributes +34 -0
  3. README.md +13 -0
  4. app.py +116 -0
  5. entire_data.pkl +3 -0
  6. requirements.txt +21 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_KEY=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
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cicero Semantic Search
3
+ emoji: 🐢
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.23.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: Rams901/Cicero-interactive-QA
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import tiktoken
4
+ import pandas as pd
5
+ import time
6
+ import spacy
7
+ from spacy.lang.en.stop_words import STOP_WORDS
8
+ from string import punctuation
9
+ from collections import Counter
10
+ from heapq import nlargest
11
+ import nltk
12
+ import numpy as np
13
+ from tqdm import tqdm
14
+ from sentence_transformers import SentenceTransformer, util
15
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
16
+ import gzip
17
+ import os
18
+ import torch
19
+ import re
20
+ import openai
21
+ from openai.embeddings_utils import get_embedding, cosine_similarity
22
+ import os
23
+ from dotenv import load_dotenv
24
+
25
+ load_dotenv()
26
+ print(os.getcwd())
27
+ openai.api_key = os.environ['OPENAI_KEY']
28
+ df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df
29
+ model = SentenceTransformer('all-mpnet-base-v2')
30
+
31
+ def remove_html_tags(text):
32
+ clean = re.compile('<.*?>')
33
+ return re.sub(clean, '', text)
34
+
35
+ df['content'] = df.content.apply(lambda x: remove_html_tags(x))
36
+ df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x))
37
+ session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. I will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
38
+
39
+ def new_ask(user_input):
40
+ response = openai.ChatCompletion.create(model ="gpt-3.5-turbo",
41
+ messages = [{'role': 'system', 'content': session_prompt},{'role': 'user', 'content': user_input}],
42
+ temperature = 0
43
+
44
+ )
45
+ # print(response)
46
+ return response['choices'][0]['message']['content']
47
+
48
+ def search(query):
49
+ n = 10
50
+ query_embedding = model.encode(query)
51
+ df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))
52
+
53
+ results = (df.sort_values("similarity", ascending=False).head(n))
54
+ r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max())
55
+ #results = results[['title','url','keywords','summary_html']].drop_duplicates()
56
+ results = r_groupby.reset_index()
57
+ results = results.sort_values("similarity", ascending=False)
58
+ tier_1 = []
59
+ tier_2 = []
60
+ for r in results.index:
61
+
62
+ if results.similarity[r][0] > 0.5:
63
+
64
+ tier_1.append(
65
+ {
66
+ "title":results.title[r],
67
+ "url":results.url[r],
68
+ "score": str(results.similarity[r][0]),
69
+ "summary": results.summary_html[r][:200],
70
+ "keywords": results.keywords[r]
71
+ }
72
+ )
73
+
74
+ elif results.similarity[r][0] > 0.4:
75
+ tier_2.append(
76
+ {
77
+ "title":results.title[r],
78
+ "url":results.url[r],
79
+ "score": str(results.similarity[r][0]),
80
+ "summary": results.summary_html[r][:200],
81
+ "keywords": results.keywords[r]
82
+ }
83
+ )
84
+ print(tier_1)
85
+ print(tier_2)
86
+ ln = "\n"
87
+ prefix = f"tier 1:\n{ln.join([x['title'] for x in tier_1])}"
88
+ print(prefix)
89
+ answer = new_ask(f"Answer the following query by giving arguments from the different arguments provided below. Make sure to quote the article used if the argument corrseponds to the query: Query: {query} Articles {ln.join([x['title'] + ': ' + x['summary'] for i, x in enumerate(tier_1)])}\nUse careful reasoning to explain your answer and give your conclusion about this.")
90
+
91
+ if len(tier_2):
92
+ suffix = f"tier 2:\n{ln.join([x['title'] for x in tier_2])}"
93
+ related_questions = new_ask(f"Give general questions related the following articles: {ln.join([str(i) + ' ' + x['summary'] for i, x in enumerate(tier_2)])}")
94
+
95
+ return f"{answer}\n\nRelated Questions:\n{related_questions}"
96
+
97
+ return f"{answer}"
98
+
99
+ def greet(query):
100
+
101
+ bm25 = search(query)
102
+ return bm25
103
+
104
+ examples = [
105
+ ["Climate Change Challenges in Europe"],
106
+ ["Philosophy in the world of Minimalism"],
107
+ ["Hate Speech vs Freedom of Speech"],
108
+ ["The importance of values and reflection"]
109
+ ]
110
+
111
+ demo = gr.Interface(fn=greet, title="cicero-interactive-qa",
112
+ outputs = "text",inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),examples=examples)
113
+
114
+ demo.launch(share = True, debug = True)
115
+
116
+
entire_data.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d719ff7c8e72ee0f56541a05b3eac5241adb7f19c7237ac3d6546af12f6dde22
3
+ size 51891614
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ scipy
3
+ tqdm
4
+ gensim
5
+ plotly
6
+ scikit-learn
7
+ numpy
8
+ wordcloud
9
+ matplotlib
10
+ openai
11
+ langchain
12
+ faiss-cpu
13
+ tiktoken
14
+ sentence_transformers
15
+ scipy
16
+ tqdm
17
+ matplotlib
18
+ spacy
19
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
20
+ rank-bm25
21
+ python-dotenv