Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
import streamlit as st
|
4 |
+
import nltk
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
import plotly.express as px
|
8 |
+
|
9 |
+
nltk.download('punkt')
|
10 |
+
nltk.download('stopwords')
|
11 |
+
|
12 |
+
def preprocess_text(text):
|
13 |
+
# Tokenize the text and remove stopwords
|
14 |
+
tokens = nltk.word_tokenize(text.lower())
|
15 |
+
stop_words = set(stopwords.words('english'))
|
16 |
+
filtered_tokens = [token for token in tokens if token not in stop_words]
|
17 |
+
return filtered_tokens
|
18 |
+
|
19 |
+
def get_context_files(prompt, md_files):
|
20 |
+
# Preprocess the prompt and context files
|
21 |
+
processed_prompt = preprocess_text(prompt)
|
22 |
+
processed_files = {}
|
23 |
+
for file in md_files:
|
24 |
+
with open(file, 'r') as f:
|
25 |
+
content = f.read()
|
26 |
+
processed_files[file] = preprocess_text(content)
|
27 |
+
|
28 |
+
# Calculate word matches and LCS bonus
|
29 |
+
file_matches = {}
|
30 |
+
for file, tokens in processed_files.items():
|
31 |
+
single_matches = set(tokens) & set(processed_prompt)
|
32 |
+
double_matches = set(nltk.bigrams(tokens)) & set(nltk.bigrams(processed_prompt))
|
33 |
+
triple_matches = set(nltk.trigrams(tokens)) & set(nltk.trigrams(processed_prompt))
|
34 |
+
match_count = len(single_matches) + len(double_matches) * 4 + len(triple_matches) * 9
|
35 |
+
file_matches[file] = {
|
36 |
+
'single_matches': single_matches,
|
37 |
+
'double_matches': double_matches,
|
38 |
+
'triple_matches': triple_matches,
|
39 |
+
'match_count': match_count
|
40 |
+
}
|
41 |
+
|
42 |
+
# Sort the files by the match count
|
43 |
+
sorted_files = sorted(file_matches.items(), key=lambda x: x[1]['match_count'], reverse=True)
|
44 |
+
|
45 |
+
# Create a markdown outline with match counts and word matches
|
46 |
+
outline = "## Outline\n"
|
47 |
+
for file, matches in sorted_files:
|
48 |
+
outline += f"- {file}: {matches['match_count']} matches\n"
|
49 |
+
if matches['single_matches']:
|
50 |
+
outline += f" - Single word matches: {', '.join(matches['single_matches'])}\n"
|
51 |
+
if matches['double_matches']:
|
52 |
+
outline += f" - Double word matches: {', '.join(' '.join(pair) for pair in matches['double_matches'])}\n"
|
53 |
+
if matches['triple_matches']:
|
54 |
+
outline += f" - Triple word matches: {', '.join(' '.join(trio) for trio in matches['triple_matches'])}\n"
|
55 |
+
|
56 |
+
# Create a single prompt by concatenating the original prompt and the content of the top ten files
|
57 |
+
context_prompt = prompt
|
58 |
+
for file, _ in sorted_files[:10]:
|
59 |
+
with open(file, 'r') as f:
|
60 |
+
content = f.read()
|
61 |
+
# Highlight the matching words in bold
|
62 |
+
for word in file_matches[file]['single_matches']:
|
63 |
+
content = content.replace(word, f"**{word}**")
|
64 |
+
for pair in file_matches[file]['double_matches']:
|
65 |
+
content = content.replace(' '.join(pair), f"**{' '.join(pair)}**")
|
66 |
+
for trio in file_matches[file]['triple_matches']:
|
67 |
+
content = content.replace(' '.join(trio), f"**{' '.join(trio)}**")
|
68 |
+
context_prompt += '\n\n' + content
|
69 |
+
|
70 |
+
# Create a plotly graph showing the match counts for the top ten files
|
71 |
+
fig = px.bar(x=[file for file, _ in sorted_files[:10]], y=[matches['match_count'] for _, matches in sorted_files[:10]])
|
72 |
+
fig.update_layout(xaxis_title='File', yaxis_title='Match Count')
|
73 |
+
st.plotly_chart(fig)
|
74 |
+
|
75 |
+
return outline, context_prompt
|
76 |
+
|
77 |
+
# Streamlit app
|
78 |
+
def main():
|
79 |
+
st.title("Context-Aware Prompt Evaluation")
|
80 |
+
|
81 |
+
# File upload
|
82 |
+
uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip")
|
83 |
+
|
84 |
+
if uploaded_file is not None:
|
85 |
+
# Unzip the uploaded file
|
86 |
+
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
|
87 |
+
zip_ref.extractall('uploaded_files')
|
88 |
+
|
89 |
+
# Get the list of .md files from the uploaded directory
|
90 |
+
md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')]
|
91 |
+
|
92 |
+
# Show the list of files
|
93 |
+
st.subheader("Uploaded Files")
|
94 |
+
for file in md_files:
|
95 |
+
st.write(file)
|
96 |
+
|
97 |
+
# Prompt input
|
98 |
+
prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?')
|
99 |
+
prompt = st.text_area("Enter your prompt", value=prompt, key='prompt')
|
100 |
+
|
101 |
+
# Evaluate the files for the prompt
|
102 |
+
if st.button("Evaluate"):
|
103 |
+
outline, context_prompt = get_context_files(prompt, md_files)
|
104 |
+
st.subheader("Outline")
|
105 |
+
st.markdown(outline)
|
106 |
+
st.subheader("Context Prompt")
|
107 |
+
st.markdown(context_prompt)
|
108 |
+
|
109 |
+
if __name__ == '__main__':
|
110 |
+
main()
|