awacke1 commited on
Commit
2da2f3b
·
verified ·
1 Parent(s): 0756cf0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import streamlit as st
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ import plotly.express as px
8
+
9
+ nltk.download('punkt')
10
+ nltk.download('stopwords')
11
+
12
+ def preprocess_text(text):
13
+ # Tokenize the text and remove stopwords
14
+ tokens = nltk.word_tokenize(text.lower())
15
+ stop_words = set(stopwords.words('english'))
16
+ filtered_tokens = [token for token in tokens if token not in stop_words]
17
+ return filtered_tokens
18
+
19
+ def get_context_files(prompt, md_files):
20
+ # Preprocess the prompt and context files
21
+ processed_prompt = preprocess_text(prompt)
22
+ processed_files = {}
23
+ for file in md_files:
24
+ with open(file, 'r') as f:
25
+ content = f.read()
26
+ processed_files[file] = preprocess_text(content)
27
+
28
+ # Calculate word matches and LCS bonus
29
+ file_matches = {}
30
+ for file, tokens in processed_files.items():
31
+ single_matches = set(tokens) & set(processed_prompt)
32
+ double_matches = set(nltk.bigrams(tokens)) & set(nltk.bigrams(processed_prompt))
33
+ triple_matches = set(nltk.trigrams(tokens)) & set(nltk.trigrams(processed_prompt))
34
+ match_count = len(single_matches) + len(double_matches) * 4 + len(triple_matches) * 9
35
+ file_matches[file] = {
36
+ 'single_matches': single_matches,
37
+ 'double_matches': double_matches,
38
+ 'triple_matches': triple_matches,
39
+ 'match_count': match_count
40
+ }
41
+
42
+ # Sort the files by the match count
43
+ sorted_files = sorted(file_matches.items(), key=lambda x: x[1]['match_count'], reverse=True)
44
+
45
+ # Create a markdown outline with match counts and word matches
46
+ outline = "## Outline\n"
47
+ for file, matches in sorted_files:
48
+ outline += f"- {file}: {matches['match_count']} matches\n"
49
+ if matches['single_matches']:
50
+ outline += f" - Single word matches: {', '.join(matches['single_matches'])}\n"
51
+ if matches['double_matches']:
52
+ outline += f" - Double word matches: {', '.join(' '.join(pair) for pair in matches['double_matches'])}\n"
53
+ if matches['triple_matches']:
54
+ outline += f" - Triple word matches: {', '.join(' '.join(trio) for trio in matches['triple_matches'])}\n"
55
+
56
+ # Create a single prompt by concatenating the original prompt and the content of the top ten files
57
+ context_prompt = prompt
58
+ for file, _ in sorted_files[:10]:
59
+ with open(file, 'r') as f:
60
+ content = f.read()
61
+ # Highlight the matching words in bold
62
+ for word in file_matches[file]['single_matches']:
63
+ content = content.replace(word, f"**{word}**")
64
+ for pair in file_matches[file]['double_matches']:
65
+ content = content.replace(' '.join(pair), f"**{' '.join(pair)}**")
66
+ for trio in file_matches[file]['triple_matches']:
67
+ content = content.replace(' '.join(trio), f"**{' '.join(trio)}**")
68
+ context_prompt += '\n\n' + content
69
+
70
+ # Create a plotly graph showing the match counts for the top ten files
71
+ fig = px.bar(x=[file for file, _ in sorted_files[:10]], y=[matches['match_count'] for _, matches in sorted_files[:10]])
72
+ fig.update_layout(xaxis_title='File', yaxis_title='Match Count')
73
+ st.plotly_chart(fig)
74
+
75
+ return outline, context_prompt
76
+
77
+ # Streamlit app
78
+ def main():
79
+ st.title("Context-Aware Prompt Evaluation")
80
+
81
+ # File upload
82
+ uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip")
83
+
84
+ if uploaded_file is not None:
85
+ # Unzip the uploaded file
86
+ with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
87
+ zip_ref.extractall('uploaded_files')
88
+
89
+ # Get the list of .md files from the uploaded directory
90
+ md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')]
91
+
92
+ # Show the list of files
93
+ st.subheader("Uploaded Files")
94
+ for file in md_files:
95
+ st.write(file)
96
+
97
+ # Prompt input
98
+ prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?')
99
+ prompt = st.text_area("Enter your prompt", value=prompt, key='prompt')
100
+
101
+ # Evaluate the files for the prompt
102
+ if st.button("Evaluate"):
103
+ outline, context_prompt = get_context_files(prompt, md_files)
104
+ st.subheader("Outline")
105
+ st.markdown(outline)
106
+ st.subheader("Context Prompt")
107
+ st.markdown(context_prompt)
108
+
109
+ if __name__ == '__main__':
110
+ main()