Spaces:

awacke1
/

BetterThanRAGPattern

Sleeping

App Files Files Community

awacke1 commited on May 29, 2024

Commit

2da2f3b

verified ·

1 Parent(s): 0756cf0

Create app.py

Browse files

Files changed (1) hide show

app.py +110 -0

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import zipfile
+import streamlit as st
+import nltk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+import plotly.express as px
+nltk.download('punkt')
+nltk.download('stopwords')
+def preprocess_text(text):
+    # Tokenize the text and remove stopwords
+    tokens = nltk.word_tokenize(text.lower())
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [token for token in tokens if token not in stop_words]
+    return filtered_tokens
+def get_context_files(prompt, md_files):
+    # Preprocess the prompt and context files
+    processed_prompt = preprocess_text(prompt)
+    processed_files = {}
+    for file in md_files:
+        with open(file, 'r') as f:
+            content = f.read()
+            processed_files[file] = preprocess_text(content)
+    # Calculate word matches and LCS bonus
+    file_matches = {}
+    for file, tokens in processed_files.items():
+        single_matches = set(tokens) & set(processed_prompt)
+        double_matches = set(nltk.bigrams(tokens)) & set(nltk.bigrams(processed_prompt))
+        triple_matches = set(nltk.trigrams(tokens)) & set(nltk.trigrams(processed_prompt))
+        match_count = len(single_matches) + len(double_matches) * 4 + len(triple_matches) * 9
+        file_matches[file] = {
+            'single_matches': single_matches,
+            'double_matches': double_matches,
+            'triple_matches': triple_matches,
+            'match_count': match_count
+        }
+    # Sort the files by the match count
+    sorted_files = sorted(file_matches.items(), key=lambda x: x[1]['match_count'], reverse=True)
+    # Create a markdown outline with match counts and word matches
+    outline = "## Outline\n"
+    for file, matches in sorted_files:
+        outline += f"- {file}: {matches['match_count']} matches\n"
+        if matches['single_matches']:
+            outline += f"  - Single word matches: {', '.join(matches['single_matches'])}\n"
+        if matches['double_matches']:
+            outline += f"  - Double word matches: {', '.join(' '.join(pair) for pair in matches['double_matches'])}\n"
+        if matches['triple_matches']:
+            outline += f"  - Triple word matches: {', '.join(' '.join(trio) for trio in matches['triple_matches'])}\n"
+    # Create a single prompt by concatenating the original prompt and the content of the top ten files
+    context_prompt = prompt
+    for file, _ in sorted_files[:10]:
+        with open(file, 'r') as f:
+            content = f.read()
+            # Highlight the matching words in bold
+            for word in file_matches[file]['single_matches']:
+                content = content.replace(word, f"**{word}**")
+            for pair in file_matches[file]['double_matches']:
+                content = content.replace(' '.join(pair), f"**{' '.join(pair)}**")
+            for trio in file_matches[file]['triple_matches']:
+                content = content.replace(' '.join(trio), f"**{' '.join(trio)}**")
+            context_prompt += '\n\n' + content
+    # Create a plotly graph showing the match counts for the top ten files
+    fig = px.bar(x=[file for file, _ in sorted_files[:10]], y=[matches['match_count'] for _, matches in sorted_files[:10]])
+    fig.update_layout(xaxis_title='File', yaxis_title='Match Count')
+    st.plotly_chart(fig)
+    return outline, context_prompt
+# Streamlit app
+def main():
+    st.title("Context-Aware Prompt Evaluation")
+    # File upload
+    uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip")
+    if uploaded_file is not None:
+        # Unzip the uploaded file
+        with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
+            zip_ref.extractall('uploaded_files')
+        # Get the list of .md files from the uploaded directory
+        md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')]
+        # Show the list of files
+        st.subheader("Uploaded Files")
+        for file in md_files:
+            st.write(file)
+        # Prompt input
+        prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?')
+        prompt = st.text_area("Enter your prompt", value=prompt, key='prompt')
+        # Evaluate the files for the prompt
+        if st.button("Evaluate"):
+            outline, context_prompt = get_context_files(prompt, md_files)
+            st.subheader("Outline")
+            st.markdown(outline)
+            st.subheader("Context Prompt")
+            st.markdown(context_prompt)
+if __name__ == '__main__':
+    main()