Spaces:

awacke1
/

BetterThanRAGPattern

Sleeping

App Files Files Community

awacke1 commited on May 29, 2024

Commit

491710e

verified ·

1 Parent(s): 2f3de1b

Create app.py

Browse files

Files changed (1) hide show

app.py +56 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import nltk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+import plotly.express as px
+def preprocess_text(text):
+    # Tokenize the text and remove stopwords
+    tokens = nltk.word_tokenize(text.lower())
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [token for token in tokens if token not in stop_words]
+    return ' '.join(filtered_tokens)
+def get_context_files(prompt):
+    # Get all .md files in the current directory
+    md_files = [file for file in os.listdir() if file.endswith('.md')]
+    # Preprocess the prompt and context files
+    processed_prompt = preprocess_text(prompt)
+    processed_files = {}
+    for file in md_files:
+        with open(file, 'r') as f:
+            content = f.read()
+            processed_files[file] = preprocess_text(content)
+    # Create a CountVectorizer to calculate word counts
+    vectorizer = CountVectorizer()
+    file_vectors = vectorizer.fit_transform(processed_files.values())
+    prompt_vector = vectorizer.transform([processed_prompt])
+    # Calculate the number of matching words for each file
+    match_counts = prompt_vector.dot(file_vectors.T).toarray()[0]
+    # Sort the files by the number of matching words
+    sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True)
+    # Get the top ten files
+    top_ten_files = [file for file, count in sorted_files[:10]]
+    # Create a single prompt by concatenating the original prompt and the content of the top ten files
+    context_prompt = prompt
+    for file in top_ten_files:
+        with open(file, 'r') as f:
+            context_prompt += '\n\n' + f.read()
+    # Create a plotly graph showing the counts of matching words for the top ten files
+    fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]])
+    fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words')
+    fig.show()
+    return context_prompt
+# Example usage
+prompt = "What is the importance of machine learning in healthcare?"
+context_prompt = get_context_files(prompt)
+print(context_prompt)