import os import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer import plotly.express as px def preprocess_text(text): # Tokenize the text and remove stopwords tokens = nltk.word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) filtered_tokens = [token for token in tokens if token not in stop_words] return ' '.join(filtered_tokens) def get_context_files(prompt): # Get all .md files in the current directory md_files = [file for file in os.listdir() if file.endswith('.md')] # Preprocess the prompt and context files processed_prompt = preprocess_text(prompt) processed_files = {} for file in md_files: with open(file, 'r') as f: content = f.read() processed_files[file] = preprocess_text(content) # Create a CountVectorizer to calculate word counts vectorizer = CountVectorizer() file_vectors = vectorizer.fit_transform(processed_files.values()) prompt_vector = vectorizer.transform([processed_prompt]) # Calculate the number of matching words for each file match_counts = prompt_vector.dot(file_vectors.T).toarray()[0] # Sort the files by the number of matching words sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True) # Get the top ten files top_ten_files = [file for file, count in sorted_files[:10]] # Create a single prompt by concatenating the original prompt and the content of the top ten files context_prompt = prompt for file in top_ten_files: with open(file, 'r') as f: context_prompt += '\n\n' + f.read() # Create a plotly graph showing the counts of matching words for the top ten files fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]]) fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words') fig.show() return context_prompt # Example usage prompt = "What is the importance of machine learning in healthcare?" context_prompt = get_context_files(prompt) print(context_prompt)