Spaces:
Sleeping
Sleeping
import os | |
import nltk | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import CountVectorizer | |
import plotly.express as px | |
def preprocess_text(text): | |
# Tokenize the text and remove stopwords | |
tokens = nltk.word_tokenize(text.lower()) | |
stop_words = set(stopwords.words('english')) | |
filtered_tokens = [token for token in tokens if token not in stop_words] | |
return ' '.join(filtered_tokens) | |
def get_context_files(prompt): | |
# Get all .md files in the current directory | |
md_files = [file for file in os.listdir() if file.endswith('.md')] | |
# Preprocess the prompt and context files | |
processed_prompt = preprocess_text(prompt) | |
processed_files = {} | |
for file in md_files: | |
with open(file, 'r') as f: | |
content = f.read() | |
processed_files[file] = preprocess_text(content) | |
# Create a CountVectorizer to calculate word counts | |
vectorizer = CountVectorizer() | |
file_vectors = vectorizer.fit_transform(processed_files.values()) | |
prompt_vector = vectorizer.transform([processed_prompt]) | |
# Calculate the number of matching words for each file | |
match_counts = prompt_vector.dot(file_vectors.T).toarray()[0] | |
# Sort the files by the number of matching words | |
sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True) | |
# Get the top ten files | |
top_ten_files = [file for file, count in sorted_files[:10]] | |
# Create a single prompt by concatenating the original prompt and the content of the top ten files | |
context_prompt = prompt | |
for file in top_ten_files: | |
with open(file, 'r') as f: | |
context_prompt += '\n\n' + f.read() | |
# Create a plotly graph showing the counts of matching words for the top ten files | |
fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]]) | |
fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words') | |
fig.show() | |
return context_prompt | |
# Example usage | |
prompt = "What is the importance of machine learning in healthcare?" | |
context_prompt = get_context_files(prompt) | |
print(context_prompt) |