Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import nltk
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
5 |
+
import plotly.express as px
|
6 |
+
|
7 |
+
def preprocess_text(text):
|
8 |
+
# Tokenize the text and remove stopwords
|
9 |
+
tokens = nltk.word_tokenize(text.lower())
|
10 |
+
stop_words = set(stopwords.words('english'))
|
11 |
+
filtered_tokens = [token for token in tokens if token not in stop_words]
|
12 |
+
return ' '.join(filtered_tokens)
|
13 |
+
|
14 |
+
def get_context_files(prompt):
|
15 |
+
# Get all .md files in the current directory
|
16 |
+
md_files = [file for file in os.listdir() if file.endswith('.md')]
|
17 |
+
|
18 |
+
# Preprocess the prompt and context files
|
19 |
+
processed_prompt = preprocess_text(prompt)
|
20 |
+
processed_files = {}
|
21 |
+
for file in md_files:
|
22 |
+
with open(file, 'r') as f:
|
23 |
+
content = f.read()
|
24 |
+
processed_files[file] = preprocess_text(content)
|
25 |
+
|
26 |
+
# Create a CountVectorizer to calculate word counts
|
27 |
+
vectorizer = CountVectorizer()
|
28 |
+
file_vectors = vectorizer.fit_transform(processed_files.values())
|
29 |
+
prompt_vector = vectorizer.transform([processed_prompt])
|
30 |
+
|
31 |
+
# Calculate the number of matching words for each file
|
32 |
+
match_counts = prompt_vector.dot(file_vectors.T).toarray()[0]
|
33 |
+
|
34 |
+
# Sort the files by the number of matching words
|
35 |
+
sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True)
|
36 |
+
|
37 |
+
# Get the top ten files
|
38 |
+
top_ten_files = [file for file, count in sorted_files[:10]]
|
39 |
+
|
40 |
+
# Create a single prompt by concatenating the original prompt and the content of the top ten files
|
41 |
+
context_prompt = prompt
|
42 |
+
for file in top_ten_files:
|
43 |
+
with open(file, 'r') as f:
|
44 |
+
context_prompt += '\n\n' + f.read()
|
45 |
+
|
46 |
+
# Create a plotly graph showing the counts of matching words for the top ten files
|
47 |
+
fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]])
|
48 |
+
fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words')
|
49 |
+
fig.show()
|
50 |
+
|
51 |
+
return context_prompt
|
52 |
+
|
53 |
+
# Example usage
|
54 |
+
prompt = "What is the importance of machine learning in healthcare?"
|
55 |
+
context_prompt = get_context_files(prompt)
|
56 |
+
print(context_prompt)
|