awacke1 commited on
Commit
491710e
·
verified ·
1 Parent(s): 2f3de1b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ import plotly.express as px
6
+
7
+ def preprocess_text(text):
8
+ # Tokenize the text and remove stopwords
9
+ tokens = nltk.word_tokenize(text.lower())
10
+ stop_words = set(stopwords.words('english'))
11
+ filtered_tokens = [token for token in tokens if token not in stop_words]
12
+ return ' '.join(filtered_tokens)
13
+
14
+ def get_context_files(prompt):
15
+ # Get all .md files in the current directory
16
+ md_files = [file for file in os.listdir() if file.endswith('.md')]
17
+
18
+ # Preprocess the prompt and context files
19
+ processed_prompt = preprocess_text(prompt)
20
+ processed_files = {}
21
+ for file in md_files:
22
+ with open(file, 'r') as f:
23
+ content = f.read()
24
+ processed_files[file] = preprocess_text(content)
25
+
26
+ # Create a CountVectorizer to calculate word counts
27
+ vectorizer = CountVectorizer()
28
+ file_vectors = vectorizer.fit_transform(processed_files.values())
29
+ prompt_vector = vectorizer.transform([processed_prompt])
30
+
31
+ # Calculate the number of matching words for each file
32
+ match_counts = prompt_vector.dot(file_vectors.T).toarray()[0]
33
+
34
+ # Sort the files by the number of matching words
35
+ sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True)
36
+
37
+ # Get the top ten files
38
+ top_ten_files = [file for file, count in sorted_files[:10]]
39
+
40
+ # Create a single prompt by concatenating the original prompt and the content of the top ten files
41
+ context_prompt = prompt
42
+ for file in top_ten_files:
43
+ with open(file, 'r') as f:
44
+ context_prompt += '\n\n' + f.read()
45
+
46
+ # Create a plotly graph showing the counts of matching words for the top ten files
47
+ fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]])
48
+ fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words')
49
+ fig.show()
50
+
51
+ return context_prompt
52
+
53
+ # Example usage
54
+ prompt = "What is the importance of machine learning in healthcare?"
55
+ context_prompt = get_context_files(prompt)
56
+ print(context_prompt)