Spaces:

awacke1
/

BetterThanRAGPattern

Sleeping

App Files Files Community

BetterThanRAGPattern / app.py

awacke1

Create app.py

491710e verified about 1 year ago

raw

history blame

2.19 kB

	import os
	import nltk
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import CountVectorizer
	import plotly.express as px

	def preprocess_text(text):
	# Tokenize the text and remove stopwords
	tokens = nltk.word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [token for token in tokens if token not in stop_words]
	return ' '.join(filtered_tokens)

	def get_context_files(prompt):
	# Get all .md files in the current directory
	md_files = [file for file in os.listdir() if file.endswith('.md')]

	# Preprocess the prompt and context files
	processed_prompt = preprocess_text(prompt)
	processed_files = {}
	for file in md_files:
	with open(file, 'r') as f:
	content = f.read()
	processed_files[file] = preprocess_text(content)

	# Create a CountVectorizer to calculate word counts
	vectorizer = CountVectorizer()
	file_vectors = vectorizer.fit_transform(processed_files.values())
	prompt_vector = vectorizer.transform([processed_prompt])

	# Calculate the number of matching words for each file
	match_counts = prompt_vector.dot(file_vectors.T).toarray()[0]

	# Sort the files by the number of matching words
	sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True)

	# Get the top ten files
	top_ten_files = [file for file, count in sorted_files[:10]]

	# Create a single prompt by concatenating the original prompt and the content of the top ten files
	context_prompt = prompt
	for file in top_ten_files:
	with open(file, 'r') as f:
	context_prompt += '\n\n' + f.read()

	# Create a plotly graph showing the counts of matching words for the top ten files
	fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]])
	fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words')
	fig.show()

	return context_prompt

	# Example usage
	prompt = "What is the importance of machine learning in healthcare?"
	context_prompt = get_context_files(prompt)
	print(context_prompt)