Torch-Git-Markdown-NLP / version1-app.py
awacke1's picture
Create version1-app.py
5cfa291
raw
history blame
2.09 kB
import streamlit as st
import requests
from transformers import pipeline
import plotly.express as px
import pandas as pd
from collections import Counter
import re
def get_markdown_from_github(url):
response = requests.get(url)
markdown = response.text
return markdown
def preprocess_text(text):
text = text.lower()
text = re.sub('[^A-Za-z0-9]+', ' ', text)
return text
def get_most_frequent_words(text, n):
words = text.split()
word_count = Counter(words)
most_common_words = word_count.most_common(n)
return most_common_words
def get_sentences_with_common_words(text, common_words):
sentences = re.split('[.?!]', text)
selected_sentences = []
for sentence in sentences:
for word in common_words:
if word in sentence:
selected_sentences.append(sentence.strip())
break
return selected_sentences
def render_heatmap(words, sentences):
df = pd.DataFrame(words, columns=['word', 'frequency'])
fig = px.treemap(df, path=['word'], values='frequency', color='frequency', hover_data=['frequency'], color_continuous_scale='reds')
st.plotly_chart(fig, use_container_width=True)
st.write('Sentences containing the most common words:')
for sentence in sentences:
st.write('- ' + sentence)
def main():
st.title('Markdown Analyzer')
# Get markdown from GitHub
markdown_url = 'https://github.com/AaronCWacker/Yggdrasil/blob/main/README.md'
markdown = get_markdown_from_github(markdown_url)
# Preprocess text
text = preprocess_text(markdown)
# Get most frequent words
n_most_frequent_words = st.sidebar.slider('Number of most frequent words to display', 1, 20, 10)
most_frequent_words = get_most_frequent_words(text, n_most_frequent_words)
# Get sentences containing common words
common_words = [word for word, _ in most_frequent_words]
sentences = get_sentences_with_common_words(text, common_words)
# Render heatmap
render_heatmap(most_frequent_words, sentences)
if __name__ == '__main__':
main()