julian-schelb commited on
Commit
f81bf22
·
verified ·
1 Parent(s): b5a9da7

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +59 -39
src/streamlit_app.py CHANGED
@@ -1,40 +1,60 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit_app.py
 
 
2
  import streamlit as st
3
+ from datasets import load_dataset
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import pandas as pd
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk.stem.snowball import SnowballStemmer
10
+ from nltk.tokenize import word_tokenize
11
+ import re
12
+
13
+ # ---------- initial setup ----------
14
+ nltk.download("stopwords", quiet=True)
15
+ nltk.download("punkt", quiet=True)
16
+
17
+ stemmer = SnowballStemmer("english")
18
+ stop_words = set(stopwords.words("english"))
19
+
20
+ def tokenizer(text: str):
21
+ # basic cleanup NLTK tokenize → stem
22
+ text = re.sub(r"[^a-zA-Z0-9\s]", " ", text.lower())
23
+ tokens = word_tokenize(text)
24
+ return [stemmer.stem(tok) for tok in tokens if tok not in stop_words and tok.isalnum()]
25
+
26
+ @st.cache_data(show_spinner="Loading data & building index…")
27
+ def load_and_index():
28
+ # first 1 000 docs only
29
+ ds = load_dataset("webis/tldr-17", split="train[:1000]")
30
+ docs = ds["content"]
31
+ vec = TfidfVectorizer(tokenizer=tokenizer)
32
+ matrix = vec.fit_transform(docs)
33
+ return docs, vec, matrix
34
+
35
+ docs, vectorizer, tfidf_matrix = load_and_index()
36
+
37
+ # ---------- UI ----------
38
+ st.markdown(
39
+ """
40
+ <style>
41
+ .stTextInput > div {width:100%; display:flex; justify-content:center;}
42
+ </style>
43
+ """,
44
+ unsafe_allow_html=True,
45
+ )
46
+
47
+ st.markdown("## TF-IDF Reddit Search")
48
+ query = st.text_input(" ", key="query", placeholder="Search…", label_visibility="hidden")
49
+
50
+ # ---------- search ----------
51
+ if query:
52
+ q_vec = vectorizer.transform([query])
53
+ sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
54
+ top_idx = sims.argsort()[::-1] # high→low
55
+ res_df = pd.DataFrame(
56
+ {"similarity": sims[top_idx], "document": [docs[i] for i in top_idx]}
57
+ )
58
+ st.dataframe(
59
+ res_df.style.format({"similarity": "{:.3f}"}), use_container_width=True
60
+ )