Spaces:

julian-schelb
/

information-retrieval-demo

Sleeping

App Files Files Community

julian-schelb commited on 26 days ago

Commit

f81bf22

verified ·

1 Parent(s): b5a9da7

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +59 -39

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,60 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# streamlit_app.py
 import streamlit as st
+from datasets import load_dataset
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem.snowball import SnowballStemmer
+from nltk.tokenize import word_tokenize
+import re
+# ---------- initial setup ----------
+nltk.download("stopwords", quiet=True)
+nltk.download("punkt", quiet=True)
+stemmer = SnowballStemmer("english")
+stop_words = set(stopwords.words("english"))
+def tokenizer(text: str):
+    # basic cleanup → NLTK tokenize → stem
+    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text.lower())
+    tokens = word_tokenize(text)
+    return [stemmer.stem(tok) for tok in tokens if tok not in stop_words and tok.isalnum()]
+@st.cache_data(show_spinner="Loading data & building index…")
+def load_and_index():
+    # first 1 000 docs only
+    ds = load_dataset("webis/tldr-17", split="train[:1000]")
+    docs = ds["content"]
+    vec = TfidfVectorizer(tokenizer=tokenizer)
+    matrix = vec.fit_transform(docs)
+    return docs, vec, matrix
+docs, vectorizer, tfidf_matrix = load_and_index()
+# ---------- UI ----------
+st.markdown(
+    """
+    <style>
+    .stTextInput > div {width:100%; display:flex; justify-content:center;}
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown("## TF-IDF Reddit Search")
+query = st.text_input(" ", key="query", placeholder="Search…", label_visibility="hidden")
+# ---------- search ----------
+if query:
+    q_vec = vectorizer.transform([query])
+    sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
+    top_idx = sims.argsort()[::-1]  # high→low
+    res_df = pd.DataFrame(
+        {"similarity": sims[top_idx], "document": [docs[i] for i in top_idx]}
+    )
+    st.dataframe(
+        res_df.style.format({"similarity": "{:.3f}"}), use_container_width=True
+    )