Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,14 +41,28 @@ def calculate_distances(embeddings, query_embedding, metric):
|
|
| 41 |
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
| 42 |
return distances.flatten()
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def generate_plotly_figure(query, pdf_file, metric):
|
| 45 |
logging.info("Generating plot with Plotly.")
|
| 46 |
query_embedding = model.encode([query])[0]
|
| 47 |
text = process_pdf(pdf_file.name)
|
| 48 |
embeddings, sentences = create_embeddings(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
| 50 |
-
|
| 51 |
-
|
| 52 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
| 53 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
| 54 |
|
|
@@ -59,10 +73,10 @@ def generate_plotly_figure(query, pdf_file, metric):
|
|
| 59 |
colors.append('red') # For the query
|
| 60 |
|
| 61 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
| 62 |
-
marker=dict(color=colors[:-1]), text=
|
| 63 |
-
name='Sentences'))
|
| 64 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
| 65 |
-
marker=dict(color='red'), text=[
|
| 66 |
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
| 67 |
|
| 68 |
logging.info("Plotly figure created successfully.")
|
|
@@ -82,7 +96,7 @@ iface = gr.Interface(
|
|
| 82 |
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
| 83 |
],
|
| 84 |
outputs=gr.Plot(),
|
| 85 |
-
title="
|
| 86 |
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
| 87 |
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
| 88 |
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|
|
|
|
| 41 |
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
| 42 |
return distances.flatten()
|
| 43 |
|
| 44 |
+
def wrap_text(text, width=40):
|
| 45 |
+
"""
|
| 46 |
+
Inserts HTML line breaks for Plotly hover text.
|
| 47 |
+
:param text: The text to wrap.
|
| 48 |
+
:param width: The maximum line width before wrapping.
|
| 49 |
+
:return: Text with line breaks inserted.
|
| 50 |
+
"""
|
| 51 |
+
wrapped_text = '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])
|
| 52 |
+
return wrapped_text
|
| 53 |
+
|
| 54 |
def generate_plotly_figure(query, pdf_file, metric):
|
| 55 |
logging.info("Generating plot with Plotly.")
|
| 56 |
query_embedding = model.encode([query])[0]
|
| 57 |
text = process_pdf(pdf_file.name)
|
| 58 |
embeddings, sentences = create_embeddings(text)
|
| 59 |
+
|
| 60 |
+
# Wrap text for each sentence
|
| 61 |
+
sentences_wrapped = [wrap_text(sentence) for sentence in sentences]
|
| 62 |
+
all_sentences_wrapped = sentences_wrapped + [wrap_text(query)] # Apply wrapping to the query as well
|
| 63 |
+
|
| 64 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
| 65 |
+
|
|
|
|
| 66 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
| 67 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
| 68 |
|
|
|
|
| 73 |
colors.append('red') # For the query
|
| 74 |
|
| 75 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
| 76 |
+
marker=dict(color=colors[:-1]), text=all_sentences_wrapped[:-1],
|
| 77 |
+
name='Sentences', hoverinfo='text'))
|
| 78 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
| 79 |
+
marker=dict(color='red'), text=[all_sentences_wrapped[-1]], name='Query', hoverinfo='text'))
|
| 80 |
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
| 81 |
|
| 82 |
logging.info("Plotly figure created successfully.")
|
|
|
|
| 96 |
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
| 97 |
],
|
| 98 |
outputs=gr.Plot(),
|
| 99 |
+
title="Semantic Search Visualizer",
|
| 100 |
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
| 101 |
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
| 102 |
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|