Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
import gradio as gr
|
2 |
import fitz # PyMuPDF for reading PDFs
|
3 |
import numpy as np
|
4 |
-
from bokeh.plotting import figure, output_file, save
|
5 |
-
from bokeh.models import HoverTool, ColumnDataSource
|
6 |
-
import umap
|
7 |
import pandas as pd
|
8 |
-
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
9 |
-
from sentence_transformers import SentenceTransformer
|
10 |
-
import tempfile
|
11 |
import logging
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -19,74 +18,76 @@ logging.info("Model loaded successfully.")
|
|
19 |
|
20 |
def process_pdf(pdf_path):
|
21 |
logging.info(f"Processing PDF: {pdf_path}")
|
22 |
-
# Open the PDF
|
23 |
doc = fitz.open(pdf_path)
|
24 |
texts = [page.get_text() for page in doc]
|
25 |
-
|
26 |
return " ".join(texts)
|
27 |
|
28 |
def create_embeddings(text):
|
29 |
-
|
30 |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
|
31 |
embeddings = model.encode(sentences)
|
32 |
-
|
33 |
return embeddings, sentences
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
def generate_plotly_figure(query, pdf_file):
|
39 |
logging.info("Generating plot with Plotly.")
|
40 |
-
# Generate embeddings for the query
|
41 |
query_embedding = model.encode([query])[0]
|
42 |
-
|
43 |
-
# Process the PDF and create embeddings
|
44 |
text = process_pdf(pdf_file.name)
|
45 |
embeddings, sentences = create_embeddings(text)
|
46 |
-
|
47 |
-
logging.info("Data prepared for UMAP.")
|
48 |
-
# Prepare the data for UMAP and visualization
|
49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
50 |
all_sentences = sentences + [query]
|
51 |
-
|
52 |
-
# UMAP transformation
|
53 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
54 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
55 |
|
56 |
-
|
57 |
-
#
|
58 |
-
distances = cosine_similarity([query_embedding], embeddings)[0]
|
59 |
-
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
|
60 |
|
61 |
-
|
62 |
-
colors
|
63 |
-
colors.append('red') # Query point in red
|
64 |
|
65 |
-
# Add the scatter plot for sentences and query
|
66 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
67 |
marker=dict(color=colors[:-1]), text=all_sentences[:-1],
|
68 |
name='Sentences'))
|
69 |
-
|
70 |
-
# Add the scatter plot for the query point
|
71 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
72 |
marker=dict(color='red'), text=[query], name='Query'))
|
73 |
-
|
74 |
-
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
75 |
|
76 |
logging.info("Plotly figure created successfully.")
|
77 |
return fig
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
81 |
logging.info("Returning Plotly figure.")
|
82 |
return fig
|
|
|
83 |
iface = gr.Interface(
|
84 |
fn=gradio_interface,
|
85 |
-
inputs=[
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
title="PDF Content Visualizer",
|
88 |
-
description="
|
|
|
|
|
|
|
89 |
)
|
90 |
|
91 |
if __name__ == "__main__":
|
92 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import fitz # PyMuPDF for reading PDFs
|
3 |
import numpy as np
|
|
|
|
|
|
|
4 |
import pandas as pd
|
|
|
|
|
|
|
5 |
import logging
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
|
8 |
+
from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product
|
9 |
+
import umap
|
10 |
+
import plotly.graph_objects as go
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
18 |
|
19 |
def process_pdf(pdf_path):
|
20 |
logging.info(f"Processing PDF: {pdf_path}")
|
|
|
21 |
doc = fitz.open(pdf_path)
|
22 |
texts = [page.get_text() for page in doc]
|
23 |
+
logging.info("PDF processed successfully.")
|
24 |
return " ".join(texts)
|
25 |
|
26 |
def create_embeddings(text):
|
27 |
+
logging.info("Creating embeddings.")
|
28 |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
|
29 |
embeddings = model.encode(sentences)
|
30 |
+
logging.info("Embeddings created successfully.")
|
31 |
return embeddings, sentences
|
32 |
|
33 |
+
def calculate_distances(embeddings, query_embedding, metric):
|
34 |
+
if metric == "cosine":
|
35 |
+
distances = 1 - cosine_similarity(embeddings, [query_embedding])
|
36 |
+
elif metric == "euclidean":
|
37 |
+
distances = euclidean_distances(embeddings, [query_embedding])
|
38 |
+
elif metric == "manhattan":
|
39 |
+
distances = manhattan_distances(embeddings, [query_embedding])
|
40 |
+
elif metric == "dot":
|
41 |
+
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
42 |
+
return distances.flatten()
|
43 |
|
44 |
+
def generate_plotly_figure(query, pdf_file, metric):
|
45 |
logging.info("Generating plot with Plotly.")
|
|
|
46 |
query_embedding = model.encode([query])[0]
|
|
|
|
|
47 |
text = process_pdf(pdf_file.name)
|
48 |
embeddings, sentences = create_embeddings(text)
|
|
|
|
|
|
|
49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
50 |
all_sentences = sentences + [query]
|
51 |
+
|
|
|
52 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
53 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
54 |
|
55 |
+
distances = calculate_distances(embeddings, query_embedding, metric)
|
56 |
+
closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences
|
|
|
|
|
57 |
|
58 |
+
colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))]
|
59 |
+
colors.append('red') # For the query
|
|
|
60 |
|
|
|
61 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
62 |
marker=dict(color=colors[:-1]), text=all_sentences[:-1],
|
63 |
name='Sentences'))
|
|
|
|
|
64 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
65 |
marker=dict(color='red'), text=[query], name='Query'))
|
66 |
+
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
|
|
67 |
|
68 |
logging.info("Plotly figure created successfully.")
|
69 |
return fig
|
70 |
+
|
71 |
+
def gradio_interface(pdf_file, query, metric):
|
72 |
+
logging.info("Gradio interface called with metric: " + metric)
|
73 |
+
fig = generate_plotly_figure(query, pdf_file, metric)
|
74 |
logging.info("Returning Plotly figure.")
|
75 |
return fig
|
76 |
+
|
77 |
iface = gr.Interface(
|
78 |
fn=gradio_interface,
|
79 |
+
inputs=[
|
80 |
+
gr.File(label="Upload PDF"),
|
81 |
+
gr.Textbox(label="Query"),
|
82 |
+
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
83 |
+
],
|
84 |
+
outputs=gr.Plot(),
|
85 |
title="PDF Content Visualizer",
|
86 |
+
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
87 |
+
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
88 |
+
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|
89 |
+
to explore different aspects of textual similarity."""
|
90 |
)
|
91 |
|
92 |
if __name__ == "__main__":
|
93 |
+
iface.launch()
|