umarigan commited on
Commit
6115839
·
verified ·
1 Parent(s): 97c72c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -40
app.py CHANGED
@@ -1,14 +1,13 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF for reading PDFs
3
  import numpy as np
4
- from bokeh.plotting import figure, output_file, save
5
- from bokeh.models import HoverTool, ColumnDataSource
6
- import umap
7
  import pandas as pd
8
- from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
9
- from sentence_transformers import SentenceTransformer
10
- import tempfile
11
  import logging
 
 
 
 
 
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -19,74 +18,76 @@ logging.info("Model loaded successfully.")
19
 
20
  def process_pdf(pdf_path):
21
  logging.info(f"Processing PDF: {pdf_path}")
22
- # Open the PDF
23
  doc = fitz.open(pdf_path)
24
  texts = [page.get_text() for page in doc]
25
- print("PDF processed successfully.")
26
  return " ".join(texts)
27
 
28
  def create_embeddings(text):
29
- print("Creating embeddings.")
30
  sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
31
  embeddings = model.encode(sentences)
32
- print("Embeddings created successfully.")
33
  return embeddings, sentences
34
 
35
- import plotly.express as px
36
- import plotly.graph_objects as go
 
 
 
 
 
 
 
 
37
 
38
- def generate_plotly_figure(query, pdf_file):
39
  logging.info("Generating plot with Plotly.")
40
- # Generate embeddings for the query
41
  query_embedding = model.encode([query])[0]
42
-
43
- # Process the PDF and create embeddings
44
  text = process_pdf(pdf_file.name)
45
  embeddings, sentences = create_embeddings(text)
46
-
47
- logging.info("Data prepared for UMAP.")
48
- # Prepare the data for UMAP and visualization
49
  all_embeddings = np.vstack([embeddings, query_embedding])
50
  all_sentences = sentences + [query]
51
-
52
- # UMAP transformation
53
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
54
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
55
 
56
- logging.info("UMAP transformation completed.")
57
- # Find the closest sentences to the query
58
- distances = cosine_similarity([query_embedding], embeddings)[0]
59
- closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
60
 
61
- # Prepare data for plotting
62
- colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] # Target points in green
63
- colors.append('red') # Query point in red
64
 
65
- # Add the scatter plot for sentences and query
66
  fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
67
  marker=dict(color=colors[:-1]), text=all_sentences[:-1],
68
  name='Sentences'))
69
-
70
- # Add the scatter plot for the query point
71
  fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
72
  marker=dict(color='red'), text=[query], name='Query'))
73
-
74
- fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
75
 
76
  logging.info("Plotly figure created successfully.")
77
  return fig
78
- def gradio_interface(pdf_file, query):
79
- logging.info("Gradio interface called.")
80
- fig = generate_plotly_figure(query, pdf_file)
 
81
  logging.info("Returning Plotly figure.")
82
  return fig
 
83
  iface = gr.Interface(
84
  fn=gradio_interface,
85
- inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
86
- outputs=gr.Plot(), # Updated to use gr.Plot() for Plotly figures
 
 
 
 
87
  title="PDF Content Visualizer",
88
- description="Upload a PDF and enter a query to visualize the content."
 
 
 
89
  )
90
 
91
  if __name__ == "__main__":
92
- iface.launch()
 
1
  import gradio as gr
2
  import fitz # PyMuPDF for reading PDFs
3
  import numpy as np
 
 
 
4
  import pandas as pd
 
 
 
5
  import logging
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
8
+ from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product
9
+ import umap
10
+ import plotly.graph_objects as go
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
18
 
19
  def process_pdf(pdf_path):
20
  logging.info(f"Processing PDF: {pdf_path}")
 
21
  doc = fitz.open(pdf_path)
22
  texts = [page.get_text() for page in doc]
23
+ logging.info("PDF processed successfully.")
24
  return " ".join(texts)
25
 
26
  def create_embeddings(text):
27
+ logging.info("Creating embeddings.")
28
  sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
29
  embeddings = model.encode(sentences)
30
+ logging.info("Embeddings created successfully.")
31
  return embeddings, sentences
32
 
33
+ def calculate_distances(embeddings, query_embedding, metric):
34
+ if metric == "cosine":
35
+ distances = 1 - cosine_similarity(embeddings, [query_embedding])
36
+ elif metric == "euclidean":
37
+ distances = euclidean_distances(embeddings, [query_embedding])
38
+ elif metric == "manhattan":
39
+ distances = manhattan_distances(embeddings, [query_embedding])
40
+ elif metric == "dot":
41
+ distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
42
+ return distances.flatten()
43
 
44
+ def generate_plotly_figure(query, pdf_file, metric):
45
  logging.info("Generating plot with Plotly.")
 
46
  query_embedding = model.encode([query])[0]
 
 
47
  text = process_pdf(pdf_file.name)
48
  embeddings, sentences = create_embeddings(text)
 
 
 
49
  all_embeddings = np.vstack([embeddings, query_embedding])
50
  all_sentences = sentences + [query]
51
+
 
52
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
53
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
54
 
55
+ distances = calculate_distances(embeddings, query_embedding, metric)
56
+ closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences
 
 
57
 
58
+ colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))]
59
+ colors.append('red') # For the query
 
60
 
 
61
  fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
62
  marker=dict(color=colors[:-1]), text=all_sentences[:-1],
63
  name='Sentences'))
 
 
64
  fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
65
  marker=dict(color='red'), text=[query], name='Query'))
66
+ fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
 
67
 
68
  logging.info("Plotly figure created successfully.")
69
  return fig
70
+
71
+ def gradio_interface(pdf_file, query, metric):
72
+ logging.info("Gradio interface called with metric: " + metric)
73
+ fig = generate_plotly_figure(query, pdf_file, metric)
74
  logging.info("Returning Plotly figure.")
75
  return fig
76
+
77
  iface = gr.Interface(
78
  fn=gradio_interface,
79
+ inputs=[
80
+ gr.File(label="Upload PDF"),
81
+ gr.Textbox(label="Query"),
82
+ gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
83
+ ],
84
+ outputs=gr.Plot(),
85
  title="PDF Content Visualizer",
86
+ description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
87
+ as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
88
+ within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
89
+ to explore different aspects of textual similarity."""
90
  )
91
 
92
  if __name__ == "__main__":
93
+ iface.launch()