umarigan commited on
Commit
c38bbc6
·
verified ·
1 Parent(s): 02e77ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -36
app.py CHANGED
@@ -1,69 +1,83 @@
1
  import gradio as gr
2
- from sentence_transformers import SentenceTransformer
3
- import fitz # PyMuPDF
4
  import numpy as np
5
  from bokeh.plotting import figure, output_file, save
6
- from bokeh.io import export_png
7
- from bokeh.embed import file_html
8
- from bokeh.resources import CDN
 
 
9
  import tempfile
10
- import os
11
 
12
- # Load your model
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
  def process_pdf(pdf_path):
16
  # Open the PDF
17
  doc = fitz.open(pdf_path)
18
- texts = []
19
- for page in doc:
20
- texts.append(page.get_text())
21
  return " ".join(texts)
22
 
23
  def create_embeddings(text):
24
- # Split the text into sentences/chunks and generate embeddings
25
- # This is a placeholder for your actual text splitting and embedding code
26
- sentences = text.split(".") # Simplistic split, consider using a better sentence splitter
27
  embeddings = model.encode(sentences)
28
  return embeddings, sentences
29
 
30
  def generate_plot(query, pdf_file):
 
 
 
31
  # Process the PDF and create embeddings
32
- text = process_pdf(pdf_file)
33
  embeddings, sentences = create_embeddings(text)
34
 
35
- # Here, you'll integrate the UMAP and Bokeh visualization code you have,
36
- # and then save the Bokeh plot to a file.
37
- # For simplicity, let's assume it's saved to 'plot.html'
38
 
39
- output_file("plot.html")
40
- # Your Bokeh plot creation code here...
41
- save(p) # Assuming 'p' is your Bokeh figure
42
-
43
- # Alternatively, you can save as PNG
44
- # export_png(p, filename="plot.png")
45
-
46
- # Return the path to the saved file
47
- return "plot.html" # or "plot.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def gradio_interface(pdf_file, query):
50
- plot_path = generate_plot(query, pdf_file.name)
51
-
52
- # If returning HTML file
53
  with open(plot_path, "r") as f:
54
  html_content = f.read()
55
  return html_content
56
-
57
- # If returning an image
58
- # return plot_path
59
 
60
- # Set up the Gradio app
61
  iface = gr.Interface(
62
  fn=gradio_interface,
63
- inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")], # Updated to new API
64
- outputs=gr.HTML(label="Visualization"), # or gr.Image() depending on your output
65
  title="PDF Content Visualizer",
66
  description="Upload a PDF and enter a query to visualize the content."
67
  )
68
 
69
- iface.launch()
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF for reading PDFs
 
3
  import numpy as np
4
  from bokeh.plotting import figure, output_file, save
5
+ from bokeh.models import HoverTool, ColumnDataSource
6
+ import umap
7
+ import pandas as pd
8
+ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
9
+ from sentence_transformers import SentenceTransformer
10
  import tempfile
 
11
 
12
+ # Initialize the model globally
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
 
15
  def process_pdf(pdf_path):
16
  # Open the PDF
17
  doc = fitz.open(pdf_path)
18
+ texts = [page.get_text() for page in doc]
 
 
19
  return " ".join(texts)
20
 
21
  def create_embeddings(text):
22
+ sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
 
 
23
  embeddings = model.encode(sentences)
24
  return embeddings, sentences
25
 
26
  def generate_plot(query, pdf_file):
27
+ # Generate embeddings for the query
28
+ query_embedding = model.encode([query])[0]
29
+
30
  # Process the PDF and create embeddings
31
+ text = process_pdf(pdf_file.name)
32
  embeddings, sentences = create_embeddings(text)
33
 
34
+ # Prepare the data for UMAP and visualization
35
+ all_embeddings = np.vstack([embeddings, query_embedding])
36
+ all_sentences = sentences + [query]
37
 
38
+ # UMAP transformation
39
+ umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
40
+ umap_embeddings = umap_transform.fit_transform(all_embeddings)
41
+
42
+ # Find the closest sentences to the query
43
+ distances = cosine_similarity([query_embedding], embeddings)[0]
44
+ closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
45
+
46
+ # Prepare data for plotting
47
+ data = {
48
+ 'x': umap_embeddings[:-1, 0], # Exclude the query point itself
49
+ 'y': umap_embeddings[:-1, 1], # Exclude the query point itself
50
+ 'content': all_sentences[:-1], # Exclude the query sentence itself
51
+ 'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
52
+ }
53
+ source = ColumnDataSource(data)
54
+
55
+ # Create the Bokeh plot
56
+ p = figure(title="UMAP Projection of Sentences", width=700, height=700)
57
+ p.scatter('x', 'y', color='color', source=source)
58
+
59
+ hover = HoverTool(tooltips=[("Content", "@content")])
60
+ p.add_tools(hover)
61
+
62
+ # Save the plot to an HTML file
63
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
64
+ output_file(temp_file.name)
65
+ save(p)
66
+ return temp_file.name
67
 
68
  def gradio_interface(pdf_file, query):
69
+ plot_path = generate_plot(query, pdf_file)
 
 
70
  with open(plot_path, "r") as f:
71
  html_content = f.read()
72
  return html_content
 
 
 
73
 
 
74
  iface = gr.Interface(
75
  fn=gradio_interface,
76
+ inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
77
+ outputs=gr.HTML(label="Visualization"),
78
  title="PDF Content Visualizer",
79
  description="Upload a PDF and enter a query to visualize the content."
80
  )
81
 
82
+ if __name__ == "__main__":
83
+ iface.launch()