Spaces:
Sleeping
Sleeping
File size: 5,142 Bytes
e932fdf c38bbc6 e932fdf c38bbc6 e932fdf 0164e97 e932fdf c38bbc6 e932fdf 0164e97 e932fdf 0164e97 e932fdf c38bbc6 3f2b399 e932fdf 3f2b399 c38bbc6 e932fdf 3f2b399 e932fdf 0cfdb4e 0164e97 c38bbc6 e932fdf c38bbc6 e932fdf 0164e97 c38bbc6 e932fdf c38bbc6 0164e97 c38bbc6 0164e97 c38bbc6 beac33f c38bbc6 0164e97 beac33f 0cfdb4e e932fdf 0cfdb4e 8315f3e 0cfdb4e 8315f3e 0cfdb4e e932fdf 0cfdb4e e932fdf c38bbc6 0cfdb4e e932fdf c38bbc6 0164e97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import fitz # PyMuPDF for reading PDFs
import numpy as np
from bokeh.plotting import figure, output_file, save
from bokeh.models import HoverTool, ColumnDataSource
import umap
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer
import tempfile
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Initialize the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
logging.info("Model loaded successfully.")
def process_pdf(pdf_path):
logging.info(f"Processing PDF: {pdf_path}")
# Open the PDF
doc = fitz.open(pdf_path)
texts = [page.get_text() for page in doc]
print("PDF processed successfully.")
return " ".join(texts)
def create_embeddings(text):
print("Creating embeddings.")
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
embeddings = model.encode(sentences)
print("Embeddings created successfully.")
return embeddings, sentences
def generate_plot_bokeh(query, pdf_file):
logging.info("Generating plot.")
# Generate embeddings for the query
query_embedding = model.encode([query])[0]
# Process the PDF and create embeddings
text = process_pdf(pdf_file.name)
embeddings, sentences = create_embeddings(text)
logging.info("Data prepared for UMAP.")
# Prepare the data for UMAP and visualization
all_embeddings = np.vstack([embeddings, query_embedding])
all_sentences = sentences + [query]
# UMAP transformation
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
umap_embeddings = umap_transform.fit_transform(all_embeddings)
logging.info("UMAP transformation completed.")
# Find the closest sentences to the query
distances = cosine_similarity([query_embedding], embeddings)[0]
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
# Prepare data for plotting
data = {
'x': umap_embeddings[:-1, 0], # Exclude the query point itself
'y': umap_embeddings[:-1, 1], # Exclude the query point itself
'content': all_sentences[:-1], # Exclude the query sentence itself
'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
}
source = ColumnDataSource(data)
# Create the Bokeh plot
p = figure(title="UMAP Projection of Sentences", width=700, height=700)
p.scatter('x', 'y', color='color', source=source)
hover = HoverTool(tooltips=[("Content", "@content")])
p.add_tools(hover)
logging.info("Plot created successfully.")
# Save the plot to an HTML file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
logging.info(f"temp file is {temp_file.name}")
output_file(temp_file.name)
save(p)
logging.info("Plot saved to file.")
return temp_file.name
import plotly.express as px
import plotly.graph_objects as go
def generate_plotly_figure(query, pdf_file):
logging.info("Generating plot with Plotly.")
# Generate embeddings for the query
query_embedding = model.encode([query])[0]
# Process the PDF and create embeddings
text = process_pdf(pdf_file.name)
embeddings, sentences = create_embeddings(text)
logging.info("Data prepared for UMAP.")
# Prepare the data for UMAP and visualization
all_embeddings = np.vstack([embeddings, query_embedding])
all_sentences = sentences + [query]
# UMAP transformation
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
umap_embeddings = umap_transform.fit_transform(all_embeddings)
logging.info("UMAP transformation completed.")
# Find the closest sentences to the query
distances = cosine_similarity([query_embedding], embeddings)[0]
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
# Prepare data for plotting
colors = ['red' if i in closest_indices else 'blue' for i in range(len(sentences))]
fig = go.Figure()
fig.add_trace(go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
marker=dict(color=colors), text=all_sentences[:-1]))
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
logging.info("Plotly figure created successfully.")
return fig
def gradio_interface(pdf_file, query):
logging.info("Gradio interface called.")
fig = generate_plotly_figure(query, pdf_file)
logging.info("Returning Plotly figure.")
return fig
iface = gr.Interface(
fn=gradio_interface,
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
outputs=gr.Plot(), # Updated to use gr.Plot() for Plotly figures
title="PDF Content Visualizer",
description="Upload a PDF and enter a query to visualize the content."
)
if __name__ == "__main__":
iface.launch()
|