Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF for reading PDFs | |
import numpy as np | |
import pandas as pd | |
import logging | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances | |
from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product | |
import umap | |
import plotly.graph_objects as go | |
# Set up logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Initialize the model globally | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
logging.info("Model loaded successfully.") | |
def process_pdf(pdf_path): | |
logging.info(f"Processing PDF: {pdf_path}") | |
doc = fitz.open(pdf_path) | |
texts = [page.get_text() for page in doc] | |
logging.info("PDF processed successfully.") | |
return " ".join(texts) | |
def create_embeddings(text): | |
logging.info("Creating embeddings.") | |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter | |
embeddings = model.encode(sentences) | |
logging.info("Embeddings created successfully.") | |
return embeddings, sentences | |
def calculate_distances(embeddings, query_embedding, metric): | |
if metric == "cosine": | |
distances = 1 - cosine_similarity(embeddings, [query_embedding]) | |
elif metric == "euclidean": | |
distances = euclidean_distances(embeddings, [query_embedding]) | |
elif metric == "manhattan": | |
distances = manhattan_distances(embeddings, [query_embedding]) | |
elif metric == "dot": | |
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics | |
return distances.flatten() | |
def generate_plotly_figure(query, pdf_file, metric): | |
logging.info("Generating plot with Plotly.") | |
query_embedding = model.encode([query])[0] | |
text = process_pdf(pdf_file.name) | |
embeddings, sentences = create_embeddings(text) | |
all_embeddings = np.vstack([embeddings, query_embedding]) | |
all_sentences = sentences + [query] | |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) | |
umap_embeddings = umap_transform.fit_transform(all_embeddings) | |
distances = calculate_distances(embeddings, query_embedding, metric) | |
closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences | |
colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] | |
colors.append('red') # For the query | |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers', | |
marker=dict(color=colors[:-1]), text=all_sentences[:-1], | |
name='Sentences')) | |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers', | |
marker=dict(color='red'), text=[query], name='Query')) | |
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2") | |
logging.info("Plotly figure created successfully.") | |
return fig | |
def gradio_interface(pdf_file, query, metric): | |
logging.info("Gradio interface called with metric: " + metric) | |
fig = generate_plotly_figure(query, pdf_file, metric) | |
logging.info("Returning Plotly figure.") | |
return fig | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.File(label="Upload PDF"), | |
gr.Textbox(label="Query"), | |
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric") | |
], | |
outputs=gr.Plot(), | |
title="PDF Content Visualizer", | |
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document | |
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts | |
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics | |
to explore different aspects of textual similarity.""" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |