import gradio as gr import fitz # PyMuPDF for reading PDFs import numpy as np import pandas as pd import logging from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product import umap import plotly.graph_objects as go # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Initialize the model globally model = SentenceTransformer('all-MiniLM-L6-v2') logging.info("Model loaded successfully.") def process_pdf(pdf_path): logging.info(f"Processing PDF: {pdf_path}") doc = fitz.open(pdf_path) texts = [page.get_text() for page in doc] logging.info("PDF processed successfully.") return " ".join(texts) def create_embeddings(text): logging.info("Creating embeddings.") sentences = text.split(". ") # A simple split; consider a more robust sentence splitter embeddings = model.encode(sentences) logging.info("Embeddings created successfully.") return embeddings, sentences def calculate_distances(embeddings, query_embedding, metric): if metric == "cosine": distances = 1 - cosine_similarity(embeddings, [query_embedding]) elif metric == "euclidean": distances = euclidean_distances(embeddings, [query_embedding]) elif metric == "manhattan": distances = manhattan_distances(embeddings, [query_embedding]) elif metric == "dot": distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics return distances.flatten() def generate_plotly_figure(query, pdf_file, metric): logging.info("Generating plot with Plotly.") query_embedding = model.encode([query])[0] text = process_pdf(pdf_file.name) embeddings, sentences = create_embeddings(text) all_embeddings = np.vstack([embeddings, query_embedding]) all_sentences = sentences + [query] umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) umap_embeddings = umap_transform.fit_transform(all_embeddings) distances = calculate_distances(embeddings, query_embedding, metric) closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] colors.append('red') # For the query fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers', marker=dict(color=colors[:-1]), text=all_sentences[:-1], name='Sentences')) fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers', marker=dict(color='red'), text=[query], name='Query')) fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2") logging.info("Plotly figure created successfully.") return fig def gradio_interface(pdf_file, query, metric): logging.info("Gradio interface called with metric: " + metric) fig = generate_plotly_figure(query, pdf_file, metric) logging.info("Returning Plotly figure.") return fig iface = gr.Interface( fn=gradio_interface, inputs=[ gr.File(label="Upload PDF"), gr.Textbox(label="Query"), gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric") ], outputs=gr.Plot(), title="PDF Content Visualizer", description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics to explore different aspects of textual similarity.""" ) if __name__ == "__main__": iface.launch()