Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF for reading PDFs | |
import numpy as np | |
from bokeh.plotting import figure, output_file, save | |
from bokeh.models import HoverTool, ColumnDataSource | |
import umap | |
import pandas as pd | |
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances | |
from sentence_transformers import SentenceTransformer | |
import tempfile | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Initialize the model globally | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
logging.info("Model loaded successfully.") | |
def process_pdf(pdf_path): | |
logging.info(f"Processing PDF: {pdf_path}") | |
# Open the PDF | |
doc = fitz.open(pdf_path) | |
texts = [page.get_text() for page in doc] | |
print("PDF processed successfully.") | |
return " ".join(texts) | |
def create_embeddings(text): | |
print("Creating embeddings.") | |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter | |
embeddings = model.encode(sentences) | |
print("Embeddings created successfully.") | |
return embeddings, sentences | |
import plotly.express as px | |
import plotly.graph_objects as go | |
def generate_plotly_figure(query, pdf_file): | |
logging.info("Generating plot with Plotly.") | |
# Generate embeddings for the query | |
query_embedding = model.encode([query])[0] | |
# Process the PDF and create embeddings | |
text = process_pdf(pdf_file.name) | |
embeddings, sentences = create_embeddings(text) | |
logging.info("Data prepared for UMAP.") | |
# Prepare the data for UMAP and visualization | |
all_embeddings = np.vstack([embeddings, query_embedding]) | |
all_sentences = sentences + [query] | |
# UMAP transformation | |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42) | |
umap_embeddings = umap_transform.fit_transform(all_embeddings) | |
logging.info("UMAP transformation completed.") | |
# Find the closest sentences to the query | |
distances = cosine_similarity([query_embedding], embeddings)[0] | |
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed | |
# Prepare data for plotting | |
colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] # Target points in green | |
colors.append('red') # Query point in red | |
# Add the scatter plot for sentences and query | |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers', | |
marker=dict(color=colors[:-1]), text=all_sentences[:-1], | |
name='Sentences')) | |
# Add the scatter plot for the query point | |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers', | |
marker=dict(color='red'), text=[query], name='Query')) | |
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2") | |
logging.info("Plotly figure created successfully.") | |
return fig | |
def gradio_interface(pdf_file, query): | |
logging.info("Gradio interface called.") | |
fig = generate_plotly_figure(query, pdf_file) | |
logging.info("Returning Plotly figure.") | |
return fig | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")], | |
outputs=gr.Plot(), # Updated to use gr.Plot() for Plotly figures | |
title="PDF Content Visualizer", | |
description="Upload a PDF and enter a query to visualize the content." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |