Spaces:

umarigan
/

SemanticSearch

Sleeping

File size: 5,142 Bytes

e932fdf
c38bbc6
e932fdf
 
c38bbc6
 
 
 
 
e932fdf
0164e97
 
 
 
e932fdf
c38bbc6
e932fdf
0164e97
e932fdf
 
0164e97
e932fdf
 
c38bbc6
3f2b399
e932fdf
 
 
3f2b399
c38bbc6
e932fdf
3f2b399
e932fdf
 
0cfdb4e
0164e97
c38bbc6
 
 
e932fdf
c38bbc6
e932fdf
 
0164e97
c38bbc6
 
 
e932fdf
c38bbc6
 
 
 
0164e97
c38bbc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0164e97
c38bbc6
 
beac33f
c38bbc6
 
0164e97
beac33f
0cfdb4e
 
e932fdf
0cfdb4e
 
 
 
 
 
 
 
 
 
 
 
 
8315f3e
0cfdb4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8315f3e
0cfdb4e
 
e932fdf
0cfdb4e
 
 
 
 
e932fdf
 
c38bbc6
0cfdb4e
e932fdf
 
 
 
c38bbc6
0164e97

import gradio as gr
import fitz  # PyMuPDF for reading PDFs
import numpy as np
from bokeh.plotting import figure, output_file, save
from bokeh.models import HoverTool, ColumnDataSource
import umap
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer
import tempfile
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the model globally
model = SentenceTransformer('all-MiniLM-L6-v2')
logging.info("Model loaded successfully.")

def process_pdf(pdf_path):
    logging.info(f"Processing PDF: {pdf_path}")
    # Open the PDF
    doc = fitz.open(pdf_path)
    texts = [page.get_text() for page in doc]
    print("PDF processed successfully.")
    return " ".join(texts)

def create_embeddings(text):
    print("Creating embeddings.")
    sentences = text.split(". ")  # A simple split; consider a more robust sentence splitter
    embeddings = model.encode(sentences)
    print("Embeddings created successfully.")
    return embeddings, sentences

def generate_plot_bokeh(query, pdf_file):
    logging.info("Generating plot.")
    # Generate embeddings for the query
    query_embedding = model.encode([query])[0]
    
    # Process the PDF and create embeddings
    text = process_pdf(pdf_file.name)
    embeddings, sentences = create_embeddings(text)
    
    logging.info("Data prepared for UMAP.")
    # Prepare the data for UMAP and visualization
    all_embeddings = np.vstack([embeddings, query_embedding])
    all_sentences = sentences + [query]
    
    # UMAP transformation
    umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
    umap_embeddings = umap_transform.fit_transform(all_embeddings)
    
    logging.info("UMAP transformation completed.")
    # Find the closest sentences to the query
    distances = cosine_similarity([query_embedding], embeddings)[0]
    closest_indices = distances.argsort()[-5:][::-1]  # Adjust the number as needed
    
    # Prepare data for plotting
    data = {
        'x': umap_embeddings[:-1, 0],  # Exclude the query point itself
        'y': umap_embeddings[:-1, 1],  # Exclude the query point itself
        'content': all_sentences[:-1],  # Exclude the query sentence itself
        'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
    }
    source = ColumnDataSource(data)
    
    # Create the Bokeh plot
    p = figure(title="UMAP Projection of Sentences", width=700, height=700)
    p.scatter('x', 'y', color='color', source=source)
    
    hover = HoverTool(tooltips=[("Content", "@content")])
    p.add_tools(hover)
    
    logging.info("Plot created successfully.")
    # Save the plot to an HTML file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
    logging.info(f"temp file is {temp_file.name}")
    output_file(temp_file.name)
    save(p)
    logging.info("Plot saved to file.")
    return temp_file.name
import plotly.express as px
import plotly.graph_objects as go

def generate_plotly_figure(query, pdf_file):
    logging.info("Generating plot with Plotly.")
    # Generate embeddings for the query
    query_embedding = model.encode([query])[0]
    
    # Process the PDF and create embeddings
    text = process_pdf(pdf_file.name)
    embeddings, sentences = create_embeddings(text)
    
    logging.info("Data prepared for UMAP.")
    # Prepare the data for UMAP and visualization
    all_embeddings = np.vstack([embeddings, query_embedding])
    all_sentences = sentences + [query]
    
    # UMAP transformation
    umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
    umap_embeddings = umap_transform.fit_transform(all_embeddings)
    
    logging.info("UMAP transformation completed.")
    # Find the closest sentences to the query
    distances = cosine_similarity([query_embedding], embeddings)[0]
    closest_indices = distances.argsort()[-5:][::-1]  # Adjust the number as needed
    
    # Prepare data for plotting
    colors = ['red' if i in closest_indices else 'blue' for i in range(len(sentences))]
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
                             marker=dict(color=colors), text=all_sentences[:-1]))
    
    fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
    
    logging.info("Plotly figure created successfully.")
    return fig

def gradio_interface(pdf_file, query):
    logging.info("Gradio interface called.")
    fig = generate_plotly_figure(query, pdf_file)
    logging.info("Returning Plotly figure.")
    return fig
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.File(label="Upload PDF"), gr.Textbox(label="Query")],
    outputs=gr.Plot(),  # Updated to use gr.Plot() for Plotly figures
    title="PDF Content Visualizer",
    description="Upload a PDF and enter a query to visualize the content."
)

if __name__ == "__main__":
    iface.launch()