Spaces:

umarigan
/

SemanticSearch

Sleeping

File size: 2,193 Bytes

e932fdf

import gradio as gr
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import numpy as np
from bokeh.plotting import figure, output_file, save
from bokeh.io import export_png
from bokeh.embed import file_html
from bokeh.resources import CDN
import tempfile
import os

# Load your model
model = SentenceTransformer('all-MiniLM-L6-v2')

def process_pdf(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        texts.append(page.get_text())
    return " ".join(texts)

def create_embeddings(text):
    # Split the text into sentences/chunks and generate embeddings
    # This is a placeholder for your actual text splitting and embedding code
    sentences = text.split(".")  # Simplistic split, consider using a better sentence splitter
    embeddings = model.encode(sentences)
    return embeddings, sentences

def generate_plot(query, pdf_file):
    # Process the PDF and create embeddings
    text = process_pdf(pdf_file)
    embeddings, sentences = create_embeddings(text)
    
    # Here, you'll integrate the UMAP and Bokeh visualization code you have,
    # and then save the Bokeh plot to a file.
    # For simplicity, let's assume it's saved to 'plot.html'
    
    output_file("plot.html")
    # Your Bokeh plot creation code here...
    save(p)  # Assuming 'p' is your Bokeh figure

    # Alternatively, you can save as PNG
    # export_png(p, filename="plot.png")

    # Return the path to the saved file
    return "plot.html"  # or "plot.png"

def gradio_interface(pdf_file, query):
    plot_path = generate_plot(query, pdf_file.name)
    
    # If returning HTML file
    with open(plot_path, "r") as f:
        html_content = f.read()
    return html_content
    
    # If returning an image
    # return plot_path

# Set up the Gradio app
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[gr.inputs.File(label="Upload PDF"), gr.inputs.Textbox(label="Query")],
    outputs=gr.outputs.HTML(label="Visualization"),  # Use gr.outputs.Image for image output
    title="PDF Content Visualizer",
    description="Upload a PDF and enter a query to visualize the content."
)

# Run the app
iface.launch()