import streamlit as st import os from langdetect import detect from PyPDF2 import PdfReader import requests from sentence_transformers import SentenceTransformer import faiss import numpy as np # Load the API key from Streamlit secrets API_KEY = st.secrets["Key2"] API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha" # Load the embedding model for semantic search embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Function to query the LLM via Hugging Face Inference API def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50): headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json", } payload = { "inputs": prompt, "parameters": { "max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, }, } response = requests.post(API_URL, headers=headers, json=payload) if response.status_code == 200: return response.json()["generated_text"] else: st.error(f"Error querying the API: {response.status_code}, {response.text}") return None # Function to detect language def detect_language(text): try: return detect(text) except Exception: return "en" # Default to English if detection fails # Function to extract text from PDF with line and page numbers def extract_text_from_pdf(pdf_file): pdf_reader = PdfReader(pdf_file) text_data = [] for page_num, page in enumerate(pdf_reader.pages): if page.extract_text(): lines = page.extract_text().split('\n') for line_num, line in enumerate(lines): text_data.append({ "page": page_num + 1, "line": line_num + 1, "content": line }) return text_data # Function to create embeddings for the PDF text def get_embeddings(text_data): texts = [entry['content'] for entry in text_data] return embedding_model.encode(texts, convert_to_tensor=False) # Function to perform KNN or cosine similarity search def search_pdf_content(pdf_text_data, query, search_type="knn", k=5): query_embedding = embedding_model.encode([query])[0] pdf_embeddings = get_embeddings(pdf_text_data) if search_type == "knn": index = faiss.IndexFlatL2(pdf_embeddings.shape[1]) index.add(pdf_embeddings.astype('float32')) distances, indices = index.search(np.array([query_embedding], dtype='float32'), k) return [pdf_text_data[i] for i in indices[0]] elif search_type == "cosine": pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True) query_embedding_norm = query_embedding / np.linalg.norm(query_embedding) similarities = np.dot(pdf_embeddings_norm, query_embedding_norm) top_indices = np.argsort(similarities)[-k:][::-1] return [pdf_text_data[i] for i in top_indices] # Streamlit UI st.title("PDF Search with LLM and Semantic Search") pdf_file = st.file_uploader("Upload a PDF file", type="pdf") search_query = st.text_input("Enter your search query") search_method = st.radio("Select Search Method", ("knn", "cosine")) k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5) if pdf_file and search_query: pdf_text_data = extract_text_from_pdf(pdf_file) results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value) st.write("### Search Results") for res in results: st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}")