Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| from langdetect import detect | |
| from PyPDF2 import PdfReader | |
| import requests | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| # Load the API key from Streamlit secrets | |
| API_KEY = st.secrets["Key2"] | |
| API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-alpha" | |
| # Load the embedding model for semantic search | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Function to query the LLM via Hugging Face Inference API | |
| def query_llm_api(prompt, max_new_tokens=1000, temperature=0.7, top_k=50): | |
| headers = { | |
| "Authorization": f"Bearer {API_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": max_new_tokens, | |
| "temperature": temperature, | |
| "top_k": top_k, | |
| }, | |
| } | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| if response.status_code == 200: | |
| return response.json()["generated_text"] | |
| else: | |
| st.error(f"Error querying the API: {response.status_code}, {response.text}") | |
| return None | |
| # Function to detect language | |
| def detect_language(text): | |
| try: | |
| return detect(text) | |
| except Exception: | |
| return "en" # Default to English if detection fails | |
| # Function to extract text from PDF with line and page numbers | |
| def extract_text_from_pdf(pdf_file): | |
| pdf_reader = PdfReader(pdf_file) | |
| text_data = [] | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| if page.extract_text(): | |
| lines = page.extract_text().split('\n') | |
| for line_num, line in enumerate(lines): | |
| text_data.append({ | |
| "page": page_num + 1, | |
| "line": line_num + 1, | |
| "content": line | |
| }) | |
| return text_data | |
| # Function to create embeddings for the PDF text | |
| def get_embeddings(text_data): | |
| texts = [entry['content'] for entry in text_data] | |
| return embedding_model.encode(texts, convert_to_tensor=False) | |
| # Function to perform KNN or cosine similarity search | |
| def search_pdf_content(pdf_text_data, query, search_type="knn", k=5): | |
| query_embedding = embedding_model.encode([query])[0] | |
| pdf_embeddings = get_embeddings(pdf_text_data) | |
| if search_type == "knn": | |
| index = faiss.IndexFlatL2(pdf_embeddings.shape[1]) | |
| index.add(pdf_embeddings.astype('float32')) | |
| distances, indices = index.search(np.array([query_embedding], dtype='float32'), k) | |
| return [pdf_text_data[i] for i in indices[0]] | |
| elif search_type == "cosine": | |
| pdf_embeddings_norm = pdf_embeddings / np.linalg.norm(pdf_embeddings, axis=1, keepdims=True) | |
| query_embedding_norm = query_embedding / np.linalg.norm(query_embedding) | |
| similarities = np.dot(pdf_embeddings_norm, query_embedding_norm) | |
| top_indices = np.argsort(similarities)[-k:][::-1] | |
| return [pdf_text_data[i] for i in top_indices] | |
| # Streamlit UI | |
| st.title("PDF Search with LLM and Semantic Search") | |
| pdf_file = st.file_uploader("Upload a PDF file", type="pdf") | |
| search_query = st.text_input("Enter your search query") | |
| search_method = st.radio("Select Search Method", ("knn", "cosine")) | |
| k_value = st.slider("Number of Results (K)", min_value=1, max_value=20, value=5) | |
| if pdf_file and search_query: | |
| pdf_text_data = extract_text_from_pdf(pdf_file) | |
| results = search_pdf_content(pdf_text_data, search_query, search_type=search_method, k=k_value) | |
| st.write("### Search Results") | |
| for res in results: | |
| st.write(f"**Page {res['page']}, Line {res['line']}:** {res['content']}") |