Spaces:
Runtime error
Runtime error
File size: 4,768 Bytes
e3f875b a88526d e3f875b a88526d e3f875b a6fb29f e3f875b a6fb29f e3f875b a6fb29f e3f875b a6fb29f e3f875b 92d2564 e3f875b a6fb29f 92d2564 a6fb29f 92d2564 a6fb29f 92d2564 e3f875b a6fb29f e3f875b 5f3b20a e3f875b 5f3b20a e3f875b a6fb29f e3f875b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
import os
import requests
from io import BytesIO
from PyPDF2 import PdfReader
from tempfile import NamedTemporaryFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from huggingface_hub import InferenceClient
from gradio.exceptions import Error
from transformers import AutoModel
import streamlit as st
# --- Konfiguration ---
HF_API_TOKEN = os.environ.get("HF_API_TOKEN") # Lesen Sie den Token aus der Umgebungsvariable
MODEL_NAME = "dannyk97/mistral-screenplay-model"
HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", "/app/cache") #Falls ein Fehler Auftritt, wird der Ordner auf /app/cache gesetzt.
# --- Hilfsfunktionen ---
def query_huggingface_inference_endpoints(prompt):
"""
Stellt eine Anfrage an die Hugging Face Inference API.
"""
try:
client = InferenceClient(token=HF_API_TOKEN)
result = client.text_generation(prompt, model=MODEL_NAME)
return result
except Exception as e:
return f"Fehler bei der Anfrage an Hugging Face API: {e}"
# Function to download PDF from Google Drive
def download_pdf_from_drive(drive_link):
file_id = drive_link.split('/d/')[1].split('/')[0]
download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
response = requests.get(download_url)
if response.status_code == 200:
return BytesIO(response.content)
else:
raise Exception("Failed to download the PDF file from Google Drive.")
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_stream):
pdf_reader = PdfReader(pdf_stream)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return text_splitter.split_text(text)
# Function to create embeddings and store in FAISS
def create_embeddings_and_store(chunks, cache_folder=HF_CACHE_DIR):
try:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_folder)
vector_db = FAISS.from_texts(chunks, embedding=embeddings)
return vector_db
except Exception as e:
print(f"❌ Fehler beim Erstellen der Embeddings: {e}")
print("Verwende Dummy Embeddings, um fortzufahren (Funktionen sind eingeschränkt).")
# Verwenden Sie eine einfachere Fallback Lösung
vector_db = FAISS.from_texts(["fallback text"], HuggingFaceEmbeddings(model_name="all-mpnet-base-v2", cache_folder=cache_folder))
return vector_db
# Function to query the vector database and interact with Hugging Face Inference API
def query_vector_db(query, vector_db):
# Retrieve relevant documents
docs = vector_db.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Interact with the Text Generation API
prompt = f"Nutze diesen Kontext um die Frage zu beantworten: {context}\nFrage: {query}"
try:
output = query_huggingface_inference_endpoints(prompt)
return output
except Exception as e:
return f"FEHLER: {str(e)}"
# Streamlit app
st.title("RAG-Based Application with Google Drive Support")
# Predefined list of Google Drive links - HIER DEFINIERT!
drive_links = [
"https://drive.google.com/file/d/1PW8PJQC1EqYpsk8AhqrE4OS5cy57sqJ4/view?usp=drive_link"
# Add more links here as needed
]
st.write("Processing the predefined Google Drive links...")
all_chunks = []
# Process each predefined Google Drive link
for link in drive_links:
try:
st.write(f"Processing link: {link}")
# Download PDF
pdf_stream = download_pdf_from_drive(link)
st.write("PDF Downloaded Successfully!")
# Extract text
text = extract_text_from_pdf(pdf_stream)
st.write("PDF Text Extracted Successfully!")
# Chunk text
chunks = chunk_text(text)
st.write(f"Created {len(chunks)} text chunks.")
all_chunks.extend(chunks)
except Exception as e:
st.write(f"Error processing link {link}: {e}")
if all_chunks:
# Generate embeddings and store in FAISS
vector_db = create_embeddings_and_store(all_chunks, cache_folder=HF_CACHE_DIR)
st.write("Embeddings Generated and Stored Successfully!")
# User query input
user_query = st.text_input("Enter your query:")
if user_query:
response = query_vector_db(user_query, vector_db)
st.write("Response from LLM:")
st.write(response) |