Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import PyPDF2 | |
| import pptx | |
| import os | |
| from langchain.llms import OpenAI | |
| from langchain.vectorstores.cassandra import Cassandra | |
| from langchain.indexes.vectorstore import VectorStoreIndexWrapper | |
| from langchain.embeddings import OpenAIEmbeddings | |
| import cassio | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from huggingface_hub import login | |
| # Secure API keys (ensure they are set) | |
| ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | |
| ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") | |
| HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| llm = OpenAI(openai_api_key=OPENAI_API_KEY) | |
| if not ASTRA_DB_APPLICATION_TOKEN or not ASTRA_DB_ID: | |
| st.error("Astra DB credentials are missing. Set the environment variables.") | |
| st.stop() | |
| if not HUGGINGFACE_API_KEY: | |
| st.error("Hugging Face API key is missing. Set the HUGGINGFACE_API_KEY environment variable.") | |
| st.stop() | |
| if not OPENAI_API_KEY: | |
| st.error("OpenAI API key is missing. Set the OPENAI_API_KEY environment variable.") | |
| st.stop() | |
| # Initialize Astra DB connection | |
| cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) | |
| # Initialize LLM & Embeddings | |
| login(token=HUGGINGFACE_API_KEY) | |
| embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) | |
| # Initialize vector store | |
| astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo") | |
| def extract_text_from_pdf(uploaded_file): | |
| """Extract text from a PDF file.""" | |
| text = "" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(uploaded_file) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: # Avoid NoneType error | |
| text += page_text + "\n" | |
| except Exception as e: | |
| st.error(f"Error reading PDF: {e}") | |
| return text | |
| def extract_text_from_ppt(uploaded_file): | |
| """Extract text from a PowerPoint file.""" | |
| text = "" | |
| try: | |
| presentation = pptx.Presentation(uploaded_file) | |
| for slide in presentation.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text += shape.text + "\n" | |
| except Exception as e: | |
| st.error(f"Error reading PPT: {e}") | |
| return text | |
| def main(): | |
| st.title("Chat with Documents") | |
| uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"]) | |
| extract_button = st.button("Extract Text") | |
| extracted_text = "" | |
| if extract_button and uploaded_file is not None: | |
| if uploaded_file.name.endswith(".pdf"): | |
| extracted_text = extract_text_from_pdf(uploaded_file) | |
| elif uploaded_file.name.endswith(".pptx"): | |
| extracted_text = extract_text_from_ppt(uploaded_file) | |
| if extracted_text: | |
| text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len) | |
| texts = text_splitter.split_text(extracted_text) | |
| astra_vector_store.add_texts(texts) | |
| st.success("Text extracted and stored successfully!") | |
| # Ensure the vector store index is initialized properly | |
| astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store) | |
| query = st.text_input("Enter your query") | |
| submit_query = st.button("Submit Query") | |
| if submit_query and query: | |
| response = astra_vector_index.query(query, llm =llm) | |
| st.write(f"Response: {response}") | |
| if __name__ == "__main__": | |
| main() | |