bainskarman commited on
Commit
13f8dc4
·
verified ·
1 Parent(s): f406221

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -51
app.py CHANGED
@@ -10,7 +10,7 @@ from transformers import pipeline
10
  from langdetect import detect
11
 
12
  # Load a smaller LLM with customizable parameters
13
- def load_llm(temperature, top_k, max_length):
14
  model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
15
  pipe = pipeline(
16
  "text-generation",
@@ -19,7 +19,7 @@ def load_llm(temperature, top_k, max_length):
19
  device_map="auto",
20
  temperature=temperature,
21
  top_k=top_k,
22
- max_length=max_length,
23
  )
24
  llm = HuggingFacePipeline(pipeline=pipe)
25
  return llm
@@ -39,7 +39,7 @@ def split_text(text, chunk_size=1000, chunk_overlap=200):
39
  return chunks
40
 
41
  # Create embeddings and vector store
42
- def create_vector_store(chunks, indexing_method="multi-representation"):
43
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
44
  if indexing_method == "multi-representation":
45
  vector_store = FAISS.from_texts(chunks, embeddings)
@@ -52,7 +52,7 @@ def create_vector_store(chunks, indexing_method="multi-representation"):
52
  return vector_store
53
 
54
  # Query the PDF
55
- def query_pdf(vector_store, query, llm, query_method="multi-query"):
56
  if query_method == "multi-query":
57
  # Implement Multi-Query logic here
58
  qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
@@ -89,53 +89,58 @@ def main():
89
  st.info("Using default PDF.")
90
  uploaded_file = "default.pdf" # Add a default PDF
91
 
92
- # Extract text
93
- text = extract_text_from_pdf(uploaded_file)
94
-
95
- # Detect language
96
- language = detect_language(text)
97
- st.write(f"Detected Language: {language}")
98
-
99
- # Split text into chunks
100
- chunk_size = st.slider("Chunk Size", 500, 2000, 1000)
101
- chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
102
- chunks = split_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
103
-
104
- # Indexing options
105
- indexing_method = st.selectbox(
106
- "Indexing Method",
107
- ["multi-representation", "raptors", "colbert"],
108
- help="Choose how to index the PDF text."
109
- )
110
- st.write(f"**Indexing Method:** {indexing_method}")
111
-
112
- # Create vector store
113
- vector_store = create_vector_store(chunks, indexing_method=indexing_method)
114
-
115
- # LLM Parameters
116
- st.sidebar.header("LLM Parameters")
117
- temperature = st.sidebar.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.")
118
- top_k = st.sidebar.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.")
119
- max_length = st.sidebar.slider("Max Length", 50, 500, 200, help="Maximum length of the generated response.")
120
-
121
- # Load LLM with user-defined parameters
122
- llm = load_llm(temperature=temperature, top_k=top_k, max_length=max_length)
123
-
124
- # Query translation options
125
- query_method = st.selectbox(
126
- "Query Translation Method",
127
- ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
128
- help="Choose a method to improve query retrieval."
129
- )
130
- st.write(f"**Query Translation Method:** {query_method}")
131
-
132
- # User input
133
- query = st.text_input("Ask a question about the PDF:")
134
- if query:
135
- # Query the PDF
136
- result = query_pdf(vector_store, query, llm, query_method=query_method)
137
- st.write("**Answer:**", result["answer"])
138
- st.write("**Source Text:**", result["source_text"])
 
 
 
 
 
139
 
140
  if __name__ == "__main__":
141
  main()
 
10
  from langdetect import detect
11
 
12
  # Load a smaller LLM with customizable parameters
13
+ def load_llm(temperature, top_k, max_new_tokens):
14
  model_name = "HuggingFaceH4/zephyr-7b-alpha" # Replace with your preferred model
15
  pipe = pipeline(
16
  "text-generation",
 
19
  device_map="auto",
20
  temperature=temperature,
21
  top_k=top_k,
22
+ max_new_tokens=max_new_tokens, # Use max_new_tokens instead of max_length
23
  )
24
  llm = HuggingFacePipeline(pipeline=pipe)
25
  return llm
 
39
  return chunks
40
 
41
  # Create embeddings and vector store
42
+ def create_vector_store(chunks, indexing_method="multi-representation", **kwargs):
43
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
44
  if indexing_method == "multi-representation":
45
  vector_store = FAISS.from_texts(chunks, embeddings)
 
52
  return vector_store
53
 
54
  # Query the PDF
55
+ def query_pdf(vector_store, query, llm, query_method="multi-query", **kwargs):
56
  if query_method == "multi-query":
57
  # Implement Multi-Query logic here
58
  qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
 
89
  st.info("Using default PDF.")
90
  uploaded_file = "default.pdf" # Add a default PDF
91
 
92
+ # Step 1: Extract text and split into chunks
93
+ if "text" not in st.session_state:
94
+ st.session_state.text = None
95
+ if "chunks" not in st.session_state:
96
+ st.session_state.chunks = None
97
+
98
+ if st.button("Extract Text and Split into Chunks"):
99
+ st.session_state.text = extract_text_from_pdf(uploaded_file)
100
+ st.session_state.chunks = split_text(st.session_state.text)
101
+ st.success("Text extracted and split into chunks!")
102
+
103
+ # Step 2: Create vector store
104
+ if "vector_store" not in st.session_state:
105
+ st.session_state.vector_store = None
106
+
107
+ if st.session_state.chunks:
108
+ st.subheader("Indexing Options")
109
+ indexing_method = st.selectbox(
110
+ "Indexing Method",
111
+ ["multi-representation", "raptors", "colbert"],
112
+ help="Choose how to index the PDF text."
113
+ )
114
+ if st.button("Create Vector Store"):
115
+ st.session_state.vector_store = create_vector_store(st.session_state.chunks, indexing_method=indexing_method)
116
+ st.success("Vector store created!")
117
+
118
+ # Step 3: Load LLM with user-defined parameters
119
+ if "llm" not in st.session_state:
120
+ st.session_state.llm = None
121
+
122
+ if st.session_state.vector_store:
123
+ st.subheader("LLM Parameters")
124
+ temperature = st.slider("Temperature", 0.1, 1.0, 0.7, help="Controls randomness in the output.")
125
+ top_k = st.slider("Top-k", 1, 100, 50, help="Limits sampling to the top-k tokens.")
126
+ max_new_tokens = st.slider("Max New Tokens", 50, 500, 200, help="Maximum number of tokens to generate.")
127
+ if st.button("Load LLM"):
128
+ st.session_state.llm = load_llm(temperature=temperature, top_k=top_k, max_new_tokens=max_new_tokens)
129
+ st.success("LLM loaded!")
130
+
131
+ # Step 4: Query the PDF
132
+ if st.session_state.llm:
133
+ st.subheader("Query Translation Options")
134
+ query_method = st.selectbox(
135
+ "Query Translation Method",
136
+ ["multi-query", "rag-fusion", "decomposition", "step-back", "hyde"],
137
+ help="Choose a method to improve query retrieval."
138
+ )
139
+ query = st.text_input("Ask a question about the PDF:")
140
+ if query:
141
+ result = query_pdf(st.session_state.vector_store, query, st.session_state.llm, query_method=query_method)
142
+ st.write("**Answer:**", result["answer"])
143
+ st.write("**Source Text:**", result["source_text"])
144
 
145
  if __name__ == "__main__":
146
  main()