tdurzynski commited on
Commit
0bfd27d
·
verified ·
1 Parent(s): 9f8acf1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ from langchain.document_loaders import ArxivLoader, PyPDFLoader
6
+ from langchain.text_splitter import TokenTextSplitter
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.chains.summarize import load_summarize_chain
11
+ from langchain_groq import ChatGroq
12
+ from transformers import pipeline
13
+ from PyPDF2 import PdfReader
14
+ from huggingface_hub import login
15
+ from groq import AsyncGroq, Groq
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+ HUGGING_API_KEY = os.getenv("HUGGING_API_KEY")
20
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
21
+
22
+ # Ensure API keys are set
23
+ if not HUGGING_API_KEY or not GROQ_API_KEY:
24
+ raise ValueError("API keys for HuggingFace or Groq are missing. Set them in your environment variables.")
25
+
26
+ # Configure Logging
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Authenticate with Hugging Face
31
+ login(HUGGING_API_KEY)
32
+
33
+ # Load models and embeddings
34
+ embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=HUGGING_API_KEY)
35
+ llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key=GROQ_API_KEY)
36
+
37
+ def display_results(result):
38
+ """Format and display results properly."""
39
+ return "\n".join(result)
40
+
41
+ def summarize_text(text):
42
+ """Summarize text using the Groq API."""
43
+ try:
44
+ sum_client = Groq(api_key=GROQ_API_KEY)
45
+ messages = [
46
+ {"role": "system", "content": "You are a summarizer. If I give you the whole text, you should summarize it."},
47
+ {"role": "user", "content": f"Summarize the paper: {text}"}
48
+ ]
49
+
50
+ response = sum_client.chat.completions.create(
51
+ messages=messages,
52
+ model="llama3-70b-8192",
53
+ temperature=0,
54
+ max_tokens=8192,
55
+ top_p=1,
56
+ )
57
+ return response.choices[0].message.content
58
+
59
+ except Exception as e:
60
+ logger.error(f"Error summarizing text: {e}")
61
+ return "Error in summarization."
62
+
63
+ def summarize_pdf(pdf_file_path, max_length):
64
+ """Extract text from a PDF and summarize it."""
65
+ try:
66
+ loader = PdfReader(pdf_file_path)
67
+ text = "\n".join(page.extract_text() or "" for page in loader.pages)
68
+
69
+ text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
70
+ chunks = text_splitter.split_text(text)
71
+
72
+ summary = ""
73
+ for chunk in chunks:
74
+ summary += summarize_text(chunk)
75
+
76
+ return summary
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error summarizing PDF: {e}")
80
+ return "Failed to process the PDF."
81
+
82
+ def summarize_arxiv_pdf(query):
83
+ """Summarize an arXiv paper given a query."""
84
+ try:
85
+ loader = ArxivLoader(query=query, load_max_docs=10)
86
+ documents = loader.load()
87
+ text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
88
+ chunks = text_splitter.split_documents(documents)
89
+
90
+ ref_summary = ""
91
+ for chunk in chunks:
92
+ ref_summary += summarize_text(chunk.page_content)
93
+
94
+ arxiv_summary = loader.get_summaries_as_docs()
95
+
96
+ summaries = []
97
+ for doc in arxiv_summary:
98
+ title = doc.metadata.get("Title", "Unknown Title")
99
+ authors = doc.metadata.get("Authors", "Unknown Authors")
100
+ url = doc.metadata.get("Entry ID", "No URL")
101
+
102
+ summaries.append(f"**{title}**\n")
103
+ summaries.append(f"**Authors:** {authors}\n")
104
+ summaries.append(f"**View full paper:** [Link to paper]({url})\n")
105
+ summaries.append(f"**Summary:** {doc.page_content}\n")
106
+ summaries.append(f"**Enhanced Summary:**\n {ref_summary}")
107
+
108
+ return display_results(summaries)
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error summarizing arXiv paper: {e}")
112
+ return "Failed to process arXiv paper."
113
+
114
+ client = AsyncGroq(api_key=GROQ_API_KEY)
115
+
116
+ async def chat_with_replit(message, history):
117
+ """Chat functionality using Groq API."""
118
+ try:
119
+ messages = [{"role": "system", "content": "You are an assistant answering user questions."}]
120
+
121
+ for chat in history:
122
+ user, assistant = chat
123
+ messages.append({"role": "user", "content": user})
124
+ messages.append({"role": "assistant", "content": assistant})
125
+
126
+ messages.append({"role": "user", "content": message})
127
+
128
+ stream = await client.chat.completions.create(
129
+ messages=messages,
130
+ model="llama3-70b-8192",
131
+ temperature=0,
132
+ max_tokens=1024,
133
+ top_p=1,
134
+ stream=True,
135
+ )
136
+
137
+ response_content = ""
138
+ async for chunk in stream:
139
+ if chunk.choices[0].delta.content:
140
+ response_content += chunk.choices[0].delta.content
141
+ yield response_content
142
+
143
+ except Exception as e:
144
+ logger.error(f"Chat error: {e}")
145
+ yield "Error in chat response."
146
+
147
+ async def chat_with_replit_pdf(message, history, doi_num):
148
+ """Chat with arXiv papers using document retrieval."""
149
+ try:
150
+ loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
151
+ documents = loader.load_and_split()
152
+ metadata = documents[0].metadata
153
+
154
+ vector_store = Chroma.from_documents(documents, embedding_model)
155
+
156
+ def retrieve_relevant_content(user_query):
157
+ results = vector_store.similarity_search(user_query, k=3)
158
+ return "\n\n".join(doc.page_content for doc in results)
159
+
160
+ relevant_content = retrieve_relevant_content(message)
161
+
162
+ messages = [
163
+ {"role": "user", "content": message},
164
+ {"role": "system", "content": f"Answer based on this arXiv paper {doi_num}.\n"
165
+ f"Metadata: {metadata}.\n"
166
+ f"Relevant Content: {relevant_content}"}
167
+ ]
168
+
169
+ response = await client.chat.completions.create(
170
+ messages=messages,
171
+ model="llama3-70b-8192",
172
+ temperature=0,
173
+ max_tokens=1024,
174
+ top_p=1,
175
+ stream=False,
176
+ )
177
+ return response.choices[0].message.content
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error in chat with PDF: {e}")
181
+ return "Error processing chat with PDF."
182
+
183
+ # Gradio UI
184
+ with gr.Blocks() as app:
185
+ with gr.Tab(label="Arxiv Summarization"):
186
+ with gr.Column():
187
+ arxiv_number = gr.Textbox(label="Enter arXiv number")
188
+ summarize_btn = gr.Button(value="Summarize arXiv Paper")
189
+ with gr.Column():
190
+ output_summary = gr.Markdown(label="Summary", height=1000)
191
+
192
+ summarize_btn.click(summarize_arxiv_pdf, inputs=arxiv_number, outputs=output_summary)
193
+
194
+ with gr.Tab(label="Local PDF Summarization"):
195
+ with gr.Row():
196
+ input_pdf = gr.File(label="Upload PDF file")
197
+ max_length_slider = gr.Slider(512, 4096, value=2048, step=512, label="Max Length")
198
+ summarize_pdf_btn = gr.Button(value="Summarize PDF")
199
+ with gr.Row():
200
+ output_pdf_summary = gr.Markdown(label="Summary", height=1000)
201
+
202
+ summarize_pdf_btn.click(summarize_pdf, inputs=[input_pdf, max_length_slider], outputs=output_pdf_summary)
203
+
204
+ app.launch()