Spaces:
Running
Running
import gradio as gr | |
#import urllib.request | |
import requests | |
import zipfile | |
import uuid | |
import bs4 | |
import lxml | |
import os | |
#import subprocess | |
from huggingface_hub import InferenceClient,HfApi | |
import random | |
import json | |
import datetime | |
from pypdf import PdfReader | |
import uuid | |
#from query import tasks | |
from agent import ( | |
PREFIX, | |
COMPRESS_DATA_PROMPT, | |
COMPRESS_DATA_PROMPT_SMALL, | |
LOG_PROMPT, | |
LOG_RESPONSE, | |
) | |
client = InferenceClient( | |
"mistralai/Mixtral-8x7B-Instruct-v0.1" | |
) | |
reponame="acecalisto3/tmp" | |
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' | |
token_self = os.environ['HF_TOKEN'] | |
api=HfApi(token=token_self) | |
def find_all(purpose, task, history, url, result, steps): | |
return_list = [] | |
visited_links = set() | |
links_to_visit = [(url, 0)] | |
while links_to_visit: | |
current_url, current_depth = links_to_visit.pop(0) | |
if current_depth < steps: | |
try: | |
if current_url not in visited_links: | |
visited_links.add(current_url) | |
source = requests.get(current_url) | |
if source.status_code == 200: | |
soup = bs4.BeautifulSoup(source.content, 'lxml') | |
rawp = f'RAW TEXT RETURNED: {soup.text}' | |
return_list.append(rawp) | |
for link in soup.find_all("a"): | |
href = link.get('href') | |
if href and href.startswith('http'): | |
links_to_visit.append((href, current_depth + 1)) | |
except Exception as e: | |
print(f"Error fetching {current_url}: {e}") | |
return True, return_list | |
def read_txt(txt_path): | |
text="" | |
with open(txt_path,"r") as f: | |
text = f.read() | |
f.close() | |
print (text) | |
return text | |
def read_pdf(pdf_path): | |
text="" | |
reader = PdfReader(f'{pdf_path}') | |
number_of_pages = len(reader.pages) | |
for i in range(number_of_pages): | |
page = reader.pages[i] | |
text = f'{text}\n{page.extract_text()}' | |
print (text) | |
return text | |
error_box=[] | |
def read_pdf_online(url): | |
uid=uuid.uuid4() | |
print(f"reading {url}") | |
response = requests.get(url, stream=True) | |
print(response.status_code) | |
text="" | |
################# | |
##################### | |
try: | |
if response.status_code == 200: | |
with open("test.pdf", "wb") as f: | |
f.write(response.content) | |
#f.close() | |
#out = Path("./data.pdf") | |
#print (out) | |
reader = PdfReader("test.pdf") | |
number_of_pages = len(reader.pages) | |
print(number_of_pages) | |
for i in range(number_of_pages): | |
page = reader.pages[i] | |
text = f'{text}\n{page.extract_text()}' | |
print(f"PDF_TEXT:: {text}") | |
return text | |
else: | |
text = response.status_code | |
error_box.append(url) | |
print(text) | |
return text | |
except Exception as e: | |
print (e) | |
return e | |
VERBOSE = True | |
MAX_HISTORY = 100 | |
MAX_DATA = 20000 | |
def format_prompt(message, history): | |
prompt = "<s>" | |
for user_prompt, bot_response in history: | |
prompt += f"[INST] {user_prompt} [/INST]" | |
prompt += f" {bot_response}</s> " | |
prompt += f"[INST] {message} [/INST]" | |
return prompt | |
def run_gpt( | |
prompt_template, | |
stop_tokens, | |
max_tokens, | |
seed, | |
**prompt_kwargs, | |
): | |
print(seed) | |
timestamp=datetime.datetime.now() | |
generate_kwargs = dict( | |
temperature=0.9, | |
max_new_tokens=max_tokens, | |
top_p=0.95, | |
repetition_penalty=1.0, | |
do_sample=True, | |
seed=seed, | |
) | |
content = PREFIX.format( | |
timestamp=timestamp, | |
purpose="Compile the provided data and complete the users task" | |
) + prompt_template.format(**prompt_kwargs) | |
if VERBOSE: | |
print(LOG_PROMPT.format(content)) | |
#formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) | |
#formatted_prompt = format_prompt(f'{content}', history) | |
stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
resp = "" | |
for response in stream: | |
resp += response.token.text | |
#yield resp | |
if VERBOSE: | |
print(LOG_RESPONSE.format(resp)) | |
return resp | |
def compress_data(c, instruct, history): | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out = [] | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = history[s:e] | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT_SMALL, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge="", | |
history=hist, | |
) | |
out.append(resp) | |
#new_history = resp | |
print (resp) | |
#out+=resp | |
e=e+chunk | |
s=s+chunk | |
return out | |
def compress_data_og(c, instruct, history): | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out = [] | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = history[s:e] | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge=new_history, | |
history=hist, | |
) | |
new_history = resp | |
print (resp) | |
out+=resp | |
e=e+chunk | |
s=s+chunk | |
''' | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge=new_history, | |
history="All data has been recieved.", | |
)''' | |
print ("final" + resp) | |
#history = "observation: {}\n".format(resp) | |
return resp | |
def summarize( | |
inp: str, | |
history: list, | |
report_check: bool, | |
sum_mem_check: str, | |
data: str = None, | |
files: list = None, | |
url: str = None, | |
pdf_url: str = None, | |
pdf_batch: str = None | |
) -> str: | |
""" | |
Summarizes the provided input data, processes files, URLs, and PDFs, and yields the results. | |
Parameters: | |
- inp (str): The input data to be processed. If empty, defaults to "Process this data". | |
- history (list): A list to keep track of the conversation history. | |
- report_check (bool): A flag indicating whether to return a report. | |
- sum_mem_check (str): A string indicating whether to summarize or save memory. | |
- data (str, optional): Additional data to process. Defaults to None. | |
- files (list, optional): A list of file paths to process. Defaults to None. | |
- url (str, optional): A URL to fetch data from. Defaults to None. | |
- pdf_url (str, optional): A URL pointing to a PDF file to read. Defaults to None. | |
- pdf_batch (str, optional): A batch of PDF URLs (comma-separated) to read. Defaults to None. | |
Yields: | |
- A tuple containing: | |
- An empty string (for future use). | |
- The updated history list. | |
- An error box (if any errors occurred). | |
- A JSON box for structured output. | |
The function processes the input data, reads from specified URLs, PDFs, and files, and summarizes or saves the data based on the provided parameters. | |
""" | |
json_box = [] | |
rawp = "" | |
json_out = None | |
if inp == "": | |
inp = "Process this data" | |
history.clear() | |
history = [(inp, "Working on it...")] | |
yield "", history, error_box, json_box | |
# Process PDF batch URLs | |
if pdf_batch and pdf_batch.startswith("http"): | |
c = pdf_batch.count(",") + 1 # Count the number of URLs | |
data = "" | |
try: | |
for i in range(c): | |
batch_url = pdf_batch.split(",", c)[i] | |
bb = read_pdf_online(batch_url) | |
data = f'{data}\nFile Name URL ({batch_url}):\n{bb}' | |
except Exception as e: | |
print(e) | |
# Process single PDF URL | |
if pdf_url and pdf_url.startswith("http"): | |
print("PDF_URL") | |
out = read_pdf_online(pdf_url) | |
data = out | |
# Process regular URL | |
if url and url.startswith("http"): | |
val, out = find_all(inp, "", history, url, "") # Add missing arguments | |
if not val: | |
data = "Error" | |
rawp = str(out) # Assign rawp here | |
else: | |
data = out | |
# Process uploaded files | |
if files: | |
for i, file in enumerate(files): | |
try: | |
print(file) | |
if file.endswith(".pdf"): | |
zz = read_pdf(file) | |
print(zz) | |
data = f'{data}\nFile Name ({file}):\n{zz}' | |
elif file.endswith(".txt"): | |
zz = read_txt(file) | |
print(zz) | |
data = f'{data}\nFile Name ({file}):\n{zz}' | |
except Exception as e: | |
data = f'{data}\nError opening File Name ({file})' | |
print(e) | |
# Process the collected data | |
if data != "Error" and data != "": | |
print(inp) | |
out = str(data) | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c = sum(1 for i in str(out) if i in [" ", ",", "\n"]) # Count delimiters | |
print(f'c:: {c}') | |
if sum_mem_check == "Memory": | |
json_out = save_memory(inp, out) | |
rawp = "Complete" # Assign rawp here | |
if sum_mem_check == "Summarize": | |
json_out = compress_data(c, inp, out) | |
out = str(json_out) | |
if report_check: | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c = sum(1 for i in str(out) if i in [" ", ",", "\n"]) # Count delimiters | |
print(f'c2:: {c}') | |
rawp = compress_data_og(c, inp, out) # Assign rawp here | |
else: | |
rawp = out # Assign rawp here | |
else: | |
rawp = "Provide a valid data source" # Assign rawp here | |
history.clear() | |
history.append((inp, rawp)) | |
yield "", history, error_box, json_out | |
SAVE_MEMORY = """ | |
You are attempting to complete the task | |
task: {task} | |
Data: | |
{history} | |
Instructions: | |
Compile and categorize the data above into a JSON dictionary string | |
Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format | |
Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else. | |
Required keys: | |
"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"] | |
"title":"title of entry" | |
"description":"A sentence summarizing the topic of this entry" | |
"content":"A brief paragraph summarizing the important datapoints found in this entry" | |
"url":"https://url.source" | |
""" | |
def save_memory(purpose, history): | |
uid=uuid.uuid4() | |
history=str(history) | |
c=1 | |
inp = str(history) | |
rl = len(inp) | |
print(f'rl:: {rl}') | |
for i in str(inp): | |
if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<": | |
c +=1 | |
print (f'c:: {c}') | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out_box = [] | |
#out="" | |
s=0 | |
ee=chunk | |
print(f'e:: {ee}') | |
new_history="" | |
task = f'Index this Data\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{ee}') | |
hist = inp[s:ee] | |
resp = run_gpt( | |
SAVE_MEMORY, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=4096, | |
seed=seed, | |
purpose=purpose, | |
task=task, | |
history=hist, | |
).strip('\n') | |
#new_history = resp | |
#print (resp) | |
#out+=resp | |
#print ("final1" + resp) | |
try: | |
resp='[{'+resp.split('[{')[1].split('</s>')[0] | |
#print ("final2\n" + resp) | |
#print(f"keywords:: {resp['keywords']}") | |
except Exception as e: | |
resp = resp | |
print(e) | |
timestamp=str(datetime.datetime.now()) | |
timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") | |
json_object=resp | |
#json_object = json.dumps(out_box) | |
#json_object = json.dumps(out_box,indent=4) | |
with open(f"tmp-{uid}.json", "w") as outfile: | |
outfile.write(json_object) | |
outfile.close() | |
api.upload_file( | |
path_or_fileobj=f"tmp-{uid}.json", | |
path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
lines = resp.strip().strip("\n").split("\n") | |
r = requests.get(f'{save_data}mem-test2/main.json') | |
print(f'status code main:: {r.status_code}') | |
if r.status_code==200: | |
lod = json.loads(r.text) | |
#lod = eval(lod) | |
print (f'lod:: {lod}') | |
if not r.status_code==200: | |
lod = [] | |
for i,line in enumerate(lines): | |
key_box=[] | |
print(f'LINE:: {line}') | |
if ":" in line: | |
print(f'line:: {line}') | |
if "keywords" in line: | |
print(f'trying:: {line}') | |
keyw=line.split(":")[1] | |
print (keyw) | |
print (keyw.split("[")[1].split("]")[0]) | |
keyw=keyw.split("[")[1].split("]")[0] | |
for ea in keyw.split(","): | |
s1="" | |
ea=ea.strip().strip("\n") | |
for ev in ea: | |
if ev.isalnum(): | |
s1+=ev | |
if ev == " ": | |
s1+=ev | |
#ea=s1 | |
print(s1) | |
key_box.append(s1) | |
lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"index":f"{s}:{ee}"}) | |
json_object = json.dumps(lod, indent=4) | |
with open(f"tmp2-{uid}.json", "w") as outfile2: | |
outfile2.write(json_object) | |
outfile2.close() | |
api.upload_file( | |
path_or_fileobj=f"tmp2-{uid}.json", | |
path_in_repo=f"/mem-test2/main.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
ee=ee+chunk | |
s=s+chunk | |
out_box.append(resp) | |
return out_box | |
def create_zip_file(output_data, zip_name): | |
with zipfile.ZipFile(zip_name, 'w') as zipf: | |
for i, data in enumerate(output_data): | |
zipf.writestr(f'data_{i}.txt', data) | |
return zip_name | |
def clear_fn(): | |
return "", [(None, None)] | |
with gr.Blocks() as app: | |
gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""") | |
# Main chat interface | |
chatbot = gr.Chatbot( | |
label="Mixtral 8x7B Chatbot", | |
show_copy_button=True, | |
type='messages', | |
height=400, | |
purpose_input = gr.Textbox(label="Purpose"), | |
task_input = gr.Textbox(label="Task"), | |
history_input = gr.Textbox(label="History"), | |
url_input = gr.Textbox(label="URL"), | |
result_input = gr.Textbox(label="Result"), | |
steps_input = gr.Number(label="Steps", value=3), # Default value of 3 steps | |
output_component = gr.Textbox(label="Output"), | |
button = gr.Button("Search"), | |
) | |
# Control Panel | |
with gr.Row(): | |
with gr.Column(scale=3): | |
prompt = gr.Textbox( | |
label="Instructions (optional)", | |
placeholder="Enter processing instructions here..." | |
) | |
steps = gr.Slider( | |
label="Crawl Steps", | |
minimum=1, | |
maximum=5, | |
value=1, | |
info="Number of levels to crawl for web content" | |
) | |
with gr.Column(scale=1): | |
report_check = gr.Checkbox( | |
label="Return Report", | |
value=True, | |
info="Generate detailed analysis report" | |
) | |
sum_mem_check = gr.Radio( | |
label="Output Type", | |
choices=["Summary", "Memory"], | |
value="Summary", | |
info="Choose between summarized or memory-based output" | |
) | |
button = gr.Button("Process", variant="primary") | |
# Clear button | |
with gr.Row(): | |
clear_btn = gr.Button("Clear", variant="secondary") | |
# Input Tabs | |
with gr.Tabs() as input_tabs: | |
with gr.Tab("π Text"): | |
data = gr.Textbox( | |
label="Input Data", | |
lines=6, | |
placeholder="Paste your text here..." | |
) | |
with gr.Tab("π File"): | |
files = gr.File( | |
label="Upload Files", | |
file_types=[".pdf", ".txt"], | |
file_count="multiple" | |
) | |
with gr.Tab("π Web URL"): | |
url = gr.Textbox( | |
label="Website URL", | |
placeholder="https://example.com" | |
) | |
with gr.Tab("π PDF URL"): | |
pdf_url = gr.Textbox( | |
label="PDF URL", | |
placeholder="https://example.com/document.pdf" | |
) | |
with gr.Tab("π PDF Batch"): | |
pdf_batch = gr.Textbox( | |
label="PDF URLs (comma separated)", | |
placeholder="url1.pdf, url2.pdf, url3.pdf" | |
) | |
# Output Section | |
with gr.Row(): | |
with gr.Column(): | |
json_out = gr.JSON( | |
label="Structured Output", | |
show_label=True | |
) | |
with gr.Column(): | |
e_box = gr.Textbox( | |
label="Status & Errors", | |
interactive=False | |
) | |
def process_and_format_response(instructions, chat_history, report, summary_memory, | |
input_data, uploaded_files, input_url, pdf_input_url): # Removed extra parameters | |
try: | |
# Process the inputs with reduced parameters | |
result = None | |
for _ in summarize( | |
instructions, | |
chat_history if chat_history else [], | |
report, | |
summary_memory, | |
input_data, | |
uploaded_files, | |
input_url, | |
pdf_input_url # Removed extra parameters | |
): | |
result = _ | |
if result: | |
_, history, errors, json_data = result | |
# Convert history to ChatMessage format | |
formatted_messages = [] | |
if isinstance(history, list): | |
for msg in history: | |
if isinstance(msg, tuple) and len(msg) == 2: | |
formatted_messages.extend([ | |
gr.ChatMessage(content=str(msg[0]), role="user"), | |
gr.ChatMessage(content=str(msg[1]), role="assistant") | |
]) | |
else: | |
formatted_messages.extend([ | |
gr.ChatMessage(content=str(instructions), role="user"), | |
gr.ChatMessage(content=str(history), role="assistant") | |
]) | |
# Format error messages | |
error_message = "\n".join(errors) if errors else "Processing completed successfully" | |
return ( | |
"", # Clear the prompt | |
formatted_messages, | |
error_message, | |
json_data | |
) | |
except Exception as e: | |
error_msg = f"Error: {str(e)}" | |
return ( | |
"", | |
[ | |
gr.ChatMessage(content=str(instructions), role="user"), | |
gr.ChatMessage(content=error_msg, role="assistant") | |
], | |
error_msg, | |
None | |
) | |
def clear_fn(): | |
return "", [] | |
# Update the button click event to match parameters | |
button.click( | |
find_all, | |
inputs=[ | |
purpose_input, # Add these input components to your Gradio interface | |
task_input, | |
history_input, | |
url_input, | |
result_input, | |
steps_input | |
], | |
outputs=[output_component] | |
) | |
# Launch the app | |
app.queue(default_concurrency_limit=20).launch( | |
show_api=False, | |
share=True, | |
server_name="0.0.0.0", | |
server_port=7860 | |
) |