Spaces:
Runtime error
Runtime error
| from AssistantService import GPTAssistant | |
| from openai.error import AuthenticationError | |
| import streamlit as st | |
| from langsmith.run_helpers import traceable | |
| import configparser | |
| import os | |
| config = configparser.ConfigParser() | |
| config.read('config.ini') | |
| if 'DEFAULT' in config: | |
| assistant_api_key = config['DEFAULT'].get('API-KEY', '') | |
| os.environ["LANGCHAIN_TRACING_V2"]="true" | |
| os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" | |
| os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"] | |
| os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"] | |
| def start(): | |
| st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*") | |
| with st.expander(label="Check out the video demo"): | |
| yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc") | |
| info_text = """ | |
| **Quick start** \n | |
| Fill the input with <HTML code>. | |
| * Choose a repeating element on the page, like a product on a list. | |
| * Inspect the HTML code and copy the element. | |
| After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it | |
| """ | |
| st.write(info_text) | |
| st.image("https://j.gifs.com/gpqvPl.gif") | |
| start() | |
| if assistant_api_key == '': | |
| assistant_api_key = st.secrets["API_KEY"] | |
| if assistant_api_key: | |
| gpt_assistant = GPTAssistant(assistant_api_key) | |
| else: | |
| gpt_assistant = GPTAssistant(assistant_api_key) | |
| def invalid_input(html): | |
| # TODO: more checks | |
| if html.startswith("http"): | |
| return True | |
| html_content = None | |
| # check if html_content is an url, and show error if it is | |
| def html_content_input(): | |
| html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above") | |
| if html_content: | |
| if html_content.startswith("http"): | |
| st.write("Please paste the HTML piece code, not the URL") | |
| invalid_input(html) | |
| return st.button("Generate output format & code") | |
| extract_button = html_content_input() | |
| if html_content and extract_button: | |
| try: | |
| st.write("1/2: Generating the output format...") | |
| output = gpt_assistant.chain_response_format(html_content) | |
| st.session_state['output_format'] = output | |
| except NameError: | |
| st.write("Complete the API key field") | |
| except AuthenticationError: | |
| st.write("Invalid API key") | |
| if 'output_format' in st.session_state: | |
| output_format = st.code(st.session_state['output_format'], language="json") | |
| try: | |
| st.write("2/2: Generating the code...") | |
| python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content) | |
| st.session_state['code_generated'] = python_code | |
| st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)" | |
| except NameError: | |
| st.write("Complete the API key field") | |
| except AuthenticationError: | |
| st.write("Invalid API key") | |
| def test_the_code(code, full_content): | |
| exec(code, globals()) | |
| if result: | |
| st.write("data extracted successfully") | |
| # show data in table | |
| st.table(result) | |
| else: | |
| st.write("error extracting data") | |
| return result or "error" | |
| if 'code_generated' in st.session_state: | |
| python_function_label = st.write("Here is your python function:") | |
| code_generated = st.code(st.session_state['code_generated'],language="python") | |
| full_content = st.text_input("Paste your complete HTML here:") | |
| test_code = st.button("Test the code") | |
| if full_content and test_code: | |
| html_data = full_content | |
| result = None | |
| test_the_code(st.session_state['code_generated_exec'], full_content=full_content) | |