Spaces:
Runtime error
Runtime error
import os | |
import streamlit as st | |
import requests | |
from streamlit_lottie import st_lottie | |
def main() -> None: | |
# ----- Loading Assets ---- | |
def load_lottieurl(lottie_url:str): | |
r = requests.get(url=lottie_url) | |
return r.json() if r.status_code == 200 else None | |
def fetch(url): | |
try: | |
result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url}) | |
return result.json() | |
except Exception: | |
return {} | |
st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide") | |
lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json") | |
# ----- Introduction -------- | |
with st.container(): | |
st.subheader("Article Scraper") | |
st.title("A Digital News / Article Information Extraction Application") | |
st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services") | |
st.write("This service can be utilised in the data collection / curation process of data science workflow") | |
st.write("[My Website >](https://www.rahulnenavath.co.in/)") | |
with st.container(): | |
st.write("---") | |
left_col, right_col = st.columns(2) | |
with left_col: | |
st.header("How it works?") | |
st.write("##") | |
st.write('**Input**: Article URL') | |
st.write('**Output**: Extracted Article Information') | |
st.write( | |
""" | |
**Working**: | |
- Download the HTML content from the given Article URL | |
- Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames | |
- Arrange Information appropriately | |
- Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters | |
""" | |
) | |
with right_col: | |
st_lottie(lottie_animation, height=500) | |
with st.container(): | |
with st.form("my_form"): | |
article_url = st.text_input("Article URL", value="", key="article_url") | |
submitted = st.form_submit_button("Submit") | |
if submitted: | |
with st.spinner('Scraping Information ...'): | |
data = fetch(url=article_url) | |
if data: | |
with st.container(): | |
content = data.get("scraped_content") | |
st.write("---") | |
st.title(f'Extracted Article Information') | |
st.write(f'**Article Title:**') | |
st.header(f"{content.get('article_title')}") | |
st.write(f"**Author:** {content.get('author')}") | |
st.write(f"**Published Date:** {content.get('publish_date')}") | |
st.write(f"**Description:** {content.get('description')}") | |
st.write(f"**Content:** {content.get('article_content')}") | |
st.write(f"**Article URL:** {content.get('article_url')}") | |
st.write(f"**Canonical URL:** {content.get('canonical_url')}") | |
st.write(f"**Publisher Name:** {content.get('publisher_name')}") | |
st.write(f"**Article Image:** {content.get('image')}") | |
st.write(f"**Article Keywords:** {content.get('keywords')}") | |
st.write(f"**Video URL:** {content.get('video_url')}") | |
st.write(f"**Audio URL:** {content.get('audio_url')}") | |
else: | |
st.error("Error") | |
if __name__ == "__main__": | |
main() |