article-scraper / app.py
rahulNenavath305's picture
Added streamlit app.py file
94ec2b7
raw
history blame
3.78 kB
import os
import streamlit as st
import requests
from streamlit_lottie import st_lottie
def main() -> None:
# ----- Loading Assets ----
def load_lottieurl(lottie_url:str):
r = requests.get(url=lottie_url)
return r.json() if r.status_code == 200 else None
def fetch(url):
try:
result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
return result.json()
except Exception:
return {}
st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
# ----- Introduction --------
with st.container():
st.subheader("Article Scraper")
st.title("A Digital News / Article Information Extraction Application")
st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
st.write("This service can be utilised in the data collection / curation process of data science workflow")
st.write("[My Website >](https://www.rahulnenavath.co.in/)")
with st.container():
st.write("---")
left_col, right_col = st.columns(2)
with left_col:
st.header("How it works?")
st.write("##")
st.write('**Input**: Article URL')
st.write('**Output**: Extracted Article Information')
st.write(
"""
**Working**:
- Download the HTML content from the given Article URL
- Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
- Arrange Information appropriately
- Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
"""
)
with right_col:
st_lottie(lottie_animation, height=500)
with st.container():
with st.form("my_form"):
article_url = st.text_input("Article URL", value="", key="article_url")
submitted = st.form_submit_button("Submit")
if submitted:
with st.spinner('Scraping Information ...'):
data = fetch(url=article_url)
if data:
with st.container():
content = data.get("scraped_content")
st.write("---")
st.title(f'Extracted Article Information')
st.write(f'**Article Title:**')
st.header(f"{content.get('article_title')}")
st.write(f"**Author:** {content.get('author')}")
st.write(f"**Published Date:** {content.get('publish_date')}")
st.write(f"**Description:** {content.get('description')}")
st.write(f"**Content:** {content.get('article_content')}")
st.write(f"**Article URL:** {content.get('article_url')}")
st.write(f"**Canonical URL:** {content.get('canonical_url')}")
st.write(f"**Publisher Name:** {content.get('publisher_name')}")
st.write(f"**Article Image:** {content.get('image')}")
st.write(f"**Article Keywords:** {content.get('keywords')}")
st.write(f"**Video URL:** {content.get('video_url')}")
st.write(f"**Audio URL:** {content.get('audio_url')}")
else:
st.error("Error")
if __name__ == "__main__":
main()