web_scrape / app.py
jonathanjordan21's picture
Update app.py
13e03cb verified
raw
history blame
4.27 kB
from typing import Annotated, Optional
from fastapi import FastAPI, Header, Query
import html2text
import requests
import httpx
import re
from fastapi.middleware.cors import CORSMiddleware
from bs4 import BeautifulSoup
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/linkedin_post_details")
async def linkedin_post_details(post_id: str):
url = "https://www.linkedin.com/posts/"+post_id
res = requests.get(url)
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
text_maker.bypass_tables = False
docs = text_maker.handle(res.content.decode("utf-8"))
chunks = docs.split("\n\n#")
linkedin_content = chunks[1]
user = linkedin_content.split("\n\n", 5)
full_name = user[1]
bio = user[2]
try:
date, edited = user[3].split(" ")
edited = True
except:
date = user[3].strip()
edited = False
content = "\n\n".join(user[5:])
insights = chunks[3].split("\n\n")[2]
likes = insights.split(" ", 1)[0].strip()
comments = insights.rsplit(" ", 2)[1].strip()
return {
"user": {"name": full_name, "bio": bio},
"content": content,
"date": date,
"is_edited": edited,
"insights": {"likeCount": likes, "commentCount": comments, "shareCount": None, "viewCount":None},
}
@app.get("/google_search")
async def google_search(q: str, delimiter: str = "\n---\n", sites: Annotated[list[str] | None, Query()] = None):
print(sites)
print(type(sites))
url = f"https://www.google.com/search?q={q} "
if sites:
url += " OR ".join(["site:"+site for site in sites])
texts = ""
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for div in soup.find_all("div")[24:]:
if len(div.find_parents("div")) == 8: # Depth 4 means 3 parent divs (0-indexed)
# print(div.get_text().strip())
href = div.find(href=True, recursive=True)
text = div.find(text=True, recursive=False)
if href and text:
print(text)
text = f'[{text}]({href["href"].split("/url?q=")[-1]})'
if text != None and text.strip():
texts += text + delimiter
return {"results":texts}
@app.get("/tiktok_video_details")
async def tiktok_video_details(username: str, video_id:str):
url = f"https://www.tiktok.com/{username}/video/{video_id}"
# user_agent = "LinkedInBot"
user_agent = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
res = requests.get(url, headers={"user-agent": user_agent})
# soup = BeautifulSoup(res.content, "html.parser")
# insights = soup.find("meta", {"property": "og:description"}).get("content")
# likes = insights.split(" ", 1)[0]
# desc = insights.rsplit(" comments. “", 1)[-1][:-1]
# comments = insights.split(", ", 1)[-1].split(" ", 1)[0]
# name = soup.find("meta", {"property": "og:title"}).get("content")[9:]
# return {
# "insights": {"likeCount": likes, "commentCount": comments, "shareCount":None, "viewCount":None},
# "description": desc,
# "username": username,
# "name": name,
# }
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
text_maker.bypass_tables = False
print("RESPONSE DETAIlL", res.content.decode("utf-8"))
docs = text_maker.handle(res.content.decode("utf-8"))
print("DOCS", docs)
content_detail = docs.split("###")[5]
likes, comments, bookmarks, shares = re.findall(r'\*\*([\w.]+)\*\*', content_detail)
profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
username = profile[0]
date = profile[1].rsplit(" · ", 1)[-1]
desc = profile[-1].replace("**", "")
return {
"insights":{
"likeCount":likes,
"commentCount":comments,
"bookmarkCount":bookmarks,
"shareCount":shares
},
"username":username,
"date":date,
"description":desc
}