Spaces:

lintasmediadanawa
/

web_scrape

Sleeping

App Files Files Community

web_scrape / app.py

jonathanjordan21

Update app.py

13e03cb verified 12 months ago

raw

history blame

4.27 kB

	from typing import Annotated, Optional

	from fastapi import FastAPI, Header, Query

	import html2text
	import requests
	import httpx
	import re

	from fastapi.middleware.cors import CORSMiddleware

	from bs4 import BeautifulSoup


	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.get("/linkedin_post_details")
	async def linkedin_post_details(post_id: str):
	url = "https://www.linkedin.com/posts/"+post_id
	res = requests.get(url)

	text_maker = html2text.HTML2Text()
	text_maker.ignore_links = True
	text_maker.ignore_images = True
	text_maker.bypass_tables = False

	docs = text_maker.handle(res.content.decode("utf-8"))

	chunks = docs.split("\n\n#")
	linkedin_content = chunks[1]
	user = linkedin_content.split("\n\n", 5)
	full_name = user[1]
	bio = user[2]
	try:
	date, edited = user[3].split(" ")
	edited = True
	except:
	date = user[3].strip()
	edited = False
	content = "\n\n".join(user[5:])

	insights = chunks[3].split("\n\n")[2]
	likes = insights.split(" ", 1)[0].strip()
	comments = insights.rsplit(" ", 2)[1].strip()

	return {
	"user": {"name": full_name, "bio": bio},
	"content": content,
	"date": date,
	"is_edited": edited,
	"insights": {"likeCount": likes, "commentCount": comments, "shareCount": None, "viewCount":None},
	}


	@app.get("/google_search")
	async def google_search(q: str, delimiter: str = "\n---\n", sites: Annotated[list[str] \| None, Query()] = None):
	print(sites)
	print(type(sites))
	url = f"https://www.google.com/search?q={q} "
	if sites:
	url += " OR ".join(["site:"+site for site in sites])

	texts = ""
	soup = BeautifulSoup(requests.get(url).content, "html.parser")

	for div in soup.find_all("div")[24:]:
	if len(div.find_parents("div")) == 8: # Depth 4 means 3 parent divs (0-indexed)
	# print(div.get_text().strip())
	href = div.find(href=True, recursive=True)
	text = div.find(text=True, recursive=False)
	if href and text:
	print(text)
	text = f'[{text}]({href["href"].split("/url?q=")[-1]})'
	if text != None and text.strip():
	texts += text + delimiter
	return {"results":texts}


	@app.get("/tiktok_video_details")
	async def tiktok_video_details(username: str, video_id:str):
	url = f"https://www.tiktok.com/{username}/video/{video_id}"
	# user_agent = "LinkedInBot"
	user_agent = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
	res = requests.get(url, headers={"user-agent": user_agent})

	# soup = BeautifulSoup(res.content, "html.parser")

	# insights = soup.find("meta", {"property": "og:description"}).get("content")
	# likes = insights.split(" ", 1)[0]
	# desc = insights.rsplit(" comments. “", 1)[-1][:-1]
	# comments = insights.split(", ", 1)[-1].split(" ", 1)[0]

	# name = soup.find("meta", {"property": "og:title"}).get("content")[9:]

	# return {
	# "insights": {"likeCount": likes, "commentCount": comments, "shareCount":None, "viewCount":None},
	# "description": desc,
	# "username": username,
	# "name": name,
	# }

	text_maker = html2text.HTML2Text()
	text_maker.ignore_links = True
	text_maker.ignore_images = True
	text_maker.bypass_tables = False

	print("RESPONSE DETAIlL", res.content.decode("utf-8"))

	docs = text_maker.handle(res.content.decode("utf-8"))

	print("DOCS", docs)

	content_detail = docs.split("###")[5]

	likes, comments, bookmarks, shares = re.findall(r'\\([\w.]+)\\', content_detail)


	profile = [x.strip() for x in content_detail.split("\n\nSpeed\n\n", 1)[1].split("\n", 6) if x.strip()]
	username = profile[0]
	date = profile[1].rsplit(" · ", 1)[-1]
	desc = profile[-1].replace("**", "")

	return {
	"insights":{
	"likeCount":likes,
	"commentCount":comments,
	"bookmarkCount":bookmarks,
	"shareCount":shares
	},
	"username":username,
	"date":date,
	"description":desc
	}