Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
raw
history blame
2.62 kB
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
from finnlp.data_sources.news._base import News_Downloader
class Eastmoney_Streaming(News_Downloader):
def __init__(self, args={}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_streaming_stock(self, stock = "600519", rounds = 3):
print( "Geting pages: ", end = "")
if rounds > 0:
for r in range(rounds):
br = self._gather_pages(stock, r)
if br == "break":
break
else:
r = 1
error_count = 0
while 1:
br = self._gather_pages(stock, r)
if br == "break":
break
elif br == "Error":
error_count +=1
if error_count>10:
print("Connection Error")
r += 1
print( f"Get total {r+1} pages.")
self.dataframe = self.dataframe.reset_index(drop = True)
def _gather_pages(self, stock, page):
print( page, end = " ")
url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
}
requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
response = self._request_get(url, headers=headers)
if response.status_code != 200:
return "Error"
# gather the comtent of the first page
page = etree.HTML(response.text)
trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
have_one = False
for item in trs:
have_one = True
read_amount = item.xpath("./td[1]//text()")[0]
comments = item.xpath("./td[2]//text()")[0]
title = item.xpath("./td[3]/div/a//text()")[0]
content_link = item.xpath("./td[3]/div/a/@href")[0]
author = item.xpath("./td[4]//text()")[0]
time = item.xpath("./td[5]//text()")[0]
tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
tmp.columns = columns
self.dataframe = pd.concat([self.dataframe, tmp])
#print(title)
if have_one == False:
return "break"