共计 741 个字符,预计需要花费 2 分钟才能阅读完成。
import requests
from bs4 import BeautifulSoup
## 发送请求,获得数据
def getHtml(page):
url = ""
header = {"User-Agent": ""}
r = requests.get(url, params={"reportTime": "2021-06-30", "pageNum": page}, headers=header)
return r.text
## 解析表格标题
def parseTitle(soup):
table = soup.find(id="myTable04")
ths = table.find_all("th")
title = [th.text for th in ths]
return title
## 解析表格数据
def parseData(soup):
tbody = soup.find(id="myTable04").find("tbody")
trs = tbody.find_all("tr")
data = []
for tr in trs:
tds = tr.find_all("td")
tdsv = [td.text for td in tds]
data.append(tdsv)
return data
## 抓取和解析全部数据
tableData = []
for page in range(1, 224):
html = getHtml(page)
soup = BeautifulSoup(html, "lxml")
if page == 1:
title = parseTitle(soup)
tableData.append(title)
pageData = parseData(soup)
tableData.extend(pageData)
tableData[:5] # 查看前五条记录
len(tableData) # 查看数据集大小
正文完