共计 1203 个字符,预计需要花费 4 分钟才能阅读完成。
import requests
from bs4 import BeautifulSoup
import json
import os
url = "https://book.douban.com/top250"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
def getHTML(num):
r = requests.get(url, headers=header, params={"start": num})
return r.text
## 定义方法存储一个网页页面数据
def getListData(html):
booklist = []
soup = BeautifulSoup(html, "lxml")
books = soup.select("tr")
for book in books:
bookdic = {}
tds = book.select("td")
bookdic[" 书名 "] = tds[1].div.a.text.strip().split("n")[0]
bookdic[" 书籍详情 "] = tds[0].a.get("href")
bookdic[" 封面 "] = tds[0].img.get("src")
bookdic[" 出版信息 "] = tds[1].p.text
spans = tds[1].select("span[class]")
bookdic[" 评分 "] = spans[1].text
bookdic[" 评论人数 "] = spans[2].text.replace("(", "").replace(")", "").strip()
if len(spans) == 4:
bookdic[" 备注 "] = spans[3].text
booklist.append(bookdic)
return booklist
## 存储所有网页页面数据
allbooks = []
for i in range(10):
html = getHTML(i * 25)
page = getListData(html)
allbooks.extend(page)
## 定义方法将数据保存为 json 文件
def saveJson(dic, path, filename):
jData = json.dumps(dic, indent=2, ensure_ascii=False)
if not os.path.exists(path):
os.makedirs(path)
with open(path + filename, "w", encoding="utf-8") as f:
f.write(jData)
## 调用方法将数据保存在 mdata 目录下 douban250.json 文件中
saveJson(allbooks, "mdata/", "douban250.json")
正文完