import requests from bs4 import BeautifulSoup import json import os url = "https://book.douban.com/top250" header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"} def getHTML(num): r = requests.get(url, headers=header, params={"start": num}) return r.text ## 定义方法存储一个网页页面数据 def getListData(html): booklist = [] soup = BeautifulSoup(html, "lxml") books = soup.select("tr") for book in books: bookdic = {} tds = book.select("td") bookdic["书名"] = tds[1].div.a.text.strip().split("n")[0] bookdic["书籍详情"] = tds[0].a.get("href") bookdic["封面"] = tds[0].img.get("src") bookdic["出版信息"] = tds[1].p.text spans = tds[1].select("span[class]") bookdic["评分"] = spans[1].text bookdic["评论人数"] = spans[2].text.replace("(", "").replace(")", "").strip() if len(spans) == 4: bookdic["备注"] = spans[3].text booklist.append(bookdic) return booklist ## 存储所有网页页面数据 allbooks = [] for i in range(10): html = getHTML(i * 25) page = getListData(html) allbooks.extend(page) ## 定义方法将数据保存为json文件 def saveJson(dic, path, filename): jData = json.dumps(dic, indent=2, ensure_ascii=False) if not os.path.exists(path): os.makedirs(path) with open(path + filename, "w", encoding="utf-8") as f: f.write(jData) ## 调用方法将数据保存在mdata目录下douban250.json文件中 saveJson(allbooks, "mdata/", "douban250.json")
本文作者:a
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!