编辑
2026-04-01
undefined
00

目录

httpx和parsel
多线程代码

Python网络爬虫领域两个最新的比较火的工具莫过于httpx和parsel了。

httpx和parsel

httpx号称下一代的新一代网络请求库,不仅支持requests库的所有操作,还能发送异步请求。

parsel最初集成在著名Python爬虫框架Scrapy中,后独立出来成立一个单独的模块,支持XPath选择器、CSS选择器和正则表达式等多种解析提取方式, 据说相比于BeautifulSoup解析效率更高。

多线程代码

from fake_useragent import UserAgent import csv import re import time from parsel import Selector import httpx import threading class HomeLinkSpider(object): def init(self): self.data = list() self.path = "浦东_70_90平_500_800万.csv" self.url = "https://sh.lianjia.com/ershoufang/pudong/a3p5/" def get_max_page(self): response = httpx.get(self.url, headers={"User-Agent": UserAgent().random}) if response.status_code == 200: # 创建Selector类实例 selector = Selector(response.text) # 采用css选择器获取最大页码div Boxl a = selector.css('div[class="page-box house-lst-page-box"]') # 使用eval将page-data的json字符串转化为字典格式 max_page = eval(a[0].xpath("//@page-data").get())["totalPage"] print(f"最大页码数:{max_page}") return max_page else: print("请求失败 status:{}".format(response.status_code)) return None # 解析单页面,需传入单页面url地址 def parse_single_page(self, url): print(f"多线程开始爬取:{url}") response = httpx.get(url, headers={"User-Agent": UserAgent().random}) selector = Selector(response.text) ul = selector.css("ul.sellListContent")[0] li_list = ul.css("li") for li in li_list: detail = dict() detail["title"] = li.css("div.title a::text").get() # 2室1厅 | 74.14平米 | 南 | 精装 | 高楼层(共6层) | 1999年建 | 板楼 house_info = li.css("div.houseInfo::text").get() house_info_list = house_info.split(" | ") detail["bedroom"] = house_info_list[0] detail["area"] = house_info_list[1] detail["direction"] = house_info_list[2] floor_pattern = re.compile(r"\d{1,2}") match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配 if match1: detail["floor"] = match1.group() else: detail["floor"] = "未知" # 匹配年份 year_pattern = re.compile(r"\d{4}") match2 = re.search(year_pattern, house_info_list[5]) if match2: detail["year"] = match2.group() else: detail["year"] = "未知" # 文兰小区 - 塘桥 提取小区名和哈快 position_info = li.css("div.positionInfo a::text").getall() detail["house"] = position_info[0] detail["location"] = position_info[1] # 650万,匹配650 price_pattern = re.compile(r"\d+") total_price = li.css("div.totalPrice span::text").get() detail["total_price"] = re.search(price_pattern, total_price).group() # 单价64182元/平米, 匹配64182 unit_price = li.css("div.unitPrice span::text").get() detail["unit_price"] = re.search(price_pattern, unit_price).group() self.data.append(detail) def parse_page(self): max_page = self.get_max_page() thread_list = [] for i in range(1, max_page + 1): url = f"https://sh.lianjia.com/ershoufang/pudong/pg{i}a3p5/" t = threading.Thread(target=self.parse_single_page, args=(url,)) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() def write_csv_file(self): head = ["标题", "小区", "房厅", "面积", "朝向", "楼层", "年份", "位置", "总价(万)", "单价(元/平方米)"] keys = [ "title", "house", "bedroom", "area", "direction", "floor", "year", "location", "total_price", "unit_price", ] try: with open(self.path, "w", newline="", encoding="utf_8_sig") as csv_file: writer = csv.writer(csv_file, dialect="excel") if head is not None: writer.writerow(head) for item in self.data: row_data = [] for k in keys: row_data.append(item[k]) writer.writerow(row_data) print(f"Write a CSV file to path {self.path} Successful.") except Exception as e: print("Fail to write CSV to path: %s, Case: %s" % (self.path, e)) if name == "main": start = time.time() home_link_spider = HomeLinkSpider() home_link_spider.parse_page() home_link_spider.write_csv_file() end = time.time() print(f"耗时:{end - start:.2f}秒")

参考:https://pythondjango.cn/python/advanced/3-httpx-parsel-requests-comparision/

本文作者:a

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!