httpx和parsel组合实现Python多线程爬虫

共计 3313 个字符，预计需要花费 9 分钟才能阅读完成。

Python 网络爬虫领域两个最新的比较火的工具莫过于 httpx 和 parsel 了。

httpx 号称下一代的新一代网络请求库，不仅支持 requests 库的所有操作，还能发送异步请求。

parsel 最初集成在著名 Python 爬虫框架 Scrapy 中，后独立出来成立一个单独的模块，支持 XPath 选择器、CSS 选择器和正则表达式等多种解析提取方式, 据说相比于 BeautifulSoup 解析效率更高。

from fake_useragent import UserAgent
import csv
import re
import time
from parsel import Selector
import httpx
import threading

class HomeLinkSpider(object):
    def __init__(self):
        self.data = list()
        self.path = " 浦东_70_90 平_500_800 万.csv"
        self.url = "https://sh.lianjia.com/ershoufang/pudong/a3p5/"

    def get_max_page(self):
        response = httpx.get(self.url, headers={"User-Agent": UserAgent().random})
        if response.status_code == 200:
            # 创建 Selector 类实例
            selector = Selector(response.text)
            # 采用 css 选择器获取最大页码 div Boxl
            a = selector.css('div[class="page-box house-lst-page-box"]')
            # 使用 eval 将 page-data 的 json 字符串转化为字典格式
            max_page = eval(a[0].xpath("//@page-data").get())["totalPage"]
            print(f" 最大页码数：{max_page}")
            return max_page
        else:
            print(" 请求失败 status:{}".format(response.status_code))
            return None

    # 解析单页面，需传入单页面 url 地址
    def parse_single_page(self, url):
        print(f" 多线程开始爬取：{url}")
        response = httpx.get(url, headers={"User-Agent": UserAgent().random})
        selector = Selector(response.text)
        ul = selector.css("ul.sellListContent")[0]
        li_list = ul.css("li")
        for li in li_list:
            detail = dict()
            detail["title"] = li.css("div.title a::text").get()

            #  2 室 1 厅 | 74.14 平米 | 南 | 精装 | 高楼层 (共 6 层) | 1999 年建 | 板楼
            house_info = li.css("div.houseInfo::text").get()
            house_info_list = house_info.split(" | ")

            detail["bedroom"] = house_info_list[0]
            detail["area"] = house_info_list[1]
            detail["direction"] = house_info_list[2]

            floor_pattern = re.compile(r"\d{1,2}")
            match1 = re.search(floor_pattern, house_info_list[4])  # 从字符串任意位置匹配
            if match1:
                detail["floor"] = match1.group()
            else:
                detail["floor"] = " 未知 "

            # 匹配年份
            year_pattern = re.compile(r"\d{4}")
            match2 = re.search(year_pattern, house_info_list[5])
            if match2:
                detail["year"] = match2.group()
            else:
                detail["year"] = " 未知 "

            # 文兰小区 - 塘桥    提取小区名和哈快
            position_info = li.css("div.positionInfo a::text").getall()
            detail["house"] = position_info[0]
            detail["location"] = position_info[1]

            # 650 万，匹配 650
            price_pattern = re.compile(r"\d+")
            total_price = li.css("div.totalPrice span::text").get()
            detail["total_price"] = re.search(price_pattern, total_price).group()

            # 单价 64182 元 / 平米，匹配 64182
            unit_price = li.css("div.unitPrice span::text").get()
            detail["unit_price"] = re.search(price_pattern, unit_price).group()

            self.data.append(detail)

    def parse_page(self):
        max_page = self.get_max_page()

        thread_list = []
        for i in range(1, max_page + 1):
            url = f"https://sh.lianjia.com/ershoufang/pudong/pg{i}a3p5/"
            t = threading.Thread(target=self.parse_single_page, args=(url,))
            thread_list.append(t)

        for t in thread_list:
            t.start()

        for t in thread_list:
            t.join()

    def write_csv_file(self):
        head = [" 标题 ", " 小区 ", " 房厅 ", " 面积 ", " 朝向 ", " 楼层 ", " 年份 ", " 位置 ", " 总价 ( 万)", " 单价 (元 / 平方米)"]
        keys = [
            "title",
            "house",
            "bedroom",
            "area",
            "direction",
            "floor",
            "year",
            "location",
            "total_price",
            "unit_price",
        ]

        try:
            with open(self.path, "w", newline="", encoding="utf_8_sig") as csv_file:
                writer = csv.writer(csv_file, dialect="excel")
                if head is not None:
                    writer.writerow(head)
                for item in self.data:
                    row_data = []
                    for k in keys:
                        row_data.append(item[k])
                    writer.writerow(row_data)
                print(f"Write a CSV file to path {self.path} Successful.")
        except Exception as e:
            print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))

if __name__ == "__main__":
    start = time.time()
    home_link_spider = HomeLinkSpider()
    home_link_spider.parse_page()
    home_link_spider.write_csv_file()
    end = time.time()
    print(f" 耗时：{end - start:.2f} 秒 ")

参考：https://pythondjango.cn/python/advanced/3-httpx-parsel-requests-comparision/

八对麻烦大佬更新下【堆新】的友链站名：八对星星描述：极目星视穹苍无界•足履行者大地有疆链接：https://8dui.com图标：https://cf.8dui.com/logo.webp横标：https://cf.8dui.com/logo-w.webp订阅：https://8dui.com/rss.xml

三毛笔记已添加

DUINEW 已添加贵站，期待贵站友链~博客名称：堆新博客地址：https://duinew.com/博客描述：堆新堆新,引力向新！——堆新（DUINEW）博客头像：https://d.duinew.com/logo.webp横版头像：https://d.duinew.com/logo-w.webp博客订阅：https://duinew.com/rss.xml