共计 3313 个字符,预计需要花费 9 分钟才能阅读完成。
Python 网络爬虫领域两个最新的比较火的工具莫过于 httpx 和 parsel 了。
httpx 和 parsel
httpx 号称下一代的新一代网络请求库,不仅支持 requests 库的所有操作,还能发送异步请求。
parsel 最初集成在著名 Python 爬虫框架 Scrapy 中,后独立出来成立一个单独的模块,支持 XPath 选择器、CSS 选择器和正则表达式等多种解析提取方式, 据说相比于 BeautifulSoup 解析效率更高。
多线程代码
from fake_useragent import UserAgent
import csv
import re
import time
from parsel import Selector
import httpx
import threading
class HomeLinkSpider(object):
def __init__(self):
self.data = list()
self.path = " 浦东_70_90 平_500_800 万.csv"
self.url = "https://sh.lianjia.com/ershoufang/pudong/a3p5/"
def get_max_page(self):
response = httpx.get(self.url, headers={"User-Agent": UserAgent().random})
if response.status_code == 200:
# 创建 Selector 类实例
selector = Selector(response.text)
# 采用 css 选择器获取最大页码 div Boxl
a = selector.css('div[class="page-box house-lst-page-box"]')
# 使用 eval 将 page-data 的 json 字符串转化为字典格式
max_page = eval(a[0].xpath("//@page-data").get())["totalPage"]
print(f" 最大页码数:{max_page}")
return max_page
else:
print(" 请求失败 status:{}".format(response.status_code))
return None
# 解析单页面,需传入单页面 url 地址
def parse_single_page(self, url):
print(f" 多线程开始爬取:{url}")
response = httpx.get(url, headers={"User-Agent": UserAgent().random})
selector = Selector(response.text)
ul = selector.css("ul.sellListContent")[0]
li_list = ul.css("li")
for li in li_list:
detail = dict()
detail["title"] = li.css("div.title a::text").get()
# 2 室 1 厅 | 74.14 平米 | 南 | 精装 | 高楼层 (共 6 层) | 1999 年建 | 板楼
house_info = li.css("div.houseInfo::text").get()
house_info_list = house_info.split(" | ")
detail["bedroom"] = house_info_list[0]
detail["area"] = house_info_list[1]
detail["direction"] = house_info_list[2]
floor_pattern = re.compile(r"\d{1,2}")
match1 = re.search(floor_pattern, house_info_list[4]) # 从字符串任意位置匹配
if match1:
detail["floor"] = match1.group()
else:
detail["floor"] = " 未知 "
# 匹配年份
year_pattern = re.compile(r"\d{4}")
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail["year"] = match2.group()
else:
detail["year"] = " 未知 "
# 文兰小区 - 塘桥 提取小区名和哈快
position_info = li.css("div.positionInfo a::text").getall()
detail["house"] = position_info[0]
detail["location"] = position_info[1]
# 650 万,匹配 650
price_pattern = re.compile(r"\d+")
total_price = li.css("div.totalPrice span::text").get()
detail["total_price"] = re.search(price_pattern, total_price).group()
# 单价 64182 元 / 平米,匹配 64182
unit_price = li.css("div.unitPrice span::text").get()
detail["unit_price"] = re.search(price_pattern, unit_price).group()
self.data.append(detail)
def parse_page(self):
max_page = self.get_max_page()
thread_list = []
for i in range(1, max_page + 1):
url = f"https://sh.lianjia.com/ershoufang/pudong/pg{i}a3p5/"
t = threading.Thread(target=self.parse_single_page, args=(url,))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
def write_csv_file(self):
head = [" 标题 ", " 小区 ", " 房厅 ", " 面积 ", " 朝向 ", " 楼层 ", " 年份 ", " 位置 ", " 总价 ( 万)", " 单价 (元 / 平方米)"]
keys = [
"title",
"house",
"bedroom",
"area",
"direction",
"floor",
"year",
"location",
"total_price",
"unit_price",
]
try:
with open(self.path, "w", newline="", encoding="utf_8_sig") as csv_file:
writer = csv.writer(csv_file, dialect="excel")
if head is not None:
writer.writerow(head)
for item in self.data:
row_data = []
for k in keys:
row_data.append(item[k])
writer.writerow(row_data)
print(f"Write a CSV file to path {self.path} Successful.")
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (self.path, e))
if __name__ == "__main__":
start = time.time()
home_link_spider = HomeLinkSpider()
home_link_spider.parse_page()
home_link_spider.write_csv_file()
end = time.time()
print(f" 耗时:{end - start:.2f} 秒 ")
参考:https://pythondjango.cn/python/advanced/3-httpx-parsel-requests-comparision/
正文完