共计 2580 个字符,预计需要花费 7 分钟才能阅读完成。
Requests 中文文档 很有趣,有兴趣的可以先看看。
数据提取
import requests | |
from lxml import etree # xpath 库 | |
url = "https://www.xinpianchang.com/discover/article?from=navigator" | |
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36" | |
} | |
response = requests.get(url, headers=headers, timeout=10) # 发起请求 | |
# print(response.status_code) # 返回状态码 | |
tree = etree.HTML(response.text) # 返回源代码,可能会乱码 | |
elements = tree.xpath("//h2[@class='truncate block']") | |
for element in elements: | |
print(element.text) | |
# content 是二进制字节码,使用 utf- 8 来解析 | |
# print(response.content.decode("utf-8")) |
数据存储
下载图片
import requests | |
url = "https://img2.woyaogexing.com/2022/06/24/783b56bc70a892a9!400x400.jpg" | |
response = requests.get(url) | |
content = response.content | |
with open("123.jpg", "wb") as f: | |
f.write(content) |
也可以 urlretrieve() 方法直接将远程数据下载到本地。
from urllib.request import urlretrieve | |
url = "https://img2.woyaogexing.com/2022/06/24/783b56bc70a892a9!400x400.jpg" | |
urlretrieve(url, "1.jpg") |
下载视频
import requests | |
url = "https://video.pearvideo.com/mp4/adshort/20220622/cont-1765998-15899235_adpkg-ad_hd.mp4" | |
response = requests.get(url, stream=True) | |
content = response.content | |
with open("1.mp4", "wb") as f: | |
f.write(content) | |
# 或者用下面的分段下载,比较不占内存 | |
# with open("2.mp4", "wb") as f: | |
# for i in response.iter_content(1024 * 1024): # 1 M | |
# f.write(i) |
文本内容
import csv | |
data = [[1, 2, 3], [4, 5, 6]] | |
with open("1.csv", "w", newline="") as f: | |
csv_writer = csv.writer(f) | |
csv_writer.writerows(data) |
POST 请求
url = "http://httpbin.org/post" | |
data = {"username": "abc", "password": "123"} | |
files = {"img": open("data/123.jpg", "rb")} | |
resp = requests.post(url, data=data, files=files) | |
print(resp.text) |
cookie 登录
import requests | |
# 会话 | |
session = requests.session() | |
data = {'name': '''password': ''} | |
# 1. 登录 | |
url = '' | |
session.post(url, data=data) | |
# 2. 拿数据 | |
res = session.get('') | |
print(res.json()) |
防盗链
import requests | |
url = 'https://www.pearvideo.com/video_1756378' | |
contId = url.split('_')[1] | |
videoStatusUrl = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.6270606489702433' | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36', | |
# 防盗链:溯源,当前本次请求的上一级是谁 | |
'Referer': url | |
} | |
res = requests.get(videoStatusUrl, headers=headers) | |
dic = res.json() | |
srcUrl = dic['videoInfo']['videos']['srcUrl'] | |
systemTime = dic['systemTime'] | |
srcUrl = srcUrl.replace(systemTime, f'cont-{contId}') | |
# 下载视频 | |
with open('a.mp4', 'wb') as f: | |
f.write(requests.get(srcUrl).content) |
代理
import requests | |
url = 'https://www.baidu.com/' | |
proxies = {'https': 'https://36.6.57.27:40257'} | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36', | |
} | |
resp = requests.get(url, headers=headers, proxies=proxies) | |
resp.encoding = 'utf-8' | |
print(resp.text) |
正文完