【爬虫】第六部分 requests

文章目录

【爬虫】第六部分 requests
6. requests
总结

6. requests

6.1 基本使用

pip install requests

import requests

path = "http://www.xxx.com"
res = requests.get(path)

# 设置编码格式
res.encoding = 'utf-8'

# 返回的是网页源代码
print(res.text)

# 返回的是二进制类型
print(res.content)

# 获取请求的url
print(res.url)

# 获取响应的状态
print(res.status_code)

# 获取请求头的信息
print(res.headers)

6.2 get请求

import requests

keyword = input("输入搜索内容: ")

url = f"http://baike.xxx.com/item/{keyword}"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
print(res.text)

6.3 post请求

import requests
import json

keyword = input("输入搜索内容: ")

url = f"https://fanyi.xxx.com/sug"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

data = {
    "kw": keyword
}

res = requests.post(url, data=data, headers=headers)
# 返回的数据是json,所以需要转字典
res = json.loads(res.text, encoding='utf-8')
print(res)

6.4 代理

import requests

url = 'http://www.xxx.com/s?'

data = {
    "ie": "UTF-8",
    "wd": "ip"
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

# 代理
proxy = {
    "http": '121.13.252.58:41564'
}

res = requests.get(url=url, params=data, headers=headers, proxies=proxy).text

with open('ip.html', 'w', encoding='utf-8') as f:
    f.write(res)

6.5 案例

url = "https://so.xxx.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx"

data = {
    "__VIEWSTATE": '',
    "__VIEWSTATEGENERATOR": '',
    "from": 'http://so.gushiwen.cn/user/collect.aspx',
    "email": "15260696383",
    "pwd": "123456789",
    "code": '',
    "denglu": "登录"
}

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    "referer": "https://so.xxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx"
}

# 1. 获取__VIEWSTATE和__VIEWSTATEGENERATOR的值
res_html = requests.get('https://so.xxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx',
                        headers=headers)

# 使用xpath进行解析
tree = etree.HTML(res_html.text)
__VIEWSTATE = tree.xpath('//input[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value')[0]

data['__VIEWSTATE'] = __VIEWSTATE
data['__VIEWSTATEGENERATOR'] = __VIEWSTATEGENERATOR

# 获取验证码
code_url = 'https://so.xxx.cn/RandCode.ashx'
# 在这里使用requests中的session()方法可以将其再同一个会话中请求,如果不使用该方法会造成,两次请求,导致验证码对不上
session = requests.session()

res_code = session.get(code_url, headers=headers).content
with open('code.png', 'wb') as cf:
    cf.write(res_code)

code = input("输入验证码:")
data["code"] = code

# 登录,在这里我们也需要使用session,保证验证码和登录是同一个链接
res_login = session.post(url=url, headers=headers, data=data).text

with open('gushiwen.html', 'w', encoding='utf-8') as f:
    f.write(res_login)