【爬虫】第六部分 requests
6. requests
6.1 基本使用
pip install requests
import requests
path = "http://www.xxx.com"
res = requests.get(path)
# 设置编码格式
res.encoding = 'utf-8'
# 返回的是网页源代码
print(res.text)
# 返回的是二进制类型
print(res.content)
# 获取请求的url
print(res.url)
# 获取响应的状态
print(res.status_code)
# 获取请求头的信息
print(res.headers)
6.2 get请求
import requests
keyword = input("输入搜索内容: ")
url = f"http://baike.xxx.com/item/{keyword}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
print(res.text)
6.3 post请求
import requests
import json
keyword = input("输入搜索内容: ")
url = f"https://fanyi.xxx.com/sug"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
data = {
"kw": keyword
}
res = requests.post(url, data=data, headers=headers)
# 返回的数据是json,所以需要转字典
res = json.loads(res.text, encoding='utf-8')
print(res)
6.4 代理
import requests
url = 'http://www.xxx.com/s?'
data = {
"ie": "UTF-8",
"wd": "ip"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
# 代理
proxy = {
"http": '121.13.252.58:41564'
}
res = requests.get(url=url, params=data, headers=headers, proxies=proxy).text
with open('ip.html', 'w', encoding='utf-8') as f:
f.write(res)
6.5 案例
url = "https://so.xxx.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx"
data = {
"__VIEWSTATE": '',
"__VIEWSTATEGENERATOR": '',
"from": 'http://so.gushiwen.cn/user/collect.aspx',
"email": "15260696383",
"pwd": "123456789",
"code": '',
"denglu": "登录"
}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"referer": "https://so.xxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx"
}
# 1. 获取__VIEWSTATE和__VIEWSTATEGENERATOR的值
res_html = requests.get('https://so.xxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx',
headers=headers)
# 使用xpath进行解析
tree = etree.HTML(res_html.text)
__VIEWSTATE = tree.xpath('//input[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value')[0]
data['__VIEWSTATE'] = __VIEWSTATE
data['__VIEWSTATEGENERATOR'] = __VIEWSTATEGENERATOR
# 获取验证码
code_url = 'https://so.xxx.cn/RandCode.ashx'
# 在这里使用requests中的session()方法可以将其再同一个会话中请求,如果不使用该方法会造成,两次请求,导致验证码对不上
session = requests.session()
res_code = session.get(code_url, headers=headers).content
with open('code.png', 'wb') as cf:
cf.write(res_code)
code = input("输入验证码:")
data["code"] = code
# 登录,在这里我们也需要使用session,保证验证码和登录是同一个链接
res_login = session.post(url=url, headers=headers, data=data).text
with open('gushiwen.html', 'w', encoding='utf-8') as f:
f.write(res_login)
总结
以上就是今天要讲的内容,希望对大家有所帮助!!!
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/82815.html