【爬虫】第一部分 urllib
文章目录
1.urllib
1.1 初步了解urllib
urllib库是爬虫常用的一个库,下面通过一个简单的例子来去了解它。
import urllib.request as ur
url = 'https://xxxx.com'
# 模拟浏览器向服务器发请求
res = ur.urlopen(url)
# 去读取返回的数据并且进行解码,read方法返回的是二进制
response = res.read().decode('utf-8')
print(response)
1.2 HttpReponse类型 六种常见的读取方法
通过urllib.request()
获取的对象是类型是HttpReponse型的,针对这种类型,有六种常见的读取方法,它们的示例如下:
import urllib.request as ur
url = 'https://xxx.com'
# 模拟浏览器去打开指定的网页
res = ur.urlopen(url)
# read()
response1 = res.read()
# read(读取的字节个数)
response2 = res.read(1024)
# readline() 读取一行
response3 = res.readline()
# readlines() 读取多行
response4 = res.readlines()
# getcode() 获取状态码
code = res.getcode()
# geturl() 获取url
URL = res.geturl()
# getheaders() 获取响应头信息
headers = res.getheaders()
print(f'code={code} --- url={URL} --- headers={headers}')
1.3 urllib 下载图片、页面、视频
import urllib.request as ur
"""
urlretrieve(url,filename)
:params url 网页地址
:params filename 对下载的文件进行命名
"""
# 下载图片
url = 'https://xxxx.com/sdr/400__/t01a44596706ed343dd.jpg'
ur.urlretrieve(url=url,filename="图片.jpeg")
# 下载视频
url = 'https://xxxxxx.com/mda-kfhqtqzx15ym5s1y/hd/mda-kfhqtqzx15ym5s1y.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1666600337-0-0-2ac9232e4f81e06eba298fb1a2da7714&bcevod_channel=searchbox_feed&cd=0&pd=1&pt=3&logid=0137667328&vid=3709634490709730702&abtest=104959_1&klogid=0137667328'
ur.urlretrieve(url=url,filename='视频.mp4')
# 下载页面
url = "https://www.xxx.com/s?q=%E8%A7%86%E9%A2%91&src=srp&ssid=&fr=none&psid=a02c45cbbdd7e835fae610ce590208d2&eci=&nlpv=test_dt_46"
ur.urlretrieve(url=url,filename="网页.html")
1.4 请求对象定制(伪装)
请求对象定制:为了解决反爬虫的第一种手段
import urllib.request as ur
# 请求对象的定制:为了解决反爬虫的第一种手段
url = 'https://www.xxx.com/'
# 用户代理:user-agent == UA
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
# 因为urlopen()方法中不能传参字典
# 所以需要定制一个请求对象:
# 这里要注意,因为Resquest()函数有三个传参,这里我们传入两个参数,所以要写成关键字传参
req_headers = ur.Request(url=url,headers=headers)
response = ur.urlopen(req_headers)
print(response.read())
1.5 urllib中get请求
quote()
方法就是将汉字转为unicode进行编码的操作
urlencode()
该方法适用于多个参数的编码
import urllib.request
import urllib.parse
# 我们从网页上复制下来的地址中中文会转成unicode编码的形式
# 如果我们传入中文参数的时候,需要转为unicode编码的形式
baseurl = "https://www.xxx.com/s?ie=utf-8&fr=none&src=360sou_newhome&ssid=&q="
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
# 汉字转为unicode进行编码的操作
keyword = urllib.parse.quote('你好')
url = baseurl + keyword
# 定制请求头
req_headers = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器发送请求
response = urllib.request.urlopen(req_headers)
print(response.read().decode('utf-8'))
import urllib.request
import urllib.parse
# 我们从网页上复制下来的地址中中文会转成unicode编码的形式
# 如果我们传入中文参数的时候,需要转为unicode编码的形式
baseurl = "https://www.xxxx.com/s?ie=utf-8&fr=none&src=360sou_newhome&ssid=&q="
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
# 汉字转为unicode进行编码的操作
keyword = urllib.parse.urlencode({
"ie":"utf-8",
"fr":"none",
"src":"xxxx_newhome",
"ssid":'',
"q":"你好"
})
print(keyword)
url = baseurl + keyword
# 定制请求头
req_headers = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器发送请求
response = urllib.request.urlopen(req_headers)
print(response.read().decode('utf-8'))
1.6 urllib中post请求
爬取 翻译信息
import urllib.request
import urllib.parse
import json
# POST请求
url = 'https://xxxx.com/sug'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
data = {
'kw':'apple'
}
# 在post请求中传入的参数必须先编码在转码
data = urllib.parse.urlencode(data).encode('utf-8')
# 伪装
request = urllib.request.Request(url=url,data=data,headers=headers)
# 模拟浏览器去发请求获取数据
response = urllib.request.urlopen(request).read().decode('utf-8')
# 获取的数据格式是json字符串,我们通过转为json对象能够具体的对获取的数据进行进一步的分析。
print(json.loads(response))
print(type(json.loads(response))) # <class 'dict'>
爬取 详细翻译信息
import urllib.request
import urllib.parse
import json
url = 'https://xxxx.xxxxdu.com/v2transapi?from=en&to=zh'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Cookie':'BAIDUID=9C8DD10E571D38D02DE69FC335B20D0F:FG=1; PSTM=1666228296; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BIDUPSID=DB602297D1834EB54A01739BC53762D5; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1666232632,1666319661; H_BDCLCKID_SF=tbFeVC8ytIK3DJjR5-oKbPRH-UnLq-3e3gOZ0l8KtqjTqPJwM46rj-LPKl3ytnjU-H5P0b7mWIQHDIJFWxjY5b3D3loP2l3M36v4KKJxtPPWeIJoLUc82M4shUJiB5JLBan7_T6IXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtnLhbRO4-TFaD6jBefK; BDSFRCVID=dsDOJeC62ACspyjj_4eIrpo4rWrYUgoTH6ao0jZFMtt-VuUmTgEwEG0P0x8g0KCMjxOgogKK3mOTH4-F_2uxOjjg8UtVJeC6EG0Ptf8g0f5; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=9C8DD10E571D38D02DE69FC335B20D0F:FG=1; BDSFRCVID_BFESS=dsDOJeC62ACspyjj_4eIrpo4rWrYUgoTH6ao0jZFMtt-VuUmTgEwEG0P0x8g0KCMjxOgogKK3mOTH4-F_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tbFeVC8ytIK3DJjR5-oKbPRH-UnLq-3e3gOZ0l8KtqjTqPJwM46rj-LPKl3ytnjU-H5P0b7mWIQHDIJFWxjY5b3D3loP2l3M36v4KKJxtPPWeIJoLUc82M4shUJiB5JLBan7_T6IXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtnLhbRO4-TFaD6jBefK; delPer=0; PSINO=6; BA_HECTOR=ah0kalak81a0042h85244ilb1hlc3o51a; ZFY=VKwJkdkdCO66cMfBbhphndxD4HpwKGhMYynlTR29zO4:C; H_PS_PSSID=36551_37354_36884_36803_36786_37534_26350_37455; BDRCVFR[S4-dAuiWMmn]=I67x6TjHwwYf0; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; RT="z=1&dm=baidu.com&si=prn0r611hei&ss=l9mhql0g&sl=4&tt=1qy&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=1zf&ul=5j5c&hd=5j6p"; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1666599821'
}
data = {
'from': 'en',
'to': 'zh',
'query': 'list',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '949041.694272',
'token': '851a2ab0a2b0e9aa18d1751397d4c048',
'domain': 'common'
}
# 对于post请求 一定要对数据线先进行编码再转码
data = urllib.parse.urlencode(data).encode('utf-8')
# 伪装
request = urllib.request.Request(url=url,data=data,headers=headers)
# 模拟浏览器向服务器发请求
response = urllib.request.urlopen(request).read().decode('utf-8')
print(json.loads(response))
1.7 综合案例 爬取电影数据
import urllib.request
import urllib.parse
# 封装get请求
def get_request(total,headers):
"""
:params total:总条数
:params headers:请求头信息
"""
url = f'https://m.xxxx.com/rexxar/api/v2/movie/recommend?refresh=0&start={total}&count=20&selected_categories=%7B%7D&uncollect=false&playable=true&tags='
# 伪装
request= urllib.request.Request(url=url,headers=headers)
# 模拟浏览器向服务器发请求
response = urllib.request.urlopen(request).read().decode('utf-8')
# 下载到本地的文件上
with open(f"./data/film.json","w",encoding="utf-8") as wfile:
wfile.write(response)
# 主程序入口
if __name__ == "__main__":
total_page = int(input('请输入需要获取多少条数据:'))
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
'Cookie':'ll="118200"; bid=1CuXFQ257_g; __utma=30149280.1620981875.1666682677.1666682677.1666682677.1; __utmc=30149280; __utmz=30149280.1666682677.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1666682677; ap_v=0,6.0; __gads=ID=cc817e7571b17890-2278973080d700ee:T=1666682691:RT=1666682691:S=ALNI_MZBOpjSY8fWAB6YORSpTLEsz_Ltlw; __gpi=UID=00000b6c3048fbef:T=1666682691:RT=1666682691:S=ALNI_Mb9UvbSUSVIfrCaaoIG75Rsced8ZQ',
'Referer':'https://movie.douban.com/explore'
}
get_request(total=total_page,headers=headers)
1.8 综合案例 爬取餐厅地址
import urllib.request
import urllib.parse
# 封装post请求
def post_request(page,city):
url = 'http://www.xxxx.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': city,
'pid':'',
'pageIndex': page,
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Cookie':'route-cell=ksa; Hm_lvt_1039f1218e57655b6677f30913227148=1666687155; Hm_lpvt_1039f1218e57',
'Referer':'http://www.xxxx.com.cn/kfccda/storelist/index.aspx'
}
# 伪装
request = urllib.request.Request(url=url,data=data,headers=headers)
# 模拟服务器发请求
response = urllib.request.urlopen(request).read().decode('utf-8')
with open(f'./data/KFC_Location_{page}.json','w',encoding='utf-8') as fs:
fs.write(response)
if __name__ == '__main__':
start_page = int(input("请输入爬取的起始页:"))
end_page = int(input("请输入爬取的结束页:"))
city = input('请输入你要查询的城市:')
for page in range(start_page,end_page+1):
post_request(page,city)
总结
以上就是今天要讲的内容,希望对大家有所帮助!!!
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/82832.html