🌵爬虫之Scrapy系列文章

🌴欢迎点赞评论学习交流~

🌱各位看官多多关注哦😘~

🍉项目介绍

我们需要对某当网图书数据(标题，作者，价格，评论，出版社)采集并入库。

🍉项目演示

🌴spider.py

'''
coding:utf-8
@Software:PyCharm
@Time:2022/6/25 18:21
@Author:小久
'''

import scrapy
from ..items import DangdangItem
import re


class DangbookSpider(scrapy.Spider):
    name = 'dangbook'
    allowed_domains = ['dangdang.com']
    # start_urls = ['http://category.dangdang.com/cp01.03.00.00.00.00.html']

    def start_requests(self):
        for i in range(9):
            start_urls = 'http://category.dangdang.com/pg{}-cp01.03.00.00.00.00.html'.format(i)
            # print(start_urls)
            yield scrapy.Request(url=start_urls,callback=self.parse)

    def parse(self, response):
        items = DangdangItem()
        titles = response.xpath('//p[@class="name"]/a/@title').extract()
        prices = response.xpath('//p[@class="price"]/span[@class="search_now_price"]/text()').extract()
        authors = response.xpath('//p[@class="search_book_author"]/span[1]/a[1]/@title').extract()
        comments = response.xpath('//p[@class="search_star_line"]/a/text()').extract()
        publishes = response.xpath('//p[5]/span[3]/a/text()').extract()
        for title,price,author,comment,publish in zip(titles,prices,authors,comments,publishes):
            items['title'] = title
            price = price.split('¥')[1]
            items['price'] = price
            items['author'] = author
            comment = re.findall('(.*?)条评论',comment)[0]
            items['comment'] = comment
            items['publish'] = publish
            # print(publish)
            yield items

用xpath对标题，作者，价格，评论，出版社进行定位，这里我们可以发现了链接的规律，我们直接写一个基础的url，然后遍历翻页就行了。

🌴items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    picture = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    author = scrapy.Field()
    comment = scrapy.Field()
    publish = scrapy.Field()

将数据存入items容器中，等下有利于入库。

🌴 pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html



# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.utils.project import get_project_settings
import pymysql


class DangdangMysql:

    def open_spider(self,spider):
        setting = get_project_settings()
        self.host = setting['HOST']
        self.user = setting['USER']
        self.password = setting['PASSWORD']
        self.charset = setting['CHARSET']
        self.db = setting['DATABASE']
        self.port = setting['PORT']
        self.connect()

    def connect(self):
        self.conn = pymysql.connect(host=self.host,user=self.user,
                                    password=self.password,charset=self.charset,
                                    db=self.db,port=self.port)

    def process_item(self,item,spider):
        self.cursor = self.conn.cursor()
        sql = 'insert into dangbooks(id,title,author,price,publish,comment) values("{}","{}","{}","{}","{}","{}")'\
            .format(0,item['title'],item['author'],item['price'],item['publish'],item['comment'])
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
        return item

    def close_process(self,spider):
        self.cursor.close()
        self.conn.close()

将数据存入mysql数据库。