scrapy中使用selenium+webdriver获取网页源码,爬取简书网站
scrapy中使用selenium+webdriver获取网页源码,爬取简书网站
由于简书中一些数据是通过js渲染出来的,所以通过正常的request请求返回的response源码中没有相关数据,
所以这里选择selenium+webdriver获取网页源码
1. 设置需要爬取的数据
import scrapy
class JianshuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    author_img = scrapy.Field()
    time = scrapy.Field()
    read_count = scrapy.Field()
    subjects = scrapy.Field()2. 在下载器中间件中使用 selenium+webdriver
from scrapy import signals
from scrapy.http.response.html import HtmlResponse
from selenium import webdriver
# 显示等待
from selenium.webdriver.support.ui import WebDriverWait
class SeleniumDownloaderMiddleware:
    def __init__(self):
        # 加载chrome驱动,若chromedriver.exe文件和python.exe 在相同目录下,可以省略executable_path="D:\python\chromedriver.exe"
        # 即 self.driver=webdriver.Chrome()就可以
        self.driver = webdriver.Chrome(executable_path="D:\python\chromedriver.exe")
    def process_request(self, request, spider):
        print("-"*40)
        print(id(self))
        print("-"*40)
        self.driver.get(request.url)
        try:
            while True:
                WebDriverWait(self.driver, 3).until(lambda x: x.find_element_by_class_name("H7E3vT"))
                # 获取加载更多按钮
                # show_more = self.driver.find_element_by_xpath("//div[@class=‘H7E3vT‘]")
                show_more = self.driver.find_element_by_class_name("H7E3vT")
                show_more.click()
        except:
            print("找不到更多按钮")
            pass
        # 获取网页源代码
        html = self.driver.page_source
        # 使用url=self.driver.current_url而不使用url=request.url,是有可能发生重定向,导致url发生变化
        response = HtmlResponse(url=self.driver.current_url, body=html, request=request, encoding="utf-8")
        # 返回response,请求就直接返回给scrapy引擎,而不会再发给下载器执行下载
        return response3. 编写解析数据的爬虫
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapylearn.jianshu.jianshu.items import JianshuItem
class JianshuspiderSpider(CrawlSpider):
    name = ‘jianshuspider‘
    allowed_domains = [‘jianshu.com‘]
    start_urls = [‘http://jianshu.com/‘]
    rules = (
        Rule(LinkExtractor(allow=r‘.*/p/[0-9a-z]{12}.*‘), callback=‘parse_detail‘, follow=True),
    )
    def parse_detail(self, response):
        title = response.xpath("//h1[@class=‘_1RuRku‘]/text()").get()
        author = response.xpath("//span[@class=‘FxYr8x‘]/a/text()").get()
        author_img = response.xpath("//img[@class=‘_13D2Eh‘]/@src").get()
        time = response.xpath("//div[@class=‘s-dsoj‘]/time/text()").get()
        read_count = response.xpath("//div[@class=‘s-dsoj‘]/span[2]/text()").get().split()[1].replace(",", "")
        subjects = ",".join(response.xpath("//div[@class=‘_2Nttfz‘]/a/span/text()").getall())
        yield JianshuItem(title=title, author=author, author_img=author_img, time=time, read_count=read_count,
                    subjects=subjects)
    def parse_item(self, response):
        item = {}
        # item[‘domain_id‘] = response.xpath(‘//input[@id="sid"]/@value‘).get()
        # item[‘name‘] = response.xpath(‘//div[@id="name"]‘).get()
        # item[‘description‘] = response.xpath(‘//div[@id="description"]‘).get()
        return item4. 将数据保存到mysql
import pymysql
class JianshuPipeline:
    def __init__(self):
        self.conn = pymysql.connect(
            host=‘localhost‘,
            port=3307,
            user=‘root‘,
            password=‘1612480331‘,
            database=‘houses‘,
            charset=‘utf8‘
        )
    def process_item(self, item, spider):
        print("=" * 40)
        print(id(self))
        print("=" * 40)
        # 打开数据库连接
        # conn = pymysql.connect("localhost", "root", "1612480331", "houses", 3307)
        # 创建一个游标对象
        cursor = self.conn.cursor()
        sql = "insert into jianshu values (%s,%s,%s,%s,%s,%s)"
        cursor.execute(sql, (
            item["title"], item["author"], item["author_img"], item["time"], item["read_count"], item["subjects"]))
        self.conn.commit()
        # print(values)
        # for v in values:
        #     print(v)
        cursor.close()
        return item
    # 当爬虫关闭的时候会调用
    def close_spider(self, spider):
        self.conn.close()
        print("爬虫执行结束")5. 在settings.py中进行配置
DOWNLOADER_MIDDLEWARES = {
    # ‘jianshu.middlewares.JianshuDownloaderMiddleware‘: 543,
    ‘jianshu.middlewares.SeleniumDownloaderMiddleware‘: 1
}
ITEM_PIPELINES = {
   ‘jianshu.pipelines.JianshuPipeline‘: 300,
}
USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36‘
# Obey robots.txt rules
ROBOTSTXT_OBEY = False 相关推荐
  瓜牛呱呱    2020-11-12  
   柳木木的IT    2020-11-04  
   yifouhu    2020-11-02  
   lei0    2020-11-02  
   源码zanqunet    2020-10-26  
   码代码的陈同学    2020-10-14  
   lukezhong    2020-10-14  
   clh0    2020-09-18  
   changcongying    2020-09-17  
   星辰大海的路上    2020-09-13  
   abfdada    2020-08-26  
   mzy000    2020-08-24  
   shenlanse    2020-08-18  
   zhujiangtaotaise    2020-08-18  
   xiemanR    2020-08-17