爬虫不求人,上手跟我做(六)线程调度爬小说

线程调度爬小说

八八读书网为例

页面跳转

图片.png
我们按照推荐列表这一维度来爬
图片.png
每一项点进去之后都是一个分页(很多本书的分页)
图片.png
分页中的每一页中包含多本书,每一本书的链接点进去后看到该书的所有章节列表(不分页)。
图片.png
每一个章节点击进去之后显示该章的内容
图片.png

xpath 分析

图片.png
每一种类别书的 xpath="//body/div[@class='tuijian']/ul/li/h2/a/@href"
图片.png
每一本书跳转 url 的 xpath="//div[@class='booklist']/ul/li/span[@class='sm']/a/@href"
图片.png
书列表总页数的 xpath="//div[@class='booklist']/div[@class='pagelink']/a[@class='last']/text()"
翻页时跳转路径https://www.88dush.com/sort1/4/、https://www.88dush.com/sort1/10/
图片.png
书的每一个章节的 xpath="//div[@class='mulu']/ul/li/a/@href"
图片.png
内容标题的 xpath="//div[@class='novel']/h1/text()"
图片.png
文章内容的 xpath="//div[@class='novel']/div[@class='yd_text2']"

实现

package cn.qiankunpingtai.spider.crawlers;


import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.beans.factory.annotation.Autowired;

import java.util.List;
import java.util.Random;

/**
 * <pre>
 * 类名: DuShu88NovelCrawler
 * 类的作用:
 * 创建原因:
 * 创建时间: 2019年4月15日 上午9:07:36
 * 描述:
 * @author qiankunpingtai
 * @version
 * @since JDK 1.8
 * </pre>
 */

@Crawler(name = "88dush")
public class DuShu88NovelCrawler extends BaseSeimiCrawler {

    private static final long serialVersionUID = 3993378973651481714L;
    @Autowired
    private CrawlerService crawlerService;

    private static final String DOMAIN_URL = "https://www.88dush.com";

    @Override
    public String[] startUrls() {
        return new String[]{DOMAIN_URL};
    }

    @Override
    public void start(Response response){
        try {
            JXDocument document = response.document();
            List<Object> urlList = document.sel("//body/div[@class='tuijian']/ul/li/h2/a/@href");
            if (urlList.size() < 1) {
                return;
            }
            System.out.println(Thread.currentThread());
            BusinessConstants.threadPoolStart.execute(()->{
                        BusinessConstants.lock.lock();
                        for (Object url:urlList) {
                            String urlStr = url.toString();
                            if (!urlStr.startsWith(DOMAIN_URL) ) {
                                BusinessConstants.CURRENT_START_URL = new StringBuffer(DOMAIN_URL).append(urlStr).toString();
                            } else {
                                BusinessConstants.CURRENT_START_URL = urlStr;
                            }
                            logger.info("start current start url={}", BusinessConstants.CURRENT_START_URL);
                            push(Request.build(BusinessConstants.CURRENT_START_URL, DuShu88NovelCrawler::getEachPage));
                            /**
                             * 当前线程等待,直到被唤醒
                             * */
                            try {
                                BusinessConstants.conditionPoolStart.await();
                            } catch (InterruptedException e) {
                                e.printStackTrace();
                            }
                        }
                    BusinessConstants.lock.unlock();
                 }
            );

        } catch (Exception e) {
            logger.error("", e);
        } finally {
        }
    }

    public void getEachPage(Response response) {
        try {
            JXDocument document = response.document();
            List<Object> urlList = document.sel("//div[@class='booklist']/ul/li/span[@class='sm']/a/@href");
            /**
             * 获取每一种类型的总页数
             * */
            List<Object> pageTotal = document.sel("//div[@class='booklist']/div[@class='pagelink']/a[@class='last']/text()");
            BusinessConstants.CURRENT_TOTAL_PAGE_NUMBER=Integer.valueOf(pageTotal.get(0).toString());
            BusinessConstants.CURRENT_PAGE_NUMBER ++;
            BusinessConstants.threadPoolPage.execute(()->{
                BusinessConstants.lock.lock();
                for (Object url:urlList) {
                    String urlStr = url.toString();
                    if (!urlStr.startsWith(DOMAIN_URL)) {
                        BusinessConstants.CURRENT_GET_DATA_URL = new StringBuffer(DOMAIN_URL).append(urlStr).toString();
                    } else {
                        BusinessConstants.CURRENT_GET_DATA_URL = urlStr;
                    }
                    push(Request.build(BusinessConstants.CURRENT_GET_DATA_URL, DuShu88NovelCrawler::getEachBook));
                    try {
                        Thread.sleep(5000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    /**
                     * 当前线程等待,直到被唤醒
                     * */
                    try {
                        BusinessConstants.conditionPoolPage.await();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                /**
                 * 总页数是多少,需要提前获取一下
                 * */
                if(BusinessConstants.CURRENT_PAGE_NUMBER>=BusinessConstants.CURRENT_TOTAL_PAGE_NUMBER){
                    /**
                     * 当前类型所有页跑完,唤醒上一级
                     * */
                    BusinessConstants.CURRENT_PAGE_NUMBER=0;
                    BusinessConstants.conditionPoolStart.signal();
                    BusinessConstants.lock.unlock();// 调用signal()方法后需要手动释放
                    return;
                }
                String nextPageEndPrefix = BusinessConstants.CURRENT_START_URL.substring(0, BusinessConstants.CURRENT_START_URL.indexOf("sort") + 6);
                logger.info(nextPageEndPrefix);
                StringBuffer bf = new StringBuffer(nextPageEndPrefix).append(BusinessConstants.CURRENT_PAGE_NUMBER).append("/");
                if (logger.isDebugEnabled()) {
                    logger.debug("url={}", bf.toString());
                }
                BusinessConstants.CURRENT_START_URL = bf.toString();
                push(Request.build(BusinessConstants.CURRENT_START_URL, DuShu88NovelCrawler::getEachPage));
                BusinessConstants.lock.unlock();
            });

        } catch (Exception e) {
            logger.error("", e);
            /**
             * 当前有异常,唤醒上一级
             * */
            BusinessConstants.conditionPoolStart.signal();
            BusinessConstants.lock.unlock();
        } finally {
        }
    }
    public void getEachBook(Response response) {
        try {
            JXDocument document = response.document();
            List<Object> urlList = document.sel("//div[@class='mulu']/ul/li/a/@href");
            BusinessConstants.threadPoolBook.execute(()-> {
                BusinessConstants.lock.lock();
                for (Object url : urlList) {
                    String urlStr = url.toString();
                    if (!urlStr.startsWith(BusinessConstants.CURRENT_GET_DATA_URL)) {
                        BusinessConstants.CURRENT_GET_BOOK_DATA_URL = new StringBuffer(BusinessConstants.CURRENT_GET_DATA_URL).append(urlStr).toString();
                    } else {
                        BusinessConstants.CURRENT_GET_BOOK_DATA_URL = urlStr;
                    }
                    push(Request.build(BusinessConstants.CURRENT_GET_BOOK_DATA_URL, DuShu88NovelCrawler::renderBean));
                    try {
                        /**
                         * 防止被屏蔽,间隔1到两秒秒钟访问
                         * */
                        Thread.sleep(new Random().nextInt(1000)+1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    /**
                     * 当前线程等待,直到被唤醒
                     * */
                    try {
                        BusinessConstants.conditionPoolBook.await();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                /**
                 * 唤醒上一级线程
                 * */
                BusinessConstants.conditionPoolPage.signal();
                BusinessConstants.lock.unlock();
            });
        } catch (Exception e) {
            logger.error("", e);
            /**
             * 当前有异常,唤醒上一级
             * */
            BusinessConstants.conditionPoolPage.signal();
            BusinessConstants.lock.unlock();
        } finally {
        }
    }
    public void renderBean(Response response) {
        try {
            BusinessConstants.lock.lock();
            crawlerService.saveForDuShu88(response);
            /**
             * 唤醒上一级线程
             * */
            BusinessConstants.conditionPoolBook.signal();
            BusinessConstants.lock.unlock();
        } catch (Exception e) {
            e.printStackTrace();
            /**
             * 当前有异常,唤醒上一级
             * */
            BusinessConstants.conditionPoolBook.signal();
            BusinessConstants.lock.unlock();
        }
    }
}




package cn.qiankunpingtai.spider.service;

import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.DuShu88Blog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import javax.annotation.Resource;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/13 20:36
 */
@Service
public class CrawlerService {
    protected Logger logger = LoggerFactory.getLogger(getClass());
    @Resource
    private BlogMapper blogMapper;

    public void saveForDuShu88(Response response) {
        try {
            DuShu88Blog blog = response.render(DuShu88Blog.class);
            logger.info("bean resolve res={},url={}", blog, response.getUrl());
            if(blog!=null){
                /**
                 * create by: qiankunpingtai
                 * create time: 2019/3/22 17:10
                 * website:http://39.105.146.63/symphony/
                 * description:
                 * 将html转化为markdown
                 */
                String htmlContent=null;
                if(StringUtils.isNotBlank(blog.getContent())){
                    blog.setStarturl(BusinessConstants.CURRENT_GET_DATA_URL);
                    blog.setUrl(BusinessConstants.CURRENT_GET_BOOK_DATA_URL);
                    htmlContent=blog.getContent();
                    //内容不为空的时候转化
                    blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
                }
                int changeNum=blogMapper.insert(blog);
                logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}

package cn.qiankun.spider.constants;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

/**
 * @ClassName:BusinessConstants
 * @Description 业务字典类
 * @Author qiankunpingtai
 * @Date 2019-3-6 17:58
 * @Version 1.0
 **/
public class BusinessConstants {

    /**
     * 定义一个静态变量,用于全局协调页数
     * 默认从第一页开始
     * */
    public static  Integer CURRENT_PAGE_NUMBER = 0;
    /**
     * 定义一个静态变量,用于记录总页数
     * */
    public static  Integer CURRENT_TOTAL_PAGE_NUMBER = 1;
    /**
     * 定义一个静态变量,用于记录每次的起始url
     * */
    public static  String CURRENT_START_URL = null;
    /**
     * 定义一个静态变量,用于记录当前获取数据的url
     * */
    public static  String CURRENT_GET_DATA_URL = null;
    /**
     * 定义一个静态变量,用于记录当前获取每一本书章节数据的url
     * */
    public static  String CURRENT_GET_BOOK_DATA_URL = null;
    


    /**
     *静态线程池
     * */
    public static ExecutorService threadPoolStart = Executors.newFixedThreadPool(1);
    /**
     *静态线程池
     * */
    public static ExecutorService threadPoolPage = Executors.newFixedThreadPool(1);
    /**
     *静态线程池
     * */
    public static ExecutorService threadPoolBook = Executors.newFixedThreadPool(10);
    /**
     * 存放执行的线程
     * */
    public static Map<String,Thread> threadMap=new ConcurrentHashMap<String,Thread>();
    /**
     * 线程锁
     * */
    public static Lock lock = new ReentrantLock();
    public static Condition conditionPoolStart = lock.newCondition();// 必须配合lock.lock()使用
    public static Condition conditionPoolPage = lock.newCondition();// 必须配合lock.lock()使用
    public static Condition conditionPoolBook = lock.newCondition();// 必须配合lock.lock()使用



}

package cn.qiankunpingtai.spider.entities;

import cn.wanghaomiao.seimi.annotation.Xpath;

import java.util.Date;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/4/18 22:26
 */
public class DuShu88Blog implements BlogWithBLOBs {
    private Integer id;
    @Xpath("//div[@class='novel']/h1/text()")
    private String title;

    private Date updateTime;
    @Xpath("//div[@class='novel']/div[@class='yd_text2']")
    private String content;

    private String url;

    private String starturl;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Date getUpdateTime() {
        return updateTime;
    }

    public void setUpdateTime(Date updateTime) {
        this.updateTime = updateTime;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getStarturl() {
        return starturl;
    }

    public void setStarturl(String starturl) {
        this.starturl = starturl;
    }
}

SeimiCrawler 框架调用配置

#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=88dush
seimi.crawler.enable-redisson-queue=false

执行过程

图片.png

获取到的数据

图片.png

上一篇 爬虫不求人,上手跟我做(五)递归爬全站
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(七)识别图片验证码登录爬取数据
项目见附件

1 打赏
打赏 100 积分后可见