爬虫不求人,上手跟我做(五)递归爬全站

递归爬全站

开源博客系统为例

页面跳转

图片.png
我们按照日志类别这一维度来爬全站
图片.png
日志类别每一项点进去之后都是一个分页
图片.png
分页中的每一页中包含多篇文章,每一篇文章的链接点进去后看到具体的内容。
图片.png

xpath 分析

图片.png
日志类别的 xpath="//div[@class='col-md-3']/div[@class='data_list']/div[@class='datas']/ul/li/span/a/@href"
图片.png
每一篇文章跳转 url 的 xpath="//div[@class='col-md-9']/div[@class='data_list']/div[@class='datas']/ul/li/span[@class='title']/a/@href"
图片.png
翻页时跳转路径http://blog.java1234.com/index.html?page=3&typeId=1&
图片.png
文章标题的 xpath="//div[@class='blog_title']/h3/strong/text()"
图片.png
文章内容的 xpath="//div[@class='blog_content']"

实现

package cn.qiankunpingtai.spider.crawlers;

import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.beans.factory.annotation.Autowired;

import java.util.List;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/28 9:26
 */
@Crawler(name = "java1234WithScheduler")
public class Java1234WithScheduler extends BaseSeimiCrawler {
    @Autowired
    private CrawlerService crawlerService;
    @Override
    public String[] startUrls() {
        return new String[]{"http://blog.java1234.com/index.html"};
    }

    @Override
    public void start(Response response) {
        JXDocument doc = response.document();
        try {
            List<Object> urls = doc.sel("//div[@class='col-md-3']/div[@class='data_list']/div[@class='datas']/ul/li/span/a/@href");
            if(urls.size()<1){
                return;
            }
            ///index.html?typeId=1
            logger.info("{}", urls.size());
            for (Object s:urls){
                String str=s.toString();
                if(str.indexOf("typeId")==-1){
                    return;
                }
                System.out.println(str);
                Thread.sleep(1000);
                if(!str.startsWith("http://blog.java1234.com")){
                    BusinessConstants.CURRENT_START_URL=new StringBuffer("http://blog.java1234.com").append(str).toString();
                }else{
                    BusinessConstants.CURRENT_START_URL=str;
                }
                System.out.println(BusinessConstants.CURRENT_START_URL);
                push(Request.build(BusinessConstants.CURRENT_START_URL,Java1234WithScheduler::getEachPage));
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public void getEachPage(Response response){
        JXDocument doc = response.document();
        try {
            List<Object> urls = doc.sel("//div[@class='col-md-9']/div[@class='data_list']/div[@class='datas']/ul/li/span[@class='title']/a/@href");
            if(urls.size()<1){
                BusinessConstants.CURRENT_PAGE_NUMBER=1;
                return;
            }
            BusinessConstants.CURRENT_PAGE_NUMBER +=1;
            logger.info("{}", urls.size());
            ///blog/articles/312.html
            for (Object s:urls){
                String str=s.toString();
                System.out.println(str);
                Thread.sleep(1000);
                if(!str.startsWith("http://blog.java1234.com")){
                    BusinessConstants.CURRENT_GET_DATA_URL=new StringBuffer("http://blog.java1234.com").append(str).toString();
                }else{
                    BusinessConstants.CURRENT_GET_DATA_URL=str;
                }
                System.out.println(BusinessConstants.CURRENT_GET_DATA_URL);
                push(Request.build(BusinessConstants.CURRENT_GET_DATA_URL,Java1234WithScheduler::renderBean));
            }
            //给定数据http://blog.java1234.com/index.html?typeId=1
            //目标数据http://blog.java1234.com/index.html?page=2&typeId=1&
            String nextPageEndPrefix=BusinessConstants.CURRENT_START_URL.substring(BusinessConstants.CURRENT_START_URL.indexOf("typeId"));
            StringBuffer bf =new StringBuffer(startUrls()[0]).append("?page=").
                    append(BusinessConstants.CURRENT_PAGE_NUMBER).append("&").append(nextPageEndPrefix).append("&");
            /**
             * create by: qiankunpingtai
             * create time: 2019/4/15 12:01
             * website:https://qiankunpingtai.cn
             * description:
             * 递归调用翻页
             */
            push(Request.build(bf.toString(),Java1234WithScheduler::getEachPage));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void renderBean(Response response) {
        try {
            crawlerService.saveJava1234(response);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }




}

package cn.qiankunpingtai.spider.service;

import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.Java1234Blog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import javax.annotation.Resource;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/13 20:36
 */
@Service
public class CrawlerService {
    protected Logger logger = LoggerFactory.getLogger(getClass());
    @Resource
    private BlogMapper blogMapper;

    public void saveJava1234(Response response) {
        try {
            Java1234Blog blog = response.render(Java1234Blog.class);
            logger.info("bean resolve res={},url={}", blog, response.getUrl());
            if(blog!=null){
                /**
                 * create by: qiankunpingtai
                 * create time: 2019/4/15 11:48
                 * website:https://qiankunpingtai.cn
                 * description:
                 * 将html转化为markdown
                 */
                String htmlContent=null;
                if(StringUtils.isNotBlank(blog.getContent())){
                    blog.setStarturl(BusinessConstants.CURRENT_START_URL);
                    blog.setUrl(BusinessConstants.CURRENT_GET_DATA_URL);
                    htmlContent=blog.getContent();
                    //内容不为空的时候转化
                    blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
                    int changeNum=blogMapper.insert(blog);
                    logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }


}

package cn.qiankunpingtai.spider.entities;

import cn.wanghaomiao.seimi.annotation.Xpath;

import java.util.Date;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/28 9:31
 */
public class Java1234Blog implements BlogWithBLOBs{
    private Integer id;
    @Xpath("//div[@class='blog_title']/h3/strong/text()")
    private String title;

    private Date updateTime;
    @Xpath("//div[@class='blog_content']")
    private String content;

    private String url;

    private String starturl;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Date getUpdateTime() {
        return updateTime;
    }

    public void setUpdateTime(Date updateTime) {
        this.updateTime = updateTime;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getStarturl() {
        return starturl;
    }

    public void setStarturl(String starturl) {
        this.starturl = starturl;
    }

}

SeimiCrawler 框架调用配置

#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=java1234WithScheduler
seimi.crawler.enable-redisson-queue=false

执行过程

图片.png

获取到的数据

图片.png
上一篇 爬虫不求人,上手跟我做(四)多页面数据获取
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(六)线程调度爬小说
项目见附件

0 打赏
打赏 30 积分后可见