爬虫不求人,上手跟我做(二)单页面数据获取

单页面数据获取

Spring Data Jpa 实体类自动创建数据库表失败解决为例
图片.png

创建对应的爬虫类

package cn.qiankunpingtai.spider.crawlers;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Response;
import org.springframework.beans.factory.annotation.Autowired;
/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/25 14:14
 */
@Crawler(name = "forOnlyOnceWebCrawler")
public class ForOnlyOnceWebCrawler extends BaseSeimiCrawler {
    @Autowired
    private CrawlerService crawlerService;
    private String startUrl="http://blog.csdn.net/u010429286/article/details/52777046";
    @Override
    public String[] startUrls() {
        String urls [] =new String[]{startUrl} ;
        BusinessConstants.CURRENT_START_URL=startUrl;
        BusinessConstants.CURRENT_GET_DATA_URL=startUrl;
        return urls;
    }

    @Override
    public void start(Response response) {
        try {
            crawlerService.saveForOnlyOnceWeb(response);
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}

对应的常量类

package cn.qiankunpingtai.spider.constants;
/**
 * @ClassName:BusinessConstants
 * @Description 业务字典类
 * @Author qiankunpingtai
 * @Date 2019-3-6 17:58
 * @Version 1.0
 **/
public class BusinessConstants {

    /**
     * 定义一个静态变量,用于全局协调页数
     * 默认从第一页开始
     * */
    public static  Integer CURRENT_PAGE_NUMBER = 1;
    /**
     * 定义一个静态变量,用于记录每次的起始url
     * */
    public static  String CURRENT_START_URL = null;
    /**
     * 定义一个静态变量,用于记录当前获取数据的url
     * */
    public static  String CURRENT_GET_DATA_URL = null;


}

处理数据逻辑的 service

package cn.qiankunpingtai.spider.service;

import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.ForOnlyOnceWebBlog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import javax.annotation.Resource;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/13 20:36
 */
@Service
public class CrawlerService {
    protected Logger logger = LoggerFactory.getLogger(getClass());
    @Resource
    private BlogMapper blogMapper;

    public void saveForOnlyOnceWeb(Response response) {
        try {
            ForOnlyOnceWebBlog blog = response.render(ForOnlyOnceWebBlog.class);
            logger.info("bean resolve res={},url={}", blog, response.getUrl());
            if(blog!=null){
                /**
                 * create by: qiankunpingtai
                 * create time: 2019/3/22 17:10
                 * website:https://qiankunpingtai.cn
                 * description:
                 * 将html转化为markdown
                 */
                String htmlContent=null;
                if(StringUtils.isNotBlank(blog.getContent())){
                    blog.setStarturl(BusinessConstants.CURRENT_START_URL);
                    blog.setUrl(BusinessConstants.CURRENT_GET_DATA_URL);
                    htmlContent=blog.getContent();
                    //内容不为空的时候转化
                    blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
                    int changeNum=blogMapper.insert(blog);
                    logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }


}

实体定义

package cn.qiankunpingtai.spider.entities;

import cn.wanghaomiao.seimi.annotation.Xpath;

import java.util.Date;

/**
 * Description
 *
 * @Author: qiankunpingtai
 * @Date: 2019/3/25 14:19
 */
public class ForOnlyOnceWebBlog implements BlogWithBLOBs {
    private Integer id;
    @Xpath("//h1[@class='title-article']/text()")
    private String title;

    private Date updateTime;
    @Xpath("//article[@class='baidu_pl']/div[@class='article_content clearfix csdn-tracking-statistics']/div[@id='content_views']")
    private String content;

    private String url;

    private String starturl;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Date getUpdateTime() {
        return updateTime;
    }

    public void setUpdateTime(Date updateTime) {
        this.updateTime = updateTime;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getStarturl() {
        return starturl;
    }

    public void setStarturl(String starturl) {
        this.starturl = starturl;
    }
}

添加 seimi 框架指定调用实例的配置

#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=forOnlyOnceWebCrawler
seimi.crawler.enable-redisson-queue=false

图片.png

运行结果

图片.png

数据库获得的数据

图片.png
上一篇 爬虫不求人,上手跟我做(一)springboot 集成爬虫框架 SeimiCrawler
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(三)xpath

项目详见附件

0 打赏
打赏 10 积分后可见