爬虫不求人,上手跟我做(二)单页面数据获取
单页面数据获取
以Spring Data Jpa 实体类自动创建数据库表失败解决为例
创建对应的爬虫类
package cn.qiankunpingtai.spider.crawlers;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Response;
import org.springframework.beans.factory.annotation.Autowired;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/25 14:14
*/
@Crawler(name = "forOnlyOnceWebCrawler")
public class ForOnlyOnceWebCrawler extends BaseSeimiCrawler {
@Autowired
private CrawlerService crawlerService;
private String startUrl="http://blog.csdn.net/u010429286/article/details/52777046";
@Override
public String[] startUrls() {
String urls [] =new String[]{startUrl} ;
BusinessConstants.CURRENT_START_URL=startUrl;
BusinessConstants.CURRENT_GET_DATA_URL=startUrl;
return urls;
}
@Override
public void start(Response response) {
try {
crawlerService.saveForOnlyOnceWeb(response);
} catch (Exception e) {
e.printStackTrace();
}
}
}
对应的常量类
package cn.qiankunpingtai.spider.constants;
/**
* @ClassName:BusinessConstants
* @Description 业务字典类
* @Author qiankunpingtai
* @Date 2019-3-6 17:58
* @Version 1.0
**/
public class BusinessConstants {
/**
* 定义一个静态变量,用于全局协调页数
* 默认从第一页开始
* */
public static Integer CURRENT_PAGE_NUMBER = 1;
/**
* 定义一个静态变量,用于记录每次的起始url
* */
public static String CURRENT_START_URL = null;
/**
* 定义一个静态变量,用于记录当前获取数据的url
* */
public static String CURRENT_GET_DATA_URL = null;
}
处理数据逻辑的 service
package cn.qiankunpingtai.spider.service;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.ForOnlyOnceWebBlog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/13 20:36
*/
@Service
public class CrawlerService {
protected Logger logger = LoggerFactory.getLogger(getClass());
@Resource
private BlogMapper blogMapper;
public void saveForOnlyOnceWeb(Response response) {
try {
ForOnlyOnceWebBlog blog = response.render(ForOnlyOnceWebBlog.class);
logger.info("bean resolve res={},url={}", blog, response.getUrl());
if(blog!=null){
/**
* create by: qiankunpingtai
* create time: 2019/3/22 17:10
* website:https://qiankunpingtai.cn
* description:
* 将html转化为markdown
*/
String htmlContent=null;
if(StringUtils.isNotBlank(blog.getContent())){
blog.setStarturl(BusinessConstants.CURRENT_START_URL);
blog.setUrl(BusinessConstants.CURRENT_GET_DATA_URL);
htmlContent=blog.getContent();
//内容不为空的时候转化
blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
int changeNum=blogMapper.insert(blog);
logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
实体定义
package cn.qiankunpingtai.spider.entities;
import cn.wanghaomiao.seimi.annotation.Xpath;
import java.util.Date;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/25 14:19
*/
public class ForOnlyOnceWebBlog implements BlogWithBLOBs {
private Integer id;
@Xpath("//h1[@class='title-article']/text()")
private String title;
private Date updateTime;
@Xpath("//article[@class='baidu_pl']/div[@class='article_content clearfix csdn-tracking-statistics']/div[@id='content_views']")
private String content;
private String url;
private String starturl;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Date updateTime) {
this.updateTime = updateTime;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getStarturl() {
return starturl;
}
public void setStarturl(String starturl) {
this.starturl = starturl;
}
}
添加 seimi 框架指定调用实例的配置
#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=forOnlyOnceWebCrawler
seimi.crawler.enable-redisson-queue=false
运行结果
数据库获得的数据
上一篇 爬虫不求人,上手跟我做(一)springboot 集成爬虫框架 SeimiCrawler
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(三)xpath
项目详见附件
0 打赏
打赏 10 积分后可见