爬虫不求人,上手跟我做(四)多页面数据获取
多页面数据获取
以坠落鱼为例
页面 url 分析:
第一页:
url:https://www.cnblogs.com/zhuiluoyu/
第二页:
url:https://www.cnblogs.com/zhuiluoyu/default.html?page=2
第三页
url:https://www.cnblogs.com/zhuiluoyu/default.html?page=3
结论:
url 以https://www.cnblogs.com/zhuiluoyu/ 开始,翻页以 https://www.cnblogs.com/zhuiluoyu/ 开始,追加 default.html?page=N
进行翻页。
每一页中列表到内容页跳转分析:
跳转 url 的关键 xpath
可以看到是 div[@class='forFlow'],然后子元素 div[@class='day'], 然后子元素 div[@class='postTitle'], 然后 a 标签的 href 属性
其它分支完全可以忽略,我们只取到 href 的跳转路径就可以。
所以 xpath="//div[@class='forFlow']/div[@class='day']/div[@class='postTitle']/a/@href"
当然 xpath 也有很多其它的写法,只要可以定位到 href 属性就可以。
内容页 xpath 分析:
标题
h1[@class='postTitle'] 的子元素的 a 标签的内容 text()
所以 xpth="//h1[@class='postTitle']/a/text()"
内容
div[@class='postBody'] 下的 div[@class='blogpost-body'] 里面包含的全部是内容部分
所以 xpath="//div[@class='postBody']/div[@class='blogpost-body']"
数据爬取
ForSomeWebCrawler
package cn.qiankunpingtai.spider.crawlers;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import java.util.List;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/25 14:05
*/
@Crawler(name = "forSomeWebCrawler")
public class ForSomeWebCrawler extends BaseSeimiCrawler {
@Autowired
private CrawlerService crawlerService;
private String startUrl="https://www.cnblogs.com/zhuiluoyu/";
/**
* create by: qiankunpingtai
* create time: 2019/4/14 21:55
* website:https://qiankunpingtai.cn
* description:
* 起始的url,初始化的时候执行
*/
@Override
public String[] startUrls() {
String urls [] =new String[]{startUrl} ;
BusinessConstants.CURRENT_START_URL=urls[0];
return urls;
}
@Override
public void start(Response response) {
JXDocument doc = response.document();
try {
List<Object> urls = doc.sel("//div[@class='forFlow']/div[@class='day']/div[@class='postTitle']/a/@href");
if(urls.size()<1){
return;
}
logger.info("{}", urls.size());
for (Object s : urls) {
/**
* create by: qiankunpingtai
* create time: 2019/4/14 22:01
* website:https://qiankunpingtai.cn
* description:
* 这个地方每访问一次停1秒钟,是为了防止频繁访问被防火墙屏蔽
*
*/
Thread.sleep(1000);
BusinessConstants.CURRENT_GET_DATA_URL=s.toString();
push(Request.build(s.toString(), ForSomeWebCrawler::renderBean));
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void renderBean(Response response) {
try {
crawlerService.saveForSomeWeb(response);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* create by: qiankunpingtai
* create time: 2019/4/14 21:56
* website:https://qiankunpingtai.cn
* description:
* 第一次调用起始url之后,
* 之后每隔60秒调用这里
*/
@Scheduled(cron = "0/60 * * * * ?")
public void callByCron(){
logger.info("我是一个根据cron表达式执行的调度器,60秒一次");
StringBuffer bf=new StringBuffer(startUrl);
bf.append("default.html");
bf.append("?page=");
bf.append(BusinessConstants.CURRENT_PAGE_NUMBER);
System.out.println("当前页码:"+BusinessConstants.CURRENT_PAGE_NUMBER);
BusinessConstants.CURRENT_START_URL=bf.toString();
System.out.println("当前起始url:"+BusinessConstants.CURRENT_START_URL);
BusinessConstants.CURRENT_PAGE_NUMBER +=1;
// 可定时发送一个Request
/**
* create by: qiankunpingtai
* create time: 2019/4/14 21:57
* website:https://qiankunpingtai.cn
* description:
* setSkipDuplicateFilter(true)表示需要跳过去重访问的url,即重复的url也访问
* 如果重复的url不访问就不加这个,SeimiCrawler默认的去重规则为去除重复的url
* 对于需要从同一个页面重复抓取数据的,需要加上这个,需要注意的是,这个是否去重的标识是
* 只对当前步骤生效,在下一个步骤如果也不需要去重,就必须再次添加上这样的配置
*/
push(Request.build(BusinessConstants.CURRENT_START_URL,"start").setSkipDuplicateFilter(true));
}
}
CrawlerService
package cn.qiankunpingtai.spider.service;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.ForSomeWebBlog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/13 20:36
*/
@Service
public class CrawlerService {
protected Logger logger = LoggerFactory.getLogger(getClass());
@Resource
private BlogMapper blogMapper;
public void saveForSomeWeb(Response response) {
try {
ForSomeWebBlog blog = response.render(ForSomeWebBlog.class);
logger.info("bean resolve res={},url={}", blog, response.getUrl());
if(blog!=null){
/**
* create by: qiankunpingtai
* create time: 2019/3/22 17:10
* website:https://qiankunpingtai.cn
* description:
* 将html转化为markdown
*/
String htmlContent=null;
if(StringUtils.isNotBlank(blog.getContent())){
blog.setStarturl(BusinessConstants.CURRENT_START_URL);
blog.setUrl(BusinessConstants.CURRENT_GET_DATA_URL);
htmlContent=blog.getContent();
//内容不为空的时候转化
blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
int changeNum=blogMapper.insert(blog);
logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
ForSomeWebBlog
package cn.qiankunpingtai.spider.entities;
import cn.wanghaomiao.seimi.annotation.Xpath;
import java.util.Date;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/25 14:09
*/
public class ForSomeWebBlog implements BlogWithBLOBs {
private Integer id;
@Xpath("//h1[@class='postTitle']/a/text()")
private String title;
private Date updateTime;
@Xpath("//div[@class='postBody']/div[@class='blogpost-body']")
private String content;
private String url;
private String starturl;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Date updateTime) {
this.updateTime = updateTime;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getStarturl() {
return starturl;
}
public void setStarturl(String starturl) {
this.starturl = starturl;
}
}
seimi 框架配置
#数据源配置
spring.datasource.url=jdbc:mysql://192.168.1.103/spider?useUnicode=yes&characterEncoding=UTF-8&useInformationSchema=true&useSSL=false&serverTimezone=UTC
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.username=root
spring.datasource.password=123456
#alibaba数据库连接池管理
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
#mybatis配置
mybatis.mapper-locations=classpath:./mapper_xml/*.xml
#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=forSomeWebCrawler
seimi.crawler.enable-redisson-queue=false
执行效果
获取到的数据
上一篇 爬虫不求人,上手跟我做(三)xpath
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(五)递归爬全站
项目实现详见附件: