爬虫不求人,上手跟我做(五)递归爬全站
递归爬全站
以开源博客系统为例
页面跳转
我们按照日志类别这一维度来爬全站
日志类别每一项点进去之后都是一个分页
分页中的每一页中包含多篇文章,每一篇文章的链接点进去后看到具体的内容。
xpath 分析
日志类别的 xpath="//div[@class='col-md-3']/div[@class='data_list']/div[@class='datas']/ul/li/span/a/@href"
每一篇文章跳转 url 的 xpath="//div[@class='col-md-9']/div[@class='data_list']/div[@class='datas']/ul/li/span[@class='title']/a/@href"
翻页时跳转路径http://blog.java1234.com/index.html?page=3&typeId=1&
文章标题的 xpath="//div[@class='blog_title']/h3/strong/text()"
文章内容的 xpath="//div[@class='blog_content']"
实现
package cn.qiankunpingtai.spider.crawlers;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.beans.factory.annotation.Autowired;
import java.util.List;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/28 9:26
*/
@Crawler(name = "java1234WithScheduler")
public class Java1234WithScheduler extends BaseSeimiCrawler {
@Autowired
private CrawlerService crawlerService;
@Override
public String[] startUrls() {
return new String[]{"http://blog.java1234.com/index.html"};
}
@Override
public void start(Response response) {
JXDocument doc = response.document();
try {
List<Object> urls = doc.sel("//div[@class='col-md-3']/div[@class='data_list']/div[@class='datas']/ul/li/span/a/@href");
if(urls.size()<1){
return;
}
///index.html?typeId=1
logger.info("{}", urls.size());
for (Object s:urls){
String str=s.toString();
if(str.indexOf("typeId")==-1){
return;
}
System.out.println(str);
Thread.sleep(1000);
if(!str.startsWith("http://blog.java1234.com")){
BusinessConstants.CURRENT_START_URL=new StringBuffer("http://blog.java1234.com").append(str).toString();
}else{
BusinessConstants.CURRENT_START_URL=str;
}
System.out.println(BusinessConstants.CURRENT_START_URL);
push(Request.build(BusinessConstants.CURRENT_START_URL,Java1234WithScheduler::getEachPage));
}
} catch (Exception e) {
e.printStackTrace();
}
}
public void getEachPage(Response response){
JXDocument doc = response.document();
try {
List<Object> urls = doc.sel("//div[@class='col-md-9']/div[@class='data_list']/div[@class='datas']/ul/li/span[@class='title']/a/@href");
if(urls.size()<1){
BusinessConstants.CURRENT_PAGE_NUMBER=1;
return;
}
BusinessConstants.CURRENT_PAGE_NUMBER +=1;
logger.info("{}", urls.size());
///blog/articles/312.html
for (Object s:urls){
String str=s.toString();
System.out.println(str);
Thread.sleep(1000);
if(!str.startsWith("http://blog.java1234.com")){
BusinessConstants.CURRENT_GET_DATA_URL=new StringBuffer("http://blog.java1234.com").append(str).toString();
}else{
BusinessConstants.CURRENT_GET_DATA_URL=str;
}
System.out.println(BusinessConstants.CURRENT_GET_DATA_URL);
push(Request.build(BusinessConstants.CURRENT_GET_DATA_URL,Java1234WithScheduler::renderBean));
}
//给定数据http://blog.java1234.com/index.html?typeId=1
//目标数据http://blog.java1234.com/index.html?page=2&typeId=1&
String nextPageEndPrefix=BusinessConstants.CURRENT_START_URL.substring(BusinessConstants.CURRENT_START_URL.indexOf("typeId"));
StringBuffer bf =new StringBuffer(startUrls()[0]).append("?page=").
append(BusinessConstants.CURRENT_PAGE_NUMBER).append("&").append(nextPageEndPrefix).append("&");
/**
* create by: qiankunpingtai
* create time: 2019/4/15 12:01
* website:https://qiankunpingtai.cn
* description:
* 递归调用翻页
*/
push(Request.build(bf.toString(),Java1234WithScheduler::getEachPage));
} catch (Exception e) {
e.printStackTrace();
}
}
public void renderBean(Response response) {
try {
crawlerService.saveJava1234(response);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package cn.qiankunpingtai.spider.service;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.Java1234Blog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/13 20:36
*/
@Service
public class CrawlerService {
protected Logger logger = LoggerFactory.getLogger(getClass());
@Resource
private BlogMapper blogMapper;
public void saveJava1234(Response response) {
try {
Java1234Blog blog = response.render(Java1234Blog.class);
logger.info("bean resolve res={},url={}", blog, response.getUrl());
if(blog!=null){
/**
* create by: qiankunpingtai
* create time: 2019/4/15 11:48
* website:https://qiankunpingtai.cn
* description:
* 将html转化为markdown
*/
String htmlContent=null;
if(StringUtils.isNotBlank(blog.getContent())){
blog.setStarturl(BusinessConstants.CURRENT_START_URL);
blog.setUrl(BusinessConstants.CURRENT_GET_DATA_URL);
htmlContent=blog.getContent();
//内容不为空的时候转化
blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
int changeNum=blogMapper.insert(blog);
logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
package cn.qiankunpingtai.spider.entities;
import cn.wanghaomiao.seimi.annotation.Xpath;
import java.util.Date;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/28 9:31
*/
public class Java1234Blog implements BlogWithBLOBs{
private Integer id;
@Xpath("//div[@class='blog_title']/h3/strong/text()")
private String title;
private Date updateTime;
@Xpath("//div[@class='blog_content']")
private String content;
private String url;
private String starturl;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Date updateTime) {
this.updateTime = updateTime;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getStarturl() {
return starturl;
}
public void setStarturl(String starturl) {
this.starturl = starturl;
}
}
SeimiCrawler 框架调用配置
#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=java1234WithScheduler
seimi.crawler.enable-redisson-queue=false
执行过程
获取到的数据
上一篇 爬虫不求人,上手跟我做(四)多页面数据获取
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(六)线程调度爬小说
项目见附件
0 打赏
打赏 30 积分后可见