爬虫不求人,上手跟我做(六)线程调度爬小说
线程调度爬小说
以八八读书网为例
页面跳转
我们按照推荐列表这一维度来爬
每一项点进去之后都是一个分页(很多本书的分页)
分页中的每一页中包含多本书,每一本书的链接点进去后看到该书的所有章节列表(不分页)。
每一个章节点击进去之后显示该章的内容
xpath 分析
每一种类别书的 xpath="//body/div[@class='tuijian']/ul/li/h2/a/@href"
每一本书跳转 url 的 xpath="//div[@class='booklist']/ul/li/span[@class='sm']/a/@href"
书列表总页数的 xpath="//div[@class='booklist']/div[@class='pagelink']/a[@class='last']/text()"
翻页时跳转路径https://www.88dush.com/sort1/4/、https://www.88dush.com/sort1/10/
书的每一个章节的 xpath="//div[@class='mulu']/ul/li/a/@href"
内容标题的 xpath="//div[@class='novel']/h1/text()"
文章内容的 xpath="//div[@class='novel']/div[@class='yd_text2']"
实现
package cn.qiankunpingtai.spider.crawlers;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.service.CrawlerService;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import org.seimicrawler.xpath.JXDocument;
import org.springframework.beans.factory.annotation.Autowired;
import java.util.List;
import java.util.Random;
/**
* <pre>
* 类名: DuShu88NovelCrawler
* 类的作用:
* 创建原因:
* 创建时间: 2019年4月15日 上午9:07:36
* 描述:
* @author qiankunpingtai
* @version
* @since JDK 1.8
* </pre>
*/
@Crawler(name = "88dush")
public class DuShu88NovelCrawler extends BaseSeimiCrawler {
private static final long serialVersionUID = 3993378973651481714L;
@Autowired
private CrawlerService crawlerService;
private static final String DOMAIN_URL = "https://www.88dush.com";
@Override
public String[] startUrls() {
return new String[]{DOMAIN_URL};
}
@Override
public void start(Response response){
try {
JXDocument document = response.document();
List<Object> urlList = document.sel("//body/div[@class='tuijian']/ul/li/h2/a/@href");
if (urlList.size() < 1) {
return;
}
System.out.println(Thread.currentThread());
BusinessConstants.threadPoolStart.execute(()->{
BusinessConstants.lock.lock();
for (Object url:urlList) {
String urlStr = url.toString();
if (!urlStr.startsWith(DOMAIN_URL) ) {
BusinessConstants.CURRENT_START_URL = new StringBuffer(DOMAIN_URL).append(urlStr).toString();
} else {
BusinessConstants.CURRENT_START_URL = urlStr;
}
logger.info("start current start url={}", BusinessConstants.CURRENT_START_URL);
push(Request.build(BusinessConstants.CURRENT_START_URL, DuShu88NovelCrawler::getEachPage));
/**
* 当前线程等待,直到被唤醒
* */
try {
BusinessConstants.conditionPoolStart.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
BusinessConstants.lock.unlock();
}
);
} catch (Exception e) {
logger.error("", e);
} finally {
}
}
public void getEachPage(Response response) {
try {
JXDocument document = response.document();
List<Object> urlList = document.sel("//div[@class='booklist']/ul/li/span[@class='sm']/a/@href");
/**
* 获取每一种类型的总页数
* */
List<Object> pageTotal = document.sel("//div[@class='booklist']/div[@class='pagelink']/a[@class='last']/text()");
BusinessConstants.CURRENT_TOTAL_PAGE_NUMBER=Integer.valueOf(pageTotal.get(0).toString());
BusinessConstants.CURRENT_PAGE_NUMBER ++;
BusinessConstants.threadPoolPage.execute(()->{
BusinessConstants.lock.lock();
for (Object url:urlList) {
String urlStr = url.toString();
if (!urlStr.startsWith(DOMAIN_URL)) {
BusinessConstants.CURRENT_GET_DATA_URL = new StringBuffer(DOMAIN_URL).append(urlStr).toString();
} else {
BusinessConstants.CURRENT_GET_DATA_URL = urlStr;
}
push(Request.build(BusinessConstants.CURRENT_GET_DATA_URL, DuShu88NovelCrawler::getEachBook));
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
/**
* 当前线程等待,直到被唤醒
* */
try {
BusinessConstants.conditionPoolPage.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 总页数是多少,需要提前获取一下
* */
if(BusinessConstants.CURRENT_PAGE_NUMBER>=BusinessConstants.CURRENT_TOTAL_PAGE_NUMBER){
/**
* 当前类型所有页跑完,唤醒上一级
* */
BusinessConstants.CURRENT_PAGE_NUMBER=0;
BusinessConstants.conditionPoolStart.signal();
BusinessConstants.lock.unlock();// 调用signal()方法后需要手动释放
return;
}
String nextPageEndPrefix = BusinessConstants.CURRENT_START_URL.substring(0, BusinessConstants.CURRENT_START_URL.indexOf("sort") + 6);
logger.info(nextPageEndPrefix);
StringBuffer bf = new StringBuffer(nextPageEndPrefix).append(BusinessConstants.CURRENT_PAGE_NUMBER).append("/");
if (logger.isDebugEnabled()) {
logger.debug("url={}", bf.toString());
}
BusinessConstants.CURRENT_START_URL = bf.toString();
push(Request.build(BusinessConstants.CURRENT_START_URL, DuShu88NovelCrawler::getEachPage));
BusinessConstants.lock.unlock();
});
} catch (Exception e) {
logger.error("", e);
/**
* 当前有异常,唤醒上一级
* */
BusinessConstants.conditionPoolStart.signal();
BusinessConstants.lock.unlock();
} finally {
}
}
public void getEachBook(Response response) {
try {
JXDocument document = response.document();
List<Object> urlList = document.sel("//div[@class='mulu']/ul/li/a/@href");
BusinessConstants.threadPoolBook.execute(()-> {
BusinessConstants.lock.lock();
for (Object url : urlList) {
String urlStr = url.toString();
if (!urlStr.startsWith(BusinessConstants.CURRENT_GET_DATA_URL)) {
BusinessConstants.CURRENT_GET_BOOK_DATA_URL = new StringBuffer(BusinessConstants.CURRENT_GET_DATA_URL).append(urlStr).toString();
} else {
BusinessConstants.CURRENT_GET_BOOK_DATA_URL = urlStr;
}
push(Request.build(BusinessConstants.CURRENT_GET_BOOK_DATA_URL, DuShu88NovelCrawler::renderBean));
try {
/**
* 防止被屏蔽,间隔1到两秒秒钟访问
* */
Thread.sleep(new Random().nextInt(1000)+1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
/**
* 当前线程等待,直到被唤醒
* */
try {
BusinessConstants.conditionPoolBook.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 唤醒上一级线程
* */
BusinessConstants.conditionPoolPage.signal();
BusinessConstants.lock.unlock();
});
} catch (Exception e) {
logger.error("", e);
/**
* 当前有异常,唤醒上一级
* */
BusinessConstants.conditionPoolPage.signal();
BusinessConstants.lock.unlock();
} finally {
}
}
public void renderBean(Response response) {
try {
BusinessConstants.lock.lock();
crawlerService.saveForDuShu88(response);
/**
* 唤醒上一级线程
* */
BusinessConstants.conditionPoolBook.signal();
BusinessConstants.lock.unlock();
} catch (Exception e) {
e.printStackTrace();
/**
* 当前有异常,唤醒上一级
* */
BusinessConstants.conditionPoolBook.signal();
BusinessConstants.lock.unlock();
}
}
}
package cn.qiankunpingtai.spider.service;
import cn.qiankunpingtai.spider.constants.BusinessConstants;
import cn.qiankunpingtai.spider.entities.DuShu88Blog;
import cn.qiankunpingtai.spider.mappers.BlogMapper;
import cn.wanghaomiao.seimi.struct.Response;
import com.vladsch.flexmark.convert.html.FlexmarkHtmlParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/3/13 20:36
*/
@Service
public class CrawlerService {
protected Logger logger = LoggerFactory.getLogger(getClass());
@Resource
private BlogMapper blogMapper;
public void saveForDuShu88(Response response) {
try {
DuShu88Blog blog = response.render(DuShu88Blog.class);
logger.info("bean resolve res={},url={}", blog, response.getUrl());
if(blog!=null){
/**
* create by: qiankunpingtai
* create time: 2019/3/22 17:10
* website:http://39.105.146.63/symphony/
* description:
* 将html转化为markdown
*/
String htmlContent=null;
if(StringUtils.isNotBlank(blog.getContent())){
blog.setStarturl(BusinessConstants.CURRENT_GET_DATA_URL);
blog.setUrl(BusinessConstants.CURRENT_GET_BOOK_DATA_URL);
htmlContent=blog.getContent();
//内容不为空的时候转化
blog.setContent(FlexmarkHtmlParser.parse(htmlContent));
}
int changeNum=blogMapper.insert(blog);
logger.info("store success,blogId = {},changeNum={}", blog.getId(), changeNum);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
package cn.qiankun.spider.constants;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* @ClassName:BusinessConstants
* @Description 业务字典类
* @Author qiankunpingtai
* @Date 2019-3-6 17:58
* @Version 1.0
**/
public class BusinessConstants {
/**
* 定义一个静态变量,用于全局协调页数
* 默认从第一页开始
* */
public static Integer CURRENT_PAGE_NUMBER = 0;
/**
* 定义一个静态变量,用于记录总页数
* */
public static Integer CURRENT_TOTAL_PAGE_NUMBER = 1;
/**
* 定义一个静态变量,用于记录每次的起始url
* */
public static String CURRENT_START_URL = null;
/**
* 定义一个静态变量,用于记录当前获取数据的url
* */
public static String CURRENT_GET_DATA_URL = null;
/**
* 定义一个静态变量,用于记录当前获取每一本书章节数据的url
* */
public static String CURRENT_GET_BOOK_DATA_URL = null;
/**
*静态线程池
* */
public static ExecutorService threadPoolStart = Executors.newFixedThreadPool(1);
/**
*静态线程池
* */
public static ExecutorService threadPoolPage = Executors.newFixedThreadPool(1);
/**
*静态线程池
* */
public static ExecutorService threadPoolBook = Executors.newFixedThreadPool(10);
/**
* 存放执行的线程
* */
public static Map<String,Thread> threadMap=new ConcurrentHashMap<String,Thread>();
/**
* 线程锁
* */
public static Lock lock = new ReentrantLock();
public static Condition conditionPoolStart = lock.newCondition();// 必须配合lock.lock()使用
public static Condition conditionPoolPage = lock.newCondition();// 必须配合lock.lock()使用
public static Condition conditionPoolBook = lock.newCondition();// 必须配合lock.lock()使用
}
package cn.qiankunpingtai.spider.entities;
import cn.wanghaomiao.seimi.annotation.Xpath;
import java.util.Date;
/**
* Description
*
* @Author: qiankunpingtai
* @Date: 2019/4/18 22:26
*/
public class DuShu88Blog implements BlogWithBLOBs {
private Integer id;
@Xpath("//div[@class='novel']/h1/text()")
private String title;
private Date updateTime;
@Xpath("//div[@class='novel']/div[@class='yd_text2']")
private String content;
private String url;
private String starturl;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getUpdateTime() {
return updateTime;
}
public void setUpdateTime(Date updateTime) {
this.updateTime = updateTime;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getStarturl() {
return starturl;
}
public void setStarturl(String starturl) {
this.starturl = starturl;
}
}
SeimiCrawler 框架调用配置
#seimi框架执行爬虫实例配置
seimi.crawler.enabled=true
seimi.crawler.names=88dush
seimi.crawler.enable-redisson-queue=false
执行过程
获取到的数据
上一篇 爬虫不求人,上手跟我做(五)递归爬全站
爬虫不求人,上手跟我做(目录)
下一篇 爬虫不求人,上手跟我做(七)识别图片验证码登录爬取数据
项目见附件
这个地方缺少一个变量重置
添加上这一句
BusinessConstants.CURRENT_PAGE_NUMBER=0;
修改为
没有详细爬取每一个分类下面的小说信息,小说信息怎么获取?只有章节信息的获取,不能正确理解是哪个小说的章节?
初始页数从 0 开始
可以自己实现一下,这个很简单的!
不是很清楚逻辑,可以详细讲解下吗??
出现错误。java.lang.IllegalMonitorStateException: null
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.signal(AbstractQueuedSynchronizer.java:1939)
at com.potato369.novel.crawler.crawlers.DuShu88NovelCrawler.renderChapterBean(DuShu88NovelCrawler.java:452)
at cn.wanghaomiao.seimi.core.SeimiProcessor.doLambdaCallback(SeimiProcessor.java:177)
at cn.wanghaomiao.seimi.core.SeimiProcessor.run(SeimiProcessor.java:116)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
2019-04-23 21:41:28,487 - 1652879 [pool-2-thread-20] ERROR c.w.seimi.core.SeimiProcessor - null
java.lang.IllegalMonitorStateException: null
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.signal(AbstractQueuedSynchronizer.java:1939)
at com.potato369.novel.crawler.crawlers.DuShu88NovelCrawler.renderChapterBean(DuShu88NovelCrawler.java:457)
at cn.wanghaomiao.seimi.core.SeimiProcessor.doLambdaCallback(SeimiProcessor.java:177)
at cn.wanghaomiao.seimi.core.SeimiProcessor.run(SeimiProcessor.java:116)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
出现这个异常。
java.lang.IllegalMonitorStateException: null
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.signal(AbstractQueuedSynchronizer.java:1939)
at com.potato369.novel.crawler.crawlers.DuShu88NovelCrawler.renderChapterBean(DuShu88NovelCrawler.java:452)
at cn.wanghaomiao.seimi.core.SeimiProcessor.doLambdaCallback(SeimiProcessor.java:177)
at cn.wanghaomiao.seimi.core.SeimiProcessor.run(SeimiProcessor.java:116)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
2019-04-23 21:41:28,487 - 1652879 [pool-2-thread-20] ERROR c.w.seimi.core.SeimiProcessor - null
java.lang.IllegalMonitorStateException: null
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.signal(AbstractQueuedSynchronizer.java:1939)
at com.potato369.novel.crawler.crawlers.DuShu88NovelCrawler.renderChapterBean(DuShu88NovelCrawler.java:457)
at cn.wanghaomiao.seimi.core.SeimiProcessor.doLambdaCallback(SeimiProcessor.java:177)
at cn.wanghaomiao.seimi.core.SeimiProcessor.run(SeimiProcessor.java:116)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
这个是那个方法爆出的异常:
public void renderChapterBean(Response response) {
Chapter chapter = null;
try {
if (log.isDebugEnabled()) {
log.debug("【后台爬虫系统爬取数据】开始爬取每页小说章节数据");
}
BusinessConstants.lock.lock();
try {
chapter = response.render(Chapter.class);
chapter.setIndex(BusinessConstants.CURRENT_CHAPTER_INDEX);
if (log.isDebugEnabled()) {
log.debug("bean resolve res={}, url={}", chapter, response.getUrl());
}
NovelChapter novelChapter = NovelChapter.builder().build();
if (chapter != null) {
chapter.setId(UUIDUtil.gen32UUID());
String htmlContent = null;
if(StringUtils.isNotBlank(chapter.getContent())){
chapter.setStarturl(BusinessConstants.CURRENT_GET_DATA_URL);
chapter.setUrl(BusinessConstants.CURRENT_GET_BOOK_DATA_URL);
htmlContent = chapter.getContent();
// 内容不为空的时候转化
chapter.setContent(FlexmarkHtmlParser.parse(htmlContent));
}
BeanUtils.copyProperties(chapter, novelChapter);
String title = chapter.getNovelName();
String categoryText = chapter.getCategoryCNText();
NovelInfo novelInfo = novelInfoServiceReader.findByTitleAndCategoryText(title, categoryText);
if (novelInfo != null) {
novelChapter.setBookId(novelInfo.getId());
novelChapter.setNewestChapterTitle(novelInfo.getNewestChapterTitle());
} else {
novelChapter.setBookId(UUIDUtil.gen32UUID());
novelChapter.setNewestChapterTitle("最后一章出错了");
}
NovelChapter novelChapterTemp = chapterServiceWriter.save(novelChapter);
String lastChapterName = null;
if (novelChapterTemp != null) {
lastChapterName = novelChapterTemp.getNewestChapterTitle();
List chapterList = chapterServiceReader.findByChaperTitle(lastChapterName);
if (chapterList != null && chapterList.size() > 0) {
NovelChapter lastChapter = chapterList.get(0);
String lastChapterId = lastChapter.getId();
String bookId = lastChapter.getBookId();
NovelInfo novelInfo2 = novelInfoServiceReader.find(bookId);
if (novelInfo2 != null) {
novelInfo2.setNewestChapterId(lastChapterId);
novelInfoServiceWriter.update(novelInfo2);
if (log.isDebugEnabled()) {
log.debug("【后台爬虫系统爬取数据】开始爬取每页 10、小说最新章节 id 信息 data={}", lastChapterId);
}
}
}
}
if (log.isDebugEnabled()) {
log.debug("covert copy store success in db, novel chapter info={}", novelChapterTemp);
}
}
if (BusinessConstants.CURRENT_CHAPTER_INDEX >= BusinessConstants.CURRENT_TOTAL_CHAPTERS) {
// 当前类型所有页跑完唤醒上一级
BusinessConstants.CURRENT_CHAPTER_INDEX = BusinessConstants.CURRENT_TOTAL_CHAPTERS;// 将这个变量重置为 0
BusinessConstants.CURRENT_CHAPTER_INDEX = 0;
BusinessConstants.conditionPoolStart.signal();
BusinessConstants.lock.unlock();// 调用 signal() 方法后需要手动释放
}
} catch (Exception e) {
log.error("【后台管理】爬取小说章节信息失败", e);
} finally {
if (log.isDebugEnabled()) {
log.debug("render success chapter info={}", chapter);
}
}
// 唤醒上一级线程
BusinessConstants.conditionPoolBook.signal();
BusinessConstants.lock.unlock();
} catch (Exception e) {
log.error("【后台爬虫系统爬取数据】爬取每页小说章节数据出现错误", e);
// 当前有异常唤醒上一级
BusinessConstants.conditionPoolBook.signal();
BusinessConstants.lock.unlock();
} finally {
if (log.isDebugEnabled()) {
log.debug("【后台爬虫系统爬取数据】结束爬取每页小说章节数据");
}
}
}
详细代码:可以查看 github:
https://github.com/Wangyanjunai/novel.git
上查看。github 网址:https://github.com/Wangyanjunai/novel/tree/master/novel_crawler