爬虫辅助工具 SeimiAgent
(一)作用
简单点说,有些页面元素是靠 js 动态加载的,那么我们在使用爬虫时,无法直接取到这些动态加载的 DOM 元素。而使用 SeimiAgent 工具以后,它会在我们取 DOM 元素之前,预先将 js 动态加载的部分先加载出来,这样我们就可以向取普通 DOM 元素一样进行处理。
(二)安装
1、先去 SeimiAgent 官网下载最新的工具包
2、centos 上操作
yum -y install gcc gcc-c++ make flex bison gperf ruby openssl-devel freetype-devel fontconfig-devel libicu-devel sqlite-devel libpng-devel libjpeg-devel
3、开启 8000 端口防火墙(供远程访问)
下面演示是基于 centos7 系统,不同于 centos6.5
//开启端口
firewall-cmd --zone=public --add-port=80/tcp --permanent
//重启防火墙
firewall-cmd --reload
4、
[root@localhost opt]# tar -zxvf seimiagent_linux_v1.3.1_x86_64.tar.gz
seimiagent_v1.3.1/
seimiagent_v1.3.1/bin/
seimiagent_v1.3.1/bin/seimiagent
seimiagent_v1.3.1/README.md
seimiagent_v1.3.1/zh.md
seimiagent_v1.3.1/LICENSE.md
[root@localhost opt]# ls
seimiagent_linux_v1.3.1_x86_64.tar.gz seimiagent_v1.3.1
[root@localhost opt]# cd seimiagent_v1.3.1/
[root@localhost seimiagent_v1.3.1]# ls
bin LICENSE.md README.md zh.md
[root@localhost seimiagent_v1.3.1]# cd bin
[root@localhost bin]# ls
seimiagent
[root@localhost bin]# ./seimiagent -p 8000
./seimiagent: error while loading shared libraries: libfontconfig.so.1: cannot open shared object file: No such file or directory
[root@localhost bin]# yum install libfontconfig.so.1
Loaded plugins: fastestmirror
Determining fastest mirrors
* base: mirrors.cqu.edu.cn
* extras: mirrors.cqu.edu.cn
* updates: mirrors.njupt.edu.cn
base | 3.6 kB 00:00:00
extras | 3.4 kB 00:00:00
updates | 3.4 kB 00:00:00
extras/7/x86_64/primary_db FAILED ] 0.0 B/s | 0 B --:--:-- ETA
http://mirrors.cqu.edu.cn/CentOS/7.6.1810/extras/x86_64/repodata/43c71026fcdefd8e9770eeb304ad82573ce6c0364172a9e07ef088ead33394e6-primary.sqlite.bz2: [Errno 14] curl#56 - "Recv failure: Connection reset by peer"
Trying other mirror.
(1/4): base/7/x86_64/group_gz | 166 kB 00:00:00
(2/4): updates/7/x86_64/primary_db | 3.3 MB 00:00:04
(3/4): base/7/x86_64/primary_db | 6.0 MB 00:00:04
(4/4): extras/7/x86_64/primary_db
[root@localhost bin]# ./seimiagent -p 8000
./seimiagent: error while loading shared libraries: libfontconfig.so.1: cannot open shared objec
[root@localhost lib]# cp /usr/lib/libfontconfig.so.1 /usr/local/lib/libfontconfig.so.1
[root@localhost lib]# vim /etc/ld.so.conf.d/usr-libs.conf
/usr/local/lib
[root@localhost lib]# ldconfig
[root@localhost bin]# ./seimiagent -p 8000
下载 libfontconfig 安装包
http://rpmfind.net/linux/rpm2html/search.php?query=libfontconfig.so.1()(64bit)
[root@localhost opt]# ls
fontconfig-2.13.0-4.3.el7.x86_64.rpm seimiagent_linux_v1.3.1_x86_64.tar.gz seimiagent_v1.3.1
[root@localhost opt]# rpm -ivh fontconfig-2.13.0-4.3.el7.x86_64.rpm
Preparing... ################################# [100%]
Updating / installing...
1:fontconfig-2.13.0-4.3.el7 ################################# [100%]
[root@localhost opt]# cd seimiagent_v1.3.1/bin/
[root@localhost bin]# ./seimiagent
[seimi] SeimiAgent started,listening on : 8000
[root@localhost bin]# nohup ./seimiagent &
[1] 4344
[root@localhost bin]# lsof -i:8000
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
seimiagen 4344 root 4u IPv4 42399 0t0 TCP *:irdmi (LISTEN)