摘要: 原创出处 www.jianshu.com/p/cfead4b3e34e 「jessehua」欢迎转载,保留摘要,谢谢!
WebMagic是一个开源的java爬虫框架。
WebMagic框架的使用并不是本文的重点,具体如何使用请参考官方文档:http://webmagic.io/docs/。
本文是对spring boot+WebMagic+MyBatis做了整合,使用WebMagic爬取数据,然后通过MyBatis持久化爬取的数据到mysql数据库。
本文提供的源代码可以作为java爬虫项目的脚手架。

1.添加maven依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion>
<groupId>hyzx</groupId> <artifactId>qbasic-crawler</artifactId> <version>1.0.0</version>
<parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>1.5.21.RELEASE</version> <relativePath/> </parent>
<properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.test.skip>true</maven.test.skip> <java.version>1.8</java.version> <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version> <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
<mysql.connector.version>5.1.47</mysql.connector.version> <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version> <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version> <fastjson.version>1.2.58</fastjson.version> <commons.lang3.version>3.9</commons.lang3.version> <joda.time.version>2.10.2</joda.time.version> <webmagic.core.version>0.7.3</webmagic.core.version> </properties>
<dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <scope>runtime</scope> <optional>true</optional> </dependency>
<dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency>
<dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> <optional>true</optional> </dependency>
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>${mysql.connector.version}</version> </dependency>
<dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> <version>${druid.spring.boot.starter.version}</version> </dependency>
<dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>${mybatis.spring.boot.starter.version}</version> </dependency>
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>${fastjson.version}</version> </dependency>
<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>${commons.lang3.version}</version> </dependency>
<dependency> <groupId>joda-time</groupId> <artifactId>joda-time</artifactId> <version>${joda.time.version}</version> </dependency>
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>${webmagic.core.version}</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> </dependencies>
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>${maven.compiler.plugin.version}</version> <configuration> <source>${java.version}</source> <target>${java.version}</target> <encoding>${project.build.sourceEncoding}</encoding> </configuration> </plugin>
<plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-resources-plugin</artifactId> <version>${maven.resources.plugin.version}</version> <configuration> <encoding>${project.build.sourceEncoding}</encoding> </configuration> </plugin>
<plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> <configuration> <fork>true</fork> <addResources>true</addResources> </configuration> <executions> <execution> <goals> <goal>repackage</goal> </goals> </execution> </executions> </plugin> </plugins> </build>
<repositories> <repository> <id>public</id> <name>aliyun nexus</name> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> <releases> <enabled>true</enabled> </releases> </repository> </repositories>
<pluginRepositories> <pluginRepository> <id>public</id> <name>aliyun nexus</name> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </pluginRepository> </pluginRepositories> </project>
|
2.项目配置文件 application.properties
配置mysql数据源,druid数据库连接池以及MyBatis的mapper文件的位置。
# mysql数据源配置 spring.datasource.name=mysql spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.driver-class-name=com.mysql.jdbc.Driver spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true spring.datasource.username=root spring.datasource.password=root
# druid数据库连接池配置 spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=10 spring.datasource.druid.max-wait=60000 spring.datasource.druid.validation-query=SELECT 1 FROM DUAL spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false spring.datasource.druid.test-while-idle=true spring.datasource.druid.time-between-eviction-runs-millis=60000 spring.datasource.druid.min-evictable-idle-time-millis=300000 spring.datasource.druid.max-evictable-idle-time-millis=600000
# mybatis配置 mybatis.mapperLocations=classpath:mapper/**/*.xml
|
3.数据库表结构
CREATE TABLE `cms_content` ( `contentId` varchar(40) NOT NULL COMMENT '内容ID', `title` varchar(150) NOT NULL COMMENT '标题', `content` longtext COMMENT '文章内容', `releaseDate` datetime NOT NULL COMMENT '发布日期', PRIMARY KEY (`contentId`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';
|
4.实体类
import java.util.Date;
public class CmsContentPO { private String contentId;
private String title;
private String content;
private Date releaseDate;
public String getContentId() { return contentId; }
public void setContentId(String contentId) { this.contentId = contentId; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getContent() { return content; }
public void setContent(String content) { this.content = content; }
public Date getReleaseDate() { return releaseDate; }
public void setReleaseDate(Date releaseDate) { this.releaseDate = releaseDate; } }
|
5.mapper接口
public interface CrawlerMapper { int addCmsContent(CmsContentPO record); }
|
6.CrawlerMapper.xml文件
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
<insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO"> insert into cms_content (contentId, title, releaseDate, content) values (#{contentId,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, #{releaseDate,jdbcType=TIMESTAMP}, #{content,jdbcType=LONGVARCHAR}) </insert> </mapper>
|
7.XXX页面内容处理类XXXPageProcessor
主要用于解析爬取到的XXX html页面。
@Component public class XXXPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.xxx\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title") == null) { page.setSkip(true); } }
@Override public Site getSite() { return site; } }
|
8.XXX数据处理类XXXPipeline
主要用于将XXX html页面解析出的数据存储到mysql数据库。
@Component public class XXXPipeline implements Pipeline { private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
@Autowired private CrawlerMapper crawlerMapper;
public void process(ResultItems resultItems, Task task) { String title = resultItems.get("title"); String answer = resultItems.get("answer");
CmsContentPO contentPO = new CmsContentPO(); contentPO.setContentId(UUID.randomUUID().toString()); contentPO.setTitle(title); contentPO.setReleaseDate(new Date()); contentPO.setContent(answer);
try { boolean success = crawlerMapper.addCmsContent(contentPO) > 0; LOGGER.info("保存文章成功:{}", title); } catch (Exception ex) { LOGGER.error("保存文章失败", ex); } } }
|
9.爬虫任务类XXXTask
每十分钟启动一次爬虫。
@Component public class XXXTask { private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
@Autowired private XXXPipeline XXXPipeline;
@Autowired private XXXPageProcessor xxxPageProcessor;
private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
public void crawl() { timer.scheduleWithFixedDelay(() -> { Thread.currentThread().setName("xxxCrawlerThread");
try { Spider.create(xxxPageProcessor) .addUrl("https://www.xxx.com/explore") .addPipeline(xxxPipeline) .thread(2) .start(); } catch (Exception ex) { LOGGER.error("定时抓取数据线程执行异常", ex); } }, 0, 10, TimeUnit.MINUTES); } }
|
10.Spring boot程序启动类
@SpringBootApplication @MapperScan(basePackages = "com.hyzx.qbasic.dao") public class Application implements CommandLineRunner {
@Autowired private XXXTask xxxTask;
public static void main(String[] args) throws IOException { SpringApplication.run(Application.class, args); }
@Override public void run(String... strings) throws Exception { xxxTask.crawl(); } }
|