一般在需要选择居住信息类的系统中都要维护一个地名地址信息库,这个库一般不会手动录入维护,为了保持与国家统计的数据一致,一般会从第三方接口或者国家统计局发布的页面获取,下面介绍通过jsoup使用java爬虫技术从国家统计局爬取行政区划数据,并将数据存储到数据库中。
技术要点
jsoup
jsuop是一个java库,它提供可用于URL获取、数据解析、提取和使用DOM 的API方法。
目标网站
国家统计局-行政区划
https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html
分析页面-获取数据
省级数据
从页面信息看,整个页面有效信息部分是包裹在一个 <table class="provincetable" width="775">
table中的,其中 <tr class="provincetr">
的标签的数据是需要爬取的省级数据,而 tr中的<a href="11.html">北京市<br></a>
包含的正好是所需要的行政区划名称和行政区划代码。
对应的代码处理逻辑为
String provinceUrl = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html";
try {
Document document = Jsoup.connect(provinceUrl).get();
// 获取所有省级别的元素
Elements provinceElements = document.body().select("tr.provincetr").select("a[href]");
for (Element provinceElement : provinceElements) {
// 获取 <a> 标签的文本
String text = provinceElement.text();
// 获取 <a> 标签的链接
String href = provinceElement.attr("href");
District district = new District();
district.setName(text);
district.setCode(href.replace(".html", ""));
district.setParentId("0");
district.setFullName(text);
}
} catch (IOException e) {
e.printStackTrace();
}
其他级别的数据
其他级别的数据获取与省级数据的获取方式相同,定位到页面对应的元素接口,不同是的爬取的url根据上级的href参数跳转
pinyin4j
pinyin4j是一个将汉字转换成拼音的java库,官网为https://sourceforge.net/projects/pinyin4j/,感兴趣的可前往浏览。
通过线程池创建多线程提高处理效率
通过ThreadPoolExecutor
创建线程,加速处理速度
代码实现
以下示例采用的是 springboot3+mybatis-plus
引入依赖
gradle方式,如果采用的是maven管理依赖,可自行找到对应jar的maven依赖形式
implementation 'org.springframework.boot:spring-boot-starter-web'
implementation 'org.jsoup:jsoup:1.18.1'
implementation 'com.mysql:mysql-connector-j:8.2.0'
implementation 'org.springframework.boot:spring-boot-starter-jdbc'
implementation 'org.springframework.boot:spring-boot-starter-data-redis'
implementation 'com.baomidou:mybatis-plus-spring-boot3-starter:3.5.7'
implementation 'com.baomidou:mybatis-plus-extension:3.5.7'
implementation 'com.alibaba:druid-spring-boot-3-starter:1.2.23'
implementation 'com.github.yulichang:mybatis-plus-join-boot-starter:1.4.11'
implementation 'com.belerweb:pinyin4j:2.5.1'
compileOnly 'org.projectlombok:lombok'
annotationProcessor 'org.projectlombok:lombok'
testImplementation 'org.springframework.boot:spring-boot-starter-test'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
Java代码
数据库脚本
-- 行政区划表
CREATE TABLE IF NOT EXISTS `sys_region`
(
`id` VARCHAR(32) NOT NULL COMMENT '主键ID' PRIMARY KEY,
`name` VARCHAR(32) NOT NULL COMMENT '行政区划名称',
`pinyin` VARCHAR(128) NOT NULL COMMENT '拼音',
`code` VARCHAR(32) NOT NULL COMMENT '行政区划代码',
`level` tinyint(1) NOT NULL COMMENT '行政区划级别',
`full_name` VARCHAR(128) NOT NULL COMMENT '行政区划完整名称',
`parent_id` VARCHAR(32) NOT NULL COMMENT '上级行政区划ID',
`created_by` VARCHAR(32) NOT NULL COMMENT '创建人',
`created_at` DATETIME NOT NULL COMMENT '创建时间',
`updated_by` VARCHAR(32) DEFAULT NULL COMMENT '最近一次更信任',
`updated_at` DATETIME DEFAULT NULL COMMENT '最近一次更新时间',
key `idx_name` (`name`) using btree,
key `idx_level` (`level`) using btree
) ENGINE = INNODB
DEFAULT CHARACTER SET UTF8MB4
COLLATE UTF8MB4_0900_AI_CI COMMENT '行政区划表'
application.yml
server:
port: 34434
servlet:
context-path: /api
spring:
application:
name: region-demo
spring:
datasource:
url: jdbc:mysql://127.0.0.1:3306/region-demo?useSSL=false&&serverTimezone=GMT%2B8&useUnicode=true&characterEncoding=utf8&allowPublicKeyRetrieval=true
username: root
password: root
driver-class-name: com.mysql.cj.jdbc.Driver
type: com.alibaba.druid.pool.DruidDataSource
druid:
# 初始连接数
initial-size: 5
# 最小空闲连接数
min-idle: 5
# 最大连接数
max-active: 5
# 获取连接的最大等待时间
max-wait: 60000
# 连接在连接池中的最小生存时间
min-evictable-idle-time-millis: 60000
# 连接在连接池中的最大生存时间
max-evictable-idle-time-millis: 90000
# 检测数据库连接是否有效语句
validation-query: select 'x'
# 既作为检测的间隔时间又作为testWhileIdel执行的依据
time-between-eviction-runs-millis: 60000
# 申请连接时会执行validationQuery检测连接是否有效,开启会降低性能,默认为true
test-on-borrow: false
# 归还连接时会执行validationQuery检测连接是否有效,开启会降低性能,默认为true
test-on-return: false
# 是否缓存preparedStatement, 也就是PSCache,PSCache对支持游标的数据库性能提升巨大,比如说oracle,在mysql下建议关闭。
pool-prepared-statements: false
# 置监控统计拦截的filters,去掉后监控界面sql无法统计,stat: 监控统计、Slf4j:日志记录、waLL: 防御sqL注入
filters: stat,wall,slf4j
# 要启用PSCache,必须配置大于0
max-pool-prepared-statement-per-connection-size: -1
# 合并多个DruidDataSource的监控数据
use-global-data-source-stat: true
# 通过connectProperties属性来打开mergeSql功能;慢SQL记录
connect-properties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
汉字转拼音工具类
package com.geekyous.core.util;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
/**
* 汉字转拼音工具类
*/
public class PinyinUtil {
private PinyinUtil() {
}
/**
* 汉字转拼音
*
* @param chinese 汉语
* @return 汉语对应的拼音
*/
public static String toPinyin(String chinese) {
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
StringBuilder sb = new StringBuilder();
char[] chars = chinese.toCharArray();
for (char ch : chars) {
if (Character.isWhitespace(ch)) {
continue;
}
if (String.valueOf(ch).matches("[\u4e00-\u9fa5]")) {
try {
String[] pinyinArray = PinyinHelper.toHanyuPinyinStringArray(ch, format);
sb.append(pinyinArray[0]);
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
} else {
sb.append(ch);
}
}
return sb.toString();
}
}
数据库实体类
package com.geekyous.pojo.entity;
import com.baomidou.mybatisplus.annotation.FieldFill;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableId;
import com.fasterxml.jackson.annotation.JsonFormat;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.time.LocalDateTime;
/**
* 数据库实体类基类
*/
@Data
@AllArgsConstructor
@NoArgsConstructor
public class BaseEntity {
/**
* 主键ID
*/
@TableField("id")
@TableId(type = IdType.ASSIGN_UUID)
private String id;
/**
* 创建人
*/
@TableField(value = "created_by", fill = FieldFill.INSERT)
private String createdBy;
/**
* 创建时间
*/
@TableField(value = "created_at", fill = FieldFill.INSERT)
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
private LocalDateTime createdAt;
/**
* 最近一次更新人
*/
@TableField(value = "updated_by", fill = FieldFill.UPDATE)
private String updatedBy;
/**
* 最近一次更新时间
*/
@TableField(value = "updated_by", fill = FieldFill.UPDATE)
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
private LocalDateTime updatedAt;
}
package com.geekyous.pojo.entity;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import com.geekyous.pojo.enums.RegionLevelEnum;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
/**
* 行政区划`sys_region`实体类
*/
@Data
@EqualsAndHashCode(callSuper = false)
@AllArgsConstructor
@NoArgsConstructor
@TableName("sys_region")
public class SysRegion extends BaseEntity {
/**
* 行政区划名称
*/
@TableField("name")
private String name;
/**
* 拼音
*/
@TableField("pinyin")
private String pinyin;
/**
* 行政区划代码
*/
@TableField("code")
private String code;
/**
* 行政区划级别
*/
@TableField("level")
private RegionLevelEnum level;
/**
* 完整行政区划路径
*/
@TableField("full_name")
private String fullName;
/**
* 上级行政区划ID
*/
@TableField("parent_id")
private String parentId;
}
RegionLevelEnum枚举
package com.geekyous.pojo.enums;
import com.baomidou.mybatisplus.annotation.EnumValue;
import com.fasterxml.jackson.annotation.JsonValue;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public enum RegionLevelEnum {
PROVINCE(1, "省"),
CITY(2, "市"),
DISTRICT(3, "区县"),
TOWN(4, "镇"),
VILLAGE(5, "村"),
;
@EnumValue
@JsonValue
private final int code;
private final String value;
}
mapper接口
package com.geekyous.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.geekyous.pojo.entity.SysRegion;
import org.apache.ibatis.annotations.Mapper;
@Mapper
public interface DistrictMapper extends BaseMapper<SysRegion> {
}
service接口
package com.geekyous.service;
import com.baomidou.mybatisplus.extension.service.IService;
import com.geekyous.pojo.entity.SysRegion;
/**
* 行政区划业务接口
*/
public interface IRegionService extends IService<SysRegion> {
/**
* 拉取行政区划数据
*/
void fetchOrigin();
}
service接口实现
package com.geekyous.service;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.geekyous.core.util.PinyinUtil;
import com.geekyous.mapper.DistrictMapper;
import com.geekyous.pojo.entity.SysRegion;
import com.geekyous.pojo.enums.RegionLevelEnum;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
@Service
@Slf4j
public class RegionServiceImpl extends ServiceImpl<DistrictMapper, SysRegion> implements IRegionService {
private final static String BASE_URL = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/";
// 连接超时时间,单位 ms
private final static int TIMEOUT = 30 * 60 * 1000;
private static Connection getConnection(String url) {
Connection connection = Jsoup.connect(url).timeout(TIMEOUT).method(Connection.Method.GET).followRedirects(false);
connection.header(HttpConnection.CONTENT_ENCODING, "UTF-8");
connection.header("Accept", "*/*");
connection.header("Accept-Encoding", "gzip, deflate, br");
connection.header("Accept-Language", "zh-CN,zh;q=0.9");
connection.header("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
return connection;
}
@Override
public void fetchOrigin() {
try {
String provinceUrl = "https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/index.html";
Connection connection = getConnection(provinceUrl);
Document document = connection.get();
// 获取所有省级别的元素
Elements provinceElements = document.body().select("tr.provincetr").select("a[href]");
if (CollectionUtils.isEmpty(provinceElements)) {
log.info("行政区划数据拉取-未获取到省级数据");
return;
}
CountDownLatch countDownLatch = new CountDownLatch(provinceElements.size());
new ElementParseExecutor(provinceElements, countDownLatch).execute();
countDownLatch.await();
log.info("行政区划数据拉取-所有行政区划数据处理完成");
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
/**
* 处理省级数据
*
* @param provinceElement 省级页面元素
*/
private void handProvince(Element provinceElement) {
// 获取 <a> 标签的文本
String provinceName = provinceElement.text();
// 获取 <a> 标签的链接
String href = provinceElement.attr("href");
SysRegion province = new SysRegion();
province.setName(provinceName);
province.setPinyin(PinyinUtil.toPinyin(provinceName));
province.setCode(href.replace(".html", ""));
province.setParentId("0");
province.setFullName(provinceName);
province.setLevel(RegionLevelEnum.PROVINCE);
// 存储数据
save(province);
// 获取市级别的数据
String cityUrl = BASE_URL + href;
handleCity(cityUrl, province);
}
/**
* 处理市级数据
*
* @param cityUrl 市级页面URL
* @param parent 上级行政区区划
*/
private void handleCity(String cityUrl, SysRegion parent) {
try {
Connection connection = getConnection(cityUrl);
Document document = connection.get();
Elements cityElements = document.body().select("table.citytable").select("a[href]");
if (CollectionUtils.isEmpty(cityElements)) {
log.info("行政区划数据拉取-未获取到市级数据");
return;
}
for (int i = 0; i < cityElements.size(); i++) {
Element cityCodeElement = cityElements.get(i);
String cityCode = cityCodeElement.text();
i++;
Element cityNameElement = cityElements.get(i);
String cityName = cityNameElement.text();
SysRegion city = new SysRegion();
city.setName(cityName);
city.setPinyin(PinyinUtil.toPinyin(cityName));
city.setCode(cityCode);
city.setLevel(RegionLevelEnum.CITY);
city.setParentId(parent.getId());
city.setFullName(parent.getFullName() + cityName);
// 存储数据
save(city);
// 获取区县级数据
String countyUrl = cityNameElement.absUrl("href");
handleCounty(countyUrl, city);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 处理区县级数据
*
* @param countyUrl 县级页面URL
* @param parent 上级行政区划
*/
private void handleCounty(String countyUrl, SysRegion parent) {
try {
Connection connection = getConnection(countyUrl);
Document document = connection.get();
Elements countyElements = document.body().select("table.countytable").select("a[href]");
if (CollectionUtils.isEmpty(countyElements)) {
log.info("行政区划数据拉取-未获取到区县级数据");
return;
}
for (int i = 0; i < countyElements.size(); i++) {
Element countyCodeElement = countyElements.get(i);
String countyCode = countyCodeElement.text();
i++;
Element countyNameElement = countyElements.get(i);
String countyName = countyNameElement.text();
SysRegion county = new SysRegion();
county.setName(countyName);
county.setPinyin(PinyinUtil.toPinyin(countyName));
county.setCode(countyCode);
county.setLevel(RegionLevelEnum.COUNTY);
county.setParentId(parent.getId());
county.setFullName(parent.getFullName() + countyName);
// 存储数据
save(county);
// 获取区镇|街道数据
String townUrl = countyNameElement.absUrl("href");
handleTown(townUrl, county);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 处理镇|街道级数据
*
* @param townUrl 镇级页面URL
* @param parent 上级行政区划
*/
private void handleTown(String townUrl, SysRegion parent) {
try {
Connection connection = getConnection(townUrl);
Document document = connection.get();
Elements townElements = document.body().select("table.towntable").select("a[href]");
if (CollectionUtils.isEmpty(townElements)) {
log.info("行政区划数据拉取-未获取到镇级数据");
return;
}
for (int i = 0; i < townElements.size(); i++) {
Element townCodeElement = townElements.get(i);
String townCode = townCodeElement.text();
i++;
Element townNameElement = townElements.get(i);
String townName = townNameElement.text();
SysRegion town = new SysRegion();
town.setName(townName);
town.setPinyin(PinyinUtil.toPinyin(townName));
town.setCode(townCode);
town.setLevel(RegionLevelEnum.TOWN);
town.setParentId(parent.getId());
town.setFullName(parent.getFullName() + townName);
// 存储数据
save(town);
// 获取村级数据
String villageUrl = townNameElement.absUrl("href");
handleVillage(villageUrl, town);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 处理村级数据
*
* @param villageUrl 村级页面URL
* @param parent 上级行政区划
*/
private void handleVillage(String villageUrl, SysRegion parent) {
try {
Connection connection = getConnection(villageUrl);
Document document = connection.get();
Elements villageElements = document.body().select("table.villagetable").select("tr.villagetr");
if (CollectionUtils.isEmpty(villageElements)) {
log.info("行政区划数据拉取-未获取到村级数据");
return;
}
for (Element villageElement : villageElements) {
String villageText = villageElement.text();
String[] text = villageText.split(" ");
String villageCode = text[0];
String villageName = text[2];
SysRegion village = new SysRegion();
village.setName(villageName);
village.setPinyin(PinyinUtil.toPinyin(villageName));
village.setCode(villageCode);
village.setLevel(RegionLevelEnum.VILLAGE);
village.setParentId(parent.getId());
village.setFullName(parent.getFullName() + villageName);
// 存储数据
save(village);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
class ElementParseExecutor {
ThreadPoolExecutor poolExecutor = new ThreadPoolExecutor(30, 50, 5,
TimeUnit.MINUTES, new LinkedBlockingDeque<>(50), new ThreadPoolExecutor.AbortPolicy());
private final List<Element> elements;
private final CountDownLatch countDownLatch;
ElementParseExecutor(List<Element> elements, CountDownLatch countDownLatch) {
this.elements = elements;
this.countDownLatch = countDownLatch;
}
public void execute() {
if (!CollectionUtils.isEmpty(elements)) {
for (Element element : elements) {
poolExecutor.execute(() -> {
handProvince(element);
countDownLatch.countDown();
});
}
}
}
}
}
controller
package com.geekyous.controller.sys;
import com.geekyous.service.IRegionService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequestMapping("/sys/district")
public class RegionController {
private final IRegionService districtService;
@Autowired
public RegionController(IRegionService districtService) {
this.districtService = districtService;
}
@GetMapping("/fetch")
public void fetch() {
districtService.fetchOrigin();
}
}