es的聚合分析主要分成4类
- Bucket: 分桶类型,类似SQL中的group by 语法。
- Metric: 指标分析类型,如计算最大值、最小值、平均值等待。
- Pipeline: 管道分析类型,基于上一级的聚合分析结果进行再分析。
- Matrix: 矩阵分析类型。
Metric 聚合分析
主要分成如下两类:
- 单值分析,只能输出一个分析结果
- min、max、avg、sum
- cardinality
- 多值分析,输出多个分析结果
- stats、extended stats
- percentile、percentile rank
- top hits
min、max、avg、sum
GET book/_search
{
"size": 0, #不返回原数据
"aggs": {
"word_count_min": {
"min": {
"field": "wordCount"
}
},
"word_count_max": {
"max": {
"field": "wordCount"
}
},
"word_count_avg": {
"avg": {
"field": "wordCount"
}
},
"word_count_sum": {
"sum": {
"field": "wordCount"
}
}
}
}
返回结果:
{
"took" : 46,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_sum" : {
"value" : 15000.0
},
"word_count_avg" : {
"value" : 2500.0
},
"word_count_max" : {
"value" : 3500.0
},
"word_count_min" : {
"value" : 1000.0
}
}
}
cardinality
意为集合的势,或者基数,是指不同值的个数,类似SQL中的district count。
查作者有几位。
GET book/_search
{
"size": 0,
"aggs": {
"count_of_author": {
"cardinality": {
"field": "author"
}
}
}
}
返回结果:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"count_of_author" : {
"value" : 4
}
}
}
stats
返回一系列数值类型的统计值,包含:min、max、avg、sum、count
GET book/_search
{
"size": 0,
"aggs": {
"stats_of_word_count": {
"stats": {
"field": "wordCount"
}
}
}
}
返回结果:
{
"took" : 7,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"stats_of_word_count" : {
"count" : 6,
"min" : 1000.0,
"max" : 3500.0,
"avg" : 2500.0,
"sum" : 15000.0
}
}
}
extended stats
对stats的扩展,包含了更多的统计数据,如方差、标准差。
GET book/_search
{
"size": 0,
"aggs": {
"stats_of_word_count": {
"extended_stats": {
"field": "wordCount"
}
}
}
}
返回结果:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"stats_of_word_count" : {
"count" : 6,
"min" : 1000.0,
"max" : 3500.0,
"avg" : 2500.0,
"sum" : 15000.0,
"sum_of_squares" : 4.2E7,
"variance" : 750000.0,
"std_deviation" : 866.0254037844386,
"std_deviation_bounds" : {
"upper" : 4232.050807568878,
"lower" : 767.9491924311228
}
}
}
}
percentiles
百分位数统计
GET book/_search
{
"size": 0,
"aggs": {
"per_word_count": {
"percentiles": {
"field": "wordCount"
}
}
}
}
返回结果:
{
"took" : 32,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"per_word_count" : {
"values" : {
"1.0" : 1000.0,
"5.0" : 1000.0,
"25.0" : 2000.0,
"50.0" : 2500.0,
"75.0" : 3500.0,
"95.0" : 3500.0,
"99.0" : 3500.0
}
}
}
}
这里的意思是 1%的数据在 1000以内。5%的数据在1000以内,以此类推。
percentile_ranks
传入值返回该值在数据中排名的位置。
GET book/_search
{
"size": 0,
"aggs": {
"per_word_count": {
"percentile_ranks": {
"field": "wordCount",
"values": [2000,3500]
}
}
}
}
返回结果:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"per_word_count" : {
"values" : {
"2000.0" : 27.777777777777775,
"3500.0" : 100.0
}
}
}
}
Top hits
一般用于分桶后获取该桶内最匹配的顶部文档列表,即详情数据。
例子: 首先按照author分组,再取出该作者下的书籍详情并按照字数排序。
POST book/_search
{
"size": 0,
"aggs": {
"word_count_terms": {
"terms": {
"field": "author",
"size": 10
},
"aggs": {
"top_word_count": {
"top_hits": {
"size": 10,
"sort": [
{
"wordCount": {
"order": "desc"
}
}
]
}
}
}
}
}
}
返回结果:
{
"took" : 65,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_terms" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3,
"top_word_count" : {
"hits" : {
"total" : 3,
"max_score" : null,
"hits" : [
{
"_index" : "book",
"_type" : "book",
"_id" : "Is6aZmwB4Jr3cw6pi6VH",
"_score" : null,
"_source" : {
"title" : "es怎么学",
"author" : "张三",
"wordCount" : 3500,
"publishDate" : "2019-11-01T11:11:11"
},
"sort" : [
3500
]
},
{
"_index" : "book",
"_type" : "book",
"_id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
"_score" : null,
"_source" : {
"id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
"title" : "java大法好",
"author" : "张三",
"wordCount" : 2000,
"publishDate" : "2019-11-01T10:00:00"
},
"sort" : [
2000
]
},
{
"_index" : "book",
"_type" : "book",
"_id" : "6e028090-453d-4995-9b6f-c64fec701d57",
"_score" : null,
"_source" : {
"id" : "6e028090-453d-4995-9b6f-c64fec701d57",
"title" : "java基础",
"author" : "张三",
"wordCount" : 1000,
"publishDate" : "2019-09-01T11:11:11"
},
"sort" : [
1000
]
}
]
}
}
},
{
"key" : "李",
"doc_count" : 1,
"top_word_count" : {
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [
{
"_index" : "book",
"_type" : "book",
"_id" : "Ic7qYmwB4Jr3cw6pi6Wc",
"_score" : null,
"_source" : {
"title" : "es怎么学",
"author" : "李",
"wordCount" : 3500,
"publishDate" : "2019-11-01T11:11:11"
},
"sort" : [
3500
]
}
]
}
}
},
{
"key" : "李四",
"doc_count" : 1,
"top_word_count" : {
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [
{
"_index" : "book",
"_type" : "book",
"_id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
"_score" : null,
"_source" : {
"id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
"title" : "java怎么学",
"author" : "李四",
"wordCount" : 2500,
"publishDate" : "2019-10-01T11:11:11"
},
"sort" : [
2500
]
}
]
}
}
},
{
"key" : "王五",
"doc_count" : 1,
"top_word_count" : {
"hits" : {
"total" : 1,
"max_score" : null,
"hits" : [
{
"_index" : "book",
"_type" : "book",
"_id" : "a7c5b2d2-aff7-415a-8c24-3caebc938116",
"_score" : null,
"_source" : {
"id" : "a7c5b2d2-aff7-415a-8c24-3caebc938116",
"title" : "j菜谱",
"author" : "王五",
"wordCount" : 2500,
"publishDate" : "2019-10-01T11:11:11"
},
"sort" : [
2500
]
}
]
}
}
}
]
}
}
}
Bucket 聚合分析
Bucket,意为桶,即按照一定的规则将文档分配到不同的桶中,达到分类分析的目的。
Terms
分桶最简单策略,直接按照term来分桶,如果是text类型,则按照分词后的结果分桶。
例子:按照作者来分组。
POST book/_search
{
"size": 0,
"aggs": {
"word_count_terms": {
"terms": {
"field": "author",
"size": 10
}
}
}
}
返回结果:
{
"took" : 15,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_terms" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3
},
{
"key" : "李",
"doc_count" : 1
},
{
"key" : "李四",
"doc_count" : 1
},
{
"key" : "王五",
"doc_count" : 1
}
]
}
}
}
range
通过指定数值的范围来设定分桶规则。
例子:按照wordCount数值分组。
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_range" : {
"buckets" : [
{
"key" : "级别一",
"to" : 1500.0,
"doc_count" : 1
},
{
"key" : "级别二",
"from" : 1500.0,
"to" : 3000.0,
"doc_count" : 3
},
{
"key" : "级别三",
"from" : 3100.0,
"doc_count" : 2
}
]
}
}
}
Date Range
通过指定日期的范围来指定分桶
例子:用publishDate分组。
POST book/_search
{
"size": 0,
"aggs": {
"publish_date_range":{
"date_range": {
"field": "publishDate",
"format": "yyyy",
"ranges": [
{
"from": "1900",
"to": "2000"
},
{
"from": "2000",
"to": "2010"
},
{
"from": "2010"
}
]
}
}
}
}
Historgram
直方图,以固定间隔的策略来分隔数据。
例子:以字数每隔1000进行分组,范围从0到4000。
POST book/_search
{
"size": 0,
"aggs": {
"word_count_hist":{
"histogram": {
"field": "wordCount",
"interval": 1000,
"extended_bounds": {
"min": 0,
"max": 4000
}
}
}
}
}
返回结果:
{
"took" : 13,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 6,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_hist" : {
"buckets" : [
{
"key" : 0.0,
"doc_count" : 0
},
{
"key" : 1000.0,
"doc_count" : 1
},
{
"key" : 2000.0,
"doc_count" : 3
},
{
"key" : 3000.0,
"doc_count" : 2
},
{
"key" : 4000.0,
"doc_count" : 0
}
]
}
}
}
Date Historgram
针对日期的直方图或者柱壮图,是时序数据分析中常用的聚合分析类型。
例子:按年分组。
POST book/_search
{
"size": 0,
"aggs": {
"pulish_date_hist":{
"date_histogram": {
"field": "publishDate",
"format": "yyyy",
"interval": "year"
}
}
}
}
返回结果:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"pulish_date_hist" : {
"buckets" : [
{
"key_as_string" : "2018",
"key" : 1514764800000,
"doc_count" : 1
},
{
"key_as_string" : "2019",
"key" : 1546300800000,
"doc_count" : 6
}
]
}
}
}
Bucket + Metric 聚合分析
Bucket聚合分析允许通过添加子分析来进一步分析,该子分析可以是Bucket也可以是Metric。
分桶后再分桶
先按author分桶,再按wordCount的range分桶。
POST book/_search
{
"size": 0,
"aggs": {
"bucket_author": {
"terms": {
"field": "author",
"size": 10
},
"aggs": {
"range_word_count": {
"range": {
"field": "wordCount",
"ranges": [
{
"to": 1700
},
{
"from": 1700,
"to": 3000
},
{
"from": 3000
}
]
}
}
}
}
}
}
返回结果:
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"bucket_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3,
"range_word_count" : {
"buckets" : [
{
"key" : "*-1700.0",
"to" : 1700.0,
"doc_count" : 1
},
{
"key" : "1700.0-3000.0",
"from" : 1700.0,
"to" : 3000.0,
"doc_count" : 1
},
{
"key" : "3000.0-*",
"from" : 3000.0,
"doc_count" : 1
}
]
}
},
{
"key" : "啊三",
"doc_count" : 1,
"range_word_count" : {
"buckets" : [
{
"key" : "*-1700.0",
"to" : 1700.0,
"doc_count" : 0
},
{
"key" : "1700.0-3000.0",
"from" : 1700.0,
"to" : 3000.0,
"doc_count" : 0
},
{
"key" : "3000.0-*",
"from" : 3000.0,
"doc_count" : 1
}
]
}
},
{
"key" : "李",
"doc_count" : 1,
"range_word_count" : {
"buckets" : [
{
"key" : "*-1700.0",
"to" : 1700.0,
"doc_count" : 0
},
{
"key" : "1700.0-3000.0",
"from" : 1700.0,
"to" : 3000.0,
"doc_count" : 0
},
{
"key" : "3000.0-*",
"from" : 3000.0,
"doc_count" : 1
}
]
}
},
{
"key" : "李四",
"doc_count" : 1,
"range_word_count" : {
"buckets" : [
{
"key" : "*-1700.0",
"to" : 1700.0,
"doc_count" : 0
},
{
"key" : "1700.0-3000.0",
"from" : 1700.0,
"to" : 3000.0,
"doc_count" : 1
},
{
"key" : "3000.0-*",
"from" : 3000.0,
"doc_count" : 0
}
]
}
},
{
"key" : "王五",
"doc_count" : 1,
"range_word_count" : {
"buckets" : [
{
"key" : "*-1700.0",
"to" : 1700.0,
"doc_count" : 0
},
{
"key" : "1700.0-3000.0",
"from" : 1700.0,
"to" : 3000.0,
"doc_count" : 1
},
{
"key" : "3000.0-*",
"from" : 3000.0,
"doc_count" : 0
}
]
}
}
]
}
}
}
先分桶再进行数据分析
POST book/_search
{
"size": 0,
"aggs": {
"bucket_author": {
"terms": {
"field": "author",
"size": 10
},
"aggs": {
"stats_word_count": {
"stats": {
"field": "wordCount"
}
}
}
}
}
}
返回结果:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"bucket_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3,
"stats_word_count" : {
"count" : 3,
"min" : 1000.0,
"max" : 3500.0,
"avg" : 2166.6666666666665,
"sum" : 6500.0
}
},
{
"key" : "啊三",
"doc_count" : 1,
"stats_word_count" : {
"count" : 1,
"min" : 3500.0,
"max" : 3500.0,
"avg" : 3500.0,
"sum" : 3500.0
}
},
{
"key" : "李",
"doc_count" : 1,
"stats_word_count" : {
"count" : 1,
"min" : 3500.0,
"max" : 3500.0,
"avg" : 3500.0,
"sum" : 3500.0
}
},
{
"key" : "李四",
"doc_count" : 1,
"stats_word_count" : {
"count" : 1,
"min" : 2500.0,
"max" : 2500.0,
"avg" : 2500.0,
"sum" : 2500.0
}
},
{
"key" : "王五",
"doc_count" : 1,
"stats_word_count" : {
"count" : 1,
"min" : 2500.0,
"max" : 2500.0,
"avg" : 2500.0,
"sum" : 2500.0
}
}
]
}
}
}
Pipeline 聚合分析
针对聚合分析的结果再次进行聚合分析,而且支持链式调用。
Pipeline的分析结果会输出到原结果中,根据输出位置不同,分为以下两类:
- Parent:结果内嵌到现有的聚合分析结果中
- Derivative
- Moving Average
- Cumulative
- Sibling:结果与现有聚合分析结果同级
- Max/Min/Avg/Sum Bucket
- Stats/Extended Stats Bucket
- Percentiles Bucket
Sibling
找出所有Bucket中值最小的Bucket名称和值。
例子:先按author分组,再按照wordCount计算平均值,找出平均值最小的Bucket。
POST book/_search
{
"size": 0,
"aggs": {
"bucket_author": {
"terms": {
"field": "author",
"size": 10
},
"aggs": {
"avg_word_count": {
"avg": {
"field": "wordCount"
}
}
}
},
"min_author_word_count": {
"min_bucket": {
"buckets_path": "bucket_author>avg_word_count"
}
}
}
}
返回结果:
{
"took" : 38,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"bucket_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3,
"avg_word_count" : {
"value" : 2166.6666666666665
}
},
{
"key" : "啊三",
"doc_count" : 1,
"avg_word_count" : {
"value" : 3500.0
}
},
{
"key" : "李",
"doc_count" : 1,
"avg_word_count" : {
"value" : 3500.0
}
},
{
"key" : "李四",
"doc_count" : 1,
"avg_word_count" : {
"value" : 2500.0
}
},
{
"key" : "王五",
"doc_count" : 1,
"avg_word_count" : {
"value" : 2500.0
}
}
]
},
"min_author_word_count" : {
"value" : 2166.6666666666665,
"keys" : [
"张三"
]
}
}
}
其实Sibling中用法与Metric类型,其他函数就不做介绍了。
Parent
Derivative:求导数。
Moving Average:移动平均值,能看到趋势。
Percentiles Bucket:累计加和。
例子:我们用求导做例子。注意的是查询时derivative_avg_salary在aggs内,返回结果与现有聚合分析结果同级。
POST book/_search
{
"size": 0,
"aggs": {
"bucket_author": {
"date_histogram": {
"field": "publishDate",
"interval": "year"
},
"aggs": {
"avg_word_count": {
"avg": {
"field": "wordCount"
}
},
"derivative_avg_salary":{
"derivative": {
"buckets_path": "avg_word_count"
}
}
}
}
}
}
返回结果:
{
"took" : 37,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"bucket_author" : {
"buckets" : [
{
"key_as_string" : "2018-01-01T00:00:00",
"key" : 1514764800000,
"doc_count" : 1,
"avg_word_count" : {
"value" : 3500.0
}
},
{
"key_as_string" : "2019-01-01T00:00:00",
"key" : 1546300800000,
"doc_count" : 6,
"avg_word_count" : {
"value" : 2500.0
},
"derivative_avg_salary" : {
"value" : -1000.0
}
}
]
}
}
}
作用范围
es聚合分析默认作用范围是query的结果集。
例子:在wordCount在 3000-4000的数据内,再按照author分桶聚合。
POST book/_search
{
"size": 0,
"query": {
"range": {
"wordCount": {
"gte": 3000,
"lte": 4000
}
}
},
"aggs": {
"word_count_terms": {
"terms": {
"field": "author",
"size": 10
}
}
}
}
还可以通过如下的方式改变其作用范围
filter
为某个聚合分析设定过滤条件,从而在不改变整体query语句的情况下修改了作用范围
例子:group_author两个相同的分桶聚合,但是第一个增加fiter缩小查询范围,不影响第二group_author聚合结果。
POST book/_search
{
"size": 0,
"aggs": {
"word_count_terms": {
"filter": {
"range": {
"wordCount": {
"gte": 3000,
"lte": 4000
}
}
},
"aggs": {
"group_author": {
"terms": {
"field": "author",
"size": 10
}
}
}
},
"group_author": {
"terms": {
"field": "author",
"size": 10
}
}
}
}
返回结果:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"word_count_terms" : {
"doc_count" : 3,
"group_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "啊三",
"doc_count" : 1
},
{
"key" : "张三",
"doc_count" : 1
},
{
"key" : "李",
"doc_count" : 1
}
]
}
},
"group_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3
},
{
"key" : "啊三",
"doc_count" : 1
},
{
"key" : "李",
"doc_count" : 1
},
{
"key" : "李四",
"doc_count" : 1
},
{
"key" : "王五",
"doc_count" : 1
}
]
}
}
}
post-filter
作用于文档过滤,但在聚合分析后生效。
例子:先按照author分桶,再过滤author是张三和李四的数据。注意的是过滤的数据详细信息会再hits中展示。
POST book/_search
{
"aggs": {
"group_author": {
"terms": {
"field": "author",
"size": 10
}
}
},
"post_filter": {
"terms": {
"author": [
"张三",
"李四"
]
}
}
}
返回结果:
{
"took" : 9,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 4,
"max_score" : 1.0,
"hits" : [
{
"_index" : "book",
"_type" : "book",
"_id" : "6e028090-453d-4995-9b6f-c64fec701d57",
"_score" : 1.0,
"_source" : {
"id" : "6e028090-453d-4995-9b6f-c64fec701d57",
"title" : "java基础",
"author" : "张三",
"wordCount" : 1000,
"publishDate" : "2019-09-01T11:11:11"
}
},
{
"_index" : "book",
"_type" : "book",
"_id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
"_score" : 1.0,
"_source" : {
"id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
"title" : "java怎么学",
"author" : "李四",
"wordCount" : 2500,
"publishDate" : "2019-10-01T11:11:11"
}
},
{
"_index" : "book",
"_type" : "book",
"_id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
"_score" : 1.0,
"_source" : {
"id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
"title" : "java大法好",
"author" : "张三",
"wordCount" : 2000,
"publishDate" : "2019-11-01T10:00:00"
}
},
{
"_index" : "book",
"_type" : "book",
"_id" : "Is6aZmwB4Jr3cw6pi6VH",
"_score" : 1.0,
"_source" : {
"title" : "es怎么学",
"author" : "张三",
"wordCount" : 3500,
"publishDate" : "2019-11-01T11:11:11"
}
}
]
},
"aggregations" : {
"group_author" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "张三",
"doc_count" : 3
},
{
"key" : "啊三",
"doc_count" : 1
},
{
"key" : "李",
"doc_count" : 1
},
{
"key" : "李四",
"doc_count" : 1
},
{
"key" : "王五",
"doc_count" : 1
} ]
}
}
}
global
无视query过滤条件,基于全部文档进行分析。
例子:all中增加了global,avg_word_count的分析范围就不受query的影响了。
POST book/_search
{
"size": 0,
"query": {
"range": {
"wordCount": {
"gte": 3000,
"lte": 4000
}
}
},
"aggs": {
"word_count_avg": {
"avg": {
"field": "wordCount"
}
},
"all":{
"global": {},
"aggs": {
"avg_word_count": {
"avg": {
"field": "wordCount"
}
}
}
}
}
}
排序
可以使用自带的关键数据进行排序。
_count:文档数
_key:field中操作的字段。
POST book/_search
{
"size": 0,
"aggs": {
"term_word_count": {
"terms": {
"field": "author",
"size": 10,
"order": [
{
"_count": "asc"
},
{
"_key": "desc"
}
]
}
}
}
}
使用子聚合分析的结果进行排序。
POST book/_search
{
"size": 0,
"aggs": {
"term_word_count": {
"terms": {
"field": "author",
"size": 10,
"order": { #根据子聚合的结果做排序
"avg_word_count": "desc"
}
},
"aggs": {
"avg_word_count": {
"avg": {
"field": "wordCount"
}
}
}
}
}
}
返回结果:
{
"took" : 48,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 7,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"term_word_count" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "啊三",
"doc_count" : 1,
"avg_word_count" : {
"value" : 3500.0
}
},
{
"key" : "李",
"doc_count" : 1,
"avg_word_count" : {
"value" : 3500.0
}
},
{
"key" : "李四",
"doc_count" : 1,
"avg_word_count" : {
"value" : 2500.0
}
},
{
"key" : "王五",
"doc_count" : 1,
"avg_word_count" : {
"value" : 2500.0
}
},
{
"key" : "张三",
"doc_count" : 3,
"avg_word_count" : {
"value" : 2166.6666666666665
}
}
]
}
}
}
如果子聚合返回结果是多个,用 stats_word_count.sum 的形式。
POST book/_search
{
"size": 0,
"aggs": {
"term_word_count": {
"terms": {
"field": "author",
"size": 10,
"order": {
"stats_word_count.sum": "desc"
}
},
"aggs": {
"stats_word_count": {
"stats": {
"field": "wordCount"
}
}
}
}
}
}
如果要排序的原始是jsonObject格式,那么要使用word_cont>avg_word_count。
POST book/_search
{
"size": 0,
"aggs": {
"word_count_hist": {
"histogram": {
"field": "wordCount",
"interval": 1000,
"extended_bounds": {
"min": 0,
"max": 4000
},
"order": {
"word_cont>avg_word_count": "asc"
}
},
"aggs": {
"word_cont": {
"filter": {
"range": {
"wordCount": {
"gte": 2000,
"lte": 4000
}
}
},
"aggs": {
"avg_word_count": {
"avg": {
"field": "wordCount"
}
}
}
}
}
}
}
}
原理和精准度问题
Min聚合
分别从各个分片中获取到Min值,汇总后在取出Min值。
Terms
这里设置了 terms的size是5,那么会从不同分片中获取到前5个数据,再进行汇总取出前5个数据。
问题
Terms并不永远准确。
下面的例子:需要返回top3的数据,最终结果是abd,但我们看原始数据,c应该替换d被查询出来。因为每个分片也是只取前三个数据,在node1上c (3)并没有被返回。
数据分散在多个shard上,coordinating node无法得悉数据全貌。
解决办法
Shard_size大小设定方法
doc_count_error_upper_bound:被遗漏的term可能的最大值。每个分片返回的最小值。
sum_other_doc_count:返回结果bucket的term外其他term的文档总数。
我们刚才的例子 doc_count_error_upper_bound: 4 + 2 = 6 sum_other_doc_coun: 3 + 3 = 6
show_term_doc_count_error
这个值代表 bucket误差最大值,它等于每个分片没有返回数据的最小值加和,0的时候代表结果是准确的。