ElasticSearch学习(八)-聚合分析

es的聚合分析主要分成4类

  • Bucket: 分桶类型,类似SQL中的group by 语法。
  • Metric: 指标分析类型,如计算最大值、最小值、平均值等待。
  • Pipeline: 管道分析类型,基于上一级的聚合分析结果进行再分析。
  • Matrix: 矩阵分析类型。

Metric 聚合分析

主要分成如下两类:

  • 单值分析,只能输出一个分析结果
    • min、max、avg、sum
    • cardinality
  • 多值分析,输出多个分析结果
    • stats、extended stats
    • percentile、percentile rank
    • top hits

min、max、avg、sum

GET book/_search
{
  "size": 0, #不返回原数据
  "aggs": {
    "word_count_min": {
      "min": {
        "field": "wordCount"
      }
    },
    "word_count_max": {
      "max": {
        "field": "wordCount"
      }
    },
    "word_count_avg": {
      "avg": {
        "field": "wordCount"
      }
    },
    "word_count_sum": {
      "sum": {
        "field": "wordCount"
      }
    }
  }
}

返回结果:

{
  "took" : 46,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_sum" : {
      "value" : 15000.0
    },
    "word_count_avg" : {
      "value" : 2500.0
    },
    "word_count_max" : {
      "value" : 3500.0
    },
    "word_count_min" : {
      "value" : 1000.0
    }
  }
}

cardinality

意为集合的势,或者基数,是指不同值的个数,类似SQL中的district count。

查作者有几位。

GET book/_search
{
  "size": 0, 
  "aggs": {
    "count_of_author": {
      "cardinality": {
        "field": "author"
      }
    }
  }
}

返回结果:

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "count_of_author" : {
      "value" : 4
    }
  }
}

stats

返回一系列数值类型的统计值,包含:min、max、avg、sum、count

GET book/_search
{
  "size": 0, 
  "aggs": {
    "stats_of_word_count": {
      "stats": {
        "field": "wordCount"
      }
    }
  }
}

返回结果:

{
  "took" : 7,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "stats_of_word_count" : {
      "count" : 6,
      "min" : 1000.0,
      "max" : 3500.0,
      "avg" : 2500.0,
      "sum" : 15000.0
    }
  }
}

extended stats

对stats的扩展,包含了更多的统计数据,如方差、标准差。

GET book/_search
{
  "size": 0, 
  "aggs": {
    "stats_of_word_count": {
      "extended_stats": {
        "field": "wordCount"
      }
    }
  }
}

返回结果:

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "stats_of_word_count" : {
      "count" : 6,
      "min" : 1000.0,
      "max" : 3500.0,
      "avg" : 2500.0,
      "sum" : 15000.0,
      "sum_of_squares" : 4.2E7,
      "variance" : 750000.0,
      "std_deviation" : 866.0254037844386,
      "std_deviation_bounds" : {
        "upper" : 4232.050807568878,
        "lower" : 767.9491924311228
      }
    }
  }
}

percentiles

百分位数统计

GET book/_search
{
  "size": 0, 
  "aggs": {
    "per_word_count": {
      "percentiles": {
        "field": "wordCount"
      }
    }
  }
}

返回结果:

{
  "took" : 32,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "per_word_count" : {
      "values" : {
        "1.0" : 1000.0,
        "5.0" : 1000.0,
        "25.0" : 2000.0,
        "50.0" : 2500.0,
        "75.0" : 3500.0,
        "95.0" : 3500.0,
        "99.0" : 3500.0
      }
    }
  }
}

这里的意思是 1%的数据在 1000以内。5%的数据在1000以内,以此类推。

percentile_ranks

传入值返回该值在数据中排名的位置。

GET book/_search
{
  "size": 0, 
  "aggs": {
    "per_word_count": {
      "percentile_ranks": {
        "field": "wordCount",
        "values": [2000,3500]
      }
    }
  }
}

返回结果:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "per_word_count" : {
      "values" : {
        "2000.0" : 27.777777777777775,
        "3500.0" : 100.0
      }
    }
  }
}

Top hits

一般用于分桶后获取该桶内最匹配的顶部文档列表,即详情数据。

例子: 首先按照author分组,再取出该作者下的书籍详情并按照字数排序。

POST book/_search
{
  "size": 0,
  "aggs": {
    "word_count_terms": {
      "terms": {
        "field": "author",
        "size": 10
      },
      "aggs": {
        "top_word_count": {
          "top_hits": {
            "size": 10,
            "sort": [
              {
                "wordCount": {
                  "order": "desc"
                }
              }
            ]
          }
        }
      }
    }
  }
}

返回结果:

{
  "took" : 65,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3,
          "top_word_count" : {
            "hits" : {
              "total" : 3,
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "Is6aZmwB4Jr3cw6pi6VH",
                  "_score" : null,
                  "_source" : {
                    "title" : "es怎么学",
                    "author" : "张三",
                    "wordCount" : 3500,
                    "publishDate" : "2019-11-01T11:11:11"
                  },
                  "sort" : [
                    3500
                  ]
                },
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
                  "_score" : null,
                  "_source" : {
                    "id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
                    "title" : "java大法好",
                    "author" : "张三",
                    "wordCount" : 2000,
                    "publishDate" : "2019-11-01T10:00:00"
                  },
                  "sort" : [
                    2000
                  ]
                },
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "6e028090-453d-4995-9b6f-c64fec701d57",
                  "_score" : null,
                  "_source" : {
                    "id" : "6e028090-453d-4995-9b6f-c64fec701d57",
                    "title" : "java基础",
                    "author" : "张三",
                    "wordCount" : 1000,
                    "publishDate" : "2019-09-01T11:11:11"
                  },
                  "sort" : [
                    1000
                  ]
                }
              ]
            }
          }
        },
        {
          "key" : "李",
          "doc_count" : 1,
          "top_word_count" : {
            "hits" : {
              "total" : 1,
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "Ic7qYmwB4Jr3cw6pi6Wc",
                  "_score" : null,
                  "_source" : {
                    "title" : "es怎么学",
                    "author" : "李",
                    "wordCount" : 3500,
                    "publishDate" : "2019-11-01T11:11:11"
                  },
                  "sort" : [
                    3500
                  ]
                }
              ]
            }
          }
        },
        {
          "key" : "李四",
          "doc_count" : 1,
          "top_word_count" : {
            "hits" : {
              "total" : 1,
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
                  "_score" : null,
                  "_source" : {
                    "id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
                    "title" : "java怎么学",
                    "author" : "李四",
                    "wordCount" : 2500,
                    "publishDate" : "2019-10-01T11:11:11"
                  },
                  "sort" : [
                    2500
                  ]
                }
              ]
            }
          }
        },
        {
          "key" : "王五",
          "doc_count" : 1,
          "top_word_count" : {
            "hits" : {
              "total" : 1,
              "max_score" : null,
              "hits" : [
                {
                  "_index" : "book",
                  "_type" : "book",
                  "_id" : "a7c5b2d2-aff7-415a-8c24-3caebc938116",
                  "_score" : null,
                  "_source" : {
                    "id" : "a7c5b2d2-aff7-415a-8c24-3caebc938116",
                    "title" : "j菜谱",
                    "author" : "王五",
                    "wordCount" : 2500,
                    "publishDate" : "2019-10-01T11:11:11"
                  },
                  "sort" : [
                    2500
                  ]
                }
              ]
            }
          }
        }
      ]
    }
  }
}

Bucket 聚合分析

Bucket,意为桶,即按照一定的规则将文档分配到不同的桶中,达到分类分析的目的。

Terms

分桶最简单策略,直接按照term来分桶,如果是text类型,则按照分词后的结果分桶。

例子:按照作者来分组。

POST book/_search
{
  "size": 0,
  "aggs": {
    "word_count_terms": {
      "terms": {
        "field": "author",
        "size": 10
      }
    }
  }
}

返回结果:

{
  "took" : 15,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3
        },
        {
          "key" : "李",
          "doc_count" : 1
        },
        {
          "key" : "李四",
          "doc_count" : 1
        },
        {
          "key" : "王五",
          "doc_count" : 1
        }
      ]
    }
  }
}

range

通过指定数值的范围来设定分桶规则。

例子:按照wordCount数值分组。

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_range" : {
      "buckets" : [
        {
          "key" : "级别一",
          "to" : 1500.0,
          "doc_count" : 1
        },
        {
          "key" : "级别二",
          "from" : 1500.0,
          "to" : 3000.0,
          "doc_count" : 3
        },
        {
          "key" : "级别三",
          "from" : 3100.0,
          "doc_count" : 2
        }
      ]
    }
  }
}

Date Range

通过指定日期的范围来指定分桶

例子:用publishDate分组。

POST book/_search
{
  "size": 0,
  "aggs": {
    "publish_date_range":{
      "date_range": {
        "field": "publishDate",
        "format": "yyyy", 
        "ranges": [
          {
            "from": "1900",
            "to": "2000"
          },
          {
            "from": "2000",
            "to": "2010"
          },
          {
            "from": "2010"
          }
        ]
      }
    }
  }
}

Historgram

直方图,以固定间隔的策略来分隔数据。

例子:以字数每隔1000进行分组,范围从0到4000。

POST book/_search
{
  "size": 0,
  "aggs": {
    "word_count_hist":{
      "histogram": {
        "field": "wordCount",
        "interval": 1000,
        "extended_bounds": {
          "min": 0,
          "max": 4000
        }
      }
    }
  }
}

返回结果:

{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 6,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_hist" : {
      "buckets" : [
        {
          "key" : 0.0,
          "doc_count" : 0
        },
        {
          "key" : 1000.0,
          "doc_count" : 1
        },
        {
          "key" : 2000.0,
          "doc_count" : 3
        },
        {
          "key" : 3000.0,
          "doc_count" : 2
        },
        {
          "key" : 4000.0,
          "doc_count" : 0
        }
      ]
    }
  }
}

Date Historgram

针对日期的直方图或者柱壮图,是时序数据分析中常用的聚合分析类型。

例子:按年分组。

POST book/_search
{
  "size": 0,
  "aggs": {
    "pulish_date_hist":{
      "date_histogram": {
        "field": "publishDate",
        "format": "yyyy", 
        "interval": "year"
      }
    }
  }
}

返回结果:

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "pulish_date_hist" : {
      "buckets" : [
        {
          "key_as_string" : "2018",
          "key" : 1514764800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019",
          "key" : 1546300800000,
          "doc_count" : 6
        }
      ]
    }
  }
}

Bucket + Metric 聚合分析

Bucket聚合分析允许通过添加子分析来进一步分析,该子分析可以是Bucket也可以是Metric。

分桶后再分桶

先按author分桶,再按wordCount的range分桶。

POST book/_search
{
  "size": 0,
  "aggs": {
    "bucket_author": {
      "terms": {
        "field": "author",
        "size": 10
      },
      "aggs": {
        "range_word_count": {
          "range": {
            "field": "wordCount",
            "ranges": [
              {
                "to": 1700
              },
              {
                "from": 1700,
                "to": 3000
              },
              {
                "from": 3000
              }
            ]
          }
        }
      }
    }
  }
}

返回结果:

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "bucket_author" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3,
          "range_word_count" : {
            "buckets" : [
              {
                "key" : "*-1700.0",
                "to" : 1700.0,
                "doc_count" : 1
              },
              {
                "key" : "1700.0-3000.0",
                "from" : 1700.0,
                "to" : 3000.0,
                "doc_count" : 1
              },
              {
                "key" : "3000.0-*",
                "from" : 3000.0,
                "doc_count" : 1
              }
            ]
          }
        },
        {
          "key" : "啊三",
          "doc_count" : 1,
          "range_word_count" : {
            "buckets" : [
              {
                "key" : "*-1700.0",
                "to" : 1700.0,
                "doc_count" : 0
              },
              {
                "key" : "1700.0-3000.0",
                "from" : 1700.0,
                "to" : 3000.0,
                "doc_count" : 0
              },
              {
                "key" : "3000.0-*",
                "from" : 3000.0,
                "doc_count" : 1
              }
            ]
          }
        },
        {
          "key" : "李",
          "doc_count" : 1,
          "range_word_count" : {
            "buckets" : [
              {
                "key" : "*-1700.0",
                "to" : 1700.0,
                "doc_count" : 0
              },
              {
                "key" : "1700.0-3000.0",
                "from" : 1700.0,
                "to" : 3000.0,
                "doc_count" : 0
              },
              {
                "key" : "3000.0-*",
                "from" : 3000.0,
                "doc_count" : 1
              }
            ]
          }
        },
        {
          "key" : "李四",
          "doc_count" : 1,
          "range_word_count" : {
            "buckets" : [
              {
                "key" : "*-1700.0",
                "to" : 1700.0,
                "doc_count" : 0
              },
              {
                "key" : "1700.0-3000.0",
                "from" : 1700.0,
                "to" : 3000.0,
                "doc_count" : 1
              },
              {
                "key" : "3000.0-*",
                "from" : 3000.0,
                "doc_count" : 0
              }
            ]
          }
        },
        {
          "key" : "王五",
          "doc_count" : 1,
          "range_word_count" : {
            "buckets" : [
              {
                "key" : "*-1700.0",
                "to" : 1700.0,
                "doc_count" : 0
              },
              {
                "key" : "1700.0-3000.0",
                "from" : 1700.0,
                "to" : 3000.0,
                "doc_count" : 1
              },
              {
                "key" : "3000.0-*",
                "from" : 3000.0,
                "doc_count" : 0
              }
            ]
          }
        }
      ]
    }
  }
}

先分桶再进行数据分析

POST book/_search
{
  "size": 0,
  "aggs": {
    "bucket_author": {
      "terms": {
        "field": "author",
        "size": 10
      },
      "aggs": {
        "stats_word_count": {
          "stats": {
            "field": "wordCount"
          }
        }
      }
    }
  }
}

返回结果:

{
  "took" : 16,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "bucket_author" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3,
          "stats_word_count" : {
            "count" : 3,
            "min" : 1000.0,
            "max" : 3500.0,
            "avg" : 2166.6666666666665,
            "sum" : 6500.0
          }
        },
        {
          "key" : "啊三",
          "doc_count" : 1,
          "stats_word_count" : {
            "count" : 1,
            "min" : 3500.0,
            "max" : 3500.0,
            "avg" : 3500.0,
            "sum" : 3500.0
          }
        },
        {
          "key" : "李",
          "doc_count" : 1,
          "stats_word_count" : {
            "count" : 1,
            "min" : 3500.0,
            "max" : 3500.0,
            "avg" : 3500.0,
            "sum" : 3500.0
          }
        },
        {
          "key" : "李四",
          "doc_count" : 1,
          "stats_word_count" : {
            "count" : 1,
            "min" : 2500.0,
            "max" : 2500.0,
            "avg" : 2500.0,
            "sum" : 2500.0
          }
        },
        {
          "key" : "王五",
          "doc_count" : 1,
          "stats_word_count" : {
            "count" : 1,
            "min" : 2500.0,
            "max" : 2500.0,
            "avg" : 2500.0,
            "sum" : 2500.0
          }
        }
      ]
    }
  }
}

Pipeline 聚合分析

针对聚合分析的结果再次进行聚合分析,而且支持链式调用。

Pipeline的分析结果会输出到原结果中,根据输出位置不同,分为以下两类:

  • Parent:结果内嵌到现有的聚合分析结果中
    • Derivative
    • Moving Average
    • Cumulative
  • Sibling:结果与现有聚合分析结果同级
    • Max/Min/Avg/Sum Bucket
    • Stats/Extended Stats Bucket
    • Percentiles Bucket

Sibling

找出所有Bucket中值最小的Bucket名称和值。

例子:先按author分组,再按照wordCount计算平均值,找出平均值最小的Bucket。

POST book/_search
{
  "size": 0,
  "aggs": {
    "bucket_author": {
      "terms": {
        "field": "author",
        "size": 10
      },
      "aggs": {
        "avg_word_count": {
          "avg": {
            "field": "wordCount"
          }
        }
      }
    },
    "min_author_word_count": {
      "min_bucket": {
        "buckets_path": "bucket_author>avg_word_count"
      }
    }
  }
}

返回结果:

{
  "took" : 38,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "bucket_author" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3,
          "avg_word_count" : {
            "value" : 2166.6666666666665
          }
        },
        {
          "key" : "啊三",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 3500.0
          }
        },
        {
          "key" : "李",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 3500.0
          }
        },
        {
          "key" : "李四",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 2500.0
          }
        },
        {
          "key" : "王五",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 2500.0
          }
        }
      ]
    },
    "min_author_word_count" : {
      "value" : 2166.6666666666665,
      "keys" : [
        "张三"
      ]
    }
  }
}

其实Sibling中用法与Metric类型,其他函数就不做介绍了。

Parent

Derivative:求导数。

Moving Average:移动平均值,能看到趋势。

Percentiles Bucket:累计加和。

例子:我们用求导做例子。注意的是查询时derivative_avg_salary在aggs内,返回结果与现有聚合分析结果同级。

POST book/_search
{
  "size": 0,
  "aggs": {
    "bucket_author": {
      "date_histogram": {
        "field": "publishDate",
        "interval": "year"
      },
      "aggs": {
        "avg_word_count": {
          "avg": {
            "field": "wordCount"
          }
        },
        "derivative_avg_salary":{
          "derivative": {
            "buckets_path": "avg_word_count"
          }
        }
      }
    }
  }
}

返回结果:

{
  "took" : 37,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "bucket_author" : {
      "buckets" : [
        {
          "key_as_string" : "2018-01-01T00:00:00",
          "key" : 1514764800000,
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 3500.0
          }
        },
        {
          "key_as_string" : "2019-01-01T00:00:00",
          "key" : 1546300800000,
          "doc_count" : 6,
          "avg_word_count" : {
            "value" : 2500.0
          },
          "derivative_avg_salary" : {
            "value" : -1000.0
          }
        }
      ]
    }
  }
}

作用范围

es聚合分析默认作用范围是query的结果集。

例子:在wordCount在 3000-4000的数据内,再按照author分桶聚合。

POST book/_search
{
  "size": 0,
  "query": {
    "range": {
      "wordCount": {
        "gte": 3000,
        "lte": 4000
      }
    }
  }, 
  "aggs": {
    "word_count_terms": {
      "terms": {
        "field": "author",
        "size": 10
      }
    }
  }
}

还可以通过如下的方式改变其作用范围

filter

为某个聚合分析设定过滤条件,从而在不改变整体query语句的情况下修改了作用范围

例子:group_author两个相同的分桶聚合,但是第一个增加fiter缩小查询范围,不影响第二group_author聚合结果。

POST book/_search
{
  "size": 0,
  "aggs": {
    "word_count_terms": {
      "filter": {
        "range": {
          "wordCount": {
            "gte": 3000,
            "lte": 4000
          }
        }
      },
      "aggs": {
        "group_author": {
          "terms": {
            "field": "author",
            "size": 10
          }
        }
      }
    },
    "group_author": {
      "terms": {
        "field": "author",
        "size": 10
      }
    }
  }
}

返回结果:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "word_count_terms" : {
      "doc_count" : 3,
      "group_author" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "啊三",
            "doc_count" : 1
          },
          {
            "key" : "张三",
            "doc_count" : 1
          },
          {
            "key" : "李",
            "doc_count" : 1
          }
        ]
      }
    },
    "group_author" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3
        },
        {
          "key" : "啊三",
          "doc_count" : 1
        },
        {
          "key" : "李",
          "doc_count" : 1
        },
        {
          "key" : "李四",
          "doc_count" : 1
        },
        {
          "key" : "王五",
          "doc_count" : 1
        }
      ]
    }
  }
}

post-filter

作用于文档过滤,但在聚合分析后生效。

例子:先按照author分桶,再过滤author是张三和李四的数据。注意的是过滤的数据详细信息会再hits中展示。

POST book/_search
{
  "aggs": {
    "group_author": {
      "terms": {
        "field": "author",
        "size": 10
      }
    }
  },
  "post_filter": {
    "terms": {
      "author": [
        "张三",
        "李四"
      ]
    }
  }
}

返回结果:

{
  "took" : 9,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 4,
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "book",
        "_type" : "book",
        "_id" : "6e028090-453d-4995-9b6f-c64fec701d57",
        "_score" : 1.0,
        "_source" : {
          "id" : "6e028090-453d-4995-9b6f-c64fec701d57",
          "title" : "java基础",
          "author" : "张三",
          "wordCount" : 1000,
          "publishDate" : "2019-09-01T11:11:11"
        }
      },
      {
        "_index" : "book",
        "_type" : "book",
        "_id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
        "_score" : 1.0,
        "_source" : {
          "id" : "c7738478-d1da-43d7-b839-29e894cbdf08",
          "title" : "java怎么学",
          "author" : "李四",
          "wordCount" : 2500,
          "publishDate" : "2019-10-01T11:11:11"
        }
      },
      {
        "_index" : "book",
        "_type" : "book",
        "_id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
        "_score" : 1.0,
        "_source" : {
          "id" : "9b821e2a-79dc-4288-b47a-817ad8496eda",
          "title" : "java大法好",
          "author" : "张三",
          "wordCount" : 2000,
          "publishDate" : "2019-11-01T10:00:00"
        }
      },
      {
        "_index" : "book",
        "_type" : "book",
        "_id" : "Is6aZmwB4Jr3cw6pi6VH",
        "_score" : 1.0,
        "_source" : {
          "title" : "es怎么学",
          "author" : "张三",
          "wordCount" : 3500,
          "publishDate" : "2019-11-01T11:11:11"
        }
      }
    ]
  },
  "aggregations" : {
    "group_author" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "张三",
          "doc_count" : 3
        },
        {
          "key" : "啊三",
          "doc_count" : 1
        },
        {
          "key" : "李",
          "doc_count" : 1
        },
        {
          "key" : "李四",
          "doc_count" : 1
        },
        {
          "key" : "王五",
          "doc_count" : 1
        }      ]
    }
  }
}

global

无视query过滤条件,基于全部文档进行分析。

例子:all中增加了global,avg_word_count的分析范围就不受query的影响了。

POST book/_search
{
  "size": 0,
  "query": {
    "range": {
      "wordCount": {
        "gte": 3000,
        "lte": 4000
      }
    }
  }, 
  "aggs": {
    "word_count_avg": {
      "avg": {
        "field": "wordCount"
      }
    },
    "all":{
      "global": {}, 
      "aggs": {
        "avg_word_count": {
          "avg": {
            "field": "wordCount"
          }
        }
      }
    }
  }
}

排序

可以使用自带的关键数据进行排序。
_count:文档数
_key:field中操作的字段。

POST book/_search
{
  "size": 0,
  "aggs": {
    "term_word_count": {
      "terms": {
        "field": "author",
        "size": 10,
        "order": [ 
          {
            "_count": "asc"
          },
          {
            "_key": "desc"
          }
        ]
      }
    }
  }
}

使用子聚合分析的结果进行排序。

POST book/_search
{
  "size": 0,
  "aggs": {
    "term_word_count": {
      "terms": {
        "field": "author",
        "size": 10,
        "order": { #根据子聚合的结果做排序
          "avg_word_count": "desc"
        }
      },
      "aggs": {
        "avg_word_count": {
          "avg": {
            "field": "wordCount"
          }
        }
      }
    }
  }
}

返回结果:

{
  "took" : 48,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 0.0,
    "hits" : [ ]
  },
  "aggregations" : {
    "term_word_count" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "啊三",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 3500.0
          }
        },
        {
          "key" : "李",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 3500.0
          }
        },
        {
          "key" : "李四",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 2500.0
          }
        },
        {
          "key" : "王五",
          "doc_count" : 1,
          "avg_word_count" : {
            "value" : 2500.0
          }
        },
        {
          "key" : "张三",
          "doc_count" : 3,
          "avg_word_count" : {
            "value" : 2166.6666666666665
          }
        }
      ]
    }
  }
}

如果子聚合返回结果是多个,用 stats_word_count.sum 的形式。

POST book/_search
{
  "size": 0,
  "aggs": {
    "term_word_count": {
      "terms": {
        "field": "author",
        "size": 10,
        "order": {
          "stats_word_count.sum": "desc"
        }
      },
      "aggs": {
        "stats_word_count": {
          "stats": {
            "field": "wordCount"
          }
        }
      }
    }
  }
}

如果要排序的原始是jsonObject格式,那么要使用word_cont>avg_word_count。

POST book/_search
{
  "size": 0,
  "aggs": {
    "word_count_hist": {
      "histogram": {
        "field": "wordCount",
        "interval": 1000,
        "extended_bounds": {
          "min": 0,
          "max": 4000
        },
        "order": {
          "word_cont>avg_word_count": "asc"
        }
      },
      "aggs": {
        "word_cont": {
          "filter": {
            "range": {
              "wordCount": {
                "gte": 2000,
                "lte": 4000
              }
            }
          },
          "aggs": {
            "avg_word_count": {
              "avg": {
                "field": "wordCount"
              }
            }
          }
        }
      }
    }
  }
}

原理和精准度问题

Min聚合

分别从各个分片中获取到Min值,汇总后在取出Min值。
在这里插入图片描述
Terms

这里设置了 terms的size是5,那么会从不同分片中获取到前5个数据,再进行汇总取出前5个数据。
在这里插入图片描述
问题

Terms并不永远准确。

下面的例子:需要返回top3的数据,最终结果是abd,但我们看原始数据,c应该替换d被查询出来。因为每个分片也是只取前三个数据,在node1上c (3)并没有被返回。
在这里插入图片描述
数据分散在多个shard上,coordinating node无法得悉数据全貌。

解决办法

在这里插入图片描述
Shard_size大小设定方法

doc_count_error_upper_bound:被遗漏的term可能的最大值。每个分片返回的最小值。
sum_other_doc_count:返回结果bucket的term外其他term的文档总数。

我们刚才的例子 doc_count_error_upper_bound: 4 + 2 = 6 sum_other_doc_coun: 3 + 3 = 6
在这里插入图片描述
show_term_doc_count_error

这个值代表 bucket误差最大值,它等于每个分片没有返回数据的最小值加和,0的时候代表结果是准确的。
在这里插入图片描述
在这里插入图片描述

发布了44 篇原创文章 · 获赞 9 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/hxyascx/article/details/98849678