ElasticSearch进阶语法

玩转TMDB电影数据分析

索引建立

PUT /movie
{
    
    
   "settings" : {
    
    
      "number_of_shards" : 1,
      "number_of_replicas" : 1
   },
   "mappings": {
    
    
     "properties": {
    
    
       "title":{
    
    "type":"text","analyzer": "english"},
       "tagline":{
    
    "type":"text","analyzer": "english"},
       "release_date":{
    
    "type":"date",        "format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"},
       "popularity":{
    
    "type":"double"},
       "cast":{
    
    
         "type":"object",
         "properties":{
    
    
           "character":{
    
    "type":"text","analyzer":"standard"},
           "name":{
    
    "type":"text","analyzer":"standard"}
         }
       },
       "overview":{
    
    "type":"text","analyzer": "english"}
     }
   }
}

数据导入

  1. 网络搜索TMDB下载电影数据资源,导入ES

Query DSL简单实验

  1. match查询,按照字段上定义的分词分析后去索引内查询
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"steve"}
  }
}
  1. term查询,不进行词的分析,直接去索引查询,及搜索关键词和索引内词的精确匹配
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"steve zissou"}
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "term":{
    
    "title":"steve zissou"}
  }
}
  1. match分词后的and和or
GET /movie/_search
{
    
    
  "query":{
    
    
    "match":{
    
    "title":"basketball with cartoom aliens"} // 默认使用的是OR
  }
}
GET /movie/_search
{
    
    
  "query":{
    
    
    "match": {
    
    
      "title": {
    
    
        "query": "basketball with cartoom aliens",
        "operator": "and" // 使用的是AND
      }
    }
  } 
}
  1. 最小词项匹配
GET /movie/_search
{
    
    
  "query":{
    
    
    "match": {
    
    
      "title": {
    
    
        "query": "basketball with cartoom aliens",
        "operator": "or" ,
        "minimum_should_match": 2 // 最少匹配中两个词
      }
    }
  }
}
  1. 短语查询
GET /movie/_search
{
    
    
  "query":{
    
    
    "match_phrase":{
    
    "title":"steve zissou"} // 匹配短语
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "match_phrase_prefix":{
    
    "title":"steve zis"} // 短语前缀查询
  }
}
  1. 多字段查询
GET /movie/_search
{
    
    
  "query":{
    
    
    "multi_match":{
    
    
      "query":"basketball with cartoom aliens",
      "field":["title","overview"] // 在这两个字段同时查询
    }
  }
}

再次解释评分规则(tf/idf)*tfnorm

  • tf:词频,这个document文档包含了多少个这个词,包含越多表明越相关
  • idf:逆文档频率,包含该词的文档总数目
  • tfnorm: 根据field长度做归一化,文档内出现频率越高,field越短越相关
// 操作不管是字符“与”还是“或”,按照逻辑关系命中后相加得分
GET /movie/_search
{
    
    
  "explain": true, 
  "query":{
    
    
    "match":{
    
    "title":"steve"}
  }
}

GET /movie/_search
{
    
    
  "query":{
    
    
    "multi_match":{
    
    
      "query":"basketball with cartoom aliens",
      "fields":["title^10","overview"], // title乘了10的权重,着重标题查询
      "tie_break":0.3
    }
  }
}

继续深入查询

  1. Bool查询
  • must:必须都是true
  • must not:必须都是false
  • should:其中有一个为true即可,但true的越多得分越高
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool": {
    
     
      "should": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ]
    }
  }
}
  1. 不同的multi_query的type和multi_match得分不一样
  • 因为multi_match有很多种type
  • 默认是best_fields,取得分最高的作为对应的分数,最匹配模式,等同于dismax模式
GET /movie/_search
{
    
    
  "query":{
    
    
    "dis_max": {
    
     
      "queries": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ]
    }
  }
}
  • 使用explan看下 ((title:steve title:job) | (overview:steve overview:job)),打分规则
GET /movie/_validate/query?explain
{
    
    
  //"explain": true, 
  "query":{
    
    
    "multi_match":{
    
    
      "query":"steve job",
      "fields":["title","overview"],
      "operator": "or",
      "type":"best_fields"
    }
  }
}
  • 以字段为单位分别计算分词的分数,然后取最好的一个,适用于最优字段匹配
GET /movie/_search
{
    
    
  "query":{
    
    
    "dis_max": {
    
     
      "queries": [
        {
    
     "match": {
    
     "title":"basketball with cartoom aliens"}}, 
        {
    
     "match": {
    
     "overview":"basketball with cartoom aliens"}}  
      ],
      "tie_breaker": 0.3 // 将其他因素以0.3的倍数考虑进去
    }
  }
}
  • most_fields:取命中的分值相加作为分数,同should match模式,加权共同影响模式
  • cross_fields:以分词为单位计算栏位总分
// 要求Peter必须在author_first_name或author_last_name中出现
// 要求Smith必须在author_first_name或author_last_name中出现
GET /forum/article/_search
{
    
    
  "query": {
    
    
    "multi_match": {
    
    
      "query": "Peter Smith",
      "type": "cross_fields", 
      "operator": "or",
      "fields": ["author_first_name", "author_last_name"]
    }
  }
}
  1. query string
  • 方便的利用AND(+) OR(|) NOT(-)
GET /movie/_search
{
    
    
  "query":{
    
    
    "query_string":{
    
    
      "fields":["title"],
      "query":"steve AND jobs"
    }
  }
}

过滤查询

  1. 单条件过滤
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "filter":{
    
    
          "term":{
    
    "title":"steve"}
      }
    }
  }
}
  1. 多条件过滤
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "filter":[
        {
    
    "term":{
    
    "title":"steve"}},
        {
    
    "term":{
    
    "cast.name":"gaspard"}},
        {
    
    "range": {
    
     "release_date": {
    
     "lte": "2015/01/01" }}},
        {
    
    "range": {
    
     "popularity": {
    
     "gte": "25" }}}
        ]
    }
  },
  "sort":[
    {
    
    "popularity":{
    
    "order":"desc"}}
  ]
}
  1. 带match打分的的filter
GET /movie/_search
{
    
    
  "query":{
    
    
    "bool":{
    
    
      "must": [
        {
    
     "match": {
    
     "title":   "Search"        }}, 
        {
    
     "match": {
    
     "tagline": "Elasticsearch" }}  
      ],
      "filter":[
        {
    
    "term":{
    
    "title":"steve"}},
        {
    
    "term":{
    
    "cast.name":"gaspard"}},
        {
    
    "range": {
    
     "release_date": {
    
     "lte": "2015/01/01" }}},
        {
    
    "range": {
    
     "popularity": {
    
     "gte": "25" }}}
        ]
    }
  }
}

优秀的搜索引擎必备

  • 查全率:正确的结果有n个,查询出来正确的有m 则 m/n
  • 查准率:查出的n个文档有m个正确,则m/n
  • 两者都需要提高,但一般不可兼得,可以通过调整排序位置,将正确的结果排在上面以提高用户体验
GET /movie/_search
{
    
    
  "query":{
    
    
    "function_score": {
    
    
      // 原始查询得到oldscore
      "query": {
    
          
        "multi_match":{
    
    
        "query":"steve job",
        "fields":["title","overview"],
        "operator": "or",
        "type":"most_fields"
      }
    },
    "functions": [
      {
    
    "field_value_factor": {
    
    
          "field": "popularity",   // 对应要处理的字段
          "modifier": "log2p",    // 将字段值+2后,计算对数
          "factor": 10    // 字段预处理*10
        }
      }
    ]
  }
}

猜你喜欢

转载自blog.csdn.net/qq_36221788/article/details/109787157