ElasticSearch学习笔记一 elasticsearch入门教程

yuyutoo 2024-10-28 20:21 2 浏览 0 评论

环境搭建

ElasticSearch版本基于7+

后续所有操作基于kibana的dev tool执行，

模拟帖子数据

_bulk

POST _bulk
{ "index": { "_index":"forum" ,"_id": 1 }}
{ "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden": false, "postDate": "2017-01-01" }
{ "index": {"_index":"forum", "_id": 2 }}
{ "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden": false, "postDate": "2017-01-02" }
{ "index": {"_index":"forum", "_id": 3 }}
{ "articleID" : "JODL-X-1937-#pV7", "userID" : 2, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_index":"forum" ,"_id": 4 }}
{ "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden": true, "postDate": "2017-01-02" }

查看结构

GET /forum/_mapping

{
  "forum" : {
    "mappings" : {
      "properties" : {
        "articleID" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "hidden" : {
          "type" : "boolean"
        },
        "postDate" : {
          "type" : "date"
        },
        "userID" : {
          "type" : "long"
        }
      }
    }
  }
}

按ID检索信息

GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "term": {
          "userID": 1
        }
      }    
      
    }
  }
}

# 展示信息

{
  "took" : 4,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "articleID" : "XHDK-A-1293-#fJ3",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-01"
        }
      },
      {
        "_index" : "forum",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "articleID" : "KDKE-B-9947-#kL5",
          "userID" : 1,
          "hidden" : false,
          "postDate" : "2017-01-02"
        }
      }
    ]
  }
}

# 查看隐藏信息
GET /forum/_search
{
    "query" : {
        "constant_score" : { 
            "filter" : {
                "term" : { 
                    "hidden" : false
                }
            }
        }
    }
}


# 信息ID
GET /forum/_search
{
    "query" : {
        "constant_score" : { 
            "filter" : {
                "term" : { 
                    "articleID" : "XHDK-A-1293-#fJ3"
                }
            }
        }
    }
}
GET /forum/_search
{
    "query" : {
        "constant_score" : { 
            "filter" : {
                "term" : { 
                    "articleID.keyword" : "XHDK-A-1293-#fJ3"
                }
            }
        }
    }
}

# articleID.keyword，是es最新版本内置建立的field，就是不分词的。
# 一次是articleID过来的时候，会建立两次索引，一次是自己本身，是要分词的，分词后放入倒排索引；
# 另外一次是基于articleID.keyword，不分词，保留256个字符最多，直接一个字符串放入倒排索引中。

查看分词

默认是analyzed的text类型的field，建立倒排索引的时候，就会对所有的articleID分词，分词以后，原本的articleID就没有了，只有分词后的各个word存在于倒排索引中。 term，是不对搜索文本分词的，XHDK-A-1293-#fJ3 –> XHDK-A-1293-#fJ3；但是articleID建立索引的时候，XHDK-A-1293-#fJ3 –> xhdk，a，1293，fj3

GET /forum/_analyze
{
  "field": "articleID",
  "text": "XHDK-A-1293-#fJ3"
}

重建索引

DELETE /forum

PUT /forum
{
  "mappings": {
      "properties": {
        "articleID": {
          "type": "keyword"
        }
      
    }
  }
}

多条件组合查询

搜索日期为2017-01-01或ID为XHDK-A-1293-#fJ3的信息 GET /forum/_search { "query": { "constant_score": { "filter": { "bool": { "should": [ {"term": { "postDate": "2017-01-01" }}, {"term": {"articleID": "XHDK-A-1293-#fJ3"}} ], "must_not": { "term": { "postDate": "2017-01-02" } } } } } } }
搜索ID为XHDK-A-1293-#fJ3或ID为JODL-X-1937-#pV7并且日期为2017-01-01的信息GET /forum/_search { "query": { "constant_score": { "filter": { "bool": { "should": [ { "term": { "articleID": "XHDK-A-1293-#fJ3" } }, { "bool": { "must": [ { "term":{ "articleID": "JODL-X-1937-#pV7" } }, { "term": { "postDate": "2017-01-01" } } ] } } ] } } } } }

模拟标签功能

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag" : ["java", "hadoop"]} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag" : ["java"]} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag" : ["hadoop"]} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag" : ["java", "elasticsearch"]} }

# 验证
GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "articleID": [
            "KDKE-B-9947-#kL5",
            "QQPX-R-3956-#aD8"
          ]
        }
      }
    }
  }
}

检索只有java的tag标签

# 查询出为包含了java的tag的信息
GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "tag":["java"]
        }
      }
    }
  }
}

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag_cnt" : 2} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag_cnt" : 2} }

GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must": [
            {
              "term": {
                "tag_cnt": 1
              }
            },
            {
              "terms": {
                "tag": [
                  "java"
                ]
              }
            }
          ]
        }
      }
    }
  }
}

模拟查看次数

POST /forum/_bulk
{"update":{"_id":"1"}}
{"doc":{"view_cnt":30}}
{"update":{"_id":"2"}}
{"doc":{"view_cnt":50}}
{"update":{"_id":"3"}}
{"doc":{"view_cnt":100}}
{"update":{"_id":"4"}}
{"doc":{"view_cnt":80}}


GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "view_cnt": {
            "gt": 30,
            "lte": 60
          }
        }
      }
    }
  }
}

模拟最近1个月的日期


POST /forum/_bulk
{ "index": { "_id": 5 }}
{ "articleID" : "DHJK-B-1395-#Ky5", "userID" : 3, "hidden": false, "postDate": "2017-03-01", "tag": ["elasticsearch"], "tag_cnt": 1, "view_cnt": 10 }


GET /forum/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "postDate": {
            "gte": "2017-01-20||-30d"
          }
        }
      }
    }
  }
}

模拟标题

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"title" : "this is java and elasticsearch blog"} }
{ "update": { "_id": "2"} }
{ "doc" : {"title" : "this is java blog"} }
{ "update": { "_id": "3"} }
{ "doc" : {"title" : "this is elasticsearch blog"} }
{ "update": { "_id": "4"} }
{ "doc" : {"title" : "this is java, elasticsearch, hadoop blog"} }
{ "update": { "_id": "5"} }
{ "doc" : {"title" : "this is spark blog"} }

# 标题中包含 java 或 spark 的帖子

GET /forum/_search
{
  "query": {
    "match": {
      "title": "java spark"
    }
  }
}

# 标题中包含 java 和 elasticsearch 的帖子
GET /forum/_search
{
  "query": {
    "match": {
      "title" : {
        "query": "java elasticsearch",
        "operator": "and"
      }
    }
  }
}

# 包含java，elasticsearch，spark，hadoop，4个关键字中，至少3个帖子
GET /forum/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch spark hadoop",
        "minimum_should_match": "75%"
      }
    }
  }
}

term + bool实现的nultiword的底层分析


# 普通match如何转换为term+should

{
    "match": { "title": "java elasticsearch"}
}

使用诸如上面的match query进行多值搜索的时候，es会在底层自动将这个match query转换为bool的语法
bool should，指定多个搜索词，同时使用term query

{
  "bool": {
    "should": [
      { "term": { "title": "java" }},
      { "term": { "title": "elasticsearch"   }}
    ]
  }
}


# and match如何转换为term+must

{
    "match": {
        "title": {
            "query":    "java elasticsearch",
            "operator": "and"
        }
    }
}

{
  "bool": {
    "must": [
      { "term": { "title": "java" }},
      { "term": { "title": "elasticsearch"   }}
    ]
  }
}


# minimum_should_match
{
    "match": {
        "title": {
            "query": "java elasticsearch hadoop spark",
            "minimum_should_match": "75%"
        }
    }
}

{
  "bool": {
    "should": [
      { "term": { "title": "java" }},
      { "term": { "title": "elasticsearch"   }},
      { "term": { "title": "hadoop" }},
      { "term": { "title": "spark" }}
    ],
    "minimum_should_match": 3 
  }
}

基于bootst的细粒度条件权重

需求：搜索标题中包含java的帖子，同时呢，如果标题中包含hadoop或elasticsearch就优先搜索出来，同时呢，如果一个帖子包含java hadoop，一个帖子包含java elasticsearch，包含hadoop的帖子要比elasticsearch优先搜索出来

GET /forum/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "title": "blog"
          }
        }
      ],
      "should": [
        {
          "match": {
            "title": {
              "query": "java"
            }
          }
        },
        {
          "match": {
            "title": {
              "query": "hadoop"
            }
          }
        },{
          "match": {
            "title": {
              "query": "elasticsearch"
            }
          }
        },
        {
          "match": {
            "title": {
              "query": "spark",
              "boost": 2
            }
          }
        }
      ]
    }
  }
}

多shard场景下relevance score不准确

生产环境下，数据量大，尽可能实现均匀分配

数据量很大的话，其实一般情况下，在概率学的背景下，es都是在多个shard中均匀路由数据的，路由的时候根据_id，负载均衡比如说有10个document，title都包含java，一共有5个shard，那么在概率学的背景下，如果负载均衡的话，其实每个shard都应该有2个doc，title包含java 如果说数据分布均匀的话，其实就没有刚才说的那个问题了

测试环境下，将索引的primary shard设置为1个，number_of_shards=1，index settings

如果说只有一个shard，那么当然，所有的document都在这个shard里面，就没有这个问题了

测试环境下，搜索附带search_type=dfs_query_then_fetch参数，会将local IDF取出来计算global IDF

计算一个doc的相关度分数的时候，就会将所有shard对的local IDF计算一下，获取出来，在本地进行global IDF分数的计算，会将所有shard的doc作为上下文来进行计算，也能确保准确性。但是production生产环境下，不推荐这个参数，因为性能很差。

dis_max实现best fields策略的多字段搜索


 # 添加contetnt内容

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"content" : "i like to write best elasticsearch article"} }
{ "update": { "_id": "2"} }
{ "doc" : {"content" : "i think java is the best programming language"} }
{ "update": { "_id": "3"} }
{ "doc" : {"content" : "i am only an elasticsearch beginner"} }
{ "update": { "_id": "4"} }
{ "doc" : {"content" : "elasticsearch and hadoop are all very good solution, i am a beginner"} }
{ "update": { "_id": "5"} }
{ "doc" : {"content" : "spark is best big data solution based on scala ,an programming language similar to java"} }


GET /forum/_search
{
  "query": {
    "bool":{
      "should": [
        {"match": {
          "title": "java elasticsearch"
        }},
        {
          "match": {
            "content": "java solution"
          }
        }
      ]
    }
  }
}

# 结果分析

期望的是doc5，结果是doc2,doc4排在了前面

计算每个document的relevance score：每个query的分数，乘以matched query数量，除以总query数量

算一下doc4的分数

{ "match": { "title": "java solution" }}，针对doc4，是有一个分数的
{ "match": { "content":  "java solution" }}，针对doc4，也是有一个分数的

所以是两个分数加起来，比如说，1.1 + 1.2 = 2.3
matched query数量 = 2
总query数量 = 2

2.3 * 2 / 2 = 2.3

算一下doc5的分数

{ "match": { "title": "java solution" }}，针对doc5，是没有分数的
{ "match": { "content":  "java solution" }}，针对doc5，是有一个分数的

所以说，只有一个query是有分数的，比如2.3
matched query数量 = 1
总query数量 = 2

2.3 * 1 / 2 = 1.15

doc5的分数 = 1.15 < doc4的分数 = 2.3

best fields策略，就是说，搜索到的结果，应该是某一个field中匹配到了尽可能多的关键词，被排在前面；而不是尽可能多的field匹配到了少数的关键词，排在了前面



GET /forum/_search
{
    "query": {
        "dis_max": {
            "queries": [
                { "match": { "title": "java solution" }},
                { "match": { "content":  "java solution" }}
            ]
        }
    }
}

tie_breaker优化dis_max

可能在实际场景中出现的一个情况是这样的：

（1）某个帖子，doc1，title中包含java，content不包含java beginner任何一个关键词（2）某个帖子，doc2，content中包含beginner，title中不包含任何一个关键词（3）某个帖子，doc3，title中包含java，content中包含beginner （4）最终搜索，可能出来的结果是，doc1和doc2排在doc3的前面，而不是我们期望的doc3排在最前面

dis_max，只是取分数最高的那个query的分数而已,tie_breaker参数的意义，在于说，将其他query的分数，乘以tie_breaker，然后综合与最高分数的那个query的分数，综合在一起进行计算(tie_breaker的值，在0~1之间，是个小数)

GET /forum/_search
{
  "query": {
    "dis_max": {
      
      "queries": [
        {
          "match": {
            "title": "java solution"
          }
        },
        {
          "match": {
            "content": "java solution"
          }
        }
        
        ],
        "tie_breaker": 0.1
    }
  }
}

multi_math & minimum_should_match

minimum_should_match 去长尾

GET /forum/_search
{
  "query": {
    "multi_match": {
      "query": "java solution",
      "fields": ["title","content"],
      "type": "best_fields",
      "tie_breaker": 0.3,
      "minimum_should_match": "50%"
    }
  }
}

most-fields策略

best-fields策略，主要是说将某一个field匹配尽可能多的关键词的doc优先返回回来
most-fields策略，主要是说尽可能返回更多field匹配到某个关键词的doc，优先返回回来PUT /forum/_mapping { "properties": { "sub_title": { "type": "text", "analyzer": "english", "fields": { "std": { "type": "text", "analyzer": "standard" } } } } } POST /forum/_bulk { "update": { "_id": "1"} } { "doc" : {"sub_title" : "learning more courses"} } { "update": { "_id": "2"} } { "doc" : {"sub_title" : "learned a lot of course"} } { "update": { "_id": "3"} } { "doc" : {"sub_title" : "we have a lot of fun"} } { "update": { "_id": "4"} } { "doc" : {"sub_title" : "both of them are good"} } { "update": { "_id": "5"} } { "doc" : {"sub_title" : "haha, hello world"} }

cross-fields

cross-fields搜索，一个唯一标识，跨了多个field

跨多个field搜索一个标识，比如搜索一个人名，或者一个地址，就是cross-fields搜索


POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"author_first_name" : "Peter", "author_last_name" : "Smith"} }
{ "update": { "_id": "2"} }
{ "doc" : {"author_first_name" : "Smith", "author_last_name" : "Williams"} }
{ "update": { "_id": "3"} }
{ "doc" : {"author_first_name" : "Jack", "author_last_name" : "Ma"} }
{ "update": { "_id": "4"} }
{ "doc" : {"author_first_name" : "Robbin", "author_last_name" : "Li"} }
{ "update": { "_id": "5"} }
{ "doc" : {"author_first_name" : "Tonny", "author_last_name" : "Peter Smith"} }


GET /forum/_search
{
  "query": {
    "multi_match": {
      "query": "Peter Smith",
      "fields": ["author_first_name", "author_last_name"],
      "type": "cross_fields"
    }
  }
}

GET /forum/_search
{
  "query": {
    "multi_match": {
      "query": "Peter Smith",
      "type": "cross_fields", 
      "operator": "and",
      "fields": ["author_first_name", "author_last_name"]
    }
  }
}

copy_to

将多个field组合成一个field


PUT /forum/_mapping
{
  "properties": {
    "new_author_full_name": {
          "type":     "text"
      },
      "new_author_first_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      },
      "new_author_last_name": {
          "type":     "text",
          "copy_to":  "new_author_full_name" 
      }
      
  }
}

POST /forum/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"new_author_first_name" : "Peter", "new_author_last_name" : "Smith"} }	
{ "update": { "_id": "2"} }	
{ "doc" : {"new_author_first_name" : "Smith", "new_author_last_name" : "Williams"} }
{ "update": { "_id": "3"} }
{ "doc" : {"new_author_first_name" : "Jack", "new_author_last_name" : "Ma"} }
{ "update": { "_id": "4"} }
{ "doc" : {"new_author_first_name" : "Robbin", "new_author_last_name" : "Li"} }
{ "update": { "_id": "5"} }
{ "doc" : {"new_author_first_name" : "Tonny", "new_author_last_name" : "Peter Smith"} }

近似匹配

phrase match，proximity match：短语匹配，近似匹配


# match_phrase语法 

GET /forum/_search
{
    "query": {
        "match_phrase": {
            "content": "big data"
        }
    }
}
只有包含java spark这个短语的doc才返回了，只包含java的doc不会返回


# 测试分词
GET _analyze
{
  "text": "hello world, java spark",
  "analyzer": "standard"
}


# 中间隔一个短语
GET /forum/_search
{
  "query": {
    "match_phrase": {
      "content": {
        "query": "big solution",
        "slop": 1
      }
    }
  }
}

slop：最多可以移动几次，获取此短语

优先满足召回率

优先满足召回率，比如，java spark，包含java的也返回，包含spark的也返回，包含java和spark的也返回；同时兼顾精准度，就是包含java和spark，同时java和spark离的越近的doc排在最前面


GET /forum/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "content": "java spark"
          }
        }
      ],
      "should": [
        {
          "match_phrase": {
            "content": {
              "query": "java spark",
              "slop":50
            }
          }
        }
      ]
    }
  }
}

rescoring机制(重新打分)

默认情况下，match也许匹配了1000个doc，proximity match全都需要对每个doc进行一遍运算，判断能否slop移动匹配上，然后去贡献自己的分数

但是很多情况下，match出来也许1000个doc，其实用户大部分情况下是分页查询的，所以可能最多只会看前几页，比如一页是10条，最多也许就看5页，就是50条

proximity match只要对前50个doc进行slop移动去匹配，去贡献自己的分数即可，不需要对全部1000个doc都去进行计算和贡献分数


GET /forum/_search
{
  "query": {
    "match": {
      "content": "java spark"
    }
  },
  "rescore": {
    "query": {
      "rescore_query": {
        "match_phrase": {
          "content": {
            "query": "java spark",
            "slop": 50
          }
        }
      }
    },
    "window_size": 50
  }
}

bulkupdate

上一篇：这样优化Elasticsearch，显著提升写入速度
下一篇：JDBC Batch Updates(二) jdbc批量更新三种模式

ElasticSearch学习笔记一 elasticsearch入门教程

环境搭建

模拟帖子数据

_bulk

查看结构

按ID检索信息

查看分词

重建索引

多条件组合查询

模拟标签功能

模拟查看次数

模拟最近1个月的日期

模拟标题

相关度计算

term + bool实现的nultiword的底层分析

基于bootst的细粒度条件权重

多shard场景下relevance score不准确

dis_max实现best fields策略的多字段搜索

tie_breaker优化dis_max

multi_math & minimum_should_match

most-fields策略

cross-fields

copy_to

近似匹配

优先满足召回率

rescoring机制(重新打分)

相关推荐

取消回复欢迎你发表评论:

织梦(Dedecms)建站教程织梦建站详细步骤

【开源分享】2024在线客服系统PHP源码(安装教程+全新UI)

2024PHP在线客服系统源码+完全开源带详细搭建教程

2024php在线客服系统实时聊天系统源码

2024最新php在线客服系统源码(免费开源+全新UI+终身使用)

【开源分享】2024PHP在线客服系统源码(搭建教程+终身使用)

如何做一个搜索网站?如何做一个外贸网站?dedecms建站教程全集

网站建站基础第十二课(dedecms基础搭建教程)

dedecms网站如何logo设计和ico站标如何替换

SEO入门学习之安装Dede模板网站-耀途盛世

ElasticSearch学习笔记一 elasticsearch入门教程

环境搭建

模拟帖子数据

_bulk

查看结构

按ID检索信息

查看分词

重建索引

多条件组合查询

模拟标签功能

模拟查看次数

模拟最近1个月的日期

模拟标题

相关度计算

term + bool实现的nultiword的底层分析

基于bootst的细粒度条件权重

多shard场景下relevance score不准确

dis_max实现best fields策略的多字段搜索

tie_breaker优化dis_max

multi_math & minimum_should_match

most-fields策略

cross-fields

copy_to

近似匹配

优先满足召回率

rescoring机制(重新打分)

相关推荐

取消回复欢迎 你 发表评论:

织梦(Dedecms)建站教程 织梦建站详细步骤

【开源分享】2024在线客服系统PHP源码(安装教程+全新UI)

2024PHP在线客服系统源码+完全开源 带详细搭建教程

2024php在线客服系统实时聊天系统源码

2024最新php在线客服系统源码(免费开源+全新UI+终身使用)

【开源分享】2024PHP在线客服系统源码(搭建教程+终身使用)

如何做一个搜索网站?如何做一个外贸网站?dedecms建站教程全集

网站建站基础第十二课(dedecms基础搭建教程)

dedecms网站如何logo设计和ico站标如何替换

SEO入门学习之安装Dede模板网站-耀途盛世

取消回复欢迎你发表评论:

织梦(Dedecms)建站教程织梦建站详细步骤

2024PHP在线客服系统源码+完全开源带详细搭建教程