文章目录
 通用建议主动disable不使用的field特性不要使用默认字符串mappingsTesting精准搜索(exact)和词根搜索(stemming)混合字段值参与score的计算_recovery_freeze
 
通用建议
 
不要返回大文件集,如果需要,使用 Scroll API避免单个大文件,ES默认最大100M(http.max_content_length),可以调整,但是Lucene任然限制大约2GB多请求时,使用 _bulk,但是自己集群一次_bulk最合适操作多少个document,需要在集群中做benchmark,可通过二分查找方式快速定位这个数字在集群启动的时候,为了加速加载过程,可以做两项设置:index.refresh_interval =-1、index.number_of_replicas=0,在集群启动之后,再调整这两个数值至少把运行ES机器的内存的一半给到 the filesystem cache,ES搜索速度很大程度依赖于 the filesystem cache优先使用 auto-generated id, 如果使用自定义id,在索引文件时,就必须判断该ID是否存在,随着数据量变大,这个过程的开销将会越来越大nested 让查询速度慢几倍,parent-child让查询速度慢几百倍,能不用就不用把多个字段合并到一个字段搜索,有助于提高搜索速度,利用 copy_to 可以做到 
PUT movies
{
  "mappings": {
    "properties": {
      "name_and_plot": {
        "type": "text"
      },
      "name": {
        "type": "text",
        "copy_to": "name_and_plot"
      },
      "plot": {
        "type": "text",
        "copy_to": "name_and_plot"
      }
    }
  }
}
 
pre-index data,提高搜索速度 
PUT index/_doc/1
{
  "designation": "spoon",
  "price": 13
}
GET index/_search
{
  "aggs": {
    "price_ranges": {
      "range": {
        "field": "price",
        "ranges": [
          { "to": 10 },
          { "from": 10, "to": 100 },
          { "from": 100 }
        ]
      }
    }
  }
}
PUT index
{
  "mappings": {
    "properties": {
      "price_range": {
        "type": "keyword"
      }
    }
  }
}
PUT index/_doc/1
{
  "designation": "spoon",
  "price": 13,
  "price_range": "10-100"
}
GET index/_search
{
  "aggs": {
    "price_ranges": {
      "terms": {
        "field": "price_range"
      }
    }
  }
}
 
增加副本数一定会提高吞吐量吗?不是。合理的副本数为: max(max_failures, ceil(num_nodes/num_primaries)-1)通过Profile API分析查询的耗时情况,只是反映相对情况,绝对数值没有太多意义 
GET /twitter/_search
{
  "profile": true,
  "query" : {
    "match" : { "message" : "some number" }
  }
}
 
主动disable不使用的field特性
 
需要 histograms,不需要filter 
PUT index
{
  "mappings": {
    "properties": {
      "foo": {
        "type": "integer",
        "index": false
      }
    }
  }
}
 
只关系是否match,而不关心score 
PUT index
{
  "mappings": {
    "properties": {
      "foo": {
        "type": "text",
        "norms": false
      }
    }
  }
}
 
不需要 phrase queries, 告诉ES不需要索引位置信息 
PUT index
{
  "mappings": {
    "properties": {
      "foo": {
        "type": "text",
        "index_options": "freqs"
      }
    }
  }
}
 
不关心得分,不需要phrase queries 
PUT index
{
  "mappings": {
    "properties": {
      "foo": {
        "type": "text",
        "norms": false,
        "index_options": "freqs"
      }
    }
  }
}
 
不要使用默认字符串mappings
 
默认字符串mappings会索引field为 text 和 keyword。但是很多情况我们只需要 keyword。自定义 dynamic_templates
 
PUT index
{
  "mappings": {
    "dynamic_templates": [
      {
        "strings": {
          "match_mapping_type": "string",
          "mapping": {
            "type": "keyword"
          }
        }
      }
    ]
  }
}
 
Testing
 
<dependencies>
  <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-test-framework</artifactId>
    <version>${lucene.version}</version>
    <scope>test</scope>
  </dependency>
  <dependency>
    <groupId>org.elasticsearch.test</groupId>
    <artifactId>framework</artifactId>
    <version>${elasticsearch.version}</version>
    <scope>test</scope>
  </dependency>
</dependencies>
 
精准搜索(exact)和词根搜索(stemming)混合
 
普通搜索,是基于词根搜索的,但是如何处理特定词不进行词根搜索呢? 在 simple_query_string 搜索中,query中 quote 的字段在quote_field_suffix 字段进行搜索,通过 quote_field_suffix 指向 exact 来实现。
 
PUT index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_exact": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "body": {
        "type": "text",
        "analyzer": "english",
        "fields": {
          "exact": {
            "type": "text",
            "analyzer": "english_exact"
          }
        }
      }
    }
  }
}
PUT index/_doc/1
{
  "body": "Ski resort"
}
PUT index/_doc/2
{
  "body": "A pair of skis"
}
POST index/_refresh
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "query": "ski"
    }
  }
}
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body.exact" ],
      "query": "ski"
    }
  }
}
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "quote_field_suffix": ".exact",
      "query": "\"ski\""
    }
  }
}
 
字段值参与score的计算
 
script_score 
PUT script_score_index
{
  "mappings": {
    "properties": {
      "url":{
        "type": "text"
      },
      "pagerank": {
        "type": "long" 
      },
      "url_length": {
        "type": "rank_feature",
        "positive_score_impact": false 
      }
    }
  }
}
PUT script_score_index/_doc/1
{
  "content":"elasticsearch",
  "pagerank": 1,
  "url_length": 22
}
PUT script_score_index/_doc/2
{
  "content":"elasticsearch",
  "pagerank": 8,
  "url_length": 22
}
GET script_score_index/_search
{
  "query": {
    "script_score": {
      "query": {
        "match": {
          "content": "elasticsearch"
        }
      },
      "script": {
        "source": "_score*saturation(doc['pagerank'].value,10)"
      }
    }
  }
}
 
rank_feature 
PUT rank_feature_index
{
  "mappings": {
    "properties": {
      "url":{
        "type": "text"
      },
      "pagerank": {
        "type": "rank_feature" 
      },
      "url_length": {
        "type": "rank_feature",
        "positive_score_impact": false 
      }
    }
  }
}
PUT rank_feature_index/_doc/1
{
  "content":"elasticsearch",
  "pagerank": 1,
  "url_length": 22
}
PUT rank_feature_index/_doc/2
{
  "content":"elasticsearch",
  "pagerank": 8,
  "url_length": 22
}
GET rank_feature_index/_search
{
  "query": {
    "rank_feature": {
      "field": "pagerank"
    }
  }
}
GET rank_feature_index/_search
{
  "query": {
    "bool": {
      "must": {
        "match": {
          "content": "elasticsearch"
        }
      },
      "should": {
        "rank_feature": {
          "field": "pagerank",
          "saturation": {
            "pivot": 10
          }
        }
      }
    }
  }
}
 
_recovery
 
GET kibana_sample_data_ecommerce,kibana_sample_data_flights/_recovery?human
GET /_recovery?human
GET _recovery?human&detailed=true
 
_freeze
 
普通索引,会缓存在内存中,使得索引数据的时候,速度特别快但是,对于时间序列相关的搜索,其每次搜索的结果可能都大不相同,把相关数据缓存到索引没有意义,并且还消耗大量内存不需要缓存的索引适合使用 freeze;重新需要缓存,使用 _unfreezefreeze index 在未来变化的可能性一般是非常低的最佳实践:由于 freeze index 未来变化的肯能性非常低,推荐在freeze index之前进行 _forcemerge, 已保证 每个分片shard 在磁盘上只有 一个段segment。这样不仅可以提供更好的压缩,也可简化在数据结构 
POST /my_index/_freeze
POST /my_index/_unfreeze
POST /twitter/_forcemerge?max_num_segments=1
GET /twitter/_search?q=user:kimchy&ignore_throttled=false
# 
# sth: true if the index frozen
#
GET /_cat/indices/*?v&h=i,sth