1. 集群健康度
2. Health相关的API
解释 | API |
---|
集群的状态(检查节点数量) | GET _cluster/health | 所有索引的健康状态(查看有问题的索引) | GET _cluster/health?level=indices | 单个索引的健康状态(查看具体的索引) | GET _cluster/health/my_index | 分片级的索引 | GET _cluster/health?level=shards | 返回第一个未分配 Shard 的原因 | GET _cluster/allocation/explain |
示例1:获取索引的健康值
http://IP:9200/_cat/health
1635313779 05:49:39 kubernetes-logging red 15 10 2128 1064 0 0 32 0 - 98.5%
1635328870 10:01:10 kubernetes-logging green 15 10 2160 1080 2 0 0 0 - 100.0%
Kibana查看
GET _cat/health
示例2:集群的状态(检查节点数量)
# 浏览器查看
http://IP:9200/_cluster/health
# 结果
{"cluster_name":"kubernetes-logging","status":"red","timed_out":false,"number_of_nodes":15,
"number_of_data_nodes":10,"active_primary_shards":1064,"active_shards":2128,"relocating_shards":0,
"initializing_shards":0,"unassigned_shards":32,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,
"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":98.51851851851852}
Kibana查看
GET _cluster/health
示例3:所有索引的健康状态
http://IP:9200/_cluster/health?level=indices
Kibana 查看
GET _cluster/health?level=indices
示例4:单个索引的健康状态(查看具体的索引)
http://IP:9200/_cluster/health/dev-tool-deployment-service
{"cluster_name":"kubernetes-logging","status":"red","timed_out":false,"number_of_nodes":15,
"number_of_data_nodes":10,"active_primary_shards":2,"active_shards":4,"relocating_shards":0,
"initializing_shards":0,"unassigned_shards":6,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,
"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":98.52534562211981}
kibana 查看
GET _cluster/health/my_index
3. 集群健康与问题排查
3.1 启动 Elasticsearch 集群
cat docker-compose.yaml
version: '2.2'
services:
cerebro:
image: lmenezes/cerebro:0.8.3
container_name: hwc_cerebro
ports:
- "9000:9000"
command:
- -Dhosts.0.host=http://elasticsearch:9200
networks:
- hwc_es7net
kibana:
image: docker.elastic.co/kibana/kibana:7.1.0
container_name: hwc_kibana7
environment:
#- I18N_LOCALE=zh-CN
- XPACK_GRAPH_ENABLED=true
- TIMELION_ENABLED=true
- XPACK_MONITORING_COLLECTION_ENABLED="true"
ports:
- "5601:5601"
networks:
- hwc_es7net
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.1.0
container_name: es7_hot
environment:
- cluster.name=geektime-hwc
- node.name=es7_hot
- node.attr.box_type=hot
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- discovery.seed_hosts=es7_hot,es7_warm,es7_cold
- cluster.initial_master_nodes=es7_hot,es7_warm,es7_cold
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- hwc_es7data_hot:/usr/share/elasticsearch/data
ports:
- 9200:9200
networks:
- hwc_es7net
elasticsearch2:
image: docker.elastic.co/elasticsearch/elasticsearch:7.1.0
container_name: es7_warm
environment:
- cluster.name=geektime-hwc
- node.name=es7_warm
- node.attr.box_type=warm
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- discovery.seed_hosts=es7_hot,es7_warm,es7_cold
- cluster.initial_master_nodes=es7_hot,es7_warm,es7_cold
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- hwc_es7data_warm:/usr/share/elasticsearch/data
networks:
- hwc_es7net
elasticsearch3:
image: docker.elastic.co/elasticsearch/elasticsearch:7.1.0
container_name: es7_cold
environment:
- cluster.name=geektime-hwc
- node.name=es7_cold
- node.attr.box_type=cold
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- discovery.seed_hosts=es7_hot,es7_warm,es7_cold
- cluster.initial_master_nodes=es7_hot,es7_warm,es7_cold
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- hwc_es7data_cold:/usr/share/elasticsearch/data
networks:
- hwc_es7net
volumes:
hwc_es7data_hot:
driver: local
hwc_es7data_warm:
driver: local
hwc_es7data_cold:
driver: local
networks:
hwc_es7net:
driver: bridge
案例1
DELETE mytest
PUT mytest
{
"settings":{
"number_of_shards":3,
"number_of_replicas":0,
"index.routing.allocation.require.box_type":"hott"
}
}
GET /_cluster/health/
GET /_cluster/health?level=indices
GET _cluster/health?level=shards
GET /_cluster/allocation/explain
GET /_cat/shards/mytest
GET _cat/nodeattrs
DELETE mytest
GET /_cluster/health/
PUT mytest
{
"settings":{
"number_of_shards":3,
"number_of_replicas":0,
"index.routing.allocation.require.box_type":"hot"
}
}
GET /_cluster/health/
案例2:Explain 看 hot 上的 explain
DELETE mytest
PUT mytest
{
"settings":{
"number_of_shards":2,
"number_of_replicas":1,
"index.routing.allocation.require.box_type":"hot"
}
}
GET _cluster/health
GET _cat/shards/mytest
GET /_cluster/allocation/explain
PUT mytest/_settings
{
"number_of_replicas": 0
}
4. 分片没有被分配的一些原因
-
INDEX_CREATE:创建索引失败,在索引的全部分片分配完成之前,会有短暂的 Red,不一定代表有问题 -
CLUSTER_RECOVER:集群重启阶段,会有这个问题 -
INDEX_REOPEN:Open 一个之前 Close 的索引 -
DANGLING_INDEX_IMPORTED:一个节点离开集群期间,有索引被删除,这个节点重新返回时,会导致 Dangling 的问题
5. 常见问题与解决办法
-
集群变红,需要检查是否有节点离线,如果有,通常通过重启离线的节点就可以解决问题 -
由于配置导致的问题,需要修复相关的配置(例如错误的 box_type,错误的副本数) -
因为磁盘空间限制,分片规则(Shard Filtering)引发的,需要调整规则或者增加节点 -
对于节点返回集群,导致 danging 变红,可直接删除 dangling 索引
6. 集群 Red & Yellow 问题的总结
-
Red & Yellow 是集群运维中常见的问题 -
除了集群故障,一些创建,增加副本等操作,都会导致集群短暂的 Red 和 Yellow,所以监控和报警时需要设置一定的延时 -
通过检查节点数,使用 ES 提供的相关 API,找到真正的原因 -
可以指定 Move 或者 Reallocate 分片
POST _cluster/reroute
{
"commands": [
{
"move": {
"index": "index_name",
"shard": 0,
"from_node": "node_name_1",
"to_node": "node_name_2"
}
}
]
}
POST _cluster/reroute?explain
{
"commands": [
{
"allocate": {
"index": "index_name",
"shard": 0,
"node": "nodename"
}
}
]
}
|