环境 ks3.0
最近发现一个ks集群的系统日志量明显过高(每半小时大概40W),经排查是kubesphere-logging-system下的 elasticsearch-logging-discovery节点的大量刷报错:
[2020-11-05T02:42:26,308][WARN ][o.e.c.r.a.AllocationService] [elasticsearch-logging-discovery-2] failing shard [failed shard, shard [ks-logstash-log-2020.11.05][1], node[p4qSYifaQy-bjnxM5zLV7Q], relocating [msZhxFHhShWjnNi-VEIjWQ], [P], recovery_source[peer recovery], s[INITIALIZING], a[id=3V_DF_SGRNC0dKniEjFYEA, rId=fGakFS9_RuOsnfv-rG9LYA], expected_shard_size[591323645], message [failed to create shard], failure [IllegalStateException[environment is not locked]; nested: IOException[Stale file handle]; ], markAsStale [true]]
java.lang.IllegalStateException: environment is not locked
at org.elasticsearch.env.NodeEnvironment.assertEnvIsLocked(NodeEnvironment.java:1025) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.env.NodeEnvironment.availableShardPaths(NodeEnvironment.java:840) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.index.shard.ShardPath.loadShardPath(ShardPath.java:120) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.index.IndexService.createShard(IndexService.java:340) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.IndicesService.createShard(IndicesService.java:623) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.IndicesService.createShard(IndicesService.java:158) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.createShard(IndicesClusterStateService.java:597) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.createOrUpdateShards(IndicesClusterStateService.java:573) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.applyClusterState(IndicesClusterStateService.java:270) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.lambda$callClusterStateAppliers$6(ClusterApplierService.java:484) ~[elasticsearch-6.7.0.jar:6.7.0]
at java.lang.Iterable.forEach(Iterable.java:75) ~[?:?]
at org.elasticsearch.cluster.service.ClusterApplierService.callClusterStateAppliers(ClusterApplierService.java:481) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.applyChanges(ClusterApplierService.java:468) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.runTask(ClusterApplierService.java:419) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService$UpdateTask.run(ClusterApplierService.java:163) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:681) [elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252) [elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215) [elasticsearch-6.7.0.jar:6.7.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) [?:?]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) [?:?]
at java.lang.Thread.run(Thread.java:835) [?:?]
Suppressed: java.lang.IllegalStateException: environment is not locked
at org.elasticsearch.env.NodeEnvironment.assertEnvIsLocked(NodeEnvironment.java:1025) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.env.NodeEnvironment.availableShardPaths(NodeEnvironment.java:840) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.index.shard.ShardPath.deleteLeftoverShardDirectory(ShardPath.java:176) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.index.IndexService.createShard(IndexService.java:344) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.IndicesService.createShard(IndicesService.java:623) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.IndicesService.createShard(IndicesService.java:158) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.createShard(IndicesClusterStateService.java:597) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.createOrUpdateShards(IndicesClusterStateService.java:573) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.indices.cluster.IndicesClusterStateService.applyClusterState(IndicesClusterStateService.java:270) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.lambda$callClusterStateAppliers$6(ClusterApplierService.java:484) ~[elasticsearch-6.7.0.jar:6.7.0]
at java.lang.Iterable.forEach(Iterable.java:75) ~[?:?]
at org.elasticsearch.cluster.service.ClusterApplierService.callClusterStateAppliers(ClusterApplierService.java:481) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.applyChanges(ClusterApplierService.java:468) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService.runTask(ClusterApplierService.java:419) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.cluster.service.ClusterApplierService$UpdateTask.run(ClusterApplierService.java:163) ~[elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:681) [elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.runAndClean(PrioritizedEsThreadPoolExecutor.java:252) [elasticsearch-6.7.0.jar:6.7.0]
at org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor$TieBreakingPrioritizedRunnable.run(PrioritizedEsThreadPoolExecutor.java:215) [elasticsearch-6.7.0.jar:6.7.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) [?:?]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) [?:?]
at java.lang.Thread.run(Thread.java:835) [?:?]
Caused by: java.io.IOException: Stale file handle
at sun.nio.ch.FileDispatcherImpl.size0(Native Method) ~[?:?]
at sun.nio.ch.FileDispatcherImpl.size(FileDispatcherImpl.java:90) ~[?:?]
at sun.nio.ch.FileChannelImpl.size(FileChannelImpl.java:383) ~[?:?]
at org.apache.lucene.store.NativeFSLockFactory$NativeFSLock.ensureValid(NativeFSLockFactory.java:182) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.elasticsearch.env.NodeEnvironment.assertEnvIsLocked(NodeEnvironment.java:1022) ~[elasticsearch-6.7.0.jar:6.7.0]
... 20 more
Caused by: java.io.IOException: Stale file handle
at sun.nio.ch.FileDispatcherImpl.size0(Native Method) ~[?:?]
at sun.nio.ch.FileDispatcherImpl.size(FileDispatcherImpl.java:90) ~[?:?]
at sun.nio.ch.FileChannelImpl.size(FileChannelImpl.java:383) ~[?:?]
at org.apache.lucene.store.NativeFSLockFactory$NativeFSLock.ensureValid(NativeFSLockFactory.java:182) ~[lucene-core-7.7.0.jar:7.7.0 8c831daf4eb41153c25ddb152501ab5bae3ea3d5 - jimczi - 2019-02-04 23:16:28]
at org.elasticsearch.env.NodeEnvironment.assertEnvIsLocked(NodeEnvironment.java:1022) ~[elasticsearch-6.7.0.jar:6.7.0]
... 20 more
经过简单查看es集群状态没发现有啥影响,搜索后也没找到合适的解决方法,对es不是很了解,请教如何解决上述刷异常的问题呢?
sh-4.2$ curl localhost:9200/_cluster/health?pretty
{
"cluster_name" : "elasticsearch",
"status" : "green",
"timed_out" : false,
"number_of_nodes" : 8,
"number_of_data_nodes" : 5,
"active_primary_shards" : 150,
"active_shards" : 300,
"relocating_shards" : 2,
"initializing_shards" : 0,
"unassigned_shards" : 0,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 4,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 77,
"active_shards_percent_as_number" : 100.0
}
sh-4.2$ curl localhost:9200/_cat/indices?v
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
green open ks-logstash-log-2020.10.27 _YknKv4NQUS-lWpLXPi90Q 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.10.30 pDCyC8oDStWSD-sJbVH_OQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.11.04 VtK3ceLoRGqpoEGKoznbGw 5 1 63526638 0 17.7gb 8.8gb
green open ks-logstash-auditing-2020.10.31 Edxt-oNJSWKBScVce1paYg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.11.03 MwGAcaHJSh6r5Lor8MpZRg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.11.01 2Iul-sPTR9W7BSZBzbtDcA 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.11.05 gXp13nYaSySTWyJejRFkmw 5 1 14919861 0 4.7gb 2.3gb
green open ks-logstash-auditing-2020.11.05 5-VGMR3bR8aFREtAQe80UQ 5 1 66 0 758.2kb 379.1kb
green open ks-logstash-events-2020.10.29 cJhZmSoBTzKv6yPce0e0XQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.10.31 x7xrf0jSRy2NN0dEjbr1Tw 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.11.04 K9QAgxeaSSOoRvhoaBSlfw 5 1 328 0 1.2mb 688.4kb
green open ks-logstash-events-2020.11.03 CdL1A2SpRuqA0D3eQYvm6w 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.11.02 Dk17sceQSk6Gk3gQqlbEpg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.11.02 w7-HXDP_SgCV-i5GWY-RvA 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.11.04 5ICGq9neQ1SqzwW8No8V-A 5 1 13573 0 14.2mb 7mb
green open ks-logstash-events-2020.10.30 tIqV8MbsTeagTgOat6VJQg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.10.28 QEvDduBvTA2iu4TDtWrP0Q 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.11.03 u3VIPtIsTcqTBK23s2YR1Q 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.10.29 g1FPdSJ9QUmlc0n7Da79sQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.10.30 71GDd76KRf-VzUFLLaTWqg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.11.01 XZ5CGYNGTuSqA5fHeIwIEg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.10.28 c52UWfe7QuK00lziLPp-1A 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.10.31 Artfha8JQwufr2H0ETbHYQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.11.01 s3Oe-w0rTJeSo5I8XuFOSg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-events-2020.11.05 u5EVxtwQTHegj0mM0FrlUw 5 1 2830 0 6mb 3.1mb
green open ks-logstash-log-2020.10.26 v4LR0eIxSSeK6P4rhf8CWQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.10.29 hQRe-R2gQvuq6t4VLCdsqQ 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.11.02 qzGPU3mVRhmsgMDht1bDOg 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-log-2020.10.28 FXsns61TRpanQtLgwmxqtw 5 1 0 0 2.5kb 1.2kb
green open ks-logstash-auditing-2020.10.27 6wiVDDYmSiixzPwjTEfRxQ 5 1 0 0 2.5kb 1.2kb
#这里有两条RELOCATING状态的如何处理呢?
ks-logstash-auditing-2020.11.05 1 p STARTED 13 52.7kb 10.234.92.59 elasticsearch-logging-data-2
ks-logstash-auditing-2020.11.05 1 r RELOCATING 10.234.90.56 elasticsearch-logging-data-4 -> 10.234.105.98 p4qSYifaQy-bjnxM5zLV7Q elasticsearch-logging-data-1
ks-logstash-auditing-2020.11.05 4 p STARTED 9 131.2kb 10.234.96.48 elasticsearch-logging-data-3
ks-logstash-auditing-2020.11.05 4 r STARTED 9 131.2kb 10.234.90.56 elasticsearch-logging-data-4
ks-logstash-auditing-2020.11.05 0 p STARTED 20 120.9kb 10.234.90.55 elasticsearch-logging-data-0
ks-logstash-auditing-2020.11.05 0 r RELOCATING 10.234.92.59 elasticsearch-logging-data-2 -> 10.234.105.98 p4qSYifaQy-bjnxM5zLV7Q elasticsearch-logging-data-1