Completed handshake but follow up connection failed ElasticSearch - Docker

PabloOchoa
March 21, 2022
139 views
0 votes
2 Answers

I am trying to deploy a 4 node cluster of ElasticSearch using the following docker compose in portainer 2.9.0 and docker 20.10.11:

version: "3.8"


x-master-opts: &master
  ES_JAVA_OPTS: "-Xmx2g -Xms2g"     
  discovery.seed_resolver.timeout: 60s
  discovery.seed_hosts: master1,master2
  cluster.initial_master_nodes: master1,master2
  network.host: 0.0.0.0             
  xpack.security.enabled: "false"


x-data-opts: &data
  ES_JAVA_OPTS: "-Xmx2g -Xms2g"
  discovery.seed_resolver.timeout: 60s
  discovery.seed_hosts: master1,master2
  cluster.initial_master_nodes: master1,master2
  network.host: 0.0.0.0
  xpack.security.enabled: "false"

networks:
    net:
        external: true

services:
  master1:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *master                   
      node.name: "master1"
      node.roles: "master"
    ulimits:
        memlock:
          soft: -1
          hard: -1
    ports:
      - "9200:9200"
      - "9300:9300"
    networks:
      net:
        aliases:
          - master1
    volumes:
      - /var/volumes/esmaster1:/usr/share/elasticsearch/data_m1
    deploy:
      placement:
        constraints:      
           - node.hostname == <node_dir>       
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  master2:
    depends_on:
      - master1
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *master
      node.name: "master2"
      node.roles: "master, data"
    ulimits:
      memlock:
        soft: -1
        hard: -1       
    networks:
      net:
        aliases:
          - master2
    volumes:
      - /var/volumes/esmaster2:/usr/share/elasticsearch/data_m2
    deploy:
      placement:
        constraints: 
           - node.hostname == <node_dir>
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data1:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data1"
      node.roles: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data1
    volumes:
      - /var/volumes/esdata1:/usr/share/elasticsearch/data1
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data2:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data2"
      node.role: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:                    
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data2
    volumes:
      - /var/volumes/esdata2:/usr/share/elasticsearch/data2
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data3:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data3"
      node.roles: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:                   
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data3
    volumes:
      - /var/volumes/esdata3:/usr/share/elasticsearch/data3
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

At first I was using the tag

endpoint_mode: dnsrr

and while using it I had no problem in host discovery. But when I erased it (due to problems with the internal balancer of the network), the data nodes keep printing this error on logs:

{"@timestamp":"2022-03-21T08:35:08.716Z", "log.level": "WARN", "message":"[connectToRemoteMasterNode[10.0.211.117:9300]] completed handshake with [{master1}{VD6TmxjXQjaMj-4TqKpGZw}{93C9Bw6YT6K5vlUvHSk24Q}{10.0.0.53}{10.0.0.53:9300}{m}{xpack.installed=true}] but followup connection failed", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[data3][generic][T#2]","log.logger":"org.elasticsearch.discovery.HandshakingTransportAddressConnector","elasticsearch.node.name":"data3","elasticsearch.cluster.name":"docker-cluster","error.type":"org.elasticsearch.transport.ConnectTransportException","error.message":"[master1][10.0.0.53:9300] connect_exception","error.stack_trace":"org.elasticsearch.transport.ConnectTransportException: [master1][10.0.0.53:9300] connect_exceptionntat org.elasticsearch.transport.TcpTransport$ChannelsConnectedListener.onFailure(TcpTransport.java:1032)ntat org.elasticsearch.action.ActionListener.lambda$toBiConsumer$0(ActionListener.java:279)ntat org.elasticsearch.core.CompletableContext.lambda$addListener$0(CompletableContext.java:31)ntat java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863)ntat java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841)ntat java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510)ntat java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2162)ntat org.elasticsearch.core.CompletableContext.completeExceptionally(CompletableContext.java:46)ntat org.elasticsearch.transport.netty4.Netty4TcpChannel.lambda$addListener$0(Netty4TcpChannel.java:58)ntat io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578)ntat io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:571)ntat io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:550)ntat io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491)ntat io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616)ntat io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:609)ntat io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:117)ntat io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe$1.run(AbstractNioChannel.java:262)ntat io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98)ntat io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:170)ntat io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)ntat io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469)ntat io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:500)ntat io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)ntat io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)ntat java.base/java.lang.Thread.run(Thread.java:833)nCaused by: io.netty.channel.ConnectTimeoutException: connection timed out: 10.0.0.53/10.0.0.53:9300ntat io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe$1.run(AbstractNioChannel.java:261)nt... 8 moren"}

Answers

Chosen as BEST ANSWER
- PabloOchoa
- March 22, 2022 at 10:15 am
- 0 votes
0
I think the problem was that there was a conflict with the IPs of both the service and the container as they had the same name, master1. I don't truely know why this happens, because in the installation guide of ES in docker they also use the same name for the node and service. I solved this by changing the service name to "master1-elastic" and adding the tag
```
hostname: master1
```
I did the same with the previously named service master2.

(Edit)

- GIN
- September 12, 2022 at 7:22 am
- 0 votes
0
This problem is caused by the inconsistency between the es sevice IP bound to the Docker container and the IP of the es node.

Just add the xxx.publish_host in elasticsearch configuration on each node.

For example:
```
network.publish_host: 10.0.211.117
http.publish_host: 10.0.211.117
transport.publish_host: 10.0.211.117
```
more details see the NetWorking | ElastichSearch Guide
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.

Completed handshake but follow up connection failed ElasticSearch – Docker

Answers