skip to Main Content

I am trying to deploy a 4 node cluster of ElasticSearch using the following docker compose in portainer 2.9.0 and docker 20.10.11:

version: "3.8"


x-master-opts: &master
  ES_JAVA_OPTS: "-Xmx2g -Xms2g"     
  discovery.seed_resolver.timeout: 60s
  discovery.seed_hosts: master1,master2
  cluster.initial_master_nodes: master1,master2
  network.host: 0.0.0.0             
  xpack.security.enabled: "false"


x-data-opts: &data
  ES_JAVA_OPTS: "-Xmx2g -Xms2g"
  discovery.seed_resolver.timeout: 60s
  discovery.seed_hosts: master1,master2
  cluster.initial_master_nodes: master1,master2
  network.host: 0.0.0.0
  xpack.security.enabled: "false"

networks:
    net:
        external: true

services:
  master1:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *master                   
      node.name: "master1"
      node.roles: "master"
    ulimits:
        memlock:
          soft: -1
          hard: -1
    ports:
      - "9200:9200"
      - "9300:9300"
    networks:
      net:
        aliases:
          - master1
    volumes:
      - /var/volumes/esmaster1:/usr/share/elasticsearch/data_m1
    deploy:
      placement:
        constraints:      
           - node.hostname == <node_dir>       
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  master2:
    depends_on:
      - master1
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *master
      node.name: "master2"
      node.roles: "master, data"
    ulimits:
      memlock:
        soft: -1
        hard: -1       
    networks:
      net:
        aliases:
          - master2
    volumes:
      - /var/volumes/esmaster2:/usr/share/elasticsearch/data_m2
    deploy:
      placement:
        constraints: 
           - node.hostname == <node_dir>
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data1:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data1"
      node.roles: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data1
    volumes:
      - /var/volumes/esdata1:/usr/share/elasticsearch/data1
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data2:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data2"
      node.role: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:                    
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data2
    volumes:
      - /var/volumes/esdata2:/usr/share/elasticsearch/data2
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

  data3:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.0.0
    environment:
      <<: *data
      node.name: "data3"
      node.roles: "data"
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:                   
      test: curl -fs http://localhost:9200/_cat/health || exit 1
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 45s
    networks:
      net:
        aliases:
          - data3
    volumes:
      - /var/volumes/esdata3:/usr/share/elasticsearch/data3
    deploy:
      placement:
        constraints:
           - node.hostname == <node_dir>        
      mode: "replicated"
      replicas: 1
      resources:
        limits:
          memory: 2G

At first I was using the tag

endpoint_mode: dnsrr

and while using it I had no problem in host discovery. But when I erased it (due to problems with the internal balancer of the network), the data nodes keep printing this error on logs:

{"@timestamp":"2022-03-21T08:35:08.716Z", "log.level": "WARN", "message":"[connectToRemoteMasterNode[10.0.211.117:9300]] completed handshake with [{master1}{VD6TmxjXQjaMj-4TqKpGZw}{93C9Bw6YT6K5vlUvHSk24Q}{10.0.0.53}{10.0.0.53:9300}{m}{xpack.installed=true}] but followup connection failed", "ecs.version": "1.2.0","service.name":"ES_ECS","event.dataset":"elasticsearch.server","process.thread.name":"elasticsearch[data3][generic][T#2]","log.logger":"org.elasticsearch.discovery.HandshakingTransportAddressConnector","elasticsearch.node.name":"data3","elasticsearch.cluster.name":"docker-cluster","error.type":"org.elasticsearch.transport.ConnectTransportException","error.message":"[master1][10.0.0.53:9300] connect_exception","error.stack_trace":"org.elasticsearch.transport.ConnectTransportException: [master1][10.0.0.53:9300] connect_exceptionntat org.elasticsearch.transport.TcpTransport$ChannelsConnectedListener.onFailure(TcpTransport.java:1032)ntat org.elasticsearch.action.ActionListener.lambda$toBiConsumer$0(ActionListener.java:279)ntat org.elasticsearch.core.CompletableContext.lambda$addListener$0(CompletableContext.java:31)ntat java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863)ntat java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841)ntat java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510)ntat java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2162)ntat org.elasticsearch.core.CompletableContext.completeExceptionally(CompletableContext.java:46)ntat org.elasticsearch.transport.netty4.Netty4TcpChannel.lambda$addListener$0(Netty4TcpChannel.java:58)ntat io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578)ntat io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:571)ntat io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:550)ntat io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491)ntat io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616)ntat io.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:609)ntat io.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:117)ntat io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe$1.run(AbstractNioChannel.java:262)ntat io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98)ntat io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:170)ntat io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)ntat io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469)ntat io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:500)ntat io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)ntat io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)ntat java.base/java.lang.Thread.run(Thread.java:833)nCaused by: io.netty.channel.ConnectTimeoutException: connection timed out: 10.0.0.53/10.0.0.53:9300ntat io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe$1.run(AbstractNioChannel.java:261)nt... 8 moren"}

2

Answers


  1. Chosen as BEST ANSWER

    I think the problem was that there was a conflict with the IPs of both the service and the container as they had the same name, master1. I don't truely know why this happens, because in the installation guide of ES in docker they also use the same name for the node and service. I solved this by changing the service name to "master1-elastic" and adding the tag

    hostname: master1
    

    I did the same with the previously named service master2.


  2. This problem is caused by the inconsistency between the es sevice IP bound to the Docker container and the IP of the es node.

    Just add the xxx.publish_host in elasticsearch configuration on each node.

    For example:

    network.publish_host: 10.0.211.117
    http.publish_host: 10.0.211.117
    transport.publish_host: 10.0.211.117
    

    more details see the NetWorking | ElastichSearch Guide

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search