I have a terraform following code that’s configuring me a gateway service on AWS ECS Fargate. Services that are not under load balancer which are in private network work as expected however gateway with added LB is failing it’s health check and every 2-3 minute is deprovisioning and provisioning new task. Docker file is exposing a service on port 3000.
Here’s a terraform plan that is failing
locals {
gateway_version = "1.0.0"
gateway_port = 3000
}
## VPC
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "3.11.0"
name = "${var.env}-vpc"
cidr = "20.0.0.0/16"
enable_ipv6 = true
azs = ["eu-central-1a", "eu-central-1b"]
public_subnets = ["20.0.1.0/24", "20.0.2.0/24"]
private_subnets = ["20.0.86.0/24", "20.0.172.0/24"]
elasticache_subnets = ["20.0.31.0/24", "20.0.32.0/24"]
enable_nat_gateway = true
single_nat_gateway = true
tags = {
Terraform = "true"
}
}
## Security Groups
module "sg" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.env}-sg-default"
description = "Default service security group"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = [
"all-icmp",
"http-80-tcp",
"https-443-tcp",
"mysql-tcp",
"rabbitmq-4369-tcp",
"rabbitmq-5671-tcp",
"rabbitmq-5672-tcp",
"rabbitmq-15672-tcp",
"rabbitmq-25672-tcp",
"redis-tcp"
]
egress_rules = ["all-all"]
}
module "security_group" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.env}-sg-lb"
description = "Security group for ALB"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = ["http-80-tcp", "all-icmp"]
egress_rules = ["all-all"]
}
resource "aws_security_group" "service_security_group" {
name = "${var.env}-lb-connection"
ingress {
from_port = 0
to_port = 0
protocol = "-1"
# Only allowing traffic in from the load balancer security group
security_groups = [module.security_group.security_group_id]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
vpc_id = module.vpc.vpc_id
}
## ECS Cluster
resource "aws_ecs_cluster" "default" {
name = "${var.env}-cluster"
}
## ECR
data "aws_ecr_repository" "gateway_ecr" {
name = "gateway-${var.env}"
}
## ECS Task Definition
resource "aws_ecs_task_definition" "gateway_task" {
family = "${var.env}-gateway-task"
container_definitions = <<DEFINITION
[
{
"name": "${var.env}-gateway-task",
"image": "${data.aws_ecr_repository.gateway_ecr.repository_url}:${local.gateway_version}",
"networkMode": "awsvpc",
"essential": true,
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "${aws_cloudwatch_log_group.gateway_logs.name}",
"awslogs-stream-prefix": "ecs",
"awslogs-region": "${var.aws-region}"
}
},
"portMappings": [
{
"containerPort": ${local.gateway_port},
"hostPort": ${local.gateway_port}
}
],
"environment": [
{
"name": "AWS_REGION",
"value": "${var.aws-region}"
},
{
"name": "PORT",
"value": "${local.gateway_port}"
},
{
"name": "STAGE",
"value": "${var.env}"
},
{
"name": "NODE_ENV",
"value": "development"
},
{
"name": "VERSION",
"value": "${local.gateway_version}"
}
],
"memory": 512,
"cpu": 256
}
]
DEFINITION
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
memory = 512
cpu = 256
task_role_arn = aws_iam_role.gateway_task_definition_role.arn
execution_role_arn = aws_iam_role.gateway_task_execution_role.arn
}
## ECS Service
resource "aws_ecs_service" "gateway_service" {
name = "${var.env}-gateway-service"
cluster = aws_ecs_cluster.default.id
task_definition = aws_ecs_task_definition.gateway_task.arn
launch_type = "FARGATE"
desired_count = 1
force_new_deployment = true
network_configuration {
subnets = concat(
module.vpc.public_subnets,
module.vpc.private_subnets,
)
security_groups = [
module.sg.security_group_id,
aws_security_group.service_security_group.id
]
assign_public_ip = true
}
lifecycle {
ignore_changes = [desired_count]
}
load_balancer {
target_group_arn = aws_lb_target_group.target_group.arn
container_name = aws_ecs_task_definition.gateway_task.family
container_port = local.gateway_port
}
}
## Cloudwatch Log Group
resource "aws_cloudwatch_log_group" "gateway_logs" {
name = "${var.env}-gateway-log-group"
tags = {
Name = "${var.env}-gateway-log-group"
}
}
## IAM Roles
resource "aws_iam_role" "gateway_task_definition_role" {
name = "${var.env}-gateway-task-definition-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags = {
Name = "${var.env}-gateway-task-definition-role"
}
}
resource "aws_iam_role" "gateway_task_execution_role" {
name = "${var.env}-gateway-task-execution-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags = {
Name = "${var.env}-gateway-task-execution-role"
}
}
data "aws_iam_policy_document" "gateway_assume_role_policy" {
statement {
effect = "Allow"
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy" "gateway_exec" {
name = "${var.env}-gateway-execution-role-policy"
role = aws_iam_role.gateway_task_execution_role.id
policy = data.aws_iam_policy_document.gateway_exec_policy.json
}
data "aws_iam_policy_document" "gateway_exec_policy" {
statement {
effect = "Allow"
resources = ["*"]
actions = [
"ecr:GetAuthorizationToken",
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"logs:CreateLogStream",
"logs:PutLogEvents",
]
}
}
## ALB
resource "aws_lb" "alb" {
name = "${var.env}-lb"
load_balancer_type = "application"
subnets = module.vpc.public_subnets
security_groups = [module.security_group.security_group_id]
}
resource "aws_lb_target_group" "target_group" {
name = "target-group"
port = 80
protocol = "HTTP"
target_type = "ip"
vpc_id = module.vpc.vpc_id
health_check {
matcher = "200,301,302"
path = "/health"
interval = 120
timeout = 30
}
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = aws_alb.alb.arn
port = 80
protocol = "HTTP"
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.target_group.arn
}
}
That’s the error
Task failed ELB health checks in (target-group arn:aws:elasticloadbalancing:eu-central-1:129228585726:targetgroup/target-group/5853904c0d3ad322)
After it’s deployed I see that a ECS service is started and it’s working there however I don’t see any requests to check it’s health
3
Answers
Target group was not an issue -> the issue was wrong security_group which didn't allowed to hit port 3000
Your target group uses
port = 80
, but your ECS task definition specifies port3000
. So this is likely reason why your ALB can’t connect to your containers.The load balancer tries to check if it is able to reach the application on the specified target port. In your case it is 3000.
Replace your target group resource to use the application port for LB healthchecks to pass.