After hosting an application on FARGATE, we’re trying to switch to EC2 instances with a custom capacity provider. For some reason the capacity provider we created won’t run more than 1 task per instance.
The configuration below works well if we only want 1 task, but as soon as we switch to 10, 5 instances are created and each has only 1 task running.
{
"CapacityProvider": {
"Type": "AWS::ECS::CapacityProvider",
"DeletionPolicy": "Retain",
"Properties": {
"AutoScalingGroupProvider": {
"AutoScalingGroupArn": {
"Ref": "AutoScaleGroup"
},
"ManagedDraining": "ENABLED",
"ManagedScaling": {
"InstanceWarmupPeriod": 0,
"MaximumScalingStepSize": 1,
"MinimumScalingStepSize": 1,
"Status": "ENABLED",
"TargetCapacity": 100
}
},
"Name": {
"Fn::Sub": "${ServiceName}-${Env}-CapacityProvider"
}
}
},
"AutoScaleGroup": {
"Type": "AWS::AutoScaling::AutoScalingGroup",
"Properties": {
"ServiceLinkedRoleARN": {
"Fn::Sub": "arn:aws:iam::${AWS::AccountId}:role/aws-service-role/autoscaling.amazonaws.com/AWSServiceRoleForAutoScaling"
},
"CapacityRebalance": true,
"Cooldown": "300",
"AvailabilityZones": [
"eu-central-1a",
"eu-central-1b",
"eu-central-1c"
],
"DesiredCapacity": "2",
"HealthCheckGracePeriod": 300,
"MetricsCollection": [],
"InstanceMaintenancePolicy": {
"MinHealthyPercentage": 90,
"MaxHealthyPercentage": 100
},
"MaxSize": "5",
"NewInstancesProtectedFromScaleIn": false,
"MinSize": "2",
"TerminationPolicies": [
"Default"
],
"AutoScalingGroupName": {
"Fn::Sub": "${ServiceName}-auto-scaling-group"
},
"MixedInstancesPolicy": {
"LaunchTemplate": {
"LaunchTemplateSpecification": {
"Version": {
"Fn::GetAtt": [
"LaunchTemplate",
"LatestVersionNumber"
]
},
"LaunchTemplateId": {
"Ref": "LaunchTemplate"
}
},
"Overrides": [
{
"InstanceType": "m5.large",
"WeightedCapacity": "1"
},
{
"InstanceType": "m4.large",
"WeightedCapacity": "1"
},
{
"InstanceType": "m5.xlarge",
"WeightedCapacity": "1"
},
{
"InstanceType": "m4.xlarge",
"WeightedCapacity": "1"
},
{
"InstanceType": "m5.2xlarge",
"WeightedCapacity": "1"
}
]
},
"InstancesDistribution": {
"OnDemandAllocationStrategy": "prioritized",
"OnDemandBaseCapacity": 0,
"SpotAllocationStrategy": "price-capacity-optimized",
"OnDemandPercentageAboveBaseCapacity": 0
}
},
"VPCZoneIdentifier": [
{
"Ref": "Subnet1"
},
{
"Ref": "Subnet2"
},
{
"Ref": "Subnet3"
}
],
"DesiredCapacityType": "units",
"Tags": [],
"HealthCheckType": "ELB"
}
}
}
And the application configuration:
{
"AWSTemplateFormatVersion": "2010-09-09",
"Parameters": {
"Env": {
"Description": "An environment name",
"Type": "String",
"AllowedValues": [
"dev",
"test",
"prod"
],
"ConstraintDescription": "Allowed values is dev/test/prod."
},
"ServiceName": {
"Type": "String",
"Default": ""
},
"VPCID": {
"Type": "String",
"Default": ""
},
"Subnet1": {
"Type": "String",
"Default": ""
},
"Subnet2": {
"Type": "String",
"Default": ""
},
"Subnet3": {
"Type": "String",
"Default": ""
},
"Cluster": {
"Type": "String",
"Default": ""
},
"LoadBalancer": {
"Type": "String",
"Default": ""
},
"Listener": {
"Type": "String",
"Default": ""
},
"Version": {
"Type": "String",
"Default": "1.0"
},
"Registry": {
"Type": "String",
"Default": ""
},
"Domain": {
"Type": "String",
"Default": ""
}
},
"Resources": {
"Certificate": {
"Type": "AWS::CertificateManager::Certificate",
"Properties": {
"DomainName": {
"Ref": "Domain"
},
"ValidationMethod": "DNS"
}
},
"CertificateListener": {
"Type": "AWS::ElasticLoadBalancingV2::ListenerCertificate",
"Properties": {
"Certificates": [
{
"CertificateArn": {
"Ref": "Certificate"
}
}
],
"ListenerArn": {
"Ref": "Listener"
}
}
},
"LogGroup": {
"Type": "AWS::Logs::LogGroup",
"Properties": {
"RetentionInDays": 14
}
},
"Filesystem": {
"Type": "AWS::EFS::FileSystem",
"Properties": {
"Encrypted": true,
"KmsKeyId": {
"Ref": "KMS"
},
"FileSystemTags": [
{
"Key": "Name",
"Value": {
"Fn::Sub": "${ServiceName}-${Env}"
}
}
]
}
},
"MountTarget": {
"Type": "AWS::EFS::MountTarget",
"Properties": {
"FileSystemId": {
"Ref": "Filesystem"
},
"SubnetId": {
"Ref": "Subnet1"
},
"SecurityGroups": [
{
"Fn::GetAtt": [
"SecurityGroup",
"GroupId"
]
}
]
}
},
"MountTarget2": {
"Type": "AWS::EFS::MountTarget",
"Properties": {
"FileSystemId": {
"Ref": "Filesystem"
},
"SubnetId": {
"Ref": "Subnet2"
},
"SecurityGroups": [
{
"Fn::GetAtt": [
"SecurityGroup",
"GroupId"
]
}
]
}
},
"MountTarget3": {
"Type": "AWS::EFS::MountTarget",
"Properties": {
"FileSystemId": {
"Ref": "Filesystem"
},
"SubnetId": {
"Ref": "Subnet3"
},
"SecurityGroups": [
{
"Fn::GetAtt": [
"SecurityGroup",
"GroupId"
]
}
]
}
},
"KMS": {
"Type": "AWS::KMS::Key",
"Properties": {
"Enabled": true,
"EnableKeyRotation": false,
"KeyPolicy": {
"Version": "2012-10-17",
"Statement": [
{
"Sid": "Enable IAM User Permissions",
"Effect": "Allow",
"Principal": {
"AWS": {
"Fn::Join": [
"",
[
"arn:aws:iam::",
{
"Ref": "AWS::AccountId"
},
":root"
]
]
}
},
"Action": "kms:*",
"Resource": "*"
},
{
"Sid": "Allow use of the key",
"Effect": "Allow",
"Principal": {
"AWS": "*"
},
"Action": [
"kms:Sign",
"kms:Verify",
"kms:DescribeKey",
"kms:List*"
],
"Resource": "*"
}
]
}
}
},
"ShopECSService": {
"Type": "AWS::ECS::Service",
"Properties": {
"Cluster": {
"Ref": "Cluster"
},
"ServiceName": {
"Fn::Sub": "${ServiceName}-${Env}"
},
"DesiredCount": 1,
"CapacityProviderStrategy": [
{
"CapacityProvider": "shop-prod-CapacityProvider",
"Weight": 1
}
],
"DeploymentConfiguration": {
"MaximumPercent": 200,
"MinimumHealthyPercent": 100,
"DeploymentCircuitBreaker": {
"Enable": true,
"Rollback": true
}
},
"LoadBalancers": [
{
"ContainerName": "nginx",
"ContainerPort": 80,
"TargetGroupArn": {
"Ref": "TargetGroup"
}
}
],
"NetworkConfiguration": {
"AwsvpcConfiguration": {
"AssignPublicIp": "DISABLED",
"SecurityGroups": [
{
"Fn::GetAtt": [
"SecurityGroup",
"GroupId"
]
}
],
"Subnets": [
{
"Ref": "Subnet1"
},
{
"Ref": "Subnet2"
},
{
"Ref": "Subnet3"
}
]
}
},
"TaskDefinition": {
"Ref": "TaskDefinition"
}
}
},
"Role": {
"Type": "AWS::IAM::Role",
"Properties": {
"AssumeRolePolicyDocument": {
"Version": "2008-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
},
"ManagedPolicyArns": [
"arn:aws:iam::aws:policy/AmazonEC2FullAccess",
"arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy",
"arn:aws:iam::aws:policy/service-role/AWSLambdaSQSQueueExecutionRole",
"arn:aws:iam::aws:policy/SecretsManagerReadWrite"
]
}
},
"TaskDefinition": {
"Type": "AWS::ECS::TaskDefinition",
"Properties": {
"Family": {
"Fn::Sub": "task-${ServiceName}-${Env}"
},
"ExecutionRoleArn": {
"Ref": "Role"
},
"Volumes": [
{
"Name": "efs",
"EFSVolumeConfiguration": {
"FilesystemId": {
"Ref": "Filesystem"
}
}
}
],
"NetworkMode": "awsvpc",
"RequiresCompatibilities": [
"EC2"
],
"ContainerDefinitions": [
{
"Name": "app",
"Image": {
"Fn::Sub": "${Registry}:${Version}-app"
},
"MemoryReservation": "512",
"MountPoints": [
{
"ContainerPath": "/var/www/html/user",
"SourceVolume": "efs"
}
],
"Secrets": [],
"LogConfiguration": {
"LogDriver": "awslogs",
"Options": {
"awslogs-group": {
"Ref": "LogGroup"
},
"awslogs-region": {
"Ref": "AWS::Region"
},
"awslogs-stream-prefix": "ecs"
}
},
"Environment": [],
"Essential": true,
"PortMappings": [
{
"ContainerPort": 9000,
"Protocol": "tcp",
"AppProtocol": "http"
}
]
},
{
"Name": "nginx",
"Image": {
"Fn::Sub": "${Registry}:${Version}-nginx"
},
"MemoryReservation": "512",
"LogConfiguration": {
"LogDriver": "awslogs",
"Options": {
"awslogs-group": {
"Ref": "LogGroup"
},
"awslogs-region": {
"Ref": "AWS::Region"
},
"awslogs-stream-prefix": "ecs"
}
},
"Essential": true,
"PortMappings": [
{
"ContainerPort": 80,
"Protocol": "tcp",
"AppProtocol": "http"
}
]
}
]
}
},
"SecurityGroup": {
"Type": "AWS::EC2::SecurityGroup",
"Properties": {
"GroupDescription": "ECS security group",
"VpcId": {
"Ref": "VPCID"
},
"SecurityGroupIngress": [
{
"IpProtocol": "tcp",
"FromPort": 80,
"ToPort": 9000,
"CidrIp": "0.0.0.0/0"
},
{
"IpProtocol": "tcp",
"FromPort": 9000,
"ToPort": 9000,
"CidrIp": "0.0.0.0/0"
}
]
}
},
"TargetGroup": {
"Type": "AWS::ElasticLoadBalancingV2::TargetGroup",
"Properties": {
"Port": 80,
"Protocol": "HTTP",
"VpcId": {
"Ref": "VPCID"
},
"TargetType": "ip"
}
},
"ListenerRule": {
"Type": "AWS::ElasticLoadBalancingV2::ListenerRule",
"Properties": {
"Priority": 1,
"ListenerArn": {
"Ref": "Listener"
},
"Actions": [
{
"Type": "forward",
"TargetGroupArn": {
"Ref": "TargetGroup"
}
}
],
"Conditions": [
{
"Field": "host-header",
"HostHeaderConfig": {
"Values": [
{
"Ref": "Domain"
}
]
}
}
]
}
}
}
}
If more info is needed I’d be happy to supply it.
2
Answers
The problem we had became apparent as soon as we had not just m4 instances running; An m4 instance can not run multiple tasks on the same port.
At some point we had an m5 instance with multiple tasks running and an m4 with just a single tasks.
Somewhere deep in the documentation of the m4 and the m5 we read the difference and removing the m4 as an option from the launch template did the trick.
Are you starting these services manually in the console?, If so there is a section in the "create" under Services tab in the Cluster UI called "Task Placement".
By default I see that a template is selected being "AZ balanced spread" this template uses the "Spread" placement strategy, that tries to spread the tasks amongs AZ.
You can try the "binpack"* strategy, this strategy tries to maximize the use of ec2 resources.
*Tasks are placed on container instances so as to leave the least amount of unused CPU or memory. This strategy minimizes the number of container instances in use.
More about placement strategy
AWS Placement Strategy docs
Cloudformation placement strategy doc