From 6f097304fc976a350ad6828852c832a529078620 Mon Sep 17 00:00:00 2001 From: Sergiy Kukunin Date: Fri, 5 Jun 2026 21:59:47 -0700 Subject: [PATCH] Release stale per-host reservation when VM starts on a different host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a VM is stopped via the API, postStateTransitionEvent moves its used capacity into reserved_capacity on its last host (so a quick restart can reclaim it cheaply). Until now, this reservation was only drained when the VM started again on the same lastHostId. Starting on any other host left an orphan reservation that lingered for up to one hour (capacity.skipcounting.hours) and was summed into the cluster used+reserved aggregate by FirstFitPlanner.removeClustersCrossingThreshold, spuriously tripping the disable-threshold and blocking later starts. Always drain the VM's reservation on its previous host before allocating on the target host — same host or not. This removes the fromLastHost branch (now dead, the only other caller already passes false), the matching boolean parameter on allocateVmCapacity, and the misspelt moveToReservered parameter everywhere it appears. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/cloud/capacity/CapacityManager.java | 4 +- .../cloud/vm/VirtualMachineManagerImpl.java | 2 +- .../cloud/capacity/CapacityManagerImpl.java | 65 +++++++------------ .../capacity/CapacityManagerImplTest.java | 55 ++++++++++++++++ 4 files changed, 81 insertions(+), 45 deletions(-) diff --git a/engine/components-api/src/main/java/com/cloud/capacity/CapacityManager.java b/engine/components-api/src/main/java/com/cloud/capacity/CapacityManager.java index abaf6ea967d0..93ef12330dc5 100644 --- a/engine/components-api/src/main/java/com/cloud/capacity/CapacityManager.java +++ b/engine/components-api/src/main/java/com/cloud/capacity/CapacityManager.java @@ -147,9 +147,9 @@ public interface CapacityManager { "'' element of domain XMLs. If it is set to a value less than or equal to '0', then the host's CPU cores capacity will be considered.", true, ConfigKey.Scope.Cluster); - public boolean releaseVmCapacity(VirtualMachine vm, boolean moveFromReserved, boolean moveToReservered, Long hostId); + public boolean releaseVmCapacity(VirtualMachine vm, boolean moveFromReserved, boolean moveToReserved, Long hostId); - void allocateVmCapacity(VirtualMachine vm, boolean fromLastHost); + void allocateVmCapacity(VirtualMachine vm); /** * @param hostId Id of the host to check capacity diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java index ead990b42b86..0ce323eeb0ea 100755 --- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java @@ -5241,7 +5241,7 @@ private VMInstanceVO orchestrateReConfigureVm(String vmUuid, ServiceOffering old } if (reconfiguringOnExistingHost) { - _capacityMgr.allocateVmCapacity(vm, false); + _capacityMgr.allocateVmCapacity(vm); } } catch (final OperationTimedoutException e) { diff --git a/server/src/main/java/com/cloud/capacity/CapacityManagerImpl.java b/server/src/main/java/com/cloud/capacity/CapacityManagerImpl.java index 3be7384ffb71..223b34cee90f 100644 --- a/server/src/main/java/com/cloud/capacity/CapacityManagerImpl.java +++ b/server/src/main/java/com/cloud/capacity/CapacityManagerImpl.java @@ -167,7 +167,7 @@ public boolean start() { @DB @Override - public boolean releaseVmCapacity(VirtualMachine vm, final boolean moveFromReserved, final boolean moveToReservered, final Long hostId) { + public boolean releaseVmCapacity(VirtualMachine vm, final boolean moveFromReserved, final boolean moveToReserved, final Long hostId) { if (hostId == null) { return true; } @@ -175,11 +175,11 @@ public boolean releaseVmCapacity(VirtualMachine vm, final boolean moveFromReserv if (HypervisorType.External.equals(host.getHypervisorType())) { return true; } - return releaseVmCapacity(vm, moveFromReserved, moveToReservered, host); + return releaseVmCapacity(vm, moveFromReserved, moveToReserved, host); } @DB - public boolean releaseVmCapacity(VirtualMachine vm, final boolean moveFromReserved, final boolean moveToReservered, final Host host) { + public boolean releaseVmCapacity(VirtualMachine vm, final boolean moveFromReserved, final boolean moveToReserved, final Host host) { if (host == null) { return true; } @@ -241,7 +241,7 @@ public void doInTransactionWithoutResult(TransactionStatus status) { capacityCpuCore.setUsedCapacity(usedCpuCore - vmCPUCore); } - if (moveToReservered) { + if (moveToReserved) { if (reservedCpu + vmCPU <= totalCpu) { capacityCpu.setReservedCapacity(reservedCpu + vmCPU); } @@ -264,11 +264,11 @@ public void doInTransactionWithoutResult(TransactionStatus status) { logger.debug("release cpu from host: {}, old used: {}, " + "reserved: {}, actual total: {}, total with overprovisioning: {}; " + - "new used: {},reserved:{}; movedfromreserved: {},moveToReservered: {}", host, usedCpu, reservedCpu, actualTotalCpu, totalCpu, capacityCpu.getUsedCapacity(), capacityCpu.getReservedCapacity(), moveFromReserved, moveToReservered); + "new used: {},reserved:{}; movedfromreserved: {},moveToReserved: {}", host, usedCpu, reservedCpu, actualTotalCpu, totalCpu, capacityCpu.getUsedCapacity(), capacityCpu.getReservedCapacity(), moveFromReserved, moveToReserved); logger.debug("release mem from host: {}, old used: {}, " + "reserved: {}, total: {}; new used: {}, reserved: {}; " + - "movedfromreserved: {}, moveToReservered: {}", host, toHumanReadableSize(usedMem), toHumanReadableSize(reservedMem), toHumanReadableSize(totalMem), toHumanReadableSize(capacityMemory.getUsedCapacity()), toHumanReadableSize(capacityMemory.getReservedCapacity()), moveFromReserved, moveToReservered); + "movedfromreserved: {}, moveToReserved: {}", host, toHumanReadableSize(usedMem), toHumanReadableSize(reservedMem), toHumanReadableSize(totalMem), toHumanReadableSize(capacityMemory.getUsedCapacity()), toHumanReadableSize(capacityMemory.getReservedCapacity()), moveFromReserved, moveToReserved); _capacityDao.update(capacityCpu.getId(), capacityCpu); _capacityDao.update(capacityMemory.getId(), capacityMemory); @@ -285,7 +285,7 @@ public void doInTransactionWithoutResult(TransactionStatus status) { @DB @Override - public void allocateVmCapacity(VirtualMachine vm, final boolean fromLastHost) { + public void allocateVmCapacity(VirtualMachine vm) { final long hostId = vm.getHostId(); final HostVO host = _hostDao.findById(hostId); @@ -328,7 +328,6 @@ public void doInTransactionWithoutResult(TransactionStatus status) { long usedCpuCore = capacityCpuCore.getUsedCapacity(); long reservedCpu = capacityCpu.getReservedCapacity(); long reservedMem = capacityMem.getReservedCapacity(); - long reservedCpuCore = capacityCpuCore.getReservedCapacity(); long actualTotalCpu = capacityCpu.getTotalCapacity(); long actualTotalMem = capacityMem.getTotalCapacity(); long totalCpu = (long)(actualTotalCpu * cpuOvercommitRatio); @@ -349,41 +348,25 @@ public void doInTransactionWithoutResult(TransactionStatus status) { capacityMem.setUsedCapacity(usedMem + ram); capacityCpuCore.setUsedCapacity(usedCpuCore + cpucore); - if (fromLastHost) { - /* alloc from reserved */ - if (logger.isDebugEnabled()) { - logger.debug("We are allocating VM to the last host again, so adjusting the reserved capacity if it is not less than required"); - logger.debug("Reserved CPU: " + reservedCpu + " , Requested CPU: " + cpu); - logger.debug("Reserved RAM: " + toHumanReadableSize(reservedMem) + " , Requested RAM: " + toHumanReadableSize(ram)); - } - if (reservedCpu >= cpu && reservedMem >= ram) { - capacityCpu.setReservedCapacity(reservedCpu - cpu); - capacityMem.setReservedCapacity(reservedMem - ram); - capacityCpuCore.setReservedCapacity(reservedCpuCore - cpucore); - } - } else { - /* alloc from free resource */ - if (!((reservedCpu + usedCpu + cpu <= totalCpu) && (reservedMem + usedMem + ram <= totalMem))) { - if (logger.isDebugEnabled()) { - logger.debug("Host doesn't seem to have enough free capacity, but increasing the used capacity anyways, " + - "since the VM is already starting on this host "); - } - } + /* alloc from free resource */ + if (!((reservedCpu + usedCpu + cpu <= totalCpu) && (reservedMem + usedMem + ram <= totalMem))) { + logger.debug("Host doesn't seem to have enough free capacity, but increasing the used capacity anyways, " + + "since the VM is already starting on this host"); } - logger.debug(String.format("CPU STATS after allocation: for host: %s, " + - "old used: %d, old reserved: %d, actual total: %d, " + - "total with overprovisioning: %d; new used: %d, reserved: %d; " + - "requested cpu: %d, alloc_from_last: %s", + logger.debug("CPU STATS after allocation: for host: {}, " + + "old used: {}, old reserved: {}, actual total: {}, " + + "total with overprovisioning: {}; new used: {}, reserved: {}; " + + "requested cpu: {}", host, usedCpu, reservedCpu, actualTotalCpu, totalCpu, - capacityCpu.getUsedCapacity(), capacityCpu.getReservedCapacity(), cpu, fromLastHost)); + capacityCpu.getUsedCapacity(), capacityCpu.getReservedCapacity(), cpu); logger.debug("RAM STATS after allocation: for host: {}, " + "old used: {}, old reserved: {}, total: {}; new used: {}, reserved: {}; " + - "requested mem: {}, alloc_from_last: {}", + "requested mem: {}", host, toHumanReadableSize(usedMem), toHumanReadableSize(reservedMem), toHumanReadableSize(totalMem), toHumanReadableSize(capacityMem.getUsedCapacity()), - toHumanReadableSize(capacityMem.getReservedCapacity()), toHumanReadableSize(ram), fromLastHost); + toHumanReadableSize(capacityMem.getReservedCapacity()), toHumanReadableSize(ram)); long cluster_id = host.getClusterId(); ClusterDetailsVO cluster_detail_cpu = _clusterDetailsDao.findDetail(cluster_id, VmDetailConstants.CPU_OVER_COMMIT_RATIO); @@ -959,8 +942,8 @@ public boolean postStateTransitionEvent(StateMachine2.Transition t Host lastHost = _hostDao.findById(vm.getLastHostId()); Host oldHost = _hostDao.findById(oldHostId); Host newHost = _hostDao.findById(vm.getHostId()); - logger.debug(String.format("%s state transited from [%s] to [%s] with event [%s]. VM's original host: %s, new host: %s, host before state transition: %s", vm, oldState, - newState, event, lastHost, newHost, oldHost)); + logger.debug("{} state transited from [{}] to [{}] with event [{}]. VM's original host: {}, new host: {}, host before state transition: {}", + vm, oldState, newState, event, lastHost, newHost, oldHost); if (oldState == State.Starting) { if (newState != State.Running) { @@ -1000,12 +983,10 @@ public boolean postStateTransitionEvent(StateMachine2.Transition t } if ((newState == State.Starting || newState == State.Migrating || event == Event.AgentReportMigrated) && vm.getHostId() != null) { - boolean fromLastHost = false; - if (vm.getHostId().equals(vm.getLastHostId())) { - logger.debug("VM starting again on the last host it was stopped on"); - fromLastHost = true; + if (vm.getLastHostId() != null) { + releaseVmCapacity(vm, true, false, vm.getLastHostId()); } - allocateVmCapacity(vm, fromLastHost); + allocateVmCapacity(vm); } if (newState == State.Stopped && event != Event.RestoringFailed && event != Event.RestoringSuccess && vm.getType() == VirtualMachine.Type.User) { diff --git a/server/src/test/java/com/cloud/capacity/CapacityManagerImplTest.java b/server/src/test/java/com/cloud/capacity/CapacityManagerImplTest.java index a69830e2e0bb..ecb976468613 100644 --- a/server/src/test/java/com/cloud/capacity/CapacityManagerImplTest.java +++ b/server/src/test/java/com/cloud/capacity/CapacityManagerImplTest.java @@ -19,9 +19,12 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; @@ -43,10 +46,15 @@ import com.cloud.dc.ClusterDetailsDao; import com.cloud.host.Host; +import com.cloud.host.dao.HostDao; import com.cloud.offering.ServiceOffering; import com.cloud.service.ServiceOfferingVO; import com.cloud.service.dao.ServiceOfferingDao; import com.cloud.utils.Pair; +import com.cloud.utils.fsm.StateMachine2; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.VirtualMachine.Event; +import com.cloud.vm.VirtualMachine.State; import com.cloud.vm.VmDetailConstants; @RunWith(MockitoJUnitRunner.class) @@ -55,6 +63,8 @@ public class CapacityManagerImplTest { ClusterDetailsDao clusterDetailsDao; @Mock ServiceOfferingDao serviceOfferingDao; + @Mock + HostDao hostDao; @Spy @InjectMocks @@ -160,6 +170,51 @@ public void testCheckIfHostHasNoCpuCapabilityButHasCapacity() { eq(false), eq(cpuOvercommit), eq(memoryOvercommit), eq(false)); } + @Test + public void testPostStateTransitionReleasesStaleReservationWhenStartingOnDifferentHost() { + final Long oldHostId = 1L; + final Long newHostId = 2L; + + VirtualMachine vm = mock(VirtualMachine.class); + when(vm.getHostId()).thenReturn(newHostId); + when(vm.getLastHostId()).thenReturn(oldHostId); + + doNothing().when(capacityManager).allocateVmCapacity(any(VirtualMachine.class)); + doReturn(true).when(capacityManager).releaseVmCapacity( + any(VirtualMachine.class), anyBoolean(), anyBoolean(), anyLong()); + + StateMachine2.Transition transition = new StateMachine2.Transition<>( + State.Stopped, Event.StartRequested, State.Starting, Collections.emptyList()); + Pair opaque = new Pair<>(null, 0L); + + capacityManager.postStateTransitionEvent(transition, vm, true, opaque); + + verify(capacityManager).releaseVmCapacity(vm, true, false, oldHostId); + verify(capacityManager).allocateVmCapacity(vm); + } + + @Test + public void testPostStateTransitionReleasesReservationWhenStartingOnSameHost() { + final Long hostId = 1L; + + VirtualMachine vm = mock(VirtualMachine.class); + when(vm.getHostId()).thenReturn(hostId); + when(vm.getLastHostId()).thenReturn(hostId); + + doNothing().when(capacityManager).allocateVmCapacity(any(VirtualMachine.class)); + doReturn(true).when(capacityManager).releaseVmCapacity( + any(VirtualMachine.class), anyBoolean(), anyBoolean(), anyLong()); + + StateMachine2.Transition transition = new StateMachine2.Transition<>( + State.Stopped, Event.StartRequested, State.Starting, Collections.emptyList()); + Pair opaque = new Pair<>(null, 0L); + + capacityManager.postStateTransitionEvent(transition, vm, true, opaque); + + verify(capacityManager).releaseVmCapacity(vm, true, false, hostId); + verify(capacityManager).allocateVmCapacity(vm); + } + @Test public void testCheckIfHostHasCpuCapabilityButNoCapacity() { Float cpuOvercommit = 2.0f;