Skip to content

Commit cc831d8

Browse files
lxd: update the instancesShutdown function to work with a usedResourcesMap
Now the `instancesShutdown` is being able to query the state of instances (busy or not). If the instance to be shutdown in a worker is detected as busy, we send it back to the shutdown channel with a bounded exponential backoff. Backoff strategy: It starts with an initial retry delay of 5 seconds. Subsequent retries increase exponentially, multiplying the previous delay by 1.5, with a jitter factor of 0.2 to avoid synchronized retries. The delay is capped at a maximum of 60 seconds to prevent excessively long waits. The entire retry process continues for a maximum of 15 minutes (MaxElapsedTime), after which it terminates regardless of success, ensuring the task doesn't run indefinitely. Example: Retry 1: 5 seconds Retry 2: 7.5 seconds (5 * 1.5) Retry 3: 11.25 seconds (7.5 * 1.5) Retry 4: 16.875 seconds (11.25 * 1.5) Retry 5: 25.3125 seconds (16.875 * 1.5) Retry 6: 37.96875 seconds (25.312 * 1.5) Retry 7: 56.953 seconds Retry 8: 60 seconds, because MaxInterval will prevent from incrementing more so we do 60s from here. Retry 9: 60s Retry 10: 60s Retry 11: 60s <Instance operation finishes before the 15min max elapsed time!> Signed-off-by: Gabriel Mougard <gabriel.mougard@canonical.com>
1 parent a458c0b commit cc831d8

File tree

2 files changed

+68
-4
lines changed

2 files changed

+68
-4
lines changed

lxd/daemon.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,7 +1527,7 @@ func (d *Daemon) init() error {
15271527
return fmt.Errorf("Failed loading containers to restart: %w", err)
15281528
}
15291529

1530-
instancesShutdown(instances)
1530+
instancesShutdown(instances, nil, nil)
15311531
instancesStart(s, instances)
15321532
}
15331533

@@ -2102,7 +2102,7 @@ func (d *Daemon) Stop(ctx context.Context, sig os.Signal) error {
21022102

21032103
// Full shutdown requested.
21042104
if sig == unix.SIGPWR {
2105-
instancesShutdown(instances)
2105+
instancesShutdown(instances, usedResourcesMap, &resourceMapMu)
21062106

21072107
logger.Info("Stopping networks")
21082108
networkShutdown(s)

lxd/instances.go

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/canonical/lxd/shared/api"
2525
"github.com/canonical/lxd/shared/entity"
2626
"github.com/canonical/lxd/shared/logger"
27+
"github.com/cenkalti/backoff/v4"
2728
)
2829

2930
var instancesCmd = APIEndpoint{
@@ -455,20 +456,31 @@ func instancesOnDisk(s *state.State) ([]instance.Instance, error) {
455456
return instances, nil
456457
}
457458

458-
func instancesShutdown(instances []instance.Instance) {
459+
func instancesShutdown(instances []instance.Instance, usedResourceMap map[*api.URL]bool, usedResourceMapMu *sync.Mutex) {
459460
sort.Sort(instanceStopList(instances))
460461

461462
// Limit shutdown concurrency to number of instances or number of CPU cores (which ever is less).
462463
var wg sync.WaitGroup
463464
instShutdownCh := make(chan instance.Instance)
465+
resourceConstrainedMode := usedResourceMap != nil && usedResourceMapMu != nil
466+
var instToBackoff map[*api.URL]*backoff.ExponentialBackOff
467+
var instToBackoffMu sync.Mutex
464468
maxConcurrent := runtime.NumCPU()
465469
instCount := len(instances)
466470
if instCount < maxConcurrent {
467471
maxConcurrent = instCount
468472
}
469473

474+
// If we are in resource constrained mode, we need to track the instance URL (the URLs here are for instances that are still busy, i.e have associated pending operations) to their backoff
475+
// so that we can re-send the instance back to the instShutdownCh channel to be shutdown after the backoff period has elapsed.
476+
// Each instance URL will have a backoff associated with it so that an instance that is newly detected as busy will have an shorter backoff period than an instance that has been busy for a while.
477+
if resourceConstrainedMode {
478+
instToBackoff = make(map[*api.URL]*backoff.ExponentialBackOff, len(instances))
479+
instToBackoffMu = sync.Mutex{}
480+
}
481+
470482
for i := 0; i < maxConcurrent; i++ {
471-
go func(instShutdownCh <-chan instance.Instance) {
483+
go func(instShutdownCh chan instance.Instance) {
472484
for inst := range instShutdownCh {
473485
// Determine how long to wait for the instance to shutdown cleanly.
474486
timeoutSeconds := 30
@@ -477,6 +489,51 @@ func instancesShutdown(instances []instance.Instance) {
477489
timeoutSeconds, _ = strconv.Atoi(value)
478490
}
479491

492+
instanceURL := entity.InstanceURL(inst.Project().Name, inst.Name())
493+
if resourceConstrainedMode {
494+
usedResourceMapMu.Lock()
495+
exponentialBackOff, exists := instToBackoff[instanceURL]
496+
if !exists {
497+
exponentialBackOff := backoff.NewExponentialBackOff()
498+
exponentialBackOff.InitialInterval = 5 * time.Second
499+
exponentialBackOff.Multiplier = 1.5
500+
exponentialBackOff.RandomizationFactor = 0.2
501+
exponentialBackOff.MaxInterval = 60 * time.Second
502+
exponentialBackOff.MaxElapsedTime = 15 * time.Minute
503+
instToBackoff[instanceURL] = exponentialBackOff
504+
}
505+
506+
code, err := backoff.RetryWithData(func() (int, error) {
507+
usedResourceMapMu.Lock()
508+
blocked, exists := usedResourceMap[instanceURL]
509+
usedResourceMapMu.Unlock()
510+
511+
if blocked && exists {
512+
instShutdownCh <- inst
513+
return 1, nil
514+
}
515+
516+
return 0, nil
517+
}, exponentialBackOff)
518+
if err != nil {
519+
if err != backoff.Permanent(err) {
520+
logger.Warn("Failed to retry backoff operation during busy instance shutdown attempt", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
521+
} else {
522+
logger.Warn("Unknown error returned from backoff operation during instance shutdown retry", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
523+
}
524+
525+
instToBackoffMu.Unlock()
526+
continue
527+
}
528+
529+
instToBackoffMu.Unlock()
530+
// This means the instance has been sent back to the channel to be retried.
531+
// We should not attempt to shutdown the instance for now.
532+
if code == 1 {
533+
continue
534+
}
535+
}
536+
480537
err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds))
481538
if err != nil {
482539
logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
@@ -486,6 +543,13 @@ func instancesShutdown(instances []instance.Instance) {
486543
}
487544
}
488545

546+
if resourceConstrainedMode {
547+
instanceURL := entity.InstanceURL(inst.Project().Name, inst.Name())
548+
usedResourceMapMu.Lock()
549+
usedResourceMap[instanceURL] = false
550+
usedResourceMapMu.Unlock()
551+
}
552+
489553
if inst.ID() > 0 {
490554
// If DB was available then the instance shutdown process will have set
491555
// the last power state to STOPPED, so set that back to RUNNING so that

0 commit comments

Comments
 (0)