go-zero/core/load/adaptiveshedder.go

package load

import (
	"errors"
	"fmt"
	"math"
	"sync/atomic"
	"time"

	"github.com/zeromicro/go-zero/core/collection"
	"github.com/zeromicro/go-zero/core/logx"
	"github.com/zeromicro/go-zero/core/mathx"
	"github.com/zeromicro/go-zero/core/stat"
	"github.com/zeromicro/go-zero/core/syncx"
	"github.com/zeromicro/go-zero/core/timex"
)

const (
	defaultBuckets = 50
	defaultWindow  = time.Second * 5
	// using 1000m notation, 900m is like 90%, keep it as var for unit test
	defaultCpuThreshold = 900
	defaultMinRt        = float64(time.Second / time.Millisecond)
	// moving average hyperparameter beta for calculating requests on the fly
	flyingBeta               = 0.9
	coolOffDuration          = time.Second
	cpuMax                   = 1000 // millicpu
	millisecondsPerSecond    = 1000
	overloadFactorLowerBound = 0.1
)

var (
	// ErrServiceOverloaded is returned by Shedder.Allow when the service is overloaded.
	ErrServiceOverloaded = errors.New("service overloaded")

	// default to be enabled
	enabled = syncx.ForAtomicBool(true)
	// default to be enabled
	logEnabled = syncx.ForAtomicBool(true)
	// make it a variable for unit test
	systemOverloadChecker = func(cpuThreshold int64) bool {
		return stat.CpuUsage() >= cpuThreshold
	}
)

type (
	// A Promise interface is returned by Shedder.Allow to let callers tell
	// whether the processing request is successful or not.
	Promise interface {
		// Pass lets the caller tell that the call is successful.
		Pass()
		// Fail lets the caller tell that the call is failed.
		Fail()
	}

	// Shedder is the interface that wraps the Allow method.
	Shedder interface {
		// Allow returns the Promise if allowed, otherwise ErrServiceOverloaded.
		Allow() (Promise, error)
	}

	// ShedderOption lets caller customize the Shedder.
	ShedderOption func(opts *shedderOptions)

	shedderOptions struct {
		window       time.Duration
		buckets      int
		cpuThreshold int64
	}

	adaptiveShedder struct {
		cpuThreshold    int64
		windowScale     float64
		flying          int64
		avgFlying       float64
		avgFlyingLock   syncx.SpinLock
		overloadTime    *syncx.AtomicDuration
		droppedRecently *syncx.AtomicBool
		passCounter     *collection.RollingWindow
		rtCounter       *collection.RollingWindow
	}
)

// Disable lets callers disable load shedding.
func Disable() {
	enabled.Set(false)
}

// DisableLog disables the stat logs for load shedding.
func DisableLog() {
	logEnabled.Set(false)
}

// NewAdaptiveShedder returns an adaptive shedder.
// opts can be used to customize the Shedder.
func NewAdaptiveShedder(opts ...ShedderOption) Shedder {
	if !enabled.True() {
		return newNopShedder()
	}

	options := shedderOptions{
		window:       defaultWindow,
		buckets:      defaultBuckets,
		cpuThreshold: defaultCpuThreshold,
	}
	for _, opt := range opts {
		opt(&options)
	}
	bucketDuration := options.window / time.Duration(options.buckets)
	return &adaptiveShedder{
		cpuThreshold:    options.cpuThreshold,
		windowScale:     float64(time.Second) / float64(bucketDuration) / millisecondsPerSecond,
		overloadTime:    syncx.NewAtomicDuration(),
		droppedRecently: syncx.NewAtomicBool(),
		passCounter: collection.NewRollingWindow(options.buckets, bucketDuration,
			collection.IgnoreCurrentBucket()),
		rtCounter: collection.NewRollingWindow(options.buckets, bucketDuration,
			collection.IgnoreCurrentBucket()),
	}
}

// Allow implements Shedder.Allow.
func (as *adaptiveShedder) Allow() (Promise, error) {
	if as.shouldDrop() {
		as.droppedRecently.Set(true)

		return nil, ErrServiceOverloaded
	}

	as.addFlying(1)

	return &promise{
		start:   timex.Now(),
		shedder: as,
	}, nil
}

func (as *adaptiveShedder) addFlying(delta int64) {
	flying := atomic.AddInt64(&as.flying, delta)
	// update avgFlying when the request is finished.
	// this strategy makes avgFlying have a little bit of lag against flying, and smoother.
	// when the flying requests increase rapidly, avgFlying increase slower, accept more requests.
	// when the flying requests drop rapidly, avgFlying drop slower, accept fewer requests.
	// it makes the service to serve as many requests as possible.
	if delta < 0 {
		as.avgFlyingLock.Lock()
		as.avgFlying = as.avgFlying*flyingBeta + float64(flying)*(1-flyingBeta)
		as.avgFlyingLock.Unlock()
	}
}

func (as *adaptiveShedder) highThru() bool {
	as.avgFlyingLock.Lock()
	avgFlying := as.avgFlying
	as.avgFlyingLock.Unlock()
	maxFlight := as.maxFlight() * as.overloadFactor()
	return avgFlying > maxFlight && float64(atomic.LoadInt64(&as.flying)) > maxFlight
}

func (as *adaptiveShedder) maxFlight() float64 {
	// windows = buckets per second
	// maxQPS = maxPASS * windows
	// minRT = min average response time in milliseconds
	// allowedFlying = maxQPS * minRT / milliseconds_per_second
	maxFlight := float64(as.maxPass()) * as.minRt() * as.windowScale
	return mathx.AtLeast(maxFlight, 1)
}

func (as *adaptiveShedder) maxPass() int64 {
	var result float64 = 1

	as.passCounter.Reduce(func(b *collection.Bucket) {
		if b.Sum > result {
			result = b.Sum
		}
	})

	return int64(result)
}

func (as *adaptiveShedder) minRt() float64 {
	// if no requests in previous windows, return defaultMinRt,
	// its a reasonable large value to avoid dropping requests.
	result := defaultMinRt

	as.rtCounter.Reduce(func(b *collection.Bucket) {
		if b.Count <= 0 {
			return
		}

		avg := math.Round(b.Sum / float64(b.Count))
		if avg < result {
			result = avg
		}
	})

	return result
}

func (as *adaptiveShedder) overloadFactor() float64 {
	// as.cpuThreshold must be less than cpuMax
	factor := (cpuMax - float64(stat.CpuUsage())) / (cpuMax - float64(as.cpuThreshold))
	// at least accept 10% of acceptable requests, even cpu is highly overloaded.
	return mathx.Between(factor, overloadFactorLowerBound, 1)
}

func (as *adaptiveShedder) shouldDrop() bool {
	if as.systemOverloaded() || as.stillHot() {
		if as.highThru() {
			flying := atomic.LoadInt64(&as.flying)
			as.avgFlyingLock.Lock()
			avgFlying := as.avgFlying
			as.avgFlyingLock.Unlock()
			msg := fmt.Sprintf(
				"dropreq, cpu: %d, maxPass: %d, minRt: %.2f, hot: %t, flying: %d, avgFlying: %.2f",
				stat.CpuUsage(), as.maxPass(), as.minRt(), as.stillHot(), flying, avgFlying)
			logx.Error(msg)
			stat.Report(msg)
			return true
		}
	}

	return false
}

func (as *adaptiveShedder) stillHot() bool {
	if !as.droppedRecently.True() {
		return false
	}

	overloadTime := as.overloadTime.Load()
	if overloadTime == 0 {
		return false
	}

	if timex.Since(overloadTime) < coolOffDuration {
		return true
	}

	as.droppedRecently.Set(false)
	return false
}

func (as *adaptiveShedder) systemOverloaded() bool {
	if !systemOverloadChecker(as.cpuThreshold) {
		return false
	}

	as.overloadTime.Set(timex.Now())
	return true
}

// WithBuckets customizes the Shedder with the given number of buckets.
func WithBuckets(buckets int) ShedderOption {
	return func(opts *shedderOptions) {
		opts.buckets = buckets
	}
}

// WithCpuThreshold customizes the Shedder with the given cpu threshold.
func WithCpuThreshold(threshold int64) ShedderOption {
	return func(opts *shedderOptions) {
		opts.cpuThreshold = threshold
	}
}

// WithWindow customizes the Shedder with given
func WithWindow(window time.Duration) ShedderOption {
	return func(opts *shedderOptions) {
		opts.window = window
	}
}

type promise struct {
	start   time.Duration
	shedder *adaptiveShedder
}

func (p *promise) Fail() {
	p.shedder.addFlying(-1)
}

func (p *promise) Pass() {
	rt := float64(timex.Since(p.start)) / float64(time.Millisecond)
	p.shedder.addFlying(-1)
	p.shedder.rtCounter.Add(math.Ceil(rt))
	p.shedder.passCounter.Add(1)
}
initial import 4 years ago			`package load`

			`import (`
			`"errors"`
			`"fmt"`
			`"math"`
			`"sync/atomic"`
			`"time"`

feat: rename module from tal-tech to zeromicro (#1413) 3 years ago			`"github.com/zeromicro/go-zero/core/collection"`
			`"github.com/zeromicro/go-zero/core/logx"`
optimize: shedding algorithm performance (#3908) 9 months ago			`"github.com/zeromicro/go-zero/core/mathx"`
feat: rename module from tal-tech to zeromicro (#1413) 3 years ago			`"github.com/zeromicro/go-zero/core/stat"`
			`"github.com/zeromicro/go-zero/core/syncx"`
			`"github.com/zeromicro/go-zero/core/timex"`
initial import 4 years ago			`)`

			`const (`
			`defaultBuckets = 50`
			`defaultWindow = time.Second * 5`
chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`// using 1000m notation, 900m is like 90%, keep it as var for unit test`
initial import 4 years ago			`defaultCpuThreshold = 900`
			`defaultMinRt = float64(time.Second / time.Millisecond)`
			`// moving average hyperparameter beta for calculating requests on the fly`
optimize: shedding algorithm performance (#3908) 9 months ago			`flyingBeta = 0.9`
			`coolOffDuration = time.Second`
			`cpuMax = 1000 // millicpu`
			`millisecondsPerSecond = 1000`
			`overloadFactorLowerBound = 0.1`
initial import 4 years ago			`)`

			`var (`
add api doc (#449) 4 years ago			`// ErrServiceOverloaded is returned by Shedder.Allow when the service is overloaded.`
initial import 4 years ago			`ErrServiceOverloaded = errors.New("service overloaded")`

			`// default to be enabled`
			`enabled = syncx.ForAtomicBool(true)`
configurable for load and stat statistics logs (#980) 3 years ago			`// default to be enabled`
			`logEnabled = syncx.ForAtomicBool(true)`
initial import 4 years ago			`// make it a variable for unit test`
			`systemOverloadChecker = func(cpuThreshold int64) bool {`
			`return stat.CpuUsage() >= cpuThreshold`
			`}`
			`)`

			`type (`
add api doc (#449) 4 years ago			`// A Promise interface is returned by Shedder.Allow to let callers tell`
			`// whether the processing request is successful or not.`
initial import 4 years ago			`Promise interface {`
add api doc (#449) 4 years ago			`// Pass lets the caller tell that the call is successful.`
initial import 4 years ago			`Pass()`
add api doc (#449) 4 years ago			`// Fail lets the caller tell that the call is failed.`
initial import 4 years ago			`Fail()`
			`}`

add api doc (#449) 4 years ago			`// Shedder is the interface that wraps the Allow method.`
initial import 4 years ago			`Shedder interface {`
add api doc (#449) 4 years ago			`// Allow returns the Promise if allowed, otherwise ErrServiceOverloaded.`
initial import 4 years ago			`Allow() (Promise, error)`
			`}`

add api doc (#449) 4 years ago			`// ShedderOption lets caller customize the Shedder.`
initial import 4 years ago			`ShedderOption func(opts *shedderOptions)`

			`shedderOptions struct {`
			`window time.Duration`
			`buckets int`
			`cpuThreshold int64`
			`}`

			`adaptiveShedder struct {`
			`cpuThreshold int64`
optimize: shedding algorithm performance (#3908) 9 months ago			`windowScale float64`
initial import 4 years ago			`flying int64`
			`avgFlying float64`
			`avgFlyingLock syncx.SpinLock`
chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`overloadTime *syncx.AtomicDuration`
initial import 4 years ago			`droppedRecently *syncx.AtomicBool`
			`passCounter *collection.RollingWindow`
			`rtCounter *collection.RollingWindow`
			`}`
			`)`

add api doc (#449) 4 years ago			`// Disable lets callers disable load shedding.`
initial import 4 years ago			`func Disable() {`
			`enabled.Set(false)`
			`}`

configurable for load and stat statistics logs (#980) 3 years ago			`// DisableLog disables the stat logs for load shedding.`
			`func DisableLog() {`
			`logEnabled.Set(false)`
			`}`

add api doc (#449) 4 years ago			`// NewAdaptiveShedder returns an adaptive shedder.`
			`// opts can be used to customize the Shedder.`
initial import 4 years ago			`func NewAdaptiveShedder(opts ...ShedderOption) Shedder {`
			`if !enabled.True() {`
			`return newNopShedder()`
			`}`

			`options := shedderOptions{`
			`window: defaultWindow,`
			`buckets: defaultBuckets,`
			`cpuThreshold: defaultCpuThreshold,`
			`}`
			`for _, opt := range opts {`
			`opt(&options)`
			`}`
			`bucketDuration := options.window / time.Duration(options.buckets)`
			`return &adaptiveShedder{`
			`cpuThreshold: options.cpuThreshold,`
optimize: shedding algorithm performance (#3908) 9 months ago			`windowScale: float64(time.Second) / float64(bucketDuration) / millisecondsPerSecond,`
chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`overloadTime: syncx.NewAtomicDuration(),`
initial import 4 years ago			`droppedRecently: syncx.NewAtomicBool(),`
			`passCounter: collection.NewRollingWindow(options.buckets, bucketDuration,`
			`collection.IgnoreCurrentBucket()),`
			`rtCounter: collection.NewRollingWindow(options.buckets, bucketDuration,`
			`collection.IgnoreCurrentBucket()),`
			`}`
			`}`

add api doc (#449) 4 years ago			`// Allow implements Shedder.Allow.`
initial import 4 years ago			`func (as *adaptiveShedder) Allow() (Promise, error) {`
			`if as.shouldDrop() {`
			`as.droppedRecently.Set(true)`

			`return nil, ErrServiceOverloaded`
			`}`

			`as.addFlying(1)`

			`return &promise{`
			`start: timex.Now(),`
			`shedder: as,`
			`}, nil`
			`}`

			`func (as *adaptiveShedder) addFlying(delta int64) {`
			`flying := atomic.AddInt64(&as.flying, delta)`
			`// update avgFlying when the request is finished.`
chore: fix warnings (#3989) 9 months ago			`// this strategy makes avgFlying have a little bit of lag against flying, and smoother.`
initial import 4 years ago			`// when the flying requests increase rapidly, avgFlying increase slower, accept more requests.`
chore: make cpu usage more smooth (#3842) 10 months ago			`// when the flying requests drop rapidly, avgFlying drop slower, accept fewer requests.`
chore: fix warnings (#3989) 9 months ago			`// it makes the service to serve as many requests as possible.`
initial import 4 years ago			`if delta < 0 {`
			`as.avgFlyingLock.Lock()`
			`as.avgFlying = as.avgFlyingflyingBeta + float64(flying)(1-flyingBeta)`
			`as.avgFlyingLock.Unlock()`
			`}`
			`}`

			`func (as *adaptiveShedder) highThru() bool {`
			`as.avgFlyingLock.Lock()`
			`avgFlying := as.avgFlying`
			`as.avgFlyingLock.Unlock()`
optimize: shedding algorithm performance (#3908) 9 months ago			`maxFlight := as.maxFlight() * as.overloadFactor()`
			`return avgFlying > maxFlight && float64(atomic.LoadInt64(&as.flying)) > maxFlight`
initial import 4 years ago			`}`

optimize: shedding algorithm performance (#3908) 9 months ago			`func (as *adaptiveShedder) maxFlight() float64 {`
initial import 4 years ago			`// windows = buckets per second`
			`// maxQPS = maxPASS * windows`
			`// minRT = min average response time in milliseconds`
optimize: shedding algorithm performance (#3908) 9 months ago			`// allowedFlying = maxQPS * minRT / milliseconds_per_second`
			`maxFlight := float64(as.maxPass()) * as.minRt() * as.windowScale`
			`return mathx.AtLeast(maxFlight, 1)`
initial import 4 years ago			`}`

			`func (as *adaptiveShedder) maxPass() int64 {`
			`var result float64 = 1`

			`as.passCounter.Reduce(func(b *collection.Bucket) {`
			`if b.Sum > result {`
			`result = b.Sum`
			`}`
			`})`

			`return int64(result)`
			`}`

			`func (as *adaptiveShedder) minRt() float64 {`
optimize: shedding algorithm performance (#3908) 9 months ago			`// if no requests in previous windows, return defaultMinRt,`
			`// its a reasonable large value to avoid dropping requests.`
chore: update code format. (#628) 4 years ago			`result := defaultMinRt`
initial import 4 years ago
			`as.rtCounter.Reduce(func(b *collection.Bucket) {`
			`if b.Count <= 0 {`
			`return`
			`}`

			`avg := math.Round(b.Sum / float64(b.Count))`
			`if avg < result {`
			`result = avg`
			`}`
			`})`

			`return result`
			`}`

optimize: shedding algorithm performance (#3908) 9 months ago			`func (as *adaptiveShedder) overloadFactor() float64 {`
			`// as.cpuThreshold must be less than cpuMax`
			`factor := (cpuMax - float64(stat.CpuUsage())) / (cpuMax - float64(as.cpuThreshold))`
chore: fix warnings (#3989) 9 months ago			`// at least accept 10% of acceptable requests, even cpu is highly overloaded.`
optimize: shedding algorithm performance (#3908) 9 months ago			`return mathx.Between(factor, overloadFactorLowerBound, 1)`
			`}`

initial import 4 years ago			`func (as *adaptiveShedder) shouldDrop() bool {`
			`if as.systemOverloaded() \|\| as.stillHot() {`
			`if as.highThru() {`
			`flying := atomic.LoadInt64(&as.flying)`
			`as.avgFlyingLock.Lock()`
			`avgFlying := as.avgFlying`
			`as.avgFlyingLock.Unlock()`
			`msg := fmt.Sprintf(`
			`"dropreq, cpu: %d, maxPass: %d, minRt: %.2f, hot: %t, flying: %d, avgFlying: %.2f",`
			`stat.CpuUsage(), as.maxPass(), as.minRt(), as.stillHot(), flying, avgFlying)`
			`logx.Error(msg)`
			`stat.Report(msg)`
			`return true`
			`}`
			`}`

			`return false`
			`}`

			`func (as *adaptiveShedder) stillHot() bool {`
			`if !as.droppedRecently.True() {`
			`return false`
			`}`

chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`overloadTime := as.overloadTime.Load()`
			`if overloadTime == 0 {`
initial import 4 years ago			`return false`
			`}`

chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`if timex.Since(overloadTime) < coolOffDuration {`
			`return true`
initial import 4 years ago			`}`

chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`as.droppedRecently.Set(false)`
			`return false`
initial import 4 years ago			`}`

			`func (as *adaptiveShedder) systemOverloaded() bool {`
chore: better shedding algorithm, make sure recover from shedding (#2476) * backup * chore: better shedding algorithm, make sure recover from shedding 2 years ago			`if !systemOverloadChecker(as.cpuThreshold) {`
			`return false`
			`}`

			`as.overloadTime.Set(timex.Now())`
			`return true`
initial import 4 years ago			`}`

chore: fix warnings (#3989) 9 months ago			`// WithBuckets customizes the Shedder with the given number of buckets.`
initial import 4 years ago			`func WithBuckets(buckets int) ShedderOption {`
			`return func(opts *shedderOptions) {`
			`opts.buckets = buckets`
			`}`
			`}`

chore: fix warnings (#3989) 9 months ago			`// WithCpuThreshold customizes the Shedder with the given cpu threshold.`
initial import 4 years ago			`func WithCpuThreshold(threshold int64) ShedderOption {`
			`return func(opts *shedderOptions) {`
			`opts.cpuThreshold = threshold`
			`}`
			`}`

add api doc (#449) 4 years ago			`// WithWindow customizes the Shedder with given`
initial import 4 years ago			`func WithWindow(window time.Duration) ShedderOption {`
			`return func(opts *shedderOptions) {`
			`opts.window = window`
			`}`
			`}`

			`type promise struct {`
			`start time.Duration`
			`shedder *adaptiveShedder`
			`}`

			`func (p *promise) Fail() {`
			`p.shedder.addFlying(-1)`
			`}`

			`func (p *promise) Pass() {`
			`rt := float64(timex.Since(p.start)) / float64(time.Millisecond)`
			`p.shedder.addFlying(-1)`
			`p.shedder.rtCounter.Add(math.Ceil(rt))`
			`p.shedder.passCounter.Add(1)`
			`}`