深入理解 Go scheduler 调度器：GPM 源码分析

Linux爱好者 2021-09-08

↓推荐关注↓

【导读】GMP 模型是让 go 语言轻量快速高效的重要调度模型，本文从 GMP 源码出发直观地解析了这一模型。

这篇文章就来看看 golang 的调度模型-GPM 模型的源码结构。

Go 版本：go1.13.9

M 结构体

M 结构体是 OS 线程的一个抽象，主要负责结合 P 运行 G。它里面有很多字段，差不多有 60 个字段，我们看看里面主要的字段意思。/src/runtime/runtime2.go

Copytype m struct {
    // 系统管理的一个 g，执行调度代码时使用的。比如执行用户的 goroutine 时，就需要把把用户
    // 的栈信息换到内核线程的栈，以便能够执行用户 goroutine
 g0      *g     // goroutine with scheduling stack
 morebuf gobuf  // gobuf arg to morestack
 divmod  uint32 // div/mod denominator for arm - known to liblink

 // Fields not known to debuggers.
 procid        uint64       // for debuggers, but offset not hard-coded
    //处理 signal 的 g
 gsignal       *g           // signal-handling g
 goSigStack    gsignalStack // Go-allocated signal handling stack
 sigmask       sigset       // storage for saved signal mask
    //线程的本地存储 TLS，这里就是为什么 OS 线程能运行 M 关键地方
 tls           [6]uintptr   // thread-local storage (for x86 extern register)
 //go 关键字运行的函数
    mstartfn      func()
    //当前运行的用户 goroutine 的 g 结构体对象
 curg          *g       // current running goroutine
 caughtsig     guintptr // goroutine running during fatal signal
    
    //当前工作线程绑定的 P，如果没有就为 nil
 p             puintptr // attached p for executing go code (nil if not executing go code)
 //暂存与当前 M 潜在关联的 P
    nextp         puintptr
    //M 之前调用的 P
 oldp          puintptr // the p that was attached before executing a syscall
 id            int64
 mallocing     int32
 throwing      int32
    //当前 M 是否关闭抢占式调度
 preemptoff    string // if != "", keep curg running on this m
 locks         int32
 dying         int32
 profilehz     int32
    //M 的自旋状态，为 true 时 M 处于自旋状态，正在从其他线程偷 G; 为 false，休眠状态
 spinning      bool // m is out of work and is actively looking for work
 blocked       bool // m is blocked on a note
 newSigstack   bool // minit on C thread called sigaltstack
 printlock     int8
 incgo         bool   // m is executing a cgo call
 freeWait      uint32 // if == 0, safe to free g0 and delete m (atomic)
 fastrand      [2]uint32
 needextram    bool
 traceback     uint8
 ncgocall      uint64      // number of cgo calls in total
 ncgo          int32       // number of cgo calls currently in progress
 cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
 cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
 //没有 goroutine 运行时，工作线程睡眠
    //通过这个来唤醒工作线程
    park          note // 休眠锁
    //记录所有工作线程的链表
 alllink       *m // on allm
 schedlink     muintptr
    //当前线程内存分配的本地缓存
 mcache        *mcache
    //当前 M 锁定的 G，
 lockedg       guintptr
 createstack   [32]uintptr // stack that created this thread.
 lockedExt     uint32      // tracking for external LockOSThread
 lockedInt     uint32      // tracking for internal lockOSThread
 nextwaitm     muintptr    // next m waiting for lock
 waitunlockf   func(*g, unsafe.Pointer) bool
 waitlock      unsafe.Pointer
 waittraceev   byte
 waittraceskip int
 startingtrace bool
 syscalltick   uint32
    //操作系统线程 id
 thread        uintptr // thread handle
 freelink      *m      // on sched.freem

 // these are here because they are too large to be on the stack
 // of low-level NOSPLIT functions.
 libcall   libcall
 libcallpc uintptr // for cpu profiler
 libcallsp uintptr
 libcallg  guintptr
 syscall   libcall // stores syscall parameters on windows

 vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
 vdsoPC uintptr // PC for traceback while in VDSO call

 dlogPerM

 mOS
}

看看几个比较重要的字段：g0：用于执行调度器的 g0gsignal：用于信号处理tls：线程本地存储的 tlsp：goroutine 绑定的本地资源

P 结构体

一个 M 要运行，必须绑定 P 才能运行 goroutine，M 阻塞时，P 会被传给其他 M。

/src/runtime/runtime2.go

Copytype p struct {
    //allp 中的索引
 id          int32
    //p 的状态
 status      uint32 // one of pidle/prunning/...
 link        puintptr
 schedtick   uint32     // incremented on every scheduler call->每次 scheduler 调用+1
 syscalltick uint32     // incremented on every system call->每次系统调用+1
 sysmontick  sysmontick // last tick observed by sysmon
    //指向绑定的 m，如果 p 是 idle 的话，那这个指针是 nil
 m           muintptr   // back-link to associated m (nil if idle)
 mcache      *mcache
 raceprocctx uintptr

    //不同大小可用 defer 结构池
 deferpool    [5][]*_defer // pool of available defer structs of different sizes (see panic.go)
 deferpoolbuf [5][32]*_defer

 // Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
 goidcache    uint64
 goidcacheend uint64

    //本地运行队列，可以无锁访问
 // Queue of runnable goroutines. Accessed without lock.
 runqhead uint32  //队列头
 runqtail uint32   //队列尾
    //数组实现的循环队列
 runq     [256]guintptr
    
 // runnext, if non-nil, is a runnable G that was ready'd by
 // the current G and should be run next instead of what's in
 // runq if there's time remaining in the running G's time
 // slice. It will inherit the time left in the current time
 // slice. If a set of goroutines is locked in a
 // communicate-and-wait pattern, this schedules that set as a
 // unit and eliminates the (potentially large) scheduling
 // latency that otherwise arises from adding the ready'd
 // goroutines to the end of the run queue.
    // runnext 非空时，代表的是一个 runnable 状态的 G，
    //这个 G 被 当前 G 修改为 ready 状态，相比 runq 中的 G 有更高的优先级。
    //如果当前 G 还有剩余的可用时间，那么就应该运行这个 G
    //运行之后，该 G 会继承当前 G 的剩余时间
 runnext guintptr

 // Available G's (status == Gdead)
    //空闲的 g
 gFree struct {
  gList
  n int32
 }

 sudogcache []*sudog
 sudogbuf   [128]*sudog

 tracebuf traceBufPtr

 // traceSweep indicates the sweep events should be traced.
 // This is used to defer the sweep start event until a span
 // has actually been swept.
 traceSweep bool
 // traceSwept and traceReclaimed track the number of bytes
 // swept and reclaimed by sweeping in the current sweep loop.
 traceSwept, traceReclaimed uintptr

 palloc persistentAlloc // per-P to avoid mutex

 _ uint32 // Alignment for atomic fields below

 // Per-P GC state
 gcAssistTime         int64    // Nanoseconds in assistAlloc
 gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
 gcBgMarkWorker       guintptr // (atomic)
 gcMarkWorkerMode     gcMarkWorkerMode

 // gcMarkWorkerStartTime is the nanotime() at which this mark
 // worker started.
 gcMarkWorkerStartTime int64

 // gcw is this P's GC work buffer cache. The work buffer is
 // filled by write barriers, drained by mutator assists, and
 // disposed on certain GC state transitions.
 gcw gcWork

 // wbBuf is this P's GC write barrier buffer.
 //
 // TODO: Consider caching this in the running G.
 wbBuf wbBuf

 runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point

 pad cpu.CacheLinePad
}

其他的一些字段就是 gc，trace，debug 信息

G 结构体

G 就是 goroutine。主要保存 goroutine 的所有信息以及栈信息，gobuf 结构体：cpu 里的寄存器信息，以便在轮到本 goroutine 执行时，知道从哪里开始执行。

/src/runtime/runtime2.go

Copytype stack struct {
 lo uintptr   //栈顶，指向内存低地址
 hi uintptr   //栈底，指向内存搞地址
}

type g struct {
 // Stack parameters.
 // stack describes the actual stack memory: [stack.lo, stack.hi).
 // stackguard0 is the stack pointer compared in the Go stack growth prologue.
 // It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
 // stackguard1 is the stack pointer compared in the C stack growth prologue.
 // It is stack.lo+StackGuard on g0 and gsignal stacks.
 // It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
 // 记录该 goroutine 使用的栈
    stack       stack   // offset known to runtime/cgo
    
 //下面两个成员用于栈溢出检查，实现栈的自动伸缩，抢占调度也会用到 stackguard0
    stackguard0 uintptr // offset known to liblink
 stackguard1 uintptr // offset known to liblink

 _panic         *_panic // innermost panic - offset known to liblink
 _defer         *_defer // innermost defer
    
    // 此 goroutine 正在被哪个工作线程执行
 m              *m      // current m; offset known to arm liblink
    //这个字段跟调度切换有关，G 切换时用来保存上下文，保存什么，看下面 gobuf 结构体
 sched          gobuf
 syscallsp      uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
 syscallpc      uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
 stktopsp       uintptr        // expected sp at top of stack, to check in traceback
 param          unsafe.Pointer // passed parameter on wakeup，wakeup 唤醒时传递的参数
 // 状态 Gidle,Grunnable,Grunning,Gsyscall,Gwaiting,Gdead
    atomicstatus   uint32
 stackLock      uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
 goid           int64
    
    //schedlink 字段指向全局运行队列中的下一个 g，
    //所有位于全局运行队列中的 g 形成一个链表
 schedlink      guintptr
 waitsince      int64      // approx time when the g become blocked
 waitreason     waitReason // if status==Gwaiting，g 被阻塞的原因
    //抢占信号，stackguard0 = stackpreempt，如果需要抢占调度，设置 preempt 为 true
 preempt        bool       // preemption signal, duplicates stackguard0 = stackpreempt
 paniconfault   bool       // panic (instead of crash) on unexpected fault address
 preemptscan    bool       // preempted g does scan for gc
 gcscandone     bool       // g has scanned stack; protected by _Gscan bit in status
 gcscanvalid    bool       // false at start of gc cycle, true if G has not run since last scan; TODO: remove?
 throwsplit     bool       // must not split stack
 raceignore     int8       // ignore race detection events
 sysblocktraced bool       // StartTrace has emitted EvGoInSyscall about this goroutine
 sysexitticks   int64      // cputicks when syscall has returned (for tracing)
 traceseq       uint64     // trace event sequencer
 tracelastp     puintptr   // last P emitted an event for this goroutine
 // 如果调用了 LockOsThread，那么这个 g 会绑定到某个 m 上
    lockedm        muintptr
 sig            uint32
 writebuf       []byte
 sigcode0       uintptr
 sigcode1       uintptr
 sigpc          uintptr
    // 创建这个 goroutine 的 go 表达式的 pc
 gopc           uintptr         // pc of go statement that created this goroutine
 ancestors      *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
 startpc        uintptr         // pc of goroutine function
 racectx        uintptr
 waiting        *sudog         // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
 cgoCtxt        []uintptr      // cgo traceback context
 labels         unsafe.Pointer // profiler labels
 timer          *timer         // cached timer for time.Sleep, 为 time.Sleep 缓存的计时器
 selectDone     uint32         // are we participating in a select and did someone win the race?

 // Per-G GC state

 // gcAssistBytes is this G's GC assist credit in terms of
 // bytes allocated. If this is positive, then the G has credit
 // to allocate gcAssistBytes bytes without assisting. If this
 // is negative, then the G must correct this by performing
 // scan work. We track this in bytes to make it fast to update
 // and check for debt in the malloc hot path. The assist ratio
 // determines how this corresponds to scan work debt.
 gcAssistBytes int64
}

gobuf

gobuf 结构体用于保存 goroutine 的调度信息，主要包括 CPU 的几个寄存器的值。

/src/runtime/runtime2.go

Copytype gobuf struct {
 // The offsets of sp, pc, and g are known to (hard-coded in) libmach.
 //
 // ctxt is unusual with respect to GC: it may be a
 // heap-allocated funcval, so GC needs to track it, but it
 // needs to be set and cleared from assembly, where it's
 // difficult to have write barriers. However, ctxt is really a
 // saved, live register, and we only ever exchange it between
 // the real register and the gobuf. Hence, we treat it as a
 // root during stack scanning, which means assembly that saves
 // and restores it doesn't need write barriers. It's still
 // typed as a pointer so that any other writes from Go get
 // write barriers.
 sp   uintptr      // 保存 CPU 的 rsp 寄存器的值
 pc   uintptr      // 保存 CPU 的 rip 寄存器的值
 g    guintptr     // 记录当前这个 gobuf 对象属于哪个 goroutine
 ctxt unsafe.Pointer
    
    //保存系统调用的返回值，因为从系统调用返回之后如果 p 被其它工作线程抢占，
    //则这个 goroutine 会被放入全局运行队列被其它工作线程调度，其它线程需要知道系统调用的返回值。
 ret  sys.Uintreg  // 保存系统调用的返回值
 lr   uintptr
    
    //保存 CPU 的 rip 寄存器的值
 bp   uintptr // for GOEXPERIMENT=framepointer
}

调度器 sched 结构

所有的 gorouteine 都是被调度器调度运行，调度器持有全局资源

sched

/src/runtime/runtime2.go

Copytype schedt struct {
 // accessed atomically. keep at top to ensure alignment on 32-bit systems.
    // 需以原子访问访问。
    // 保持在 struct 顶部，以使其在 32 位系统上可以对齐
 goidgen  uint64
 lastpoll uint64

 lock mutex

 // When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
 // sure to call checkdead().
 
    //由空闲的工作线程组成的链表
 midle        muintptr // idle m's waiting for work
    //空闲的工作线程的数量
 nmidle       int32    // number of idle m's waiting for work
    //空闲的且被 lock 的 m 计数
 nmidlelocked int32    // number of locked m's waiting for work
    //已经创建的多个 m，下一个 m id
 mnext        int64    // number of m's that have been created and next M ID
    //被允许创建的最大 m 线程数量
 maxmcount    int32    // maximum number of m's allowed (or die)
 nmsys        int32    // number of system m's not counted for deadlock
    //累积空闲的 m 数量
 nmfreed      int64    // cumulative number of freed m's

    //系统 goroutine 的数量，自动更新
 ngsys uint32 // number of system goroutines; updated atomically
 
    //由空闲的 p 结构体对象组成的链表
 pidle      puintptr // idle p's
    //空闲的 p 结构体对象的数量
 npidle     uint32
 nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.

 // Global runnable queue.
    //全局运行队列 G 队列
 runq     gQueue //这个结构体在 proc.go 里
    //元素数量
 runqsize int32

 // disable controls selective disabling of the scheduler.
 //
 // Use schedEnableUser to control this.
 //
 // disable is protected by sched.lock.
 disable struct {
  // user disables scheduling of user goroutines.
  user     bool
  runnable gQueue // pending runnable Gs
  n        int32  // length of runnable
 }

 // Global cache of dead G's. 有效 dead G 全局缓存
 gFree struct {
  lock    mutex
  stack   gList // Gs with stacks
  noStack gList // Gs without stacks
  n       int32
 }

 // Central cache of sudog structs. dusog 结构的集中缓存
 sudoglock  mutex
 sudogcache *sudog

 // Central pool of available defer structs of different sizes. 不同大小有效的 defer 结构的池
 deferlock mutex
 deferpool [5]*_defer

 // freem is the list of m's waiting to be freed when their
 // m.exited is set. Linked through m.freelink.
 freem *m

 gcwaiting  uint32 // gc is waiting to run
 stopwait   int32
 stopnote   note
 sysmonwait uint32
 sysmonnote note

 // safepointFn should be called on each P at the next GC
 // safepoint if p.runSafePointFn is set.
 safePointFn   func(*p)
 safePointWait int32
 safePointNote note

 profilehz int32 // cpu profiling rate

 procresizetime int64 // nanotime() of last change to gomaxprocs
 totaltime      int64 // ∫gomaxprocs dt up to procresizetime
}

gQueue

/src/runtime/proc.go

Copytype gQueue struct {
 head guintptr //队列头
 tail guintptr //队列尾
}

一些重要全局变量

/src/runtime/proc.go

Copym0 m            //代表主线程
g0  g          //m0 绑定的 g0，也就是 M 结构体中 m0.g0=&g0

allgs  []*g  //保存所有的 g

/src/runtime/runtime2.go

Copyallm  *m             //所有的 m 构成的一个链表，包括上面的 m0
allp  []*p            //保存所有的 p， len(allp) == gomaxprocs

sched         schedt //调度器的结构体，保存了调度器的各种信息

ncpu       int32  //系统 cpu 核的数量，程序启动时由 runtime 初始化
gomaxprocs int32 //p 的最大数量，默认等于 ncpu，可以通过 GOMAXPROCS 修改

在程序初始化时，这些变量都会被初始化为 0 值，指针会被初始化为 nil 指针，切片初始化为 nil 切片，int 被初始化为数字 0，结构体的所有成员变量按其本类型初始化为其类型的 0 值。

调度器初始化

调度器初始化有一个主要的函数 schedinit()，这个函数在 /src/runtime/proc.go 文件中。函数开头还把初始化的顺序给列出来了：

// The bootstrap sequence is://// call osinit// call schedinit// make & queue new G// call runtime·mstart//// The new G calls runtime·main.

Copyfunc schedinit() {
 // raceinit must be the first call to race detector.
 // In particular, it must be done before mallocinit below calls racemapshadow.
 _g_ := getg() //getg() 在 src/runtime/stubs.go 中声明，真正的代码由编译器生成
 if raceenabled {
  _g_.racectx, raceprocctx0 = raceinit()
 }
 
    //设置最大 M 的数量
 sched.maxmcount = 10000

 tracebackinit()
 moduledataverify()
    //初始化栈空间常用管理链表
 stackinit()
 mallocinit()
    //初始化当前 m
 mcommoninit(_g_.m)
 cpuinit()       // must run before alginit
 alginit()       // maps must not be used before this call
 modulesinit()   // provides activeModules
 typelinksinit() // uses maps, activeModules
 itabsinit()     // uses activeModules

 msigsave(_g_.m)
 initSigmask = _g_.m.sigmask

 goargs()
 goenvs()
 parsedebugvars()
 gcinit()

 sched.lastpoll = uint64(nanotime())
    // 把 p 数量从 1 调整到默认的 CPU Core 数量
 procs := ncpu
 if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
  procs = n
 }
    //调整 P 数量
    //这里的 P 都是新建的，所以不返回有本地任务的 p
 if procresize(procs) != nil {
  throw("unknown runnable goroutine during bootstrap")
 }

 // For cgocheck > 1, we turn on the write barrier at all times
 // and check all pointer writes. We can't do this until after
 // procresize because the write barrier needs a P.
 if debug.cgocheck > 1 {
  writeBarrier.cgo = true
  writeBarrier.enabled = true
  for _, p := range allp {
   p.wbBuf.reset()
  }
 }

 if buildVersion == "" {
  // Condition should never trigger. This code just serves
  // to ensure runtime·buildVersion is kept in the resulting binary.
  buildVersion = "unknown"
 }
 if len(modinfo) == 1 {
  // Condition should never trigger. This code just serves
  // to ensure runtime·modinfo is kept in the resulting binary.
  modinfo = ""
 }
}

开头的这个函数 getg()，跳转到了 func getg() *g ，定义这么一个形式，什么意思？函数首先调用 getg() 函数获取当前正在运行的 g，getg() 在 src/runtime/stubs.go 中声明，真正的代码由编译器生成。

Copy// getg returns the pointer to the current g.
// The compiler rewrites calls to this function into instructions
// that fetch the g directly (from TLS or from the dedicated register).
func getg() *g

注释里也说了，getg 返回当前正在运行的 goroutine 的指针，它会从 tls 里取出 tls[0]，也就是当前运行的 goroutine 的地址。编译器插入类似下面的代码：

Copyget_tls(CX) 
MOVQ g(CX), BX; // BX 存器里面现在放的是当前 g 结构体对象的地址

原来是这么个意思。

调度器初始化大致过程：M 初始化 --> P 初始化 - -> G 初始化mcommoninit Procresize newproc-------------------------------------------------------allm 池 allp 池 g.sched 执行现场p.runq 调度队列

M/P/G 初始化：mcommoninit、procresize、newproc，他们负责 M 资源池（allm）、p 资源池（allp）、G 的运行现场（g.sched）以及调度队列（p.runq）

调度循环

所有的工作初始化完成后，就要启动运行器了。准备工作做好了，就要启动 mstart 了。这个工作在汇编语言中也可以看出来

/src/runtime/asm_amd64.s （在 linux 下）

CopyTEXT runtime·rt0_go(SB),NOSPLIT,$0

  ... ... ...

  MOVL 16(SP), AX  // copy argc
 MOVL AX, 0(SP)
 MOVQ 24(SP), AX  // copy argv
 MOVQ AX, 8(SP)
 CALL runtime·args(SB)  
 CALL runtime·osinit(SB)    //OS 初始化
 CALL runtime·schedinit(SB) //调度器初始化

 // create a new goroutine to start program
 MOVQ $runtime·mainPC(SB), AX  // entry
 PUSHQ AX
 PUSHQ $0   // arg size
 CALL runtime·newproc(SB)       // G 初始化
 POPQ AX

 // start this M , 启动 M
 CALL runtime·mstart(SB)

 CALL runtime·abort(SB) // mstart should never return
 RET

转自：九卷
cnblogs.com/jiujuan/p/12977832.html

- EOF -

推荐阅读点击标题可跳转

1、goroutine 调度器原理

2、Go 微服务全链路跟踪详解

3、Go 存储基础：内存结构体怎么写入文件？

看完本文有收获？请分享给更多人

推荐关注「Linux 爱好者」，提升Linux技能

点赞和在看就是最大的支持❤️

: ， . Video Mini Program Like ，轻点两下取消赞 Wow ，轻点两下取消在看

观察｜官方通报陕西蒲城一职校学生坠亡：事发前与舍友发生口角和肢体冲突认定该生系高空坠落死亡

桐城一派｜倒在“跨年夜”的龚书记，13个字换来免职调查冤不冤？

市管干部“龚书记”免职迷局

讣告！又一知名女星在家中去世，终年54岁，曾是无数人白月光…

近视的孩子有救了！国内最新近视防控矫正技术，不手术，扫码进群即可了解！

深入理解 Go scheduler 调度器：GPM 源码分析

M 结构体

P 结构体

G 结构体

gobuf

调度器 sched 结构

sched

gQueue

一些重要全局变量

调度器初始化

调度循环

您可能也对以下帖子感兴趣

观察｜官方通报陕西蒲城一职校学生坠亡：事发前与舍友发生口角和肢体冲突 认定该生系高空坠落死亡

桐城一派｜倒在“跨年夜”的龚书记，13个字换来免职调查冤不冤？

市管干部“龚书记”免职迷局

讣告！又一知名女星在家中去世，终年54岁，曾是无数人白月光…

近视的孩子有救了！国内最新近视防控矫正技术，不手术，扫码进群即可了解！

生成图片，分享到微信朋友圈

深入理解 Go scheduler 调度器：GPM 源码分析

M 结构体

P 结构体

G 结构体

gobuf

调度器 sched 结构

sched

gQueue

一些重要全局变量

调度器初始化

调度循环

您可能也对以下帖子感兴趣

观察｜官方通报陕西蒲城一职校学生坠亡：事发前与舍友发生口角和肢体冲突认定该生系高空坠落死亡