代码炼金工坊

go内存泄露
November 3, 2020

今天同事遇到一个问题,是关于goroutine的泄露。

问题和分析

代码简化后如下:

package main

import (
	"errors"
	"fmt"
	"github.com/gin-contrib/pprof"
	"github.com/gin-gonic/gin"
	"log"
	"net/http"
)

func leakGrs() error {
	s := []int{1, 2, 3, 4, 5, 6, 7, 8, 9}
	ch := make(chan error)

	for i := range s {
		go func(i int) {
			var err error
			if i == 3 {
				err = errors.New("something wrong")
			}
			ch <- err
		}(i)
	}

	for range s {
		err := <-ch
		if err != nil {
			return err
		}
	}

	return nil
}

func handlerLeakGrs(c *gin.Context) {
	err := leakGrs()
	c.JSON(http.StatusOK, gin.H{
		"err": fmt.Sprintf("%+v", err),
	})
}

func main() {
	engine := gin.Default()
	pprof.Register(engine)
	engine.GET("/", handlerLeakGrs)
	server := http.Server{
		Addr:    ":8080",
		Handler: engine,
	}
	if err := server.ListenAndServe(); err != nil {
		log.Fatalf("cannot start server: %+v", err)
	}
}

由于引入了pprof,所以我们可以通过http://localhost:8080/debug/pprof/goroutine?debug=1访问到goroutine状况——

在一开始,一共有三个goroutine,分别是一个主G和两个net/http的G:

goroutine profile: total 3
1 @ 0x1038170 0x103178a 0x1030d55 0x10ca245 0x10cb141 0x10cb123 0x11a389f 0x11b684e 0x12c9a88 0x10677b1
#	0x1030d54	internal/poll.runtime_pollWait+0x54		/Users/yuchanns/go/go1.14/src/runtime/netpoll.go:203
#	0x10ca244	internal/poll.(*pollDesc).wait+0x44		/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:87
#	0x10cb140	internal/poll.(*pollDesc).waitRead+0x200	/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:92
#	0x10cb122	internal/poll.(*FD).Read+0x1e2			/Users/yuchanns/go/go1.14/src/internal/poll/fd_unix.go:169
#	0x11a389e	net.(*netFD).Read+0x4e				/Users/yuchanns/go/go1.14/src/net/fd_unix.go:202
#	0x11b684d	net.(*conn).Read+0x8d				/Users/yuchanns/go/go1.14/src/net/net.go:184
#	0x12c9a87	net/http.(*connReader).backgroundRead+0x57	/Users/yuchanns/go/go1.14/src/net/http/server.go:678

1 @ 0x1038170 0x103178a 0x1030d55 0x10ca245 0x10ccaa4 0x10cca86 0x11a4152 0x11bf2a2 0x11be0e4 0x12d42dd 0x12d4027 0x15882d9 0x1037d92 0x10677b1
#	0x1030d54	internal/poll.runtime_pollWait+0x54		/Users/yuchanns/go/go1.14/src/runtime/netpoll.go:203
#	0x10ca244	internal/poll.(*pollDesc).wait+0x44		/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:87
#	0x10ccaa3	internal/poll.(*pollDesc).waitRead+0x1d3	/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:92
#	0x10cca85	internal/poll.(*FD).Accept+0x1b5		/Users/yuchanns/go/go1.14/src/internal/poll/fd_unix.go:384
#	0x11a4151	net.(*netFD).accept+0x41			/Users/yuchanns/go/go1.14/src/net/fd_unix.go:238
#	0x11bf2a1	net.(*TCPListener).accept+0x31			/Users/yuchanns/go/go1.14/src/net/tcpsock_posix.go:139
#	0x11be0e3	net.(*TCPListener).Accept+0x63			/Users/yuchanns/go/go1.14/src/net/tcpsock.go:261
#	0x12d42dc	net/http.(*Server).Serve+0x25c			/Users/yuchanns/go/go1.14/src/net/http/server.go:2901
#	0x12d4026	net/http.(*Server).ListenAndServe+0xb6		/Users/yuchanns/go/go1.14/src/net/http/server.go:2830
#	0x15882d8	main.main+0x138					/Users/yuchanns/Coding/golang/gobyexample/main.go:51
#	0x1037d91	runtime.main+0x211				/Users/yuchanns/go/go1.14/src/runtime/proc.go:203

1 @ 0x13603a5 0x13601c0 0x135cf8a 0x136a3aa 0x1587ef8 0x1587ea7 0x157245b 0x1585cb0 0x157245b 0x1584de1 0x157245b 0x157c346 0x157ba1e 0x12d3f33 0x12cf8ac 0x10677b1
#	0x13603a4	runtime/pprof.writeRuntimeProfile+0x94				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:694
#	0x13601bf	runtime/pprof.writeGoroutine+0x9f				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:656
#	0x135cf89	runtime/pprof.(*Profile).WriteTo+0x3d9				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:329
#	0x136a3a9	net/http/pprof.handler.ServeHTTP+0x339				/Users/yuchanns/go/go1.14/src/net/http/pprof/pprof.go:248
#	0x1587ef7	net/http.HandlerFunc.ServeHTTP+0x77				/Users/yuchanns/go/go1.14/src/net/http/server.go:2012
#	0x1587ea6	github.com/gin-contrib/pprof.pprofHandler.func1+0x26		/Users/yuchanns/go/pkg/mod/github.com/gin-contrib/pprof@v1.3.0/pprof.go:56
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x1585caf	github.com/gin-gonic/gin.RecoveryWithWriter.func1+0x5f		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/recovery.go:83
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x1584de0	github.com/gin-gonic/gin.LoggerWithConfig.func1+0xe0		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/logger.go:241
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x157c345	github.com/gin-gonic/gin.(*Engine).handleHTTPRequest+0x665	/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/gin.go:409
#	0x157ba1d	github.com/gin-gonic/gin.(*Engine).ServeHTTP+0x17d		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/gin.go:367
#	0x12d3f32	net/http.serverHandler.ServeHTTP+0xa2				/Users/yuchanns/go/go1.14/src/net/http/server.go:2807
#	0x12cf8ab	net/http.(*conn).serve+0x86b					/Users/yuchanns/go/go1.14/src/net/http/server.go:1895

然后对http://localhost:8080/debug/pprof/goroutine?debug=1进行多次访问,再次查看goroutine状态,发现G的数量多达153,其中150个全是main.leakGrs这个方法的:

goroutine profile: total 153
150 @ 0x1038170 0x1006dfd 0x1006bc5 0x15883ee 0x10677b1
#	0x15883ed	main.leakGrs.func1+0x4d	/Users/yuchanns/Coding/golang/gobyexample/main.go:22

1 @ 0x1038170 0x103178a 0x1030d55 0x10ca245 0x10ccaa4 0x10cca86 0x11a4152 0x11bf2a2 0x11be0e4 0x12d42dd 0x12d4027 0x15882d9 0x1037d92 0x10677b1
#	0x1030d54	internal/poll.runtime_pollWait+0x54		/Users/yuchanns/go/go1.14/src/runtime/netpoll.go:203
#	0x10ca244	internal/poll.(*pollDesc).wait+0x44		/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:87
#	0x10ccaa3	internal/poll.(*pollDesc).waitRead+0x1d3	/Users/yuchanns/go/go1.14/src/internal/poll/fd_poll_runtime.go:92
#	0x10cca85	internal/poll.(*FD).Accept+0x1b5		/Users/yuchanns/go/go1.14/src/internal/poll/fd_unix.go:384
#	0x11a4151	net.(*netFD).accept+0x41			/Users/yuchanns/go/go1.14/src/net/fd_unix.go:238
#	0x11bf2a1	net.(*TCPListener).accept+0x31			/Users/yuchanns/go/go1.14/src/net/tcpsock_posix.go:139
#	0x11be0e3	net.(*TCPListener).Accept+0x63			/Users/yuchanns/go/go1.14/src/net/tcpsock.go:261
#	0x12d42dc	net/http.(*Server).Serve+0x25c			/Users/yuchanns/go/go1.14/src/net/http/server.go:2901
#	0x12d4026	net/http.(*Server).ListenAndServe+0xb6		/Users/yuchanns/go/go1.14/src/net/http/server.go:2830
#	0x15882d8	main.main+0x138					/Users/yuchanns/Coding/golang/gobyexample/main.go:51
#	0x1037d91	runtime.main+0x211				/Users/yuchanns/go/go1.14/src/runtime/proc.go:203

1 @ 0x1054a5e 0x10b3ee6 0x10cb071 0x10cb03a 0x11a389f 0x11b684e 0x12c9a88 0x10677b1
#	0x1054a5d	syscall.syscall+0x2d				/Users/yuchanns/go/go1.14/src/runtime/sys_darwin.go:63
#	0x10b3ee5	syscall.read+0x65				/Users/yuchanns/go/go1.14/src/syscall/zsyscall_darwin_amd64.go:1242
#	0x10cb070	syscall.Read+0x130				/Users/yuchanns/go/go1.14/src/syscall/syscall_unix.go:189
#	0x10cb039	internal/poll.(*FD).Read+0xf9			/Users/yuchanns/go/go1.14/src/internal/poll/fd_unix.go:165
#	0x11a389e	net.(*netFD).Read+0x4e				/Users/yuchanns/go/go1.14/src/net/fd_unix.go:202
#	0x11b684d	net.(*conn).Read+0x8d				/Users/yuchanns/go/go1.14/src/net/net.go:184
#	0x12c9a87	net/http.(*connReader).backgroundRead+0x57	/Users/yuchanns/go/go1.14/src/net/http/server.go:678

1 @ 0x13603a5 0x13601c0 0x135cf8a 0x136a3aa 0x1587ef8 0x1587ea7 0x157245b 0x1585cb0 0x157245b 0x1584de1 0x157245b 0x157c346 0x157ba1e 0x12d3f33 0x12cf8ac 0x10677b1
#	0x13603a4	runtime/pprof.writeRuntimeProfile+0x94				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:694
#	0x13601bf	runtime/pprof.writeGoroutine+0x9f				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:656
#	0x135cf89	runtime/pprof.(*Profile).WriteTo+0x3d9				/Users/yuchanns/go/go1.14/src/runtime/pprof/pprof.go:329
#	0x136a3a9	net/http/pprof.handler.ServeHTTP+0x339				/Users/yuchanns/go/go1.14/src/net/http/pprof/pprof.go:248
#	0x1587ef7	net/http.HandlerFunc.ServeHTTP+0x77				/Users/yuchanns/go/go1.14/src/net/http/server.go:2012
#	0x1587ea6	github.com/gin-contrib/pprof.pprofHandler.func1+0x26		/Users/yuchanns/go/pkg/mod/github.com/gin-contrib/pprof@v1.3.0/pprof.go:56
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x1585caf	github.com/gin-gonic/gin.RecoveryWithWriter.func1+0x5f		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/recovery.go:83
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x1584de0	github.com/gin-gonic/gin.LoggerWithConfig.func1+0xe0		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/logger.go:241
#	0x157245a	github.com/gin-gonic/gin.(*Context).Next+0x3a			/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/context.go:156
#	0x157c345	github.com/gin-gonic/gin.(*Engine).handleHTTPRequest+0x665	/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/gin.go:409
#	0x157ba1d	github.com/gin-gonic/gin.(*Engine).ServeHTTP+0x17d		/Users/yuchanns/go/pkg/mod/github.com/gin-gonic/gin@v1.6.2/gin.go:367
#	0x12d3f32	net/http.serverHandler.ServeHTTP+0xa2				/Users/yuchanns/go/go1.14/src/net/http/server.go:2807
#	0x12cf8ab	net/http.(*conn).serve+0x86b					/Users/yuchanns/go/go1.14/src/net/http/server.go:1895

同事贴出他的代码,询问错误在哪里。实际代码比较复杂,特别是在循环返回错误那块,因此一时之间大家也没看出问题所在。

幸好因为事先知道问题在于这段代码造成的泄露,经过简化,很快大家就弄清楚了问题在于使用了go关键字的循环中,往无缓冲的通道写数据,而下面的通道读数据却并不总是全部读完,就会返回。这就造成了多个goroutine阻塞等待通道让位可以写入数据。

而goroutine都是平等而独立的,并且不会被gc所回收,一旦发生阻塞,就会一直存在,也就形成了事实上的“goroutine泄露”。更可怕的就是一次请求产生少量的泄露,一天下来的大量请求则会造成海量的goroutine泄露,数量飙升,后果就会变得严重,导致监控告警。

解决方法

代码中有两处错误,第一处是使用无缓冲通道。

使用无缓冲通道,通道只能容纳一个单位类型的数据;当存在多个写入成员时,需要等待原有的数据被读取后才能再次写入,造成阻塞排队。

所以提倡应该使用有缓冲的通道,并且把通道长度设置成写入成员的个数。

func leakGrs() error {
	s := []int{1, 2, 3, 4, 5, 6, 7, 8, 9}
	ch := make(chan error, 9)

ch := make(chan error)加上长度9,就不会发生泄露的问题了。所有的goroutine完成了通道写入,自动退出;而尽管读取通道并没有用到那么多数据,随着方法的结束,通道也将被gc回收。

第二处错误则是遇到错误直接return。

这种错误写法源于对常驻进程的不了解。这位同事和我一样是从php转语言而来的。习惯于php-fpm的程序员一般认为变量的生命周期仅在于一次请求当中;当请求结束后,代码内容全被清空,自然也就不存在链接忘记回收、goroutine阻塞等泄露问题。

所以即使发生了错误,正确的做法也应该是收集全部错误信息,或者及时通知goroutine退出。

总结

goroutine很好用,可以以极小的资源轻松创建成千上万,但这并不意味着可以被滥用。

防止goroutine泄露,一定要做好协程的退出准备和意外退出处理。

无缓冲通道会造成写入阻塞。