Drone Server Hangs (from event stream?)

Trying out “latest” drone/drone:0.8.0 and running into issues. I’ll start it and be able to access it fine. After a little bit of time though (~20min) it’ll become unresponsive. I see the following in my server logs:

time="2017-09-15T15:38:07Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T15:38:08Z" level=debug msg="user feed: connection opened" 
fatal error: runtime: out of memory

runtime stack:
runtime.throw(0xe7e0e2, 0x16)
	/usr/local/go/src/runtime/panic.go:596 +0x95
runtime.sysMap(0xc477600000, 0x100000, 0x43d900, 0x16ac118)
	/usr/local/go/src/runtime/mem_linux.go:216 +0x1d0
runtime.(*mheap).sysAlloc(0x168c080, 0x100000, 0x45ca42)
	/usr/local/go/src/runtime/malloc.go:428 +0x374
runtime.(*mheap).grow(0x168c080, 0x6, 0x0)
	/usr/local/go/src/runtime/mheap.go:774 +0x62
runtime.(*mheap).allocSpanLocked(0x168c080, 0x6, 0x7f18d7a3c728)
	/usr/local/go/src/runtime/mheap.go:678 +0x44f
runtime.(*mheap).alloc_m(0x168c080, 0x6, 0x34, 0x7f18e4c27000)
	/usr/local/go/src/runtime/mheap.go:562 +0xe2
runtime.(*mheap).alloc.func1()
	/usr/local/go/src/runtime/mheap.go:627 +0x4b
runtime.systemstack(0x7f18e73c5c18)
	/usr/local/go/src/runtime/asm_amd64.s:343 +0xab
runtime.(*mheap).alloc(0x168c080, 0x6, 0x10000000034, 0x7f18d7a3c728)
	/usr/local/go/src/runtime/mheap.go:628 +0xa0
runtime.(*mcentral).grow(0x168ea70, 0x0)
	/usr/local/go/src/runtime/mcentral.go:212 +0x8e
runtime.(*mcentral).cacheSpan(0x168ea70, 0x7f18d7a3c728)
	/usr/local/go/src/runtime/mcentral.go:93 +0x12a
runtime.(*mcache).refill(0x7f18e8d8a000, 0x34, 0x7f18d7a3c728)
	/usr/local/go/src/runtime/mcache.go:122 +0xa0
runtime.(*mcache).nextFree.func1()
	/usr/local/go/src/runtime/malloc.go:526 +0x32
runtime.systemstack(0xc420021300)
	/usr/local/go/src/runtime/asm_amd64.s:327 +0x79
runtime.mstart()
	/usr/local/go/src/runtime/proc.go:1132

goroutine 160052 [running]:
runtime.systemstack_switch()
	/usr/local/go/src/runtime/asm_amd64.s:281 fp=0xc4756133b0 sp=0xc4756133a8
runtime.(*mcache).nextFree(0x7f18e8d8a000, 0xc43d3f2834, 0x90, 0xc46800f3e8, 0xaead01)
	/usr/local/go/src/runtime/malloc.go:527 +0xb9 fp=0xc475613408 sp=0xc4756133b0
runtime.mallocgc(0x2600, 0xddb4a0, 0xc475613501, 0x40e29f)
	/usr/local/go/src/runtime/malloc.go:679 +0x827 fp=0xc4756134a8 sp=0xc475613408
runtime.newarray(0xddb4a0, 0x40, 0xc45a3aa300)
	/usr/local/go/src/runtime/malloc.go:821 +0x64 fp=0xc4756134e8 sp=0xc4756134a8
runtime.hashGrow(0xd6b4c0, 0xc47551e990)
	/usr/local/go/src/runtime/hashmap.go:880 +0xbe fp=0xc475613528 sp=0xc4756134e8
runtime.mapassign(0xd6b4c0, 0xc47551e990, 0xc45a3aa3f0, 0xc46800f849)
	/usr/local/go/src/runtime/hashmap.go:560 +0x375 fp=0xc4756135c8 sp=0xc475613528
github.com/drone/drone/server.EventStreamSSE(0xc46a274160)
	/go/src/github.com/drone/drone/server/stream.go:50 +0x329 fp=0xc4756137b8 sp=0xc4756135c8
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc4756137d8 sp=0xc4756137b8
github.com/drone/drone/router/middleware/token.Refresh(0xc46a274160)
	/go/src/github.com/drone/drone/router/middleware/token/token.go:26 +0x317 fp=0xc475613888 sp=0xc4756137d8
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc4756138a8 sp=0xc475613888
github.com/drone/drone/router/middleware/session.SetUser.func1(0xc46a274160)
	/go/src/github.com/drone/drone/router/middleware/session/user.go:68 +0xd2 fp=0xc475613918 sp=0xc4756138a8
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613938 sp=0xc475613918
github.com/drone/drone/router/middleware.Store.func1(0xc46a274160)
	/go/src/github.com/drone/drone/router/middleware/store.go:15 +0x57 fp=0xc475613968 sp=0xc475613938
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613988 sp=0xc475613968
github.com/drone/drone/vendor/github.com/gin-gonic/contrib/ginrus.Ginrus.func1(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/contrib/ginrus/ginrus.go:26 +0xbf fp=0xc475613ae8 sp=0xc475613988
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613b08 sp=0xc475613ae8
github.com/drone/drone/router/middleware/header.Options(0xc46a274160)
	/go/src/github.com/drone/drone/router/middleware/header/header.go:25 +0x40 fp=0xc475613b40 sp=0xc475613b08
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613b60 sp=0xc475613b40
github.com/drone/drone/router/middleware/header.NoCache(0xc46a274160)
	/go/src/github.com/drone/drone/router/middleware/header/header.go:17 +0x129 fp=0xc475613bc0 sp=0xc475613b60
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613be0 sp=0xc475613bc0
github.com/drone/drone/vendor/github.com/gin-gonic/gin.RecoveryWithWriter.func1(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/recovery.go:45 +0x5a fp=0xc475613c10 sp=0xc475613be0
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Context).Next(0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/context.go:97 +0x5a fp=0xc475613c30 sp=0xc475613c10
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Engine).handleHTTPRequest(0xc420113960, 0xc46a274160)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/gin.go:284 +0x5c0 fp=0xc475613d28 sp=0xc475613c30
github.com/drone/drone/vendor/github.com/gin-gonic/gin.(*Engine).ServeHTTP(0xc420113960, 0x14dc1e0, 0xc444e75e80, 0xc43998ac00)
	/go/src/github.com/drone/drone/vendor/github.com/gin-gonic/gin/gin.go:265 +0x16b fp=0xc475613d60 sp=0xc475613d28
net/http.serverHandler.ServeHTTP(0xc4202c0370, 0x14dc1e0, 0xc444e75e80, 0xc43998ac00)
	/usr/local/go/src/net/http/server.go:2568 +0x92 fp=0xc475613da8 sp=0xc475613d60
net/http.initNPNRequest.ServeHTTP(0xc4201de380, 0xc4202c0370, 0x14dc1e0, 0xc444e75e80, 0xc43998ac00)
	/usr/local/go/src/net/http/server.go:3088 +0x93 fp=0xc475613f10 sp=0xc475613da8
net/http.(*initNPNRequest).ServeHTTP(0xc420332190, 0x14dc1e0, 0xc444e75e80, 0xc43998ac00)
	<autogenerated>:312 +0x74 fp=0xc475613f50 sp=0xc475613f10
net/http.(Handler).ServeHTTP-fm(0x14dc1e0, 0xc444e75e80, 0xc43998ac00)
	/usr/local/go/src/net/http/h2_bundle.go:4319 +0x4d fp=0xc475613f80 sp=0xc475613f50
net/http.(*http2serverConn).runHandler(0xc4200641c0, 0xc444e75e80, 0xc43998ac00, 0xc4696682e0)
	/usr/local/go/src/net/http/h2_bundle.go:4599 +0x89 fp=0xc475613fc0 sp=0xc475613f80
runtime.goexit()
	/usr/local/go/src/runtime/asm_amd64.s:2197 +0x1 fp=0xc475613fc8 sp=0xc475613fc0
created by net/http.(*http2serverConn).processHeaders
	/usr/local/go/src/net/http/h2_bundle.go:4331 +0x4ac

goroutine 1 [semacquire, 51 minutes]:
sync.runtime_Semacquire(0xc420205114)
	/usr/local/go/src/runtime/sema.go:47 +0x34
sync.(*WaitGroup).Wait(0xc420205108)
	/usr/local/go/src/sync/waitgroup.go:131 +0x7a
github.com/drone/drone/vendor/golang.org/x/sync/errgroup.(*Group).Wait(0xc420205100, 0xc4202281a0, 0xb)
	/go/src/github.com/drone/drone/vendor/golang.org/x/sync/errgroup/errgroup.go:41 +0x31
main.server(0xc4203b5040, 0xc400000000, 0x1686e00)
	/go/src/github.com/drone/drone/extras/cmd/drone-server/server.go:539 +0x4e0
github.com/drone/drone/vendor/github.com/urfave/cli.HandleAction(0xd3b5c0, 0xeab9a0, 0xc4203b5040, 0xc4203b85a0, 0x0)
	/go/src/github.com/drone/drone/vendor/github.com/urfave/cli/app.go:485 +0xd4
github.com/drone/drone/vendor/github.com/urfave/cli.(*App).Run(0xc4203ec000, 0xc420062150, 0x1, 0x1, 0x0, 0x0)
	/go/src/github.com/drone/drone/vendor/github.com/urfave/cli/app.go:259 +0x70d
main.main()
	/go/src/github.com/drone/drone/extras/cmd/drone-server/main.go:22 +0x18c

goroutine 17 [syscall, 51 minutes, locked to thread]:
runtime.goexit()
	/usr/local/go/src/runtime/asm_amd64.s:2197 +0x1

goroutine 22 [chan receive, 51 minutes]:
database/sql.(*DB).connectionOpener(0xc4203ab7c0)
	/usr/local/go/src/database/sql/sql.go:837 +0x4a
created by database/sql.Open
	/usr/local/go/src/database/sql/sql.go:582 +0x212

goroutine 50 [IO wait]:
net.runtime_pollWait(0x7f18e753c840, 0x72, 0x3)
	/usr/local/go/src/runtime/netpoll.go:164 +0x59
net.(*pollDesc).wait(0xc42017a068, 0x72, 0x14d6760, 0x14d04e0)
	/usr/local/go/src/net/fd_poll_runtime.go:75 +0x38
net.(*pollDesc).waitRead(0xc42017a068, 0xc4202a6000, 0x1000)
	/usr/local/go/src/net/fd_poll_runtime.go:80 +0x34
net.(*netFD).Read(0xc42017a000, 0xc4202a6000, 0x1000, 0x1000, 0x0, 0x14d6760, 0x14d04e0)
	/usr/local/go/src/net/fd_unix.go:250 +0x1b7
net.(*conn).Read(0xc42000e008, 0xc4202a6000, 0x1000, 0x1000, 0x0, 0x0, 0x0)
	/usr/local/go/src/net/net.go:181 +0x70
crypto/tls.(*block).readFromUntil(0xc4203ae990, 0x7f18e753c9e8, 0xc42000e008, 0x5, 0xc42000e008, 0x6)
	/usr/local/go/src/crypto/tls/conn.go:488 +0x98
crypto/tls.(*Conn).readRecord(0xc4201de380, 0xeac217, 0xc4201de4a0, 0x2)
	/usr/local/go/src/crypto/tls/conn.go:590 +0xc4
crypto/tls.(*Conn).Read(0xc4201de380, 0xc42006c9f8, 0x9, 0x9, 0x0, 0x0, 0x0)
	/usr/local/go/src/crypto/tls/conn.go:1134 +0x11d
io.ReadAtLeast(0x7f18e753ca10, 0xc4201de380, 0xc42006c9f8, 0x9, 0x9, 0x9, 0xc426318b90, 0xc420068250, 0xc426318bd0)
	/usr/local/go/src/io/io.go:307 +0xa9
io.ReadFull(0x7f18e753ca10, 0xc4201de380, 0xc42006c9f8, 0x9, 0x9, 0xc4202cdf44, 0x2, 0x2)
	/usr/local/go/src/io/io.go:325 +0x58
net/http.http2readFrameHeader(0xc42006c9f8, 0x9, 0x9, 0x7f18e753ca10, 0xc4201de380, 0x0, 0xc400000000, 0x0, 0x0)
	/usr/local/go/src/net/http/h2_bundle.go:781 +0x7b
net/http.(*http2Framer).ReadFrame(0xc42006c9c0, 0xc4203b8a20, 0x0, 0x1, 0x0)
	/usr/local/go/src/net/http/h2_bundle.go:1008 +0xa4
net/http.(*http2serverConn).readFrames(0xc4200641c0)
	/usr/local/go/src/net/http/h2_bundle.go:3443 +0xab
created by net/http.(*http2serverConn).serve
	/usr/local/go/src/net/http/h2_bundle.go:3544 +0x305

goroutine 244 [select, 11 minutes]:
github.com/drone/drone/vendor/github.com/cncd/pubsub.(*publisher).Subscribe(0xc42028fb50, 0x14dd5a0, 0xc4203f4790, 0xe7335e, 0xc, 0xc42062df50, 0xae40be, 0xc420630680)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/pubsub/pub.go:58 +0x209
github.com/drone/drone/server.EventStreamSSE.func2(0xc4203f4790, 0xc42052a390, 0x14dd2e0, 0xc420310f40, 0xc420206a80, 0xc42049f180)
	/go/src/github.com/drone/drone/server/stream.go:78 +0xe5
created by github.com/drone/drone/server.EventStreamSSE
	/go/src/github.com/drone/drone/server/stream.go:80 +0x45e

goroutine 28 [select, 51 minutes]:
github.com/drone/drone/vendor/github.com/cncd/queue.(*fifo).Poll(0xc42004fe30, 0x14dd3a0, 0xc42031e3c0, 0xc42031e510, 0xc4202d36c8, 0x416278, 0x30)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/queue/fifo.go:65 +0x22f
github.com/drone/drone/model.(*persistentQueue).Poll(0xc4202b6c80, 0x14dd3a0, 0xc42031e3c0, 0xc42031e510, 0x0, 0x0, 0x413af8)
	/go/src/github.com/drone/drone/model/queue.go:59 +0x6f
github.com/drone/drone/server.(*RPC).Next(0xc4202d3800, 0x14dd3a0, 0xc42031e3c0, 0xc42031e4b0, 0x0, 0x0, 0x0, 0xc42031e330, 0xd730e0)
	/go/src/github.com/drone/drone/server/rpc.go:99 +0xe2
github.com/drone/drone/server.(*DroneServer).Next(0xc4202dcea0, 0x7f18e753dee0, 0xc42031e3c0, 0xc42000e0d8, 0xc4202dcea0, 0xc42031e2a0, 0x14dd3a0)
	/go/src/github.com/drone/drone/server/rpc.go:525 +0x186
github.com/drone/drone/vendor/github.com/cncd/pipeline/pipeline/rpc/proto._Drone_Next_Handler.func1(0x7f18e753dee0, 0xc42031e3c0, 0xdb8480, 0xc42000e0d8, 0x0, 0xc420401c00, 0x0, 0xc42002e800)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/pipeline/pipeline/rpc/proto/drone.pb.go:606 +0x86
main.(*authorizer).unaryIntercaptor(0xc4204016a0, 0x7f18e753dee0, 0xc42031e3c0, 0xdb8480, 0xc42000e0d8, 0xc420401be0, 0xc420401c00, 0x20, 0xd9ce00, 0x1, ...)
	/go/src/github.com/drone/drone/extras/cmd/drone-server/server.go:667 +0xdc
main.(*authorizer).(main.unaryIntercaptor)-fm(0x7f18e753dee0, 0xc42031e3c0, 0xdb8480, 0xc42000e0d8, 0xc420401be0, 0xc420401c00, 0x0, 0xc4202cd9d8, 0x416278, 0x50)
	/go/src/github.com/drone/drone/extras/cmd/drone-server/server.go:507 +0x73
github.com/drone/drone/vendor/github.com/cncd/pipeline/pipeline/rpc/proto._Drone_Next_Handler(0xded0e0, 0xc4202dcea0, 0x7f18e753dee0, 0xc42031e3c0, 0xc420015770, 0xc42023b120, 0x0, 0x0, 0x0, 0x0)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/pipeline/pipeline/rpc/proto/drone.pb.go:608 +0x177
github.com/drone/drone/vendor/google.golang.org/grpc.(*Server).processUnaryRPC(0xc4201c5400, 0x14e05a0, 0xc4201e9600, 0xc4201c9000, 0xc420017c80, 0x14c51a0, 0xc42031e420, 0x0, 0x0)
	/go/src/github.com/drone/drone/vendor/google.golang.org/grpc/server.go:781 +0xc41
github.com/drone/drone/vendor/google.golang.org/grpc.(*Server).handleStream(0xc4201c5400, 0x14e05a0, 0xc4201e9600, 0xc4201c9000, 0xc42031e420)
	/go/src/github.com/drone/drone/vendor/google.golang.org/grpc/server.go:981 +0x15a6
github.com/drone/drone/vendor/google.golang.org/grpc.(*Server).serveStreams.func1.1(0xc42023b290, 0xc4201c5400, 0x14e05a0, 0xc4201e9600, 0xc4201c9000)
	/go/src/github.com/drone/drone/vendor/google.golang.org/grpc/server.go:551 +0xa9
created by github.com/drone/drone/vendor/google.golang.org/grpc.(*Server).serveStreams.func1
	/go/src/github.com/drone/drone/vendor/google.golang.org/grpc/server.go:552 +0xa1

Entually just looping the following over and over:

goroutine 963 [select, 11 minutes]:
github.com/drone/drone/vendor/github.com/cncd/pubsub.(*publisher).Subscribe(0xc42028fb50, 0x14dd5a0, 0xc4202c0160, 0xe7335e, 0xc, 0xc420253830, 0xc420c00001, 0xc420c087c0)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/pubsub/pub.go:58 +0x209
github.com/drone/drone/server.EventStreamSSE.func2(0xc4202c0160, 0xc42097a600, 0x14dd2e0, 0xc420624c00, 0xc420abcd80, 0xc42018cb30)
	/go/src/github.com/drone/drone/server/stream.go:78 +0xe5
created by github.com/drone/drone/server.EventStreamSSE
	/go/src/github.com/drone/drone/server/stream.go:80 +0x45e

goroutine 1182 [select, 11 minutes]:
github.com/drone/drone/vendor/github.com/cncd/pubsub.(*publisher).Subscribe(0xc42028fb50, 0x14dd5a0, 0xc4203f4210, 0xe7335e, 0xc, 0xc4210b9050, 0xae0001, 0xc420e9d7c0)
	/go/src/github.com/drone/drone/vendor/github.com/cncd/pubsub/pub.go:58 +0x209
github.com/drone/drone/server.EventStreamSSE.func2(0xc4203f4210, 0xc4210b8ff0, 0x14dd2e0, 0xc42099edc0, 0xc420225440, 0xc420716b10)
	/go/src/github.com/drone/drone/server/stream.go:78 +0xe5
created by github.com/drone/drone/server.EventStreamSSE
	/go/src/github.com/drone/drone/server/stream.go:80 +0x45e

Before this point the logs seem really chatty (about 20 requests/sec) with just 1 web browser open to it with logs such as:

time="2017-09-15T17:27:26Z" level=info ip=x.x.x.x latency=330.191305ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:26Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:26Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:26Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:26Z" level=info ip=x.x.x.x latency=476.008382ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:26Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:26Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:27Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:27Z" level=info ip=x.x.x.x latency=488.787609ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:27Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:27Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:27Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:27Z" level=info ip=x.x.x.x latency=497.800961ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:27Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:27Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:27Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:27Z" level=info ip=x.x.x.x latency=394.039755ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:27Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:28Z" level=info ip=x.x.x.x latency=424.490579ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:28Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:28Z" level=info ip=x.x.x.x latency=506.217952ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:28Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:28Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:29Z" level=info ip=x.x.x.x latency=718.627886ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:29Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:29Z" level=info ip=x.x.x.x latency=540.017108ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:29Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:29Z" level=info ip=x.x.x.x latency=576.751723ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:29Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:29Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:30Z" level=info ip=x.x.x.x latency=539.214457ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:29Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:30Z" level=info ip=x.x.x.x latency=543.4731ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:30Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:30Z" level=info ip=x.x.x.x latency=439.569993ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:30Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:30Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:30Z" level=info ip=x.x.x.x latency=243.54935ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:30Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:31Z" level=info ip=x.x.x.x latency=431.377976ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:31Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:31Z" level=info ip=x.x.x.x latency=400.128903ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:31Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:31Z" level=info ip=x.x.x.x latency=469.531503ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:31Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:31Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:32Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:32Z" level=debug msg="user feed: connection opened" 
time="2017-09-15T17:27:32Z" level=info ip=x.x.x.x latency=417.917335ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:32Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 
time="2017-09-15T17:27:32Z" level=debug msg="user feed: connection closed" 
time="2017-09-15T17:27:32Z" level=info ip=x.x.x.x latency=415.659295ms method=GET path="/stream/events" status=200 time="2017-09-15T17:27:32Z" user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 

It will also be running the server with 30-40% CPU utilization. I also can see memory utilization grow steadily until it “runs out”.

This is currently running on a 2CPU 2GB system. Previous version (rc3?) seems to run fine.

This would tell me the server sent event (sse) keeps reconnecting over and over. This would (generally) happen when the sse connection is being killed and the browser is automatically reconnecting. Can you see any evidence of this?

OK, that gives me something to look into. Any tips on how to TS such a thing?

do you see the above error before all the reconnects? If so it sounds like the server is running out of memory. Are you running builds or any other processes on the same server as drone that could be eating up memory? If not maybe need to run pprof to see if there is a leak?

Error comes eventually after all the reconnects (~20min) before hitting that … though maybe that still isn’t good?

Not running builds or anything else … just having drone UI open in a browser.

I can hit the pprof endpoint to get some more there. But would like to figure out this connect/disconnect/connect/etc thing as well. But not sure how to start TS’ing that as I’m not a web dev and have no experience with sse.

Error comes eventually after all the reconnects (~20min) before hitting that … though maybe that still isn’t good?

Probably not. It would be interesting to see if there are open file descriptors or something that start to build up.

The cool thing about server sent events is that they are just plain http requests, with a special content type set to text/event-stream. The server keeps the request open and continues to write to the response body using a simple text protocol, like this:

id: 1
data: hello

id: 2
data: world

The browser parses the response stream as it receives data from the server. The browser also automatically handles reconnects when the connection breaks. It is a really nice, simple protocol and api :slight_smile:

me too :slight_smile:

Is there any useful information in the browser console? Also which browser are you using? If not Chrome, can you repeat the issue in Chrome?

In the meantime, I will look at the frontend code to see if there is any additional information I can add that might help with debugging.

There are two differences I can think of.

The first difference is that we bundle a polyfill with rc.5 primarily for internet explorer. I remember reading that the polyfill will override the default Firefox implementation which is only partially compliant with the specification. Maybe there is an issue with the polyfill?

The second is that the browser does not automatically reconnect under some circumstances, so I added code to augment this behavior and to automatically reconnect. It is possible that I did not implement this correctly, or maybe it is only working properly in Chrome.

Let me know which browser and perhaps that will help me reproduce. Thanks and sorry about this :frowning:

Just determined that this seems to be an issue when using Safari (Version 10.1.2 (12603.3.8)) but not Chrome (Version 61.0.3163.79).

Chrome seems to be fine. :slight_smile:

But still need it to work in Safari too. :slight_smile:

EDIT: Firefox (55.0.3) seems OK also.

Now tracking with https://github.com/drone/drone-ui/issues/169

Good news. I can totally repeat this behavior with Safari.

I confirmed there is no build-up of file descriptors which is good news. I was unable to crash my digital ocean server (1gb ram, 1vcpu) which is also somewhat good news.

I do not have a root cause yet, but the fact that I can reproduce is a good start.

OK, this is definitely strange. I can repeat this issue with my deployed servers (ie beta.drone.io) but I cannot repeat this issue running drone locally, at localhost. The only difference between the two that I can think of are https (and http2)

:frowning:

You know I am desperate when I do this: https://stackoverflow.com/questions/46292945/safari-server-sent-event-sse-infinte-loop

1 Like

OK, so I have isolated the issue to http2. When I remove http2 as an accepted protocol, Safari is able to connect without issue.

		httpServer := &http.Server{
			Addr: ":443",
			TLSConfig: &tls.Config{
				GetCertificate: manager.GetCertificate,
-				NextProtos:     []string{"h2", "http/1.1"},
+				NextProtos:     []string{"http/1.1"},
			},
		}

I can reproduce with the following Go program (which uses lets encrypt)

package main

import (
	"crypto/tls"
	"flag"
	"io"
	"log"
	"net/http"
	"strings"
	"time"

	"golang.org/x/crypto/acme/autocert"
)

var (
	domain = flag.String("domain", "", "domain name")
	protos = flag.String("protos", "h2,http/1.1", "supported protocols")
)

func main() {
	flag.Parse()

	http.HandleFunc("/", handler)

	manager := autocert.Manager{
		Prompt:     autocert.AcceptTOS,
		HostPolicy: autocert.HostWhitelist(*domain),
	}
	httpServer := &http.Server{
		Addr: ":443",
		TLSConfig: &tls.Config{
			GetCertificate: manager.GetCertificate,
			NextProtos:     strings.Split(*protos, ","),
		},
	}
	httpServer.ListenAndServeTLS("", "")
}

func handler(w http.ResponseWriter, r *http.Request) {
	w.Header().Set("Content-Type", "text/event-stream")
	w.Header().Set("Cache-Control", "no-cache")
	w.Header().Set("Connection", "keep-alive")
	w.Header().Set("X-Accel-Buffering", "no")

	flusher, ok := w.(http.Flusher)
	if !ok {
		w.WriteHeader(500)
		log.Println("streaming not supported")
		return
	}

	io.WriteString(w, ": ping\n\n")
	flusher.Flush()

	log.Println("connection opened")

	for {
		select {
		case <-w.(http.CloseNotifier).CloseNotify():
			log.Println("connection closed")
			return
		case <-time.After(time.Second * 5):
			io.WriteString(w, ": ping\n\n")
			flusher.Flush()
		}
	}
}

If fails when I enable http2

./demo --domain=my.domain.com --protos="h2,http/1.1"

It works when I disable http2

./demo --domain=my.domain.com --protos="http/1.1"

The challenge here is that Go enables http2 by default now and globally disabling http2 would definitely be a bummer. I will see if there is some magic we can use to disable http2 for Safari only …

for reference, I also posted to the gopher mailing list https://groups.google.com/forum/#!topic/golang-nuts/fBR6kBmZY3U