database/sql: rows.Next panic from concurrent map writes

問題描述

我們最近用golang重構了以前c++的系統,在最後上線前的壓測環節,我負責的模塊偶現如下錯誤:

[mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 [mysql] 2018/09/02 23:17:35 packets.go:72: read tcp xx:39151->xx:3306: i/o timeout
 [mysql] 2018/09/02 23:17:35 packets.go:408: busy buffer
 [mysql] 2018/09/02 23:17:35 connection.go:372: invalid connection
 fatal error: concurrent map writes

   goroutine 4932 [running]:
    runtime.throw(0xcadf03, 0x15)
         /home/doctorq/soft/sf-go/src/runtime/panic.go:605 +0x95 fp=0xc422162758 sp=0xc422162738 pc=0x42d415
     runtime.mapdelete_fast64(0xb7d760, 0xc420180ff0, 0x91d)
         /home/doctorq/soft/sf-go/src/runtime/hashmap_fast.go:755 +0x212 fp=0xc4221627a0 sp=0xc422162758 pc=0x40ed12
     database/sql.(*DB).putConnDBLocked(0xc4201beaa0, 0xc4201880e0, 0x0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:1109 +0x258 fp=0xc422162888 sp=0xc4221627a0 pc=0x919798
     database/sql.(*DB).putConn(0xc4201beaa0, 0xc4201880e0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x10d fp=0xc4221628f0 sp=0xc422162888 pc=0x91943d
     database/sql.(*driverConn).releaseConn(0xc4201880e0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x47 fp=0xc422162920 sp=0xc4221628f0 pc=0x915897
     database/sql.(*driverConn).(database/sql.releaseConn)-fm(0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x3e fp=0xc422162948 sp=0xc422162920 pc=0x92437e
     database/sql.(*Rows).close(0xc4213c1080, 0x0, 0x0, 0x0, 0x0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x15c fp=0xc422162998 sp=0xc422162948 pc=0x92261c
     database/sql.(*Rows).Close(0xc4213c1080, 0xc4213c10b0, 0xc4221629f0)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x3d fp=0xc4221629d0 sp=0xc422162998 pc=0x92248d
     database/sql.(*Rows).Next(0xc4213c1080, 0xc42733f780)
         /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0xa4 fp=0xc422162a20 sp=0xc4221629d0 pc=0x920fa4
     (*MysqlAdapter).ParserRows(0xc4200115c0, 0x11ca6a0, 0xc4263eab10, 0xc4213c1080, 0x0, 0x0, 0x0, 0x0, 0x0)

報錯信息顯示,我不正確的使用了非安全的map導致的。但是錯誤信息指向了database/sql庫,我覺得不太可能是golang基礎庫的問題,所以我從自身代碼找問題。

解決過程

問題指向的代碼如下:


func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]map[string]interface{}, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([]interface{}, count)
    valuePtrs := make([]interface{}, count)
    var resultRows []map[string]interface{}
    for rows.Next() {//panic發生的點
        for i, _ := range columns {
            valuePtrs[i] = &values[i]
        }
        rows.Scan(valuePtrs...)

        resultMap := make(map[string]interface{}, count)
        for i, col := range columns {
            var v interface{}
            val := values[i]
            b, ok := val.([]byte)
            if ok {
                v = string(b)
            } else {
                v = val
            }
            resultMap[col] = v
        }
        resultRows = append(resultRows, resultMap)
    }
    log.Infof(ctx, "row size:[%d]", len(resultRows))
    return resultRows, nil
}

rows.Next()執行的時候報的錯,首先懷疑我自己的resultRows這個map使用不當,所以我把普通map更換成了sync.Map,替換後的代碼如下:

func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]sync.Map, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([][]byte, count)
    scans := make([]interface{}, count)
    for i, _ := range columns {
        scans[i] = &values[i]
    }

    var syncMaps []sync.Map
    for rows.Next() {
        if err := rows.Scan(scans...); err != nil { //query.Scan查詢出來的不定長值放到scans[i] = &values[i],也就是每行都放在values裏
            errStr := "mysql scan failed"
            log.Errorf(ctx, errStr)
            return nil, errors.New(errStr)
        }
        var syncMap sync.Map
        for k, v := range values {
            key := columns[k]
            syncMap.Store(key, string(v))
        }
        syncMaps = append(syncMaps, syncMap)
    }
    //  log.Infof(ctx, "row size:[%d]", len(syncMaps))
    return syncMaps, nil
}

然後繼續壓測,仍然偶現(有時運行1小時就掛了,有時候運行6個小時也沒事)。在經歷了3天都搞到晚上2點都失敗了,仍然沒解決這個問題後,我求助golang開發人員了。

過程中我還重構了代碼,任然沒解決。最後我放棄了,求助於golang/go和go-sql-driver開發人員

golang/go的開發人員很快回復了我。讓我用race和vet工具輔助定位問題,

繼續-戰

我用race工具確實查出了有同時讀寫map的問題。

==================
WARNING: DATA RACE
Write at 0x00c4202b8008 by goroutine 306:
  database/sql.(*DB).putConnDBLocked()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1119 +0x214
  database/sql.(*DB).putConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x1a8
  database/sql.(*driverConn).releaseConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x65
  database/sql.(*driverConn).(database/sql.releaseConn)-fm()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x55
  database/sql.(*Rows).close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x287
  database/sql.(*Rows).Close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x4a
  database/sql.(*Rows).Next()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0x120
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).ParserRows()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:87 +0x52e
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).OrderQuery()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:65 +0x47c
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlProxy).GetOrderId()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_proxy.go:179 +0xe03
  gitlab.sftcwl.com/dispatch/data/handler/proxy.(*OrderProxy).GetUndistributedOrder()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/proxy/order.go:76 +0x112
  gitlab.sftcwl.com/dispatch/data/handler/service.(*GetOrder).Access()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/service/get_order.go:35 +0x32c
  gitlab.sftcwl.com/dispatch/data/handler.(*DataServiceHandler).GetOrderDetail()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/DataService_handler.go:45 +0xcb
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2709 +0xa1
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:31 +0x2c7
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter/perfcounter.go:32 +0x142
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:34 +0x1ca
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/header.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/header/header.go:59 +0x2ae
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:39 +0x367
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2711 +0x1ec
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).processUnaryRPC()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1026 +0xbb7
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleStream()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1252 +0x14a6
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:699 +0xac

Previous read at 0x00c4202b8008 by goroutine 309:
  runtime.growslice()
      /home/doctorq/soft/sf-go/src/runtime/slice.go:82 +0x0
  database/sql.(*DB).putConnDBLocked()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1119 +0x30c
  database/sql.(*DB).putConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:1079 +0x1a8
  database/sql.(*driverConn).releaseConn()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:380 +0x65
  database/sql.(*driverConn).(database/sql.releaseConn)-fm()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:643 +0x55
  database/sql.(*Rows).close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2767 +0x287
  database/sql.(*Rows).Close()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2738 +0x4a
  database/sql.(*Rows).Next()
      /home/doctorq/soft/sf-go/src/database/sql/sql.go:2445 +0x120
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).ParserRows()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:87 +0x52e
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlAdapter).OrderQuery()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_adapter.go:65 +0x47c
  gitlab.sftcwl.com/dispatch/data/handler/db.(*MysqlProxy).GetOrderId()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/db/mysql_proxy.go:179 +0xe03
  gitlab.sftcwl.com/dispatch/data/handler/proxy.(*OrderProxy).GetUndistributedOrder()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/proxy/order.go:76 +0x112
  gitlab.sftcwl.com/dispatch/data/handler/service.(*GetOrder).Access()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/service/get_order.go:35 +0x32c
  gitlab.sftcwl.com/dispatch/data/handler.(*DataServiceHandler).GetOrderDetail()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/handler/DataService_handler.go:45 +0xcb
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2709 +0xa1
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:31 +0x2c7
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/perfcounter/perfcounter.go:32 +0x142
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:34 +0x1ca
  gitlab.sftcwl.com/dispatch/data/framework/interceptor/header.UnaryServerInterceptor.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/framework/interceptor/header/header.go:59 +0x2ae
  gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/github.com/grpc-ecosystem/go-grpc-middleware/chain.go:39 +0x367
  gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto._DataService_GetOrderDetail_Handler()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/gitlab.sftcwl.com/dispatch/util/proto/data.pb.go:2711 +0x1ec
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).processUnaryRPC()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1026 +0xbb7
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleStream()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:1252 +0x14a6
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1.1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:699 +0xac

Goroutine 306 (running) created at:
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:697 +0xb8
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).operateHeaders()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:432 +0x14f9
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).HandleStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:472 +0x8da
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:695 +0x183
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func2()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:649 +0x53
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func3()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:657 +0x34

Goroutine 309 (running) created at:
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams.func1()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:697 +0xb8
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).operateHeaders()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:432 +0x14f9
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport.(*http2Server).HandleStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/internal/transport/http2_server.go:472 +0x8da
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).serveStreams()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:695 +0x183
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func2()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:649 +0x53
  gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc.(*Server).handleRawConn.func3()
      /home/doctorq/go-dev/src/gitlab.sftcwl.com/dispatch/data/vendor/google.golang.org/grpc/server.go:657 +0x34
==================
race: limit on 8192 simultaneously alive goroutines is exceeded, dying
exit status 66

但是很奇怪的是,這個問題仍然定位到database/sql庫。然後我嘗試用第二個工具vet,靜態代碼檢測工具來定位代碼問題,果然發現了問題所在。

handler/db/mysql_adapter.go:98: call of append copies lock value: sync.Map contains sync.Mutex
framework/app/mysql.go:130: literal copies lock value from *client: database/sql.DB contains sync.Mutex

這個地方是說這些對象,不能複製,應該用指針,所以我修改了我原先的方法改成指針傳遞。

func (p *MysqlAdapter) ParserRows(ctx context.Context, rows *sql.Rows) ([]*sync.Map, error) {
    if rows == nil {
        errStr := "rows is nil"
        return nil, errors.New(errStr)
    }
    defer rows.Close()
    columns, err := rows.Columns()
    if err != nil {
        errStr := fmt.Sprintf("rows.Columns error:[%s]", err.Error())
        log.Error(ctx, errStr)
        return nil, errors.New(errStr)
    }
    count := len(columns)
    values := make([][]byte, count)
    scans := make([]interface{}, count)
    for i, _ := range columns {
        scans[i] = &values[i]
    }

    var syncMaps []*sync.Map
    for rows.Next() {
        if err := rows.Scan(scans...); err != nil { //query.Scan查詢出來的不定長值放到scans[i] = &values[i],也就是每行都放在values裏
            errStr := "mysql scan failed"
            log.Errorf(ctx, errStr)
            return nil, errors.New(errStr)
        }
        var syncMap sync.Map
        for k, v := range values {
            key := columns[k]
            syncMap.Store(key, string(v))
        }
        syncMaps = append(syncMaps, &syncMap)
    }
    //  log.Infof(ctx, "row size:[%d]", len(syncMaps))
    return syncMaps, nil
}

根本原因

根本原因還是mysql client的創建過程,也用了複製:


type DpMysql struct {
    mysqlclient.Client
}


var client *mysqlclient.Client
    client, err = mysqlclient.NewClient(mysqlConfig)
    if err != nil {
        return nil, err
    }

    MysqlNS.config = mysqlConfig

    return &DpMysql{*client}, nil

因爲複製的過程,會引起sync.Mutex傳遞出錯,所以用指針來傳遞。修改後的代碼如下:

type DpMysql struct {
    *mysqlclient.Client
}
var client *mysqlclient.Client
    client, err = mysqlclient.NewClient(mysqlConfig)
    if err != nil {
        return nil, err
    }

    MysqlNS.config = mysqlConfig

    return &DpMysql{client}, nil

至此,問題解決。

總結

  • race工具能直接定位到進程之間同步鎖的問題。
  • vet工具能很準確的定位到語法問題,很有用。
  • 新人入行,踩坑了。
  • 多進程儘量使用sync.Map
  • 對象用指針向上傳遞
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章