bitxhub/pkg/peermgr/swarm.go

494 lines
12 KiB
Go
Raw Normal View History

2020-03-29 21:32:01 +08:00
package peermgr
import (
"context"
"fmt"
"sync"
2020-03-29 21:32:01 +08:00
"time"
"github.com/Rican7/retry"
"github.com/Rican7/retry/strategy"
"github.com/ethereum/go-ethereum/event"
"github.com/libp2p/go-libp2p-core/peer"
"github.com/libp2p/go-libp2p-core/protocol"
"github.com/meshplus/bitxhub-model/pb"
"github.com/meshplus/bitxhub/internal/ledger"
"github.com/meshplus/bitxhub/internal/model"
"github.com/meshplus/bitxhub/internal/model/events"
"github.com/meshplus/bitxhub/internal/repo"
"github.com/meshplus/bitxhub/pkg/cert"
2020-08-24 14:17:46 +08:00
network "github.com/meshplus/go-lightp2p"
2020-03-29 21:32:01 +08:00
"github.com/sirupsen/logrus"
ma "github.com/multiformats/go-multiaddr"
2020-03-29 21:32:01 +08:00
)
const (
protocolID protocol.ID = "/B1txHu6/1.0.0" // magic protocol
)
type Swarm struct {
2020-12-07 21:48:47 +08:00
repo *repo.Repo
localID uint64
p2p network.Network
logger logrus.FieldLogger
2020-12-14 22:16:17 +08:00
routers map[uint64]*pb.VpInfo // trace the vp nodes
2020-12-08 16:42:56 +08:00
multiAddrs map[uint64]*peer.AddrInfo
connectedPeers sync.Map
notifiee *notifiee
2020-12-07 21:48:47 +08:00
2020-11-05 11:04:53 +08:00
ledger ledger.Ledger
2020-03-29 21:32:01 +08:00
orderMessageFeed event.Feed
2020-11-05 11:04:53 +08:00
enablePing bool
pingTimeout time.Duration
2020-03-29 21:32:01 +08:00
ctx context.Context
cancel context.CancelFunc
}
func New(repoConfig *repo.Repo, logger logrus.FieldLogger, ledger ledger.Ledger) (*Swarm, error) {
2020-08-24 14:17:46 +08:00
var protocolIDs = []string{string(protocolID)}
2020-12-07 21:48:47 +08:00
// init peers with ips and hosts
2020-12-14 22:16:17 +08:00
routers := repoConfig.NetworkConfig.GetVpInfos()
bootstrap := make([]string, 0)
for _, p := range routers {
if p.Id == repoConfig.NetworkConfig.ID {
continue
2020-12-07 21:48:47 +08:00
}
2020-12-14 22:16:17 +08:00
addr := fmt.Sprintf("%s%s", p.Hosts[0], p.Pid)
bootstrap = append(bootstrap, addr)
2020-12-07 21:48:47 +08:00
}
multiAddrs := make(map[uint64]*peer.AddrInfo)
p2pPeers, _ := repoConfig.NetworkConfig.GetNetworkPeers()
2020-12-14 22:16:17 +08:00
for id, node := range p2pPeers {
if id == repoConfig.NetworkConfig.ID {
2020-12-07 21:48:47 +08:00
continue
}
2020-12-14 22:16:17 +08:00
multiAddrs[id] = node
2020-12-07 21:48:47 +08:00
}
2020-12-14 22:16:17 +08:00
notifiee := newNotifiee(routers, logger)
2020-04-21 22:56:19 +08:00
p2p, err := network.New(
network.WithLocalAddr(repoConfig.NetworkConfig.LocalAddr),
network.WithPrivateKey(repoConfig.Key.Libp2pPrivKey),
2020-08-24 14:17:46 +08:00
network.WithProtocolIDs(protocolIDs),
2020-04-21 22:56:19 +08:00
network.WithLogger(logger),
2020-12-07 21:48:47 +08:00
// enable discovery
2020-12-14 22:16:17 +08:00
network.WithBootstrap(bootstrap),
2020-12-07 21:48:47 +08:00
network.WithNotify(notifiee),
2020-03-29 21:32:01 +08:00
)
if err != nil {
return nil, fmt.Errorf("create p2p: %w", err)
}
ctx, cancel := context.WithCancel(context.Background())
return &Swarm{
repo: repoConfig,
2020-12-07 21:48:47 +08:00
localID: repoConfig.NetworkConfig.ID,
2020-03-29 21:32:01 +08:00
p2p: p2p,
logger: logger,
ledger: ledger,
2020-12-14 22:16:17 +08:00
enablePing: repoConfig.Config.Ping.Enable,
pingTimeout: repoConfig.Config.Ping.Duration,
routers: routers,
2020-12-08 16:42:56 +08:00
multiAddrs: multiAddrs,
connectedPeers: sync.Map{},
2020-12-07 21:48:47 +08:00
notifiee: notifiee,
2020-03-29 21:32:01 +08:00
ctx: ctx,
cancel: cancel,
}, nil
}
func (swarm *Swarm) Start() error {
2020-04-21 22:56:19 +08:00
swarm.p2p.SetMessageHandler(swarm.handleMessage)
2020-03-29 21:32:01 +08:00
if err := swarm.p2p.Start(); err != nil {
return err
}
2020-12-07 21:48:47 +08:00
for id, addr := range swarm.multiAddrs {
2020-03-29 21:32:01 +08:00
go func(id uint64, addr *peer.AddrInfo) {
if err := retry.Retry(func(attempt uint) error {
2020-12-14 22:16:17 +08:00
// for restart node, after updating the routing table, some nodes may not exist in routing table
routers := swarm.notifiee.getPeers()
if _, ok := routers[id]; !ok {
swarm.logger.Infof("Can't find node %d from routing table, stopping connect", id)
return nil
}
2020-08-24 14:17:46 +08:00
if err := swarm.p2p.Connect(*addr); err != nil {
2020-03-29 21:32:01 +08:00
swarm.logger.WithFields(logrus.Fields{
"node": id,
"error": err,
}).Error("Connect failed")
return err
}
if err := swarm.verifyCertOrDisconnect(id); err != nil {
2020-03-29 21:32:01 +08:00
if attempt != 0 && attempt%5 == 0 {
swarm.logger.WithFields(logrus.Fields{
"node": id,
"error": err,
}).Error("Verify cert")
}
return err
}
swarm.logger.WithFields(logrus.Fields{
"node": id,
}).Info("Connect successfully")
swarm.connectedPeers.Store(id, addr)
2020-03-29 21:32:01 +08:00
return nil
},
strategy.Wait(1*time.Second),
); err != nil {
swarm.logger.Error(err)
}
}(id, addr)
}
2020-11-05 11:04:53 +08:00
if swarm.enablePing {
go swarm.Ping()
}
2020-03-29 21:32:01 +08:00
return nil
}
func (swarm *Swarm) Stop() error {
swarm.cancel()
return nil
}
func (swarm *Swarm) verifyCertOrDisconnect(id uint64) error {
if err := swarm.verifyCert(id); err != nil {
2020-12-14 22:16:17 +08:00
if err = swarm.p2p.Disconnect(swarm.routers[id].Pid); err != nil {
return err
}
}
return nil
}
2020-11-05 11:04:53 +08:00
func (swarm *Swarm) Ping() {
ticker := time.NewTicker(swarm.pingTimeout)
for {
select {
case <-ticker.C:
fields := logrus.Fields{}
swarm.connectedPeers.Range(func(key, value interface{}) bool {
info := value.(*peer.AddrInfo)
pingCh, err := swarm.p2p.Ping(info.ID.String())
if err != nil {
return true
}
select {
case res := <-pingCh:
fields[fmt.Sprintf("%d", key.(uint64))] = res.RTT
case <-time.After(time.Second * 5):
swarm.logger.Errorf("ping to node %d timeout", key.(uint64))
}
return true
})
2020-12-07 21:48:47 +08:00
swarm.logger.WithFields(fields).Info("ping time")
2020-12-08 16:42:56 +08:00
case <-swarm.ctx.Done():
return
2020-11-05 11:04:53 +08:00
}
}
}
func (swarm *Swarm) AsyncSend(id uint64, msg *pb.Message) error {
2020-12-07 21:48:47 +08:00
var (
addr string
err error
)
if addr, err = swarm.findPeer(id); err != nil {
2020-03-29 21:32:01 +08:00
return fmt.Errorf("p2p send: %w", err)
}
data, err := msg.Marshal()
if err != nil {
return err
}
2020-08-24 14:17:46 +08:00
return swarm.p2p.AsyncSend(addr, data)
2020-03-29 21:32:01 +08:00
}
2020-08-24 14:17:46 +08:00
func (swarm *Swarm) SendWithStream(s network.Stream, msg *pb.Message) error {
2020-03-29 21:32:01 +08:00
data, err := msg.Marshal()
if err != nil {
return err
}
2020-08-24 14:17:46 +08:00
return s.AsyncSend(data)
2020-03-29 21:32:01 +08:00
}
func (swarm *Swarm) Send(id uint64, msg *pb.Message) (*pb.Message, error) {
2020-12-07 21:48:47 +08:00
var (
addr string
err error
)
if addr, err = swarm.findPeer(id); err != nil {
2020-03-29 21:32:01 +08:00
return nil, fmt.Errorf("check id: %w", err)
}
data, err := msg.Marshal()
if err != nil {
return nil, err
}
2020-08-24 14:17:46 +08:00
ret, err := swarm.p2p.Send(addr, data)
2020-03-29 21:32:01 +08:00
if err != nil {
return nil, fmt.Errorf("sync send: %w", err)
}
m := &pb.Message{}
2020-08-24 14:17:46 +08:00
if err := m.Unmarshal(ret); err != nil {
2020-03-29 21:32:01 +08:00
return nil, err
}
return m, nil
}
func (swarm *Swarm) Broadcast(msg *pb.Message) error {
2020-12-07 21:48:47 +08:00
addrs := make([]string, 0, len(swarm.routers))
2020-12-14 22:16:17 +08:00
for _, router := range swarm.routers {
if router.Id == swarm.localID {
2020-12-07 21:48:47 +08:00
continue
}
2020-12-14 22:16:17 +08:00
addrs = append(addrs, router.Pid)
2020-12-07 21:48:47 +08:00
}
// if we are in adding node but hasn't finished updateN, new node hash will be temporarily recorded
// in swarm.notifiee.newPeer.
if swarm.notifiee.newPeer != "" {
swarm.logger.Debugf("Broadcast to new peer %s", swarm.notifiee.newPeer)
addrs = append(addrs, swarm.notifiee.newPeer)
2020-03-29 21:32:01 +08:00
}
data, err := msg.Marshal()
if err != nil {
return err
}
2020-08-24 14:17:46 +08:00
return swarm.p2p.Broadcast(addrs, data)
2020-03-29 21:32:01 +08:00
}
2020-12-14 22:16:17 +08:00
func (swarm *Swarm) Peers() map[uint64]*pb.VpInfo {
2020-12-08 16:42:56 +08:00
return swarm.notifiee.getPeers()
2020-03-29 21:32:01 +08:00
}
func (swarm *Swarm) OtherPeers() map[uint64]*peer.AddrInfo {
2020-12-07 21:48:47 +08:00
addrInfos := make(map[uint64]*peer.AddrInfo)
2020-12-14 22:16:17 +08:00
for _, node := range swarm.notifiee.getPeers() {
if node.Id == swarm.localID {
2020-12-07 21:48:47 +08:00
continue
}
addrInfo := &peer.AddrInfo{
2020-12-14 22:16:17 +08:00
ID: peer.ID(node.Pid),
2020-12-07 21:48:47 +08:00
}
2020-12-14 22:16:17 +08:00
addrInfos[node.Id] = addrInfo
}
2020-12-07 21:48:47 +08:00
return addrInfos
2020-03-29 21:32:01 +08:00
}
func (swarm *Swarm) SubscribeOrderMessage(ch chan<- events.OrderMessageEvent) event.Subscription {
return swarm.orderMessageFeed.Subscribe(ch)
}
func (swarm *Swarm) verifyCert(id uint64) error {
2020-12-07 21:48:47 +08:00
if _, err := swarm.findPeer(id); err != nil {
return fmt.Errorf("check id: %w", err)
}
2020-03-29 21:32:01 +08:00
msg := &pb.Message{
Type: pb.Message_FETCH_CERT,
}
ret, err := swarm.Send(id, msg)
2020-03-29 21:32:01 +08:00
if err != nil {
return fmt.Errorf("sync send: %w", err)
}
certs := &model.CertsMessage{}
if err := certs.Unmarshal(ret.Data); err != nil {
return fmt.Errorf("unmarshal certs: %w", err)
}
nodeCert, err := cert.ParseCert(certs.NodeCert)
if err != nil {
return fmt.Errorf("parse node cert: %w", err)
}
agencyCert, err := cert.ParseCert(certs.AgencyCert)
if err != nil {
return fmt.Errorf("parse agency cert: %w", err)
}
if err := verifyCerts(nodeCert, agencyCert, swarm.repo.Certs.CACert); err != nil {
2020-12-21 15:08:04 +08:00
err = swarm.p2p.Disconnect(swarm.routers[id].Pid)
if err != nil {
return fmt.Errorf("disconnect peer: %w", err)
}
2020-03-29 21:32:01 +08:00
return fmt.Errorf("verify certs: %w", err)
}
return nil
}
2020-12-07 21:48:47 +08:00
func (swarm *Swarm) findPeer(id uint64) (string, error) {
if swarm.routers[id] != nil {
2020-12-14 22:16:17 +08:00
return swarm.routers[id].Pid, nil
2020-03-29 21:32:01 +08:00
}
2020-12-07 21:48:47 +08:00
newPeerAddr := swarm.notifiee.newPeer
2020-12-08 16:42:56 +08:00
// new node id should be len(swarm.peers)+1
if uint64(len(swarm.routers)+1) == id && swarm.notifiee.newPeer != "" {
2020-12-14 22:16:17 +08:00
swarm.logger.Debugf("Unicast to new peer %s", swarm.notifiee.newPeer)
2020-12-07 21:48:47 +08:00
return newPeerAddr, nil
}
return "", fmt.Errorf("wrong id: %d", id)
}
2020-03-29 21:32:01 +08:00
2020-12-14 22:16:17 +08:00
func (swarm *Swarm) AddNode(newNodeID uint64, vpInfo *pb.VpInfo) {
2020-12-07 21:48:47 +08:00
if _, ok := swarm.routers[newNodeID]; ok {
2020-12-14 22:16:17 +08:00
swarm.logger.Warningf("VP[ID: %d, Pid: %s] has already exist in routing table", newNodeID, vpInfo.Pid)
2020-12-07 21:48:47 +08:00
return
}
2020-12-14 22:16:17 +08:00
swarm.logger.Infof("Add vp[ID: %d, Pid: %s] into routing table", newNodeID, vpInfo.Pid)
// 1. update routers and connectedPeers
2020-12-07 21:48:47 +08:00
swarm.routers[newNodeID] = vpInfo
2020-12-14 22:16:17 +08:00
addInfo, err := constructMultiaddr(vpInfo)
if err != nil {
swarm.logger.Error("Construct AddrInfo failed")
return
}
swarm.connectedPeers.Store(newNodeID, addInfo)
// 2. persist routers
if err := repo.RewriteNetworkConfig(swarm.repo.Config.RepoRoot, swarm.routers, false); err != nil {
swarm.logger.Errorf("Persist routing table failed, err: %s", err.Error())
return
}
// 3. update notifiee info
swarm.notifiee.setPeers(swarm.routers)
2020-12-08 16:42:56 +08:00
for id, p := range swarm.routers {
swarm.logger.Debugf("=====ID: %d, Addr: %v=====", id, p)
2020-12-07 21:48:47 +08:00
}
2020-12-14 22:16:17 +08:00
if swarm.notifiee.newPeer == vpInfo.Pid {
2020-12-07 21:48:47 +08:00
swarm.logger.Info("Clear notifiee newPeer info")
swarm.notifiee.newPeer = ""
2020-12-14 22:16:17 +08:00
} else if swarm.notifiee.newPeer != "" {
2020-12-07 21:48:47 +08:00
swarm.logger.Warningf("Received vpInfo %v, but it doesn't equal to notifiee newPeer %s", vpInfo, swarm.notifiee.newPeer)
}
}
func (swarm *Swarm) DelNode(delID uint64) {
2020-12-08 16:42:56 +08:00
var (
2020-12-14 22:16:17 +08:00
delNode *pb.VpInfo
2020-12-08 16:42:56 +08:00
ok bool
)
if delNode, ok = swarm.routers[delID]; !ok {
swarm.logger.Warningf("Can't find vp node %d from routing table ", delID)
return
}
swarm.logger.Infof("Delete node [ID: %d, peerInfo: %v] ", delID, delNode)
2020-12-14 22:16:17 +08:00
// 1. update routing table, multiAddrs and connectedPeers
2020-12-08 16:42:56 +08:00
delete(swarm.routers, delID)
delete(swarm.multiAddrs, delID)
swarm.connectedPeers.Delete(delID)
2020-12-14 22:16:17 +08:00
// 2. persist routers
if err := repo.RewriteNetworkConfig(swarm.repo.Config.RepoRoot, swarm.routers, false); err != nil {
swarm.logger.Errorf("Persist routing table failed, err: %s", err.Error())
return
}
2020-12-08 16:42:56 +08:00
for id, p := range swarm.routers {
swarm.logger.Debugf("=====ID: %d, Addr: %v=====", id, p)
}
2020-12-14 22:16:17 +08:00
// 3. update notifiee info
2020-12-08 16:42:56 +08:00
swarm.notifiee.setPeers(swarm.routers)
2020-12-14 22:16:17 +08:00
// 4. deleted node itself will exit the cluster
if delID == swarm.localID {
swarm.reset()
_ = swarm.p2p.Stop()
_ = swarm.Stop()
return
}
2020-12-07 21:48:47 +08:00
}
2020-12-14 22:16:17 +08:00
func (swarm *Swarm) UpdateRouter(vpInfos map[uint64]*pb.VpInfo, isNew bool) bool {
2020-12-07 21:48:47 +08:00
swarm.logger.Infof("Update router: %+v", vpInfos)
2020-12-14 22:16:17 +08:00
// 1. update routing table, multiAddrs and connectedPeers
oldRouters := swarm.routers
2020-12-07 21:48:47 +08:00
swarm.routers = vpInfos
2020-12-14 22:16:17 +08:00
for id, _ := range oldRouters {
if _, ok := vpInfos[id]; !ok {
delete(swarm.multiAddrs, id)
swarm.connectedPeers.Delete(id)
}
}
// 2. persist routers
if err := repo.RewriteNetworkConfig(swarm.repo.Config.RepoRoot, swarm.routers, isNew); err != nil {
swarm.logger.Errorf("Persist routing table failed, err: %s", err.Error())
return false
}
// 3. update notifiee info
swarm.notifiee.setPeers(vpInfos)
// 4. check if a restart node is exist in the routing table, if not, then exit the cluster
2020-12-08 16:42:56 +08:00
var isExist bool
for id, _ := range vpInfos {
if id == swarm.localID {
isExist = true
break
}
}
2020-12-14 22:16:17 +08:00
// deleted node itself will exit the cluster
2020-12-08 16:42:56 +08:00
if !isExist && !isNew {
swarm.reset()
_ = swarm.p2p.Stop()
_ = swarm.Stop()
return true
}
return false
}
2020-12-14 22:16:17 +08:00
func (swarm *Swarm) Disconnect(vpInfos map[uint64]*pb.VpInfo) {
for id, info := range vpInfos {
if err := swarm.p2p.Disconnect(info.Pid); err != nil {
swarm.logger.Errorf("Disconnect peer %s failed, err: %s", err.Error())
}
swarm.logger.Infof("Disconnect peer [ID: %d, Pid: %s]", id, info.Pid)
}
}
2020-12-08 16:42:56 +08:00
func (swarm *Swarm) reset() {
swarm.routers = nil
swarm.multiAddrs = nil
swarm.connectedPeers = sync.Map{}
swarm.notifiee.setPeers(nil)
}
2020-12-14 22:16:17 +08:00
func constructMultiaddr(vpInfo *pb.VpInfo) (*peer.AddrInfo, error) {
addrs := make([]ma.Multiaddr,0)
for _, host := range vpInfo.Hosts {
addr, err := ma.NewMultiaddr(fmt.Sprintf("%s%s", host, vpInfo.Pid))
if err != nil {
return nil, fmt.Errorf("new Multiaddr error:%w", err)
2020-12-08 16:42:56 +08:00
}
2020-12-14 22:16:17 +08:00
addrs = append(addrs, addr)
}
addrInfo := &peer.AddrInfo{
ID: peer.ID(vpInfo.Pid),
Addrs: addrs,
2020-12-08 16:42:56 +08:00
}
2020-12-14 22:16:17 +08:00
return addrInfo, nil
2020-03-29 21:32:01 +08:00
}
2020-12-21 15:08:04 +08:00