wireguard-go/device/device.go

576 lines
12 KiB
Go
Raw Normal View History

2019-01-02 01:55:51 +01:00
/* SPDX-License-Identifier: MIT
*
* Copyright (C) 2017-2020 WireGuard LLC. All Rights Reserved.
*/
2019-03-03 04:04:41 +01:00
package device
import (
"runtime"
"sync"
"sync/atomic"
"time"
2019-05-14 09:09:52 +02:00
"golang.org/x/net/ipv4"
"golang.org/x/net/ipv6"
"golang.zx2c4.com/wireguard/conn"
2019-05-14 09:09:52 +02:00
"golang.zx2c4.com/wireguard/ratelimiter"
"golang.zx2c4.com/wireguard/rwcancel"
2019-05-14 09:09:52 +02:00
"golang.zx2c4.com/wireguard/tun"
)
type Device struct {
isUp AtomicBool // device is (going) up
isClosed AtomicBool // device is closed? (acting as guard)
log *Logger
ipcSetMu sync.Mutex // serializes UAPI set operations
// synchronized resources (locks acquired in order)
state struct {
2018-05-05 06:00:38 +02:00
stopping sync.WaitGroup
sync.Mutex
changing AtomicBool
current bool
}
net struct {
stopping sync.WaitGroup
sync.RWMutex
bind conn.Bind // bind interface
netlinkCancel *rwcancel.RWCancel
port uint16 // listening port
fwmark uint32 // mark value (0 = disabled)
}
2018-05-13 23:14:43 +02:00
staticIdentity struct {
sync.RWMutex
privateKey NoisePrivateKey
publicKey NoisePublicKey
}
peers struct {
empty AtomicBool // empty reports whether len(keyMap) == 0
sync.RWMutex // protects keyMap
keyMap map[NoisePublicKey]*Peer
}
// unprotected / "self-synchronising resources"
2018-05-13 23:14:43 +02:00
allowedips AllowedIPs
indexTable IndexTable
cookieChecker CookieChecker
rate struct {
underLoadUntil atomic.Value
limiter ratelimiter.Ratelimiter
}
pool struct {
2018-09-22 06:29:02 +02:00
messageBufferPool *sync.Pool
messageBufferReuseChan chan *[MaxMessageSize]byte
inboundElementPool *sync.Pool
inboundElementReuseChan chan *QueueInboundElement
outboundElementPool *sync.Pool
outboundElementReuseChan chan *QueueOutboundElement
}
queue struct {
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-15 00:07:23 +01:00
encryption *encryptionQueue
decryption *decryptionQueue
2017-07-01 23:29:22 +02:00
handshake chan QueueHandshakeElement
}
signals struct {
stop chan struct{}
}
tun struct {
device tun.Device
mtu int32
}
2017-06-01 21:31:30 +02:00
}
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-15 00:07:23 +01:00
// An encryptionQueue is a channel of QueueOutboundElements awaiting encryption.
// An encryptionQueue is ref-counted using its wg field.
// An encryptionQueue created with newEncryptionQueue has one reference.
// Every additional writer must call wg.Add(1).
// Every completed writer must call wg.Done().
// When no further writers will be added,
// call wg.Done to remove the initial reference.
// When the refcount hits 0, the queue's channel is closed.
type encryptionQueue struct {
c chan *QueueOutboundElement
wg sync.WaitGroup
}
func newEncryptionQueue() *encryptionQueue {
q := &encryptionQueue{
c: make(chan *QueueOutboundElement, QueueOutboundSize),
}
q.wg.Add(1)
go func() {
q.wg.Wait()
close(q.c)
}()
return q
}
// A decryptionQueue is similar to an encryptionQueue; see those docs.
type decryptionQueue struct {
c chan *QueueInboundElement
wg sync.WaitGroup
}
func newDecryptionQueue() *decryptionQueue {
q := &decryptionQueue{
c: make(chan *QueueInboundElement, QueueInboundSize),
}
q.wg.Add(1)
go func() {
q.wg.Wait()
close(q.c)
}()
return q
}
/* Converts the peer into a "zombie", which remains in the peer map,
* but processes no packets and does not exists in the routing table.
*
* Must hold device.peers.Mutex
*/
func unsafeRemovePeer(device *Device, peer *Peer, key NoisePublicKey) {
2017-12-29 17:42:09 +01:00
// stop routing and processing of packets
2018-05-13 23:14:43 +02:00
device.allowedips.RemoveByPeer(peer)
peer.Stop()
// remove from peer map
delete(device.peers.keyMap, key)
device.peers.empty.Set(len(device.peers.keyMap) == 0)
}
func deviceUpdateState(device *Device) {
// check if state already being updated (guard)
if device.state.changing.Swap(true) {
return
}
2018-02-04 16:46:24 +01:00
// compare to current state of device
device.state.Lock()
2018-02-04 16:46:24 +01:00
newIsUp := device.isUp.Get()
2018-02-04 16:46:24 +01:00
if newIsUp == device.state.current {
device.state.changing.Set(false)
device.state.Unlock()
2018-02-04 16:46:24 +01:00
return
}
2018-02-04 16:46:24 +01:00
// change state of device
2018-02-04 16:46:24 +01:00
switch newIsUp {
case true:
if err := device.BindUpdate(); err != nil {
device.log.Errorf("Unable to update bind: %v", err)
2018-02-04 16:46:24 +01:00
device.isUp.Set(false)
break
}
device.peers.RLock()
2018-02-04 16:46:24 +01:00
for _, peer := range device.peers.keyMap {
peer.Start()
if atomic.LoadUint32(&peer.persistentKeepaliveInterval) > 0 {
peer.SendKeepalive()
}
2018-02-04 16:46:24 +01:00
}
device.peers.RUnlock()
2018-02-04 16:46:24 +01:00
case false:
device.BindClose()
device.peers.RLock()
2018-02-04 16:46:24 +01:00
for _, peer := range device.peers.keyMap {
peer.Stop()
}
device.peers.RUnlock()
2018-02-04 16:46:24 +01:00
}
2018-02-04 16:46:24 +01:00
// update state variables
2018-02-04 16:46:24 +01:00
device.state.current = newIsUp
device.state.changing.Set(false)
device.state.Unlock()
// check for state change in the mean time
deviceUpdateState(device)
2017-12-29 17:42:09 +01:00
}
func (device *Device) Up() {
// closed device cannot be brought up
2017-12-29 17:42:09 +01:00
if device.isClosed.Get() {
return
}
device.isUp.Set(true)
deviceUpdateState(device)
}
func (device *Device) Down() {
device.isUp.Set(false)
deviceUpdateState(device)
2017-12-29 17:42:09 +01:00
}
func (device *Device) IsUnderLoad() bool {
// check if currently under load
now := time.Now()
underLoad := len(device.queue.handshake) >= UnderLoadQueueSize
if underLoad {
2018-05-13 18:42:06 +02:00
device.rate.underLoadUntil.Store(now.Add(UnderLoadAfterTime))
return true
}
// check if recently under load
until := device.rate.underLoadUntil.Load().(time.Time)
return until.After(now)
}
2017-08-04 16:15:53 +02:00
func (device *Device) SetPrivateKey(sk NoisePrivateKey) error {
// lock required resources
device.staticIdentity.Lock()
defer device.staticIdentity.Unlock()
if sk.Equals(device.staticIdentity.privateKey) {
return nil
}
device.peers.Lock()
defer device.peers.Unlock()
2019-08-05 17:46:34 +02:00
lockedPeers := make([]*Peer, 0, len(device.peers.keyMap))
for _, peer := range device.peers.keyMap {
peer.handshake.mutex.RLock()
2019-08-05 17:46:34 +02:00
lockedPeers = append(lockedPeers, peer)
}
2017-06-24 15:34:17 +02:00
// remove peers with matching public keys
2017-08-04 16:15:53 +02:00
publicKey := sk.publicKey()
for key, peer := range device.peers.keyMap {
if peer.handshake.remoteStatic.Equals(publicKey) {
unsafeRemovePeer(device, peer, key)
2017-08-04 16:15:53 +02:00
}
}
2017-06-24 15:34:17 +02:00
// update key material
2018-05-13 23:14:43 +02:00
device.staticIdentity.privateKey = sk
device.staticIdentity.publicKey = publicKey
device.cookieChecker.Init(publicKey)
2017-06-24 15:34:17 +02:00
// do static-static DH pre-computations
2019-08-05 17:46:34 +02:00
expiredPeers := make([]*Peer, 0, len(device.peers.keyMap))
2020-02-04 18:08:51 +01:00
for _, peer := range device.peers.keyMap {
2018-05-14 12:27:29 +02:00
handshake := &peer.handshake
2020-02-04 18:08:51 +01:00
handshake.precomputedStaticStatic = device.staticIdentity.privateKey.sharedSecret(handshake.remoteStatic)
expiredPeers = append(expiredPeers, peer)
2017-06-23 13:41:59 +02:00
}
2017-08-04 16:15:53 +02:00
2019-08-05 17:46:34 +02:00
for _, peer := range lockedPeers {
peer.handshake.mutex.RUnlock()
}
for _, peer := range expiredPeers {
peer.ExpireCurrentKeypairs()
}
2017-08-04 16:15:53 +02:00
return nil
2017-06-23 13:41:59 +02:00
}
func NewDevice(tunDevice tun.Device, logger *Logger) *Device {
device := new(Device)
2017-11-14 18:26:28 +01:00
device.log = logger
2018-05-23 02:10:54 +02:00
device.tun.device = tunDevice
2018-04-18 16:39:14 +02:00
mtu, err := device.tun.device.MTU()
if err != nil {
device.log.Errorf("Trouble determining MTU, assuming default: %v", err)
2018-04-19 15:52:59 +02:00
mtu = DefaultMTU
2018-04-18 16:39:14 +02:00
}
device.tun.mtu = int32(mtu)
device.peers.keyMap = make(map[NoisePublicKey]*Peer)
device.rate.limiter.Init()
device.rate.underLoadUntil.Store(time.Time{})
2018-05-13 18:23:40 +02:00
device.indexTable.Init()
2018-09-22 06:29:02 +02:00
device.PopulatePools()
// create queues
2017-07-01 23:29:22 +02:00
device.queue.handshake = make(chan QueueHandshakeElement, QueueHandshakeSize)
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-15 00:07:23 +01:00
device.queue.encryption = newEncryptionQueue()
device.queue.decryption = newDecryptionQueue()
2017-07-01 23:29:22 +02:00
// prepare signals
2018-05-14 04:19:25 +02:00
device.signals.stop = make(chan struct{})
2017-11-11 15:43:55 +01:00
// prepare net
device.net.port = 0
device.net.bind = nil
// start workers
2018-05-05 06:00:38 +02:00
cpus := runtime.NumCPU()
device.state.stopping.Wait()
for i := 0; i < cpus; i++ {
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-15 00:07:23 +01:00
device.state.stopping.Add(2) // decryption and handshake
go device.RoutineEncryption()
2017-07-01 23:29:22 +02:00
go device.RoutineDecryption()
go device.RoutineHandshake()
}
2017-12-01 23:37:26 +01:00
device.state.stopping.Add(2)
go device.RoutineReadFromTUN()
go device.RoutineTUNEventReader()
2017-12-01 23:37:26 +01:00
return device
2017-06-24 15:34:17 +02:00
}
func (device *Device) LookupPeer(pk NoisePublicKey) *Peer {
device.peers.RLock()
defer device.peers.RUnlock()
return device.peers.keyMap[pk]
2017-06-24 15:34:17 +02:00
}
func (device *Device) RemovePeer(key NoisePublicKey) {
device.peers.Lock()
defer device.peers.Unlock()
// stop peer and remove from routing
peer, ok := device.peers.keyMap[key]
if ok {
unsafeRemovePeer(device, peer, key)
}
2017-06-01 21:31:30 +02:00
}
2017-06-24 15:34:17 +02:00
func (device *Device) RemoveAllPeers() {
device.peers.Lock()
defer device.peers.Unlock()
for key, peer := range device.peers.keyMap {
unsafeRemovePeer(device, peer, key)
2017-06-01 21:31:30 +02:00
}
device.peers.keyMap = make(map[NoisePublicKey]*Peer)
}
2018-05-05 06:00:38 +02:00
func (device *Device) FlushPacketQueues() {
for {
select {
case elem := <-device.queue.handshake:
device.PutMessageBuffer(elem.buffer)
2018-05-05 06:00:38 +02:00
default:
return
}
}
}
func (device *Device) Close() {
2017-12-29 17:42:09 +01:00
if device.isClosed.Swap(true) {
return
}
device.log.Verbosef("Device closing")
device.state.changing.Set(true)
device.state.Lock()
defer device.state.Unlock()
2017-11-11 23:26:44 +01:00
device.tun.device.Close()
device.BindClose()
device.isUp.Set(false)
// We kept a reference to the encryption and decryption queues,
// in case we started any new peers that might write to them.
// No new peers are coming; we are done with these queues.
device: use channel close to shut down and drain encryption channel The new test introduced in this commit used to deadlock about 1% of the time. I believe that the deadlock occurs as follows: * The test completes, calling device.Close. * device.Close closes device.signals.stop. * RoutineEncryption stops. * The deferred function in RoutineEncryption drains device.queue.encryption. * RoutineEncryption exits. * A peer's RoutineNonce processes an element queued in peer.queue.nonce. * RoutineNonce puts that element into the outbound and encryption queues. * RoutineSequentialSender reads that elements from the outbound queue. * It waits for that element to get Unlocked by RoutineEncryption. * RoutineEncryption has already exited, so RoutineSequentialSender blocks forever. * device.RemoveAllPeers calls peer.Stop on all peers. * peer.Stop waits for peer.routines.stopping, which blocks forever. Rather than attempt to add even more ordering to the already complex centralized shutdown orchestration, this commit moves towards a data-flow-oriented shutdown. The device.queue.encryption gets closed when there will be no more writes to it. All device.queue.encryption readers always read until the channel is closed and then exit. We thus guarantee that any element that enters the encryption queue also exits it. This removes the need for central control of the lifetime of RoutineEncryption, removes the need to drain the encryption queue on shutdown, and simplifies RoutineEncryption. This commit also fixes a data race. When RoutineSequentialSender drains its queue on shutdown, it needs to lock the elem before operating on it, just as the main body does. The new test in this commit passed 50k iterations with the race detector enabled and 150k iterations with the race detector disabled, with no failures. Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
2020-12-15 00:07:23 +01:00
device.queue.encryption.wg.Done()
device.queue.decryption.wg.Done()
close(device.signals.stop)
device.state.stopping.Wait()
2018-09-24 01:52:02 +02:00
device.RemoveAllPeers()
2018-05-05 06:00:38 +02:00
device.FlushPacketQueues()
2018-02-11 22:53:39 +01:00
device.rate.limiter.Close()
device.state.changing.Set(false)
device.log.Verbosef("Interface closed")
}
2017-12-01 23:37:26 +01:00
func (device *Device) Wait() chan struct{} {
return device.signals.stop
}
func (device *Device) SendKeepalivesToPeersWithCurrentKeypair() {
if device.isClosed.Get() {
return
}
device.peers.RLock()
for _, peer := range device.peers.keyMap {
peer.keypairs.RLock()
sendKeepalive := peer.keypairs.current != nil && !peer.keypairs.current.created.Add(RejectAfterTime).Before(time.Now())
peer.keypairs.RUnlock()
if sendKeepalive {
peer.SendKeepalive()
}
}
device.peers.RUnlock()
}
func unsafeCloseBind(device *Device) error {
var err error
netc := &device.net
if netc.netlinkCancel != nil {
netc.netlinkCancel.Cancel()
}
if netc.bind != nil {
err = netc.bind.Close()
netc.bind = nil
}
netc.stopping.Wait()
return err
}
func (device *Device) Bind() conn.Bind {
device.net.Lock()
defer device.net.Unlock()
return device.net.bind
}
func (device *Device) BindSetMark(mark uint32) error {
device.net.Lock()
defer device.net.Unlock()
// check if modified
if device.net.fwmark == mark {
return nil
}
// update fwmark on existing bind
device.net.fwmark = mark
if device.isUp.Get() && device.net.bind != nil {
if err := device.net.bind.SetMark(mark); err != nil {
return err
}
}
// clear cached source addresses
device.peers.RLock()
for _, peer := range device.peers.keyMap {
peer.Lock()
defer peer.Unlock()
if peer.endpoint != nil {
peer.endpoint.ClearSrc()
}
}
device.peers.RUnlock()
return nil
}
func (device *Device) BindUpdate() error {
device.net.Lock()
defer device.net.Unlock()
// close existing sockets
if err := unsafeCloseBind(device); err != nil {
return err
}
// open new sockets
if device.isUp.Get() {
// bind to new port
var err error
netc := &device.net
netc.bind, netc.port, err = conn.CreateBind(netc.port)
if err != nil {
netc.bind = nil
netc.port = 0
return err
}
netc.netlinkCancel, err = device.startRouteListener(netc.bind)
if err != nil {
netc.bind.Close()
netc.bind = nil
netc.port = 0
return err
}
// set fwmark
if netc.fwmark != 0 {
err = netc.bind.SetMark(netc.fwmark)
if err != nil {
return err
}
}
// clear cached source addresses
device.peers.RLock()
for _, peer := range device.peers.keyMap {
peer.Lock()
defer peer.Unlock()
if peer.endpoint != nil {
peer.endpoint.ClearSrc()
}
}
device.peers.RUnlock()
// start receiving routines
device.net.stopping.Add(2)
device.queue.decryption.wg.Add(2) // each RoutineReceiveIncoming goroutine writes to device.queue.decryption
go device.RoutineReceiveIncoming(ipv4.Version, netc.bind)
go device.RoutineReceiveIncoming(ipv6.Version, netc.bind)
device.log.Verbosef("UDP bind has been updated")
}
return nil
}
func (device *Device) BindClose() error {
device.net.Lock()
err := unsafeCloseBind(device)
device.net.Unlock()
return err
}