731 lines
21 KiB
Go
731 lines
21 KiB
Go
//go:build linux
|
||
|
||
package transport
|
||
|
||
import (
|
||
"encoding/binary"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"syscall"
|
||
"time"
|
||
|
||
"omnisocketgo/cmd/internal/latencylog"
|
||
"omnisocketgo/cmd/internal/protocol"
|
||
)
|
||
|
||
const (
|
||
linuxTimestampControlBufferSize = 256 // 控制消息缓冲区。
|
||
linuxTXTimestampWaitTimeout = 250 * time.Millisecond // 等待 TX 时间戳的上限。
|
||
linuxTXTimestampPollInterval = time.Millisecond // 轮询 errqueue 的间隔。
|
||
linuxDataPollInterval = time.Millisecond // 轮询普通收发的间隔。
|
||
|
||
linuxSOTimestampingNew = 0x41
|
||
linuxSCMTimestampingNew = linuxSOTimestampingNew
|
||
linuxSOEEOriginTimestamping = 4 // timestamping errqueue 事件。
|
||
linuxSCMTstampSnd = 0 // 对应 A_TX_SOFTWARE。
|
||
linuxSCMTstampSched = 1 // 对应 A_TX_SCHED。
|
||
|
||
linuxSOFTimestampingTXSoftware = 1 << 1 // 打开 TX software timestamp。
|
||
linuxSOFTimestampingRXSoftware = 1 << 3 // 打开 RX software timestamp。
|
||
linuxSOFTimestampingSoftware = 1 << 4 // software timestamp 总开关。
|
||
linuxSOFTimestampingOptID = 1 << 7 // 给时间戳关联 ID。
|
||
linuxSOFTimestampingTXSched = 1 << 8 // 打开 TX sched timestamp。
|
||
linuxSOFTimestampingOptTSONLY = 1 << 11 // 只回时间戳。
|
||
linuxSOFTimestampingOptIDTCP = 1 << 16 // 让 TCP 也带 timestamp ID。
|
||
|
||
linuxTXTimestampPhasePreSendDrain = "pre_send_drain"
|
||
linuxTXTimestampPhasePostSendCollect = "post_send_collect"
|
||
linuxTXTimestampPhasePostSelectDrain = "post_select_drain"
|
||
)
|
||
|
||
type txSendChunk struct {
|
||
SendCallIndex int
|
||
FrameOffsetStart int
|
||
FrameOffsetEnd int // 包含当前 sendmsg 成功写出的最后一个 frame 字节偏移。
|
||
BytesWritten int
|
||
ExpectedTXID uint32
|
||
}
|
||
|
||
type txTimestampEvent struct {
|
||
EventName string
|
||
TSUnixNano int64
|
||
EEInfo uint32
|
||
EEData uint32
|
||
}
|
||
|
||
type observedTXTimestampEvent struct {
|
||
Phase string
|
||
ReadIndex int
|
||
Event txTimestampEvent
|
||
}
|
||
|
||
type txTimestampSelection struct {
|
||
SelectedID uint32
|
||
Timestamps map[string]int64
|
||
HasEvent bool
|
||
}
|
||
|
||
type socketExtendedErrInfo struct {
|
||
Info uint32
|
||
Data uint32
|
||
}
|
||
|
||
// 拿到底层 fd,并打开 Linux timestamping。
|
||
func (c *TCPConn) initLinuxTimestamping() error {
|
||
sysConn, ok := c.conn.(interface {
|
||
SyscallConn() (syscall.RawConn, error)
|
||
})
|
||
if !ok {
|
||
return fmt.Errorf("transport: connection does not support SyscallConn")
|
||
}
|
||
|
||
rawConn, err := sysConn.SyscallConn()
|
||
if err != nil || rawConn == nil {
|
||
if err != nil {
|
||
return fmt.Errorf("transport: get syscall conn: %w", err)
|
||
}
|
||
return fmt.Errorf("transport: missing syscall conn")
|
||
}
|
||
|
||
//socket是否可以成功打开 timestamping 取决于内核版本和配置,尝试多个 flag 组合直到成功或遇到非 EINVAL 错误。
|
||
if err := enableLinuxTimestamping(rawConn); err != nil {
|
||
return fmt.Errorf("transport: enable linux timestamping: %w", err)
|
||
}
|
||
//成功打开 timestamping 后,rawConn 就可以用来收 TX/RX 时间戳了。
|
||
c.raw = rawConn
|
||
return nil
|
||
}
|
||
|
||
// 给 socket开权限打开TX software timestamping。
|
||
func enableLinuxTimestamping(rawConn syscall.RawConn) error {
|
||
flagCandidates := []int{ //不同linux版本可能支持不同的 flag 组合,尝试多个组合直到成功。
|
||
linuxSOFTimestampingTXSched |
|
||
linuxSOFTimestampingTXSoftware |
|
||
linuxSOFTimestampingRXSoftware |
|
||
linuxSOFTimestampingSoftware |
|
||
linuxSOFTimestampingOptID | //TCP 协议栈给每个时间戳生成一个序列号
|
||
linuxSOFTimestampingOptIDTCP |
|
||
linuxSOFTimestampingOptTSONLY,
|
||
linuxSOFTimestampingTXSched |
|
||
linuxSOFTimestampingTXSoftware |
|
||
linuxSOFTimestampingRXSoftware |
|
||
linuxSOFTimestampingSoftware |
|
||
linuxSOFTimestampingOptID |
|
||
linuxSOFTimestampingOptTSONLY,
|
||
linuxSOFTimestampingTXSched |
|
||
linuxSOFTimestampingTXSoftware |
|
||
linuxSOFTimestampingRXSoftware |
|
||
linuxSOFTimestampingSoftware |
|
||
linuxSOFTimestampingOptTSONLY,
|
||
}
|
||
|
||
var lastErr error
|
||
for _, flags := range flagCandidates { //尝试不同的 flag 组合,直到成功或遇到非 EINVAL 错误。
|
||
// 内核根据 fd 找到对应的内存结构体(Socket 缓冲区)
|
||
err := rawConn.Control(func(fd uintptr) { //Control 方法保证在回调里 fd 是有效的,可以安全地调用 syscall.SetsockoptInt。
|
||
lastErr = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, linuxSOTimestampingNew, flags)
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if lastErr == nil {
|
||
return nil
|
||
}
|
||
if !errors.Is(lastErr, syscall.EINVAL) {
|
||
return lastErr
|
||
}
|
||
}
|
||
|
||
return lastErr
|
||
}
|
||
|
||
// sendMessageLinux 编码消息、写完整帧,再记录 TX 时间戳。
|
||
func (c *TCPConn) sendMessageLinux(msg protocol.Message) error {
|
||
payload, err := protocol.EncodeMessage(msg)
|
||
if err != nil {
|
||
return fmt.Errorf("protocol: encode message: %w", err)
|
||
}
|
||
|
||
//编码后的消息 payload 前面加 4 字节长度,构成完整帧。
|
||
frame := make([]byte, 4+len(payload))
|
||
binary.BigEndian.PutUint32(frame[:4], uint32(len(payload)))
|
||
copy(frame[4:], payload)
|
||
|
||
readIndex := 0
|
||
c.drainPendingTXTimestampEvents(msg, nil, linuxTXTimestampPhasePreSendDrain, &readIndex)
|
||
|
||
chunks, err := c.writeFrameLinux(frame, msg)
|
||
if err != nil {
|
||
return fmt.Errorf("protocol: write frame: %w", err)
|
||
}
|
||
//记录发送延时日志
|
||
c.logTXTimestampEvents(msg, chunks, &readIndex)
|
||
return nil
|
||
}
|
||
|
||
// writeFrameLinux 用 sendmsg 写完整帧。
|
||
func (c *TCPConn) writeFrameLinux(frame []byte, msg protocol.Message) ([]txSendChunk, error) {
|
||
written := 0
|
||
sendCallIndex := 0
|
||
chunks := make([]txSendChunk, 0, 1)
|
||
|
||
for written < len(frame) {
|
||
n, err := c.sendmsgDataOnce(frame[written:])
|
||
switch {
|
||
case err == nil:
|
||
if n <= 0 {
|
||
return nil, io.ErrShortWrite
|
||
}
|
||
chunk := txSendChunk{
|
||
SendCallIndex: sendCallIndex,
|
||
FrameOffsetStart: written,
|
||
FrameOffsetEnd: written + n - 1,
|
||
BytesWritten: n,
|
||
ExpectedTXID: c.txWriteSeq + uint32(n) - 1,
|
||
}
|
||
c.txWriteSeq += uint32(n)
|
||
chunks = append(chunks, chunk)
|
||
c.logTXSendChunkDebugRecord(msg, chunk)
|
||
sendCallIndex++
|
||
written += n
|
||
case isWouldBlock(err):
|
||
time.Sleep(linuxDataPollInterval)
|
||
default:
|
||
return nil, err
|
||
}
|
||
}
|
||
|
||
return chunks, nil
|
||
}
|
||
|
||
// 把 A_TX_SCHED / A_TX_SOFTWARE 写入日志。(发送过程中)
|
||
func (c *TCPConn) logTXTimestampEvents(msg protocol.Message, chunks []txSendChunk, readIndex *int) {
|
||
timestamps := c.collectTXTimestampEvents(msg, chunks, readIndex)
|
||
|
||
if ts, ok := timestamps[latencylog.EventATXSched]; ok {
|
||
latencylog.LogMessageEventAt(c.logger, c.nodeRole, c.nodeID, latencylog.EventATXSched, ts, msg)
|
||
}
|
||
if ts, ok := timestamps[latencylog.EventATXSoftware]; ok {
|
||
latencylog.LogMessageEventAt(c.logger, c.nodeRole, c.nodeID, latencylog.EventATXSoftware, ts, msg)
|
||
}
|
||
}
|
||
|
||
// 在 errqueue 里等两类 TX 时间戳。
|
||
func (c *TCPConn) collectTXTimestampEvents(msg protocol.Message, chunks []txSendChunk, readIndex *int) map[string]int64 {
|
||
deadline := time.Now().Add(linuxTXTimestampWaitTimeout)
|
||
observed := make([]observedTXTimestampEvent, 0, 4)
|
||
finalChunk, hasFinalChunk := finalTXSendChunk(chunks)
|
||
|
||
// 轮询 errqueue,优先等待本次消息最后一个 sendmsg chunk 的 sched/software 两类时间戳。
|
||
for time.Now().Before(deadline) {
|
||
event, err := c.recvTXTimestampOnce()
|
||
if err != nil {
|
||
if isWouldBlock(err) {
|
||
time.Sleep(linuxTXTimestampPollInterval)
|
||
continue
|
||
}
|
||
break
|
||
}
|
||
if event.EventName == "" || event.TSUnixNano <= 0 {
|
||
continue
|
||
}
|
||
observed = append(observed, observedTXTimestampEvent{
|
||
Phase: linuxTXTimestampPhasePostSendCollect,
|
||
ReadIndex: *readIndex,
|
||
Event: event,
|
||
})
|
||
*readIndex = *readIndex + 1
|
||
|
||
selection := selectTXTimestampEvents(observed, finalChunk.ExpectedTXID, hasFinalChunk)
|
||
if hasFinalChunk && selection.HasEvent && selection.SelectedID == finalChunk.ExpectedTXID && hasCompleteTXTimestampPair(selection.Timestamps) {
|
||
break
|
||
}
|
||
}
|
||
|
||
selection := selectTXTimestampEvents(observed, finalChunk.ExpectedTXID, hasFinalChunk)
|
||
c.logObservedTXTimestampEvents(msg, chunks, observed, selection)
|
||
c.drainPendingTXTimestampEvents(msg, chunks, linuxTXTimestampPhasePostSelectDrain, readIndex)
|
||
return selection.Timestamps
|
||
}
|
||
|
||
// recvTXTimestampOnce 从 errqueue 读一次时间戳事件。
|
||
func (c *TCPConn) recvTXTimestampOnce() (txTimestampEvent, error) {
|
||
var (
|
||
event txTimestampEvent
|
||
opErr error
|
||
)
|
||
|
||
err := c.raw.Control(func(fd uintptr) {
|
||
//设置足够大的 oob buffer 来接收控制消息,调用 recvmsg 从 errqueue 读一条消息。
|
||
oob := make([]byte, linuxTimestampControlBufferSize)
|
||
//recvmsg 的 flags 里必须带 MSG_ERRQUEUE,才能从 errqueue 里读消息,非阻塞模式下如果没有消息可读会返回 EAGAIN。
|
||
_, oobn, _, _, recvErr := syscall.Recvmsg(int(fd), nil, oob, syscall.MSG_ERRQUEUE|syscall.MSG_DONTWAIT)
|
||
if recvErr != nil {
|
||
opErr = recvErr
|
||
return
|
||
}
|
||
//解析控制消息,看看是不是我们关心的 TX 时间戳事件,如果是就拿到事件名和时间戳。
|
||
event, _ = parseTXTimestampControlMessages(oob[:oobn])
|
||
})
|
||
if err != nil {
|
||
return txTimestampEvent{}, err
|
||
}
|
||
if opErr != nil {
|
||
return txTimestampEvent{}, opErr
|
||
}
|
||
|
||
return event, nil
|
||
}
|
||
|
||
// 把底层时间戳映射成日志事件名。
|
||
func parseTXTimestampControlMessages(oob []byte) (txTimestampEvent, bool) {
|
||
if len(oob) == 0 {
|
||
return txTimestampEvent{}, false
|
||
}
|
||
//解析控制消息,看看是不是我们关心的 TX 时间戳事件,如果是就拿到事件名和时间戳。
|
||
controlMessages, err := syscall.ParseSocketControlMessage(oob)
|
||
if err != nil {
|
||
return txTimestampEvent{}, false
|
||
}
|
||
|
||
return parseTXTimestampSocketControlMessages(controlMessages)
|
||
}
|
||
|
||
func parseTXTimestampSocketControlMessages(controlMessages []syscall.SocketControlMessage) (txTimestampEvent, bool) {
|
||
var (
|
||
event txTimestampEvent
|
||
hasTS bool
|
||
hasKind bool
|
||
)
|
||
|
||
for _, controlMessage := range controlMessages {
|
||
switch {
|
||
case controlMessage.Header.Level == syscall.SOL_SOCKET && controlMessage.Header.Type == linuxSCMTimestampingNew:
|
||
if ts := parseSCMTimestampingData(controlMessage.Data); ts > 0 {
|
||
event.TSUnixNano = ts
|
||
hasTS = true
|
||
}
|
||
case isSocketExtendedErr(controlMessage): //判断时间戳是否进入了errqueue,
|
||
if info, ok := parseSocketExtendedErrInfo(controlMessage.Data); ok {
|
||
event.EEInfo = info.Info
|
||
event.EEData = info.Data
|
||
event.EventName = mapLinuxTXTimestampEventName(info.Info)
|
||
hasKind = true
|
||
}
|
||
}
|
||
}
|
||
|
||
if !hasTS || !hasKind || event.EventName == "" {
|
||
return txTimestampEvent{}, false
|
||
}
|
||
|
||
return event, true
|
||
}
|
||
|
||
// 判断控制消息是否来自 socket extended err。
|
||
// 内核产生的时间戳并不会混合在普通的数据流里,而是被包装成一种特殊的“错误消息”丢进 Error Queue。
|
||
func isSocketExtendedErr(controlMessage syscall.SocketControlMessage) bool {
|
||
switch {
|
||
case controlMessage.Header.Level == syscall.SOL_IP && controlMessage.Header.Type == syscall.IP_RECVERR:
|
||
return true
|
||
case controlMessage.Header.Level == syscall.SOL_IPV6 && controlMessage.Header.Type == syscall.IPV6_RECVERR:
|
||
return true
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
|
||
// 从 socket extended err 的数据里取 origin timestamping 信息。
|
||
func parseSocketExtendedErrInfo(data []byte) (socketExtendedErrInfo, bool) {
|
||
if len(data) < 16 {
|
||
return socketExtendedErrInfo{}, false
|
||
}
|
||
if data[4] != linuxSOEEOriginTimestamping {
|
||
return socketExtendedErrInfo{}, false
|
||
}
|
||
|
||
return socketExtendedErrInfo{
|
||
Info: binary.NativeEndian.Uint32(data[8:12]),
|
||
Data: binary.NativeEndian.Uint32(data[12:16]),
|
||
}, true
|
||
}
|
||
|
||
func mapLinuxTXTimestampEventName(tsKind uint32) string {
|
||
switch tsKind {
|
||
case linuxSCMTstampSched:
|
||
return latencylog.EventATXSched
|
||
case linuxSCMTstampSnd:
|
||
return latencylog.EventATXSoftware
|
||
default:
|
||
return fmt.Sprintf("TX_TIMESTAMP_KIND_%d", tsKind)
|
||
}
|
||
}
|
||
|
||
func finalTXSendChunk(chunks []txSendChunk) (txSendChunk, bool) {
|
||
if len(chunks) == 0 {
|
||
return txSendChunk{}, false
|
||
}
|
||
|
||
return chunks[len(chunks)-1], true
|
||
}
|
||
|
||
func isBusinessTXTimestampEventName(eventName string) bool {
|
||
return eventName == latencylog.EventATXSched || eventName == latencylog.EventATXSoftware
|
||
}
|
||
|
||
func hasCompleteTXTimestampPair(timestamps map[string]int64) bool {
|
||
if len(timestamps) == 0 {
|
||
return false
|
||
}
|
||
|
||
return timestamps[latencylog.EventATXSched] > 0 && timestamps[latencylog.EventATXSoftware] > 0
|
||
}
|
||
|
||
func selectTXTimestampEvents(events []observedTXTimestampEvent, expectedFinalTXID uint32, hasExpectedFinalTXID bool) txTimestampSelection {
|
||
byID := make(map[uint32]map[string]int64)
|
||
selection := txTimestampSelection{
|
||
Timestamps: make(map[string]int64, 2),
|
||
}
|
||
|
||
var (
|
||
highestID uint32
|
||
haveHighest bool
|
||
)
|
||
for _, observed := range events {
|
||
event := observed.Event
|
||
selection.HasEvent = true
|
||
if !haveHighest || event.EEData > highestID {
|
||
highestID = event.EEData
|
||
haveHighest = true
|
||
}
|
||
|
||
if !isBusinessTXTimestampEventName(event.EventName) {
|
||
continue
|
||
}
|
||
if _, ok := byID[event.EEData]; !ok {
|
||
byID[event.EEData] = make(map[string]int64, 2)
|
||
}
|
||
if existing, exists := byID[event.EEData][event.EventName]; !exists || event.TSUnixNano < existing {
|
||
byID[event.EEData][event.EventName] = event.TSUnixNano
|
||
}
|
||
}
|
||
|
||
if !selection.HasEvent {
|
||
return selection
|
||
}
|
||
|
||
switch {
|
||
case hasExpectedFinalTXID && len(byID[expectedFinalTXID]) > 0:
|
||
selection.SelectedID = expectedFinalTXID
|
||
selection.Timestamps = copyTXTimestampMap(byID[expectedFinalTXID])
|
||
case len(byID[highestID]) > 0:
|
||
selection.SelectedID = highestID
|
||
selection.Timestamps = copyTXTimestampMap(byID[highestID])
|
||
default:
|
||
selection.SelectedID = highestID
|
||
}
|
||
|
||
return selection
|
||
}
|
||
|
||
func copyTXTimestampMap(src map[string]int64) map[string]int64 {
|
||
dst := make(map[string]int64, len(src))
|
||
for key, value := range src {
|
||
dst[key] = value
|
||
}
|
||
return dst
|
||
}
|
||
|
||
func (c *TCPConn) drainPendingTXTimestampEvents(msg protocol.Message, chunks []txSendChunk, phase string, readIndex *int) {
|
||
for {
|
||
event, err := c.recvTXTimestampOnce()
|
||
if err != nil {
|
||
if isWouldBlock(err) {
|
||
return
|
||
}
|
||
return
|
||
}
|
||
if event.EventName == "" || event.TSUnixNano <= 0 {
|
||
continue
|
||
}
|
||
c.logTXErrqueueDebugRecord(msg, chunks, observedTXTimestampEvent{
|
||
Phase: phase,
|
||
ReadIndex: *readIndex,
|
||
Event: event,
|
||
}, false)
|
||
*readIndex = *readIndex + 1
|
||
}
|
||
}
|
||
|
||
func (c *TCPConn) logObservedTXTimestampEvents(msg protocol.Message, chunks []txSendChunk, observed []observedTXTimestampEvent, selection txTimestampSelection) {
|
||
if len(observed) == 0 {
|
||
return
|
||
}
|
||
|
||
for _, entry := range observed {
|
||
selected := selection.HasEvent &&
|
||
entry.Event.EEData == selection.SelectedID &&
|
||
isBusinessTXTimestampEventName(entry.Event.EventName) &&
|
||
selection.Timestamps[entry.Event.EventName] == entry.Event.TSUnixNano
|
||
c.logTXErrqueueDebugRecord(msg, chunks, entry, selected)
|
||
}
|
||
}
|
||
|
||
func (c *TCPConn) logTXSendChunkDebugRecord(msg protocol.Message, chunk txSendChunk) {
|
||
if c.txTimestampDebugLogger == nil {
|
||
return
|
||
}
|
||
|
||
sendCallIndex := chunk.SendCallIndex
|
||
frameOffsetStart := chunk.FrameOffsetStart
|
||
frameOffsetEnd := chunk.FrameOffsetEnd
|
||
bytesWritten := chunk.BytesWritten
|
||
expectedTXID := chunk.ExpectedTXID
|
||
|
||
record := c.newTXTimestampDebugRecord(msg)
|
||
record.RecordType = txTimestampDebugRecordTypeSendChunk
|
||
record.SendCallIndex = &sendCallIndex
|
||
record.FrameOffsetStart = &frameOffsetStart
|
||
record.FrameOffsetEnd = &frameOffsetEnd
|
||
record.BytesWritten = &bytesWritten
|
||
record.ExpectedTXID = &expectedTXID
|
||
c.logTXTimestampDebugRecord(record)
|
||
}
|
||
|
||
func (c *TCPConn) logTXErrqueueDebugRecord(msg protocol.Message, chunks []txSendChunk, observed observedTXTimestampEvent, selected bool) {
|
||
if c.txTimestampDebugLogger == nil {
|
||
return
|
||
}
|
||
|
||
readIndex := observed.ReadIndex
|
||
tsUnixNano := observed.Event.TSUnixNano
|
||
eeInfo := observed.Event.EEInfo
|
||
eeData := observed.Event.EEData
|
||
selectedForLatency := selected
|
||
|
||
record := c.newTXTimestampDebugRecord(msg)
|
||
record.RecordType = txTimestampDebugRecordTypeErrqueueEvent
|
||
record.Phase = observed.Phase
|
||
record.ReadIndex = &readIndex
|
||
record.EventName = observed.Event.EventName
|
||
record.TSUnixNano = &tsUnixNano
|
||
record.EEInfo = &eeInfo
|
||
record.EEData = &eeData
|
||
record.SelectedForLatency = &selectedForLatency
|
||
if matchedSendCallIndex, ok := matchTXTimestampEventToSendChunk(observed.Event.EEData, chunks); ok {
|
||
record.MatchedSendCallIndex = &matchedSendCallIndex
|
||
}
|
||
c.logTXTimestampDebugRecord(record)
|
||
}
|
||
|
||
func matchTXTimestampEventToSendChunk(txID uint32, chunks []txSendChunk) (int, bool) {
|
||
for _, chunk := range chunks {
|
||
if chunk.ExpectedTXID == txID {
|
||
return chunk.SendCallIndex, true
|
||
}
|
||
}
|
||
|
||
return 0, false
|
||
}
|
||
|
||
func (c *TCPConn) newTXTimestampDebugRecord(msg protocol.Message) TXTimestampDebugRecord {
|
||
return TXTimestampDebugRecord{
|
||
NodeRole: c.nodeRole,
|
||
NodeID: c.nodeID,
|
||
MessageType: msg.Type,
|
||
MessageID: msg.ID,
|
||
From: msg.From,
|
||
To: msg.To,
|
||
FileName: msg.FileName,
|
||
BodySize: len(msg.Body),
|
||
}
|
||
}
|
||
|
||
func (c *TCPConn) logTXTimestampDebugRecord(record TXTimestampDebugRecord) {
|
||
if c.txTimestampDebugLogger == nil {
|
||
return
|
||
}
|
||
|
||
_ = c.txTimestampDebugLogger.LogTXTimestampDebugRecord(record)
|
||
}
|
||
|
||
// 读一条完整消息,并记录 B_RX_SOFTWARE。
|
||
func (c *TCPConn) receiveMessageLinux() (protocol.Message, error) {
|
||
payload, rxTimestamp, err := c.readFrameLinux()
|
||
if err != nil {
|
||
return protocol.Message{}, fmt.Errorf("protocol: read frame: %w", err)
|
||
}
|
||
|
||
msg, err := protocol.DecodeMessage(payload)
|
||
if err != nil {
|
||
return protocol.Message{}, fmt.Errorf("protocol: decode message: %w", err)
|
||
}
|
||
|
||
if rxTimestamp > 0 {
|
||
latencylog.LogMessageEventAt(c.logger, c.nodeRole, c.nodeID, latencylog.EventBRXSoftware, rxTimestamp, msg)
|
||
}
|
||
|
||
return msg, nil
|
||
}
|
||
|
||
// readFrameLinux 先读 4 字节长度,再读整条 payload。
|
||
func (c *TCPConn) readFrameLinux() ([]byte, int64, error) {
|
||
var frameHeader [4]byte
|
||
rxTimestamp, err := c.readFullLinux(frameHeader[:])
|
||
if err != nil {
|
||
return nil, rxTimestamp, err
|
||
}
|
||
|
||
size := binary.BigEndian.Uint32(frameHeader[:])
|
||
switch {
|
||
case size == 0:
|
||
return nil, rxTimestamp, protocol.ErrInvalidFrameLength
|
||
case size > protocol.MaxFrameSize:
|
||
return nil, rxTimestamp, protocol.ErrFrameTooLarge
|
||
}
|
||
|
||
payload := make([]byte, int(size))
|
||
bodyTimestamp, err := c.readFullLinux(payload)
|
||
if rxTimestamp == 0 {
|
||
rxTimestamp = bodyTimestamp
|
||
}
|
||
if err != nil {
|
||
return nil, rxTimestamp, err
|
||
}
|
||
|
||
return payload, rxTimestamp, nil
|
||
}
|
||
|
||
// 读满 buf,并保留首个 RX_SOFTWARE(返回进入tcp协议栈的时间戳)。
|
||
func (c *TCPConn) readFullLinux(buf []byte) (int64, error) {
|
||
if len(buf) == 0 {
|
||
return 0, nil
|
||
}
|
||
|
||
var (
|
||
offset int
|
||
firstRXTime int64
|
||
)
|
||
|
||
for offset < len(buf) {
|
||
n, rxTimestamp, err := c.recvmsgLinux(buf[offset:])
|
||
if firstRXTime == 0 && rxTimestamp > 0 {
|
||
firstRXTime = rxTimestamp
|
||
}
|
||
if err != nil {
|
||
if errors.Is(err, io.EOF) && offset > 0 {
|
||
return firstRXTime, io.ErrUnexpectedEOF
|
||
}
|
||
return firstRXTime, err
|
||
}
|
||
|
||
offset += n
|
||
}
|
||
|
||
return firstRXTime, nil
|
||
}
|
||
|
||
// recvmsgLinux 用 recvmsg 同时读取数据和控制消息。
|
||
func (c *TCPConn) recvmsgLinux(buf []byte) (int, int64, error) {
|
||
for {
|
||
n, rxTimeNS, err := c.recvmsgDataOnce(buf)
|
||
switch {
|
||
case err == nil:
|
||
if n == 0 {
|
||
return 0, 0, io.EOF
|
||
}
|
||
return n, rxTimeNS, nil
|
||
case isWouldBlock(err):
|
||
time.Sleep(linuxDataPollInterval)
|
||
default:
|
||
return 0, 0, err
|
||
}
|
||
}
|
||
}
|
||
|
||
// 从控制消息里取 RX_SOFTWARE。
|
||
func parseRXTimestampControlMessages(oob []byte) int64 {
|
||
if len(oob) == 0 {
|
||
return 0
|
||
}
|
||
|
||
controlMessages, err := syscall.ParseSocketControlMessage(oob)
|
||
if err != nil {
|
||
return 0
|
||
}
|
||
|
||
for _, controlMessage := range controlMessages {
|
||
if controlMessage.Header.Level != syscall.SOL_SOCKET || controlMessage.Header.Type != linuxSCMTimestampingNew {
|
||
continue
|
||
}
|
||
|
||
if ts := parseSCMTimestampingData(controlMessage.Data); ts > 0 {
|
||
return ts
|
||
}
|
||
}
|
||
|
||
return 0
|
||
}
|
||
|
||
// 取第一个非零 timespec。
|
||
func parseSCMTimestampingData(data []byte) int64 {
|
||
const timespec64Size = 16
|
||
|
||
for offset := 0; offset+timespec64Size <= len(data); offset += timespec64Size {
|
||
sec := int64(binary.NativeEndian.Uint64(data[offset : offset+8]))
|
||
nsec := int64(binary.NativeEndian.Uint64(data[offset+8 : offset+16]))
|
||
if sec == 0 && nsec == 0 {
|
||
continue
|
||
}
|
||
return sec*int64(time.Second) + nsec
|
||
}
|
||
|
||
return 0
|
||
}
|
||
|
||
// 判断错误是否是 EAGAIN 或 EWOULDBLOCK。
|
||
func isWouldBlock(err error) bool {
|
||
return errors.Is(err, syscall.EAGAIN) || errors.Is(err, syscall.EWOULDBLOCK)
|
||
}
|
||
|
||
func (c *TCPConn) sendmsgDataOnce(buf []byte) (int, error) {
|
||
var (
|
||
n int
|
||
opErr error
|
||
)
|
||
|
||
err := c.raw.Control(func(fd uintptr) {
|
||
n, opErr = syscall.SendmsgN(int(fd), buf, nil, nil, 0)
|
||
})
|
||
if err != nil {
|
||
return 0, err
|
||
}
|
||
|
||
return n, opErr
|
||
}
|
||
|
||
func (c *TCPConn) recvmsgDataOnce(buf []byte) (int, int64, error) {
|
||
var (
|
||
n int
|
||
rxTimeNS int64
|
||
opErr error
|
||
)
|
||
|
||
err := c.raw.Control(func(fd uintptr) {
|
||
oob := make([]byte, linuxTimestampControlBufferSize)
|
||
readN, oobN, _, _, recvErr := syscall.Recvmsg(int(fd), buf, oob, 0)
|
||
if recvErr != nil {
|
||
opErr = recvErr
|
||
return
|
||
}
|
||
n = readN
|
||
rxTimeNS = parseRXTimestampControlMessages(oob[:oobN])
|
||
})
|
||
if err != nil {
|
||
return 0, 0, err
|
||
}
|
||
|
||
return n, rxTimeNS, opErr
|
||
}
|