[haiku-commits] haiku: hrev49502 - src/add-ons/kernel/network/protocols/tcp

  • From: mmlr@xxxxxxxx
  • To: haiku-commits@xxxxxxxxxxxxx
  • Date: Sun, 2 Aug 2015 23:30:02 +0200 (CEST)

hrev49502 adds 9 changesets to branch 'master'
old head: 86b58866404aa1dbc70b88d388f5fc13b4cc9656
new head: 8982293017b132c73aff734fa8baa7afd261e125
overview:
http://cgit.haiku-os.org/haiku/log/?qt=range&q=8982293017b1+%5E86b58866404a

----------------------------------------------------------------------------

2fdea65c3a0b: tcp: Fix 64 bit build with debugging features enabled.

da8fbe0e5974: tcp: Replace custom WaitList with ConditionVariable.

The WaitList implementation had a race condition between checking for
the condition and acquiering the semaphore. If a thread was rescheduled
at that point, the signal could be missed due to the use of
release_sem_etc() with the B_RELEASE_ALL flag while the thread was not
yet waiting for the semaphore. The transfer would subsequently stall.

5f7749078e1c: tcp: Fix retransmit logic to avoid lots of spurious retransmits.

The retransmit timer was only stopped when all in flight data was
acknowledged and never updated on individual acknowledgements. This
caused a lot of erroneous retransmits whenever the buffer was filled
fast enough so that the acknowledgements never caught up, i.e. whenever
uploading or streaming data.

Move setting of the initial retransmit timer inside the send loop so it
is closer to the actual time the segment is sent out and simplify the
logic a bit.

Limit the minimal retransmit timeout to 200 msecs to avoid spurious
retransmit in the face of delayed acknowledgements. This is lower than
the 1 second minimum the RFCs suggest. Other stacks use various other
sub-second timeouts, the 200 msecs follows what Linux does.

Also add the exponential back off of the retransmit timeout when
retransmits are triggered. This is bounded by a 60 seconds maximum
according to RFC6298.

01b0f935ec61: tcp: Move persist timeout value to a define in the header.

05220224ffc2: tcp: Split Timer trace entry into Timer{Set|Triggered}.

Trace whenever a timer is (re-)set as well as when it triggers. A value
of -1 denotes the cancellation of the timer.

bc49140bab90: tcp: Add APICall trace entry and move TRACEs into locked parts.

The APICall trace entry just records the function name but this is
enough to deduce where some of the state changes come from.

Also move the TRACE macros past the MutexLockers to ensure that their
output happens at the time when the methods actually run.

94fb06bfce1f: tcp: Fix early cancellation of timers on socket free.

TCPEndpoint::Free() uses _EnterTimeWait() to start the time-wait timer
for later cleanup. The latter did call _CancelConnectionTimers()
unconditionally however, also cancelling a retransmit timer that was
possibly still needed for the retransmission of the FIN packet. If the
FIN packet got lost, the connection would be left open on the other end.

bed94ebc49e2: tcp: Change timestamp factor from 1024 to 1000.

It is the conversion factor between the milliseconds tcp time and the
microseconds system time, so 1024 does not make much sense.

8982293017b1: tcp: Whitespace cleanup, move a define to header and fix a typo.

[ Michael Lotz <mmlr@xxxxxxxx> ]

----------------------------------------------------------------------------

3 files changed, 278 insertions(+), 219 deletions(-)
.../kernel/network/protocols/tcp/TCPEndpoint.cpp | 424 +++++++++++--------
.../kernel/network/protocols/tcp/TCPEndpoint.h | 27 +-
src/add-ons/kernel/network/protocols/tcp/tcp.h | 46 +-

############################################################################

Commit: 2fdea65c3a0b5f8680eee251c794c231ee507ce3
URL: http://cgit.haiku-os.org/haiku/commit/?id=2fdea65c3a0b
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sat Aug 1 07:57:22 2015 UTC

tcp: Fix 64 bit build with debugging features enabled.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 5072e30..a6d8e64 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -60,16 +60,19 @@

#ifdef TRACE_TCP
// the space before ', ##args' is important in order for this to work with cpp
2.95
-# define TRACE(format, args...) dprintf("%ld: TCP [%llu] %p (%12s) " \
- format "\n", find_thread(NULL), system_time(), this, \
- name_for_state(fState) , ##args)
+# define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \
+ B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
+ system_time(), this, name_for_state(fState) , ##args)
#else
# define TRACE(args...) do { } while (0)
#endif

#ifdef PROBE_TCP
# define PROBE(buffer, window) \
- dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win
%lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
+ dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %"
B_PRIu32 \
+ " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
+ B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %"
\
+ B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
system_time(), PrintAddress(buffer->source), \
PrintAddress(buffer->destination), buffer->size,
fSendNext.Number(), \
fSendUnacknowledged.Number(), fCongestionWindow,
fSlowStartThreshold, \
@@ -101,9 +104,10 @@ public:

virtual void AddDump(TraceOutput& out)
{
- out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags
%x, "
- "seq %lu, ack %lu, wnd %lu", fEndpoint,
name_for_state(fState),
- fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge,
fWindow);
+ out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 "
bytes), "
+ "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
+ ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState),
fBuffer,
+ fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
}

protected:
@@ -137,10 +141,11 @@ public:

virtual void AddDump(TraceOutput& out)
{
- out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
- "seq %lu, ack %lu, first %lu, last %lu",
- fEndpoint, name_for_state(fState), fBuffer,
fBufferSize, fFlags,
- fSequence, fAcknowledge, fFirstSequence, fLastSequence);
+ out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
+ "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
+ ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
+ name_for_state(fState), fBuffer, fBufferSize, fFlags,
fSequence,
+ fAcknowledge, fFirstSequence, fLastSequence);
}

protected:
@@ -210,7 +215,7 @@ public:

virtual void AddDump(TraceOutput& out)
{
- out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint,
+ out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s",
fEndpoint,
name_for_state(fState), fLine, fError);
}

@@ -553,8 +558,8 @@ TCPEndpoint::Close()
return status;
}

- TRACE("Close(): after waiting, the SendQ was left with %lu
bytes.",
- fSendQueue.Used());
+ TRACE("Close(): after waiting, the SendQ was left with %"
B_PRIuSIZE
+ " bytes.", fSendQueue.Used());
}
return B_OK;
}
@@ -597,8 +602,8 @@ TCPEndpoint::Connect(const sockaddr* address)
if (gStackModule->is_restarted_syscall()) {
bigtime_t timeout =
gStackModule->restore_syscall_restart_timeout();
status_t status = _WaitForEstablished(locker, timeout);
- TRACE(" Connect(): Connection complete: %s (timeout was %llu)",
- strerror(status), timeout);
+ TRACE(" Connect(): Connection complete: %s (timeout was %"
+ B_PRIdBIGTIME ")", strerror(status), timeout);
return posix_error(status);
}

@@ -656,8 +661,8 @@ TCPEndpoint::Connect(const sockaddr* address)
gStackModule->store_syscall_restart_timeout(absoluteTimeout);

status = _WaitForEstablished(locker, absoluteTimeout);
- TRACE(" Connect(): Connection complete: %s (timeout was %llu)",
- strerror(status), timeout);
+ TRACE(" Connect(): Connection complete: %s (timeout was %"
B_PRIdBIGTIME
+ ")", strerror(status), timeout);
return posix_error(status);
}

@@ -784,9 +789,9 @@ TCPEndpoint::SendData(net_buffer *buffer)
{
MutexLocker lock(fLock);

- TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has
%lu]",
- buffer, buffer->size, buffer->flags, fSendQueue.Size(),
- fSendQueue.Free());
+ TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
+ ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
+ buffer->size, buffer->flags, fSendQueue.Size(),
fSendQueue.Free());

uint32 flags = buffer->flags;

@@ -851,7 +856,7 @@ TCPEndpoint::SendData(net_buffer *buffer)
}
}

- TRACE(" SendData(): %lu bytes used.", fSendQueue.Used());
+ TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());

bool force = false;
if ((flags & MSG_OOB) != 0) {
@@ -885,7 +890,7 @@ TCPEndpoint::SendAvailable()
else
available = EPIPE;

- TRACE("SendAvailable(): %li", available);
+ TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
return available;
}

@@ -906,7 +911,8 @@ TCPEndpoint::FillStat(net_stat *stat)
status_t
TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
{
- TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
+ TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
+ flags);

MutexLocker locker(fLock);

@@ -977,7 +983,8 @@ TCPEndpoint::ReadData(size_t numBytes, uint32 flags,
net_buffer** _buffer)
}
}

- TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available());
+ TRACE(" ReadData(): %" B_PRIuSIZE " are available.",
+ fReceiveQueue.Available());

if (numBytes < fReceiveQueue.Available())
fReceiveList.Signal();
@@ -986,7 +993,8 @@ TCPEndpoint::ReadData(size_t numBytes, uint32 flags,
net_buffer** _buffer)

ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);

- TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available());
+ TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.",
+ fReceiveQueue.Available());

// if we are opening the window, check if we should send an ACK
if (!clone)
@@ -1001,7 +1009,7 @@ TCPEndpoint::ReadAvailable()
{
MutexLocker locker(fLock);

- TRACE("ReadAvailable(): %li", _AvailableData());
+ TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());

return _AvailableData();
}
@@ -1328,8 +1336,8 @@ TCPEndpoint::_AddData(tcp_segment_header& segment,
net_buffer* buffer)
segment.flags |= TCP_FLAG_FINISH;
}

- TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu
bytes.",
- fReceiveNext.Number(), fReceiveQueue.Available());
+ TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now
have %"
+ B_PRIuSIZE " bytes.", fReceiveNext.Number(),
fReceiveQueue.Available());

if ((segment.flags & TCP_FLAG_PUSH) != 0)
fReceiveQueue.SetPushPointer();
@@ -1543,8 +1551,8 @@ TCPEndpoint::_Receive(tcp_segment_header& segment,
net_buffer* buffer)
// Check sequence number
if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
fReceiveWindow)) {
- TRACE(" Receive(): segment out of window, next: %lu
wnd: %lu",
- fReceiveNext.Number(), fReceiveWindow);
+ TRACE(" Receive(): segment out of window, next: %"
B_PRIu32
+ " wnd: %" B_PRIu32, fReceiveNext.Number(),
fReceiveWindow);
if ((segment.flags & TCP_FLAG_RESET) != 0) {
// TODO: this doesn't look right - review!
return DROP;
@@ -1602,7 +1610,7 @@ TCPEndpoint::_Receive(tcp_segment_header& segment,
net_buffer* buffer)
}

// remove duplicate data at the start
- TRACE("* remove %ld bytes from the start", drop);
+ TRACE("* remove %" B_PRId32 " bytes from the start", drop);
gBufferModule->remove_header(buffer, drop);
segment.sequence += drop;
}
@@ -1628,14 +1636,14 @@ TCPEndpoint::_Receive(tcp_segment_header& segment,
net_buffer* buffer)
}

segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
- TRACE("* remove %ld bytes from the end", drop);
+ TRACE("* remove %" B_PRId32 " bytes from the end", drop);
gBufferModule->remove_trailer(buffer, drop);
}

#ifdef TRACE_TCP
if (advertisedWindow > fSendWindow) {
- TRACE(" Receive(): Window update %lu -> %lu", fSendWindow,
- advertisedWindow);
+ TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
+ fSendWindow, advertisedWindow);
}
#endif

@@ -1777,7 +1785,7 @@ TCPEndpoint::_Receive(tcp_segment_header& segment,
net_buffer* buffer)

_UpdateTimestamps(segment, segmentLength);

- TRACE("Receive() Action %ld", action);
+ TRACE("Receive() Action %" B_PRId32, action);

return action;
}
@@ -1788,9 +1796,9 @@ TCPEndpoint::SegmentReceived(tcp_segment_header& segment,
net_buffer* buffer)
{
MutexLocker locker(fLock);

- TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
- "\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
- buffer, buffer->size, PrintAddress(buffer->source),
+ TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
+ "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
+ ", wnd %" B_PRIu32, buffer, buffer->size,
PrintAddress(buffer->source),
PrintAddress(buffer->destination), segment.flags,
segment.sequence,
segment.acknowledge,
(uint32)segment.advertised_window << fSendWindowShift);
@@ -2048,9 +2056,10 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
uint32 size = buffer->size;
segment.sequence = fSendNext.Number();

- TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
- "\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu,
ssthresh %lu\n"
- "\tlen %lu first %lu last %lu",
+ TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s
to "
+ "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %"
B_PRIu32
+ ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %"
B_PRIu32
+ ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %"
B_PRIu32,
buffer, buffer->size, PrintAddress(buffer->source),
PrintAddress(buffer->destination), segment.flags,
segment.sequence,
segment.acknowledge, segment.advertised_window,
@@ -2111,7 +2120,7 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
// start the retransmition timer
if (previousSendNext == fSendUnacknowledged
&& fSendNext > previousSendNext) {
- TRACE(" SendQueue(): set retransmit timer with rto %llu",
+ TRACE(" SendQueue(): set retransmit timer with rto %"
B_PRIdBIGTIME,
fRetransmitTimeout);

gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
@@ -2245,8 +2254,8 @@ TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
* kTimestampFactor;

- TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
- roundTripTime);
+ TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
+ fRetransmitTimeout, roundTripTime);
}


@@ -2348,7 +2357,7 @@ TCPEndpoint::Dump() const
kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
kprintf(" send\n");
- kprintf(" window shift: %u\n", fSendWindowShift);
+ kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift);
kprintf(" unacknowledged: %" B_PRIu32 "\n",
fSendUnacknowledged.Number());
kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number());
@@ -2357,7 +2366,8 @@ TCPEndpoint::Dump() const
kprintf(" window: %" B_PRIu32 "\n", fSendWindow);
kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow);
kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
- kprintf(" queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
+ kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
fSendQueue.Used(),
+ fSendQueue.Size());
#if DEBUG_BUFFER_QUEUE
fSendQueue.Dump();
#endif
@@ -2366,14 +2376,14 @@ TCPEndpoint::Dump() const
kprintf(" initial sequence: %" B_PRIu32 "\n",
fInitialSendSequence.Number());
kprintf(" receive\n");
- kprintf(" window shift: %u\n", fReceiveWindowShift);
+ kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number());
kprintf(" max advertised: %" B_PRIu32 "\n",
fReceiveMaxAdvertised.Number());
kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow);
kprintf(" max segment size: %" B_PRIu32 "\n",
fReceiveMaxSegmentSize);
- kprintf(" queue: %lu / %lu\n", fReceiveQueue.Available(),
- fReceiveQueue.Size());
+ kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
+ fReceiveQueue.Available(), fReceiveQueue.Size());
#if DEBUG_BUFFER_QUEUE
fReceiveQueue.Dump();
#endif

############################################################################

Commit: da8fbe0e5974bef73324b4297e7cc471b942b679
URL: http://cgit.haiku-os.org/haiku/commit/?id=da8fbe0e5974
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 13:10:06 2015 UTC

tcp: Replace custom WaitList with ConditionVariable.

The WaitList implementation had a race condition between checking for
the condition and acquiering the semaphore. If a thread was rescheduled
at that point, the signal could be missed due to the use of
release_sem_etc() with the B_RELEASE_ALL flag while the thread was not
yet waiting for the semaphore. The transfer would subsequently stall.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index a6d8e64..8204458 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -364,68 +364,10 @@ state_needs_finish(int32 state)
// #pragma mark -


-WaitList::WaitList(const char* name)
-{
- fCondition = 0;
- fSem = create_sem(0, name);
-}
-
-
-WaitList::~WaitList()
-{
- delete_sem(fSem);
-}
-
-
-status_t
-WaitList::InitCheck() const
-{
- return fSem;
-}
-
-
-status_t
-WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
-{
- locker.Unlock();
-
- status_t status = B_OK;
-
- while (!atomic_test_and_set(&fCondition, 0, 1)) {
- status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT |
B_CAN_INTERRUPT,
- timeout);
- if (status != B_OK)
- break;
- }
-
- locker.Lock();
- return status;
-}
-
-
-void
-WaitList::Signal()
-{
- atomic_or(&fCondition, 1);
-#ifdef __HAIKU__
- release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
-#else
- int32 count;
- if (get_sem_count(fSem, &count) == B_OK && count < 0)
- release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
-#endif
-}
-
-
-// #pragma mark -
-
-
TCPEndpoint::TCPEndpoint(net_socket* socket)
:
ProtocolSocket(socket),
fManager(NULL),
- fReceiveList("tcp receive"),
- fSendList("tcp send"),
fOptions(0),
fSendWindowShift(0),
fReceiveWindowShift(0),
@@ -457,6 +399,9 @@ TCPEndpoint::TCPEndpoint(net_socket* socket)
// TODO: to be replaced with a real read/write locking strategy!
mutex_init(&fLock, "tcp lock");

+ fReceiveCondition.Init(this, "tcp receive");
+ fSendCondition.Init(this, "tcp send");
+
gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer,
this);
gStackModule->init_timer(&fRetransmitTimer,
TCPEndpoint::_RetransmitTimer,
this);
@@ -494,12 +439,6 @@ TCPEndpoint::~TCPEndpoint()
status_t
TCPEndpoint::InitCheck() const
{
- if (fReceiveList.InitCheck() < B_OK)
- return fReceiveList.InitCheck();
-
- if (fSendList.InitCheck() < B_OK)
- return fSendList.InitCheck();
-
return B_OK;
}

@@ -551,7 +490,7 @@ TCPEndpoint::Close()
bigtime_t maximum = absolute_timeout(socket->linger *
1000000LL);

while (fSendQueue.Used() > 0) {
- status = fSendList.Wait(locker, maximum);
+ status = _WaitForCondition(fSendCondition, locker,
maximum);
if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
break;
else if (status < B_OK)
@@ -817,7 +756,7 @@ TCPEndpoint::SendData(net_buffer *buffer)
while (left > 0) {
while (fSendQueue.Free() < socket->send.low_water_mark) {
// wait until enough space is available
- status_t status = fSendList.Wait(lock, timeout);
+ status_t status = _WaitForCondition(fSendCondition,
lock, timeout);
if (status < B_OK) {
TRACE(" SendData() returning %s (%d)",
strerror(posix_error(status)),
(int)posix_error(status));
@@ -969,7 +908,7 @@ TCPEndpoint::ReadData(size_t numBytes, uint32 flags,
net_buffer** _buffer)
if ((fFlags & FLAG_NO_RECEIVE) != 0)
return B_OK;

- status_t status = fReceiveList.Wait(locker, timeout);
+ status_t status = _WaitForCondition(fReceiveCondition, locker,
timeout);
if (status < B_OK) {
// The Open Group base specification mentions that
EINTR should be
// returned if the recv() is interrupted before _any
data_ is
@@ -987,7 +926,7 @@ TCPEndpoint::ReadData(size_t numBytes, uint32 flags,
net_buffer** _buffer)
fReceiveQueue.Available());

if (numBytes < fReceiveQueue.Available())
- fReceiveList.Signal();
+ fReceiveCondition.NotifyAll();

bool clone = (flags & MSG_PEEK) != 0;

@@ -1198,7 +1137,7 @@ TCPEndpoint::_MarkEstablished()
release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
}

- fSendList.Signal();
+ fSendCondition.NotifyAll();
gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
}

@@ -1212,7 +1151,7 @@ TCPEndpoint::_WaitForEstablished(MutexLocker &locker,
bigtime_t timeout)
if (socket->error != B_OK)
return socket->error;

- status_t status = fSendList.Wait(locker, timeout);
+ status_t status = _WaitForCondition(fSendCondition, locker,
timeout);
if (status < B_OK)
return status;
}
@@ -1233,7 +1172,7 @@ TCPEndpoint::_Close()

fFlags |= FLAG_DELETE_ON_CLOSE;

- fSendList.Signal();
+ fSendCondition.NotifyAll();
_NotifyReader();

if (gSocketModule->has_parent(socket)) {
@@ -1311,7 +1250,7 @@ TCPEndpoint::_AvailableData() const
void
TCPEndpoint::_NotifyReader()
{
- fReceiveList.Signal();
+ fReceiveCondition.NotifyAll();
gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
}

@@ -2203,7 +2142,7 @@ TCPEndpoint::_Acknowledged(tcp_segment_header& segment)

if (is_writable(fState)) {
// notify threads waiting on the socket to become
writable again
- fSendList.Signal();
+ fSendCondition.NotifyAll();
gSocketModule->notify(socket, B_SELECT_WRITE,
fSendQueue.Free());
}

@@ -2342,6 +2281,21 @@ TCPEndpoint::_TimeWaitTimer(net_timer* timer, void*
_endpoint)
}


+/*static*/ status_t
+TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
+ MutexLocker& locker, bigtime_t timeout)
+{
+ ConditionVariableEntry entry;
+ condition.Add(&entry);
+
+ locker.Unlock();
+ status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
timeout);
+ locker.Lock();
+
+ return result;
+}
+
+
// #pragma mark -


diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
index 085e173..c49bf47 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
@@ -25,22 +25,6 @@
#include <stddef.h>


-class WaitList {
-public:
- WaitList(const char* name);
- ~WaitList();
-
- status_t InitCheck() const;
-
- status_t Wait(MutexLocker& locker, bigtime_t timeout =
B_INFINITE_TIMEOUT);
- void Signal();
-
-private:
- int32 fCondition;
- sem_id fSem;
-};
-
-
class TCPEndpoint : public net_protocol, public ProtocolSocket {
public:
TCPEndpoint(net_socket* socket);
@@ -132,6 +116,9 @@ private:
static void _DelayedAcknowledgeTimer(net_timer* timer,
void* _endpoint);

+ static status_t _WaitForCondition(ConditionVariable& condition,
+ MutexLocker& locker,
bigtime_t timeout);
+
private:
TCPEndpoint* fConnectionHashLink;
TCPEndpoint* fEndpointHashLink;
@@ -141,8 +128,10 @@ private:

mutex fLock;
EndpointManager* fManager;
- WaitList fReceiveList;
- WaitList fSendList;
+ ConditionVariable
+ fReceiveCondition;
+ ConditionVariable
+ fSendCondition;
sem_id fAcceptSemaphore;
uint8 fOptions;


############################################################################

Commit: 5f7749078e1cf1c7ea9fa091a1c1f1d94ec37ce2
URL: http://cgit.haiku-os.org/haiku/commit/?id=5f7749078e1c
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 16:29:29 2015 UTC

tcp: Fix retransmit logic to avoid lots of spurious retransmits.

The retransmit timer was only stopped when all in flight data was
acknowledged and never updated on individual acknowledgements. This
caused a lot of erroneous retransmits whenever the buffer was filled
fast enough so that the acknowledgements never caught up, i.e. whenever
uploading or streaming data.

Move setting of the initial retransmit timer inside the send loop so it
is closer to the actual time the segment is sent out and simplify the
logic a bit.

Limit the minimal retransmit timeout to 200 msecs to avoid spurious
retransmit in the face of delayed acknowledgements. This is lower than
the 1 second minimum the RFCs suggest. Other stacks use various other
sub-second timeouts, the 200 msecs follows what Linux does.

Also add the exponential back off of the retransmit timeout when
retransmits are triggered. This is bounded by a 60 seconds maximum
according to RFC6298.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 8204458..4fab616 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -1953,7 +1953,8 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
}

uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
- tcp_sequence previousSendNext = fSendNext;
+ bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
+ bool retransmit = fSendNext < fSendMax;

do {
uint32 segmentMaxSize = fSendMaxSegmentSize
@@ -1968,7 +1969,7 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
}

// Determine if we should really send this segment
- if (!force && !_ShouldSendSegment(segment, segmentLength,
+ if (!force && !retransmit && !_ShouldSendSegment(segment,
segmentLength,
segmentMaxSize, flightSize)) {
if (fSendQueue.Available()
&&
!gStackModule->is_timer_active(&fPersistTimer)
@@ -2047,23 +2048,24 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
return status;
}

+ if (shouldStartRetransmitTimer && size > 0) {
+ TRACE("starting initial retransmit timer of: %"
B_PRIdBIGTIME,
+ fRetransmitTimeout);
+ gStackModule->set_timer(&fRetransmitTimer,
fRetransmitTimeout);
+ shouldStartRetransmitTimer = false;
+ }
+
if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
fLastAcknowledgeSent = segment.acknowledge;

length -= segmentLength;
segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
| TCP_FLAG_FINISH);
- } while (length > 0);

- // if we sent data from the beggining of the send queue,
- // start the retransmition timer
- if (previousSendNext == fSendUnacknowledged
- && fSendNext > previousSendNext) {
- TRACE(" SendQueue(): set retransmit timer with rto %"
B_PRIdBIGTIME,
- fRetransmitTimeout);
+ if (retransmit)
+ break;

- gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
- }
+ } while (length > 0);

return B_OK;
}
@@ -2120,24 +2122,34 @@ TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
void
TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
{
- size_t previouslyUsed = fSendQueue.Used();
-
- fSendQueue.RemoveUntil(segment.acknowledge);
- fSendUnacknowledged = segment.acknowledge;
+ TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
+ B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
+ fSendUnacknowledged.Number(), fSendNext.Number(),
fSendMax.Number());

- if (fSendNext < fSendUnacknowledged)
- fSendNext = fSendUnacknowledged;
+ ASSERT(fSendUnacknowledged <= segment.acknowledge);

- if (fSendUnacknowledged == fSendMax)
- gStackModule->cancel_timer(&fRetransmitTimer);
-
- if (fSendQueue.Used() < previouslyUsed) {
- // this ACK acknowledged data
+ if (fSendUnacknowledged < segment.acknowledge) {
+ fSendQueue.RemoveUntil(segment.acknowledge);
+ fSendUnacknowledged = segment.acknowledge;
+ if (fSendNext < fSendUnacknowledged)
+ fSendNext = fSendUnacknowledged;

if (segment.options & TCP_HAS_TIMESTAMPS)

_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
else {
- // TODO: Fallback to RFC 793 type estimation
+ // TODO: Fallback to RFC 793 type estimation; This just
resets
+ // any potential exponential back off that happened due
to
+ // retransmits.
+ fRetransmitTimeout = TCP_INITIAL_RTT;
+ }
+
+ if (fSendUnacknowledged == fSendMax) {
+ TRACE("all acknowledged, cancelling retransmission
timer");
+ gStackModule->cancel_timer(&fRetransmitTimer);
+ } else {
+ TRACE("data acknowledged, resetting retransmission
timer to: %"
+ B_PRIdBIGTIME, fRetransmitTimeout);
+ gStackModule->set_timer(&fRetransmitTimer,
fRetransmitTimeout);
}

if (is_writable(fState)) {
@@ -2171,8 +2183,15 @@ void
TCPEndpoint::_Retransmit()
{
TRACE("Retransmit()");
+
_ResetSlowStart();
fSendNext = fSendUnacknowledged;
+
+ // Do exponential back off of the retransmit timeout
+ fRetransmitTimeout *= 2;
+ if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
+ fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
+
_SendQueued();
}

@@ -2192,6 +2211,8 @@ TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)

fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
* kTimestampFactor;
+ if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
+ fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;

TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
fRetransmitTimeout, roundTripTime);
diff --git a/src/add-ons/kernel/network/protocols/tcp/tcp.h
b/src/add-ons/kernel/network/protocols/tcp/tcp.h
index 64e13ac..07d4423 100644
--- a/src/add-ons/kernel/network/protocols/tcp/tcp.h
+++ b/src/add-ons/kernel/network/protocols/tcp/tcp.h
@@ -186,6 +186,11 @@ operator==(tcp_sequence a, tcp_sequence b)
#define TCP_MAX_WINDOW 65535
#define TCP_MAX_SEGMENT_LIFETIME 60000000 // 60 secs

+// Minimum retransmit timeout (consider delayed ack)
+#define TCP_MIN_RETRANSMIT_TIMEOUT 200000 // 200 msecs
+// Maximum retransmit timeout (per RFC6298)
+#define TCP_MAX_RETRANSMIT_TIMEOUT 60000000 // 60 secs
+
struct tcp_sack {
uint32 left_edge;
uint32 right_edge;

############################################################################

Commit: 01b0f935ec6177b7f20eeaf0955df0600ced66c3
URL: http://cgit.haiku-os.org/haiku/commit/?id=01b0f935ec61
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 17:05:24 2015 UTC

tcp: Move persist timeout value to a define in the header.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 4fab616..74caa4e 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -1061,7 +1061,7 @@ TCPEndpoint::SendAcknowledge(bool force)
void
TCPEndpoint::_StartPersistTimer()
{
- gStackModule->set_timer(&fPersistTimer, 1000000LL);
+ gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
}


diff --git a/src/add-ons/kernel/network/protocols/tcp/tcp.h
b/src/add-ons/kernel/network/protocols/tcp/tcp.h
index 07d4423..47fe3f0 100644
--- a/src/add-ons/kernel/network/protocols/tcp/tcp.h
+++ b/src/add-ons/kernel/network/protocols/tcp/tcp.h
@@ -185,6 +185,7 @@ operator==(tcp_sequence a, tcp_sequence b)
#define TCP_DEFAULT_MAX_SEGMENT_SIZE 536
#define TCP_MAX_WINDOW 65535
#define TCP_MAX_SEGMENT_LIFETIME 60000000 // 60 secs
+#define TCP_PERSIST_TIMEOUT 1000000 // 1 sec

// Minimum retransmit timeout (consider delayed ack)
#define TCP_MIN_RETRANSMIT_TIMEOUT 200000 // 200 msecs

############################################################################

Commit: 05220224ffc203b62a13169288c178553e98ccb8
URL: http://cgit.haiku-os.org/haiku/commit/?id=05220224ffc2
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 17:10:27 2015 UTC

tcp: Split Timer trace entry into Timer{Set|Triggered}.

Trace whenever a timer is (re-)set as well as when it triggers. A value
of -1 denotes the cancellation of the timer.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 74caa4e..cfbb271 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -226,12 +226,13 @@ protected:
tcp_state fState;
};

-class Timer : public AbstractTraceEntry {
+class TimerSet : public AbstractTraceEntry {
public:
- Timer(TCPEndpoint* endpoint, const char* which)
+ TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
:
fEndpoint(endpoint),
fWhich(which),
+ fTimeout(timeout),
fState(endpoint->State())
{
Initialized();
@@ -239,7 +240,31 @@ public:

virtual void AddDump(TraceOutput& out)
{
- out.Print("tcp:%p (%12s) %s timer", fEndpoint,
+ out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME,
fEndpoint,
+ name_for_state(fState), fWhich, fTimeout);
+ }
+
+protected:
+ TCPEndpoint* fEndpoint;
+ const char* fWhich;
+ bigtime_t fTimeout;
+ tcp_state fState;
+};
+
+class TimerTriggered : public AbstractTraceEntry {
+public:
+ TimerTriggered(TCPEndpoint* endpoint, const char* which)
+ :
+ fEndpoint(endpoint),
+ fWhich(which),
+ fState(endpoint->State())
+ {
+ Initialized();
+ }
+
+ virtual void AddDump(TraceOutput& out)
+ {
+ out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
name_for_state(fState), fWhich);
}

@@ -418,6 +443,7 @@ TCPEndpoint::~TCPEndpoint()

_CancelConnectionTimers();
gStackModule->cancel_timer(&fTimeWaitTimer);
+ T(TimerSet(this, "time-wait", -1));

if (fManager != NULL) {
fManager->Unbind(this);
@@ -1042,11 +1068,13 @@ TCPEndpoint::DelayedAcknowledge()
if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
// timer was active, send an ACK now (with the exception above,
// we send every other ACK)
+ T(TimerSet(this, "delayed ack", -1));
return SendAcknowledge(true);
}

gStackModule->set_timer(&fDelayedAcknowledgeTimer,
TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
+ T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
return B_OK;
}

@@ -1062,6 +1090,7 @@ void
TCPEndpoint::_StartPersistTimer()
{
gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
+ T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
}


@@ -1086,6 +1115,7 @@ void
TCPEndpoint::_UpdateTimeWait()
{
gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
+ T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
}


@@ -1093,8 +1123,11 @@ void
TCPEndpoint::_CancelConnectionTimers()
{
gStackModule->cancel_timer(&fRetransmitTimer);
+ T(TimerSet(this, "retransmit", -1));
gStackModule->cancel_timer(&fPersistTimer);
+ T(TimerSet(this, "persist", -1));
gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
+ T(TimerSet(this, "delayed ack", -1));
}


@@ -2052,6 +2085,7 @@ TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
TRACE("starting initial retransmit timer of: %"
B_PRIdBIGTIME,
fRetransmitTimeout);
gStackModule->set_timer(&fRetransmitTimer,
fRetransmitTimeout);
+ T(TimerSet(this, "retransmit", fRetransmitTimeout));
shouldStartRetransmitTimer = false;
}

@@ -2146,10 +2180,12 @@ TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
if (fSendUnacknowledged == fSendMax) {
TRACE("all acknowledged, cancelling retransmission
timer");
gStackModule->cancel_timer(&fRetransmitTimer);
+ T(TimerSet(this, "retransmit", -1));
} else {
TRACE("data acknowledged, resetting retransmission
timer to: %"
B_PRIdBIGTIME, fRetransmitTimeout);
gStackModule->set_timer(&fRetransmitTimer,
fRetransmitTimeout);
+ T(TimerSet(this, "retransmit", fRetransmitTimeout));
}

if (is_writable(fState)) {
@@ -2235,7 +2271,7 @@ TCPEndpoint::_ResetSlowStart()
TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
{
TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
- T(Timer(endpoint, "retransmit"));
+ T(TimerTriggered(endpoint, "retransmit"));

MutexLocker locker(endpoint->fLock);
if (!locker.IsLocked())
@@ -2249,7 +2285,7 @@ TCPEndpoint::_RetransmitTimer(net_timer* timer, void*
_endpoint)
TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
{
TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
- T(Timer(endpoint, "persist"));
+ T(TimerTriggered(endpoint, "persist"));

MutexLocker locker(endpoint->fLock);
if (!locker.IsLocked())
@@ -2267,7 +2303,7 @@ TCPEndpoint::_PersistTimer(net_timer* timer, void*
_endpoint)
TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
{
TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
- T(Timer(endpoint, "delayed ack"));
+ T(TimerTriggered(endpoint, "delayed ack"));

MutexLocker locker(endpoint->fLock);
if (!locker.IsLocked())
@@ -2285,7 +2321,7 @@ TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer,
void* _endpoint)
TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
{
TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
- T(Timer(endpoint, "time-wait"));
+ T(TimerTriggered(endpoint, "time-wait"));

MutexLocker locker(endpoint->fLock);
if (!locker.IsLocked())

############################################################################

Commit: bc49140bab90e562dfff8a30f99e647c11779b30
URL: http://cgit.haiku-os.org/haiku/commit/?id=bc49140bab90
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 18:31:50 2015 UTC

tcp: Add APICall trace entry and move TRACEs into locked parts.

The APICall trace entry just records the function name but this is
enough to deduce where some of the state changes come from.

Also move the TRACE macros past the MutexLockers to ensure that their
output happens at the time when the methods actually run.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index cfbb271..61f010a 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -274,6 +274,29 @@ protected:
tcp_state fState;
};

+class APICall : public AbstractTraceEntry {
+public:
+ APICall(TCPEndpoint* endpoint, const char* which)
+ :
+ fEndpoint(endpoint),
+ fWhich(which),
+ fState(endpoint->State())
+ {
+ Initialized();
+ }
+
+ virtual void AddDump(TraceOutput& out)
+ {
+ out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
+ name_for_state(fState), fWhich);
+ }
+
+protected:
+ TCPEndpoint* fEndpoint;
+ const char* fWhich;
+ tcp_state fState;
+};
+
} // namespace TCPTracing

# define T(x) new(std::nothrow) TCPTracing::x
@@ -434,6 +457,8 @@ TCPEndpoint::TCPEndpoint(net_socket* socket)
TCPEndpoint::_DelayedAcknowledgeTimer, this);
gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
this);
+
+ T(APICall(this, "constructor"));
}


@@ -441,6 +466,8 @@ TCPEndpoint::~TCPEndpoint()
{
mutex_lock(&fLock);

+ T(APICall(this, "destructor"));
+
_CancelConnectionTimers();
gStackModule->cancel_timer(&fTimeWaitTimer);
T(TimerSet(this, "time-wait", -1));
@@ -476,6 +503,7 @@ status_t
TCPEndpoint::Open()
{
TRACE("Open()");
+ T(APICall(this, "open"));

status_t status = ProtocolSocket::Open();
if (status < B_OK)
@@ -492,10 +520,11 @@ TCPEndpoint::Open()
status_t
TCPEndpoint::Close()
{
- TRACE("Close()");
-
MutexLocker locker(fLock);

+ TRACE("Close()");
+ T(APICall(this, "close"));
+
if (fState == LISTEN)
delete_sem(fAcceptSemaphore);

@@ -533,10 +562,11 @@ TCPEndpoint::Close()
void
TCPEndpoint::Free()
{
- TRACE("Free()");
-
MutexLocker _(fLock);

+ TRACE("Free()");
+ T(APICall(this, "free"));
+
if (fState <= SYNCHRONIZE_SENT)
return;

@@ -557,13 +587,14 @@ TCPEndpoint::Free()
status_t
TCPEndpoint::Connect(const sockaddr* address)
{
- TRACE("Connect() on address %s", PrintAddress(address));
-
if (!AddressModule()->is_same_family(address))
return EAFNOSUPPORT;

MutexLocker locker(fLock);

+ TRACE("Connect() on address %s", PrintAddress(address));
+ T(APICall(this, "connect"));
+
if (gStackModule->is_restarted_syscall()) {
bigtime_t timeout =
gStackModule->restore_syscall_restart_timeout();
status_t status = _WaitForEstablished(locker, timeout);
@@ -635,10 +666,11 @@ TCPEndpoint::Connect(const sockaddr* address)
status_t
TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
{
- TRACE("Accept()");
-
MutexLocker locker(fLock);

+ TRACE("Accept()");
+ T(APICall(this, "accept"));
+
status_t status;
bigtime_t timeout = absolute_timeout(socket->receive.timeout);
if (gStackModule->is_restarted_syscall())
@@ -679,6 +711,7 @@ TCPEndpoint::Bind(const sockaddr *address)
MutexLocker lock(fLock);

TRACE("Bind() on address %s", PrintAddress(address));
+ T(APICall(this, "bind"));

if (fState != CLOSED)
return EISCONN;
@@ -690,9 +723,11 @@ TCPEndpoint::Bind(const sockaddr *address)
status_t
TCPEndpoint::Unbind(struct sockaddr *address)
{
+ MutexLocker _(fLock);
+
TRACE("Unbind()");
+ T(APICall(this, "unbind"));

- MutexLocker _(fLock);
return fManager->Unbind(this);
}

@@ -700,10 +735,11 @@ TCPEndpoint::Unbind(struct sockaddr *address)
status_t
TCPEndpoint::Listen(int count)
{
- TRACE("Listen()");
-
MutexLocker _(fLock);

+ TRACE("Listen()");
+ T(APICall(this, "listen"));
+
if (fState != CLOSED && fState != LISTEN)
return B_BAD_VALUE;

@@ -731,10 +767,11 @@ TCPEndpoint::Listen(int count)
status_t
TCPEndpoint::Shutdown(int direction)
{
- TRACE("Shutdown(%i)", direction);
-
MutexLocker lock(fLock);

+ TRACE("Shutdown(%i)", direction);
+ T(APICall(this, "shutdown"));
+
if (direction == SHUT_RD || direction == SHUT_RDWR)
fFlags |= FLAG_NO_RECEIVE;

@@ -757,6 +794,7 @@ TCPEndpoint::SendData(net_buffer *buffer)
TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
buffer->size, buffer->flags, fSendQueue.Size(),
fSendQueue.Free());
+ T(APICall(this, "senddata"));

uint32 flags = buffer->flags;

@@ -856,6 +894,7 @@ TCPEndpoint::SendAvailable()
available = EPIPE;

TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
+ T(APICall(this, "sendavailable"));
return available;
}

@@ -876,10 +915,11 @@ TCPEndpoint::FillStat(net_stat *stat)
status_t
TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
{
+ MutexLocker locker(fLock);
+
TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
flags);
-
- MutexLocker locker(fLock);
+ T(APICall(this, "readdata"));

*_buffer = NULL;

@@ -975,6 +1015,7 @@ TCPEndpoint::ReadAvailable()
MutexLocker locker(fLock);

TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
+ T(APICall(this, "readavailable"));

return _AvailableData();
}

############################################################################

Commit: 94fb06bfce1f654a262e988fe224be42ffa4f8f1
URL: http://cgit.haiku-os.org/haiku/commit/?id=94fb06bfce1f
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 18:42:53 2015 UTC

tcp: Fix early cancellation of timers on socket free.

TCPEndpoint::Free() uses _EnterTimeWait() to start the time-wait timer
for later cleanup. The latter did call _CancelConnectionTimers()
unconditionally however, also cancelling a retransmit timer that was
possibly still needed for the retransmission of the FIN packet. If the
FIN packet got lost, the connection would be left open on the other end.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 61f010a..4762006 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -1138,14 +1138,16 @@ TCPEndpoint::_StartPersistTimer()
void
TCPEndpoint::_EnterTimeWait()
{
- TRACE("_EnterTimeWait()\n");
+ TRACE("_EnterTimeWait()");

- _CancelConnectionTimers();
+ if (fState == TIME_WAIT) {
+ _CancelConnectionTimers();

- if (fState == TIME_WAIT && IsLocal()) {
- // we do not use TIME_WAIT state for local connections
- fFlags |= FLAG_DELETE_ON_CLOSE;
- return;
+ if (IsLocal()) {
+ // we do not use TIME_WAIT state for local connections
+ fFlags |= FLAG_DELETE_ON_CLOSE;
+ return;
+ }
}

_UpdateTimeWait();

############################################################################

Commit: bed94ebc49e2b99d39a4a1ef1880d4d7a47a0c6d
URL: http://cgit.haiku-os.org/haiku/commit/?id=bed94ebc49e2
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 19:20:39 2015 UTC

tcp: Change timestamp factor from 1024 to 1000.

It is the conversion factor between the milliseconds tcp time and the
microseconds system time, so 1024 does not make much sense.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index 4762006..b4f52dd 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -321,7 +321,8 @@ enum {
};


-static const int kTimestampFactor = 1024;
+static const int kTimestampFactor = 1000;
+ // conversion factor between usec system time and msec tcp time


static inline bigtime_t

############################################################################

Revision: hrev49502
Commit: 8982293017b132c73aff734fa8baa7afd261e125
URL: http://cgit.haiku-os.org/haiku/commit/?id=8982293017b1
Author: Michael Lotz <mmlr@xxxxxxxx>
Date: Sun Aug 2 19:18:02 2015 UTC

tcp: Whitespace cleanup, move a define to header and fix a typo.

----------------------------------------------------------------------------

diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
index b4f52dd..d941d6e 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp
@@ -37,8 +37,8 @@


// References:
-// - RFC 793 - Transmission Control Protocol
-// - RFC 813 - Window and Acknowledgement Strategy in TCP
+// - RFC 793 - Transmission Control Protocol
+// - RFC 813 - Window and Acknowledgement Strategy in TCP
// - RFC 1337 - TIME_WAIT Assassination Hazards in TCP
//
// Things this implementation currently doesn't implement:
@@ -304,8 +304,6 @@ protected:
# define T(x)
#endif // TCP_TRACING

-// Initial estimate for packet round trip time (RTT)
-#define TCP_INITIAL_RTT 2000000

// constants for the fFlags field
enum {
@@ -1280,8 +1278,7 @@ TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header
&segment)

if (fDuplicateAcknowledgeCount == 3) {
_ResetSlowStart();
- fCongestionWindow = fSlowStartThreshold + 3
- * fSendMaxSegmentSize;
+ fCongestionWindow = fSlowStartThreshold + 3 *
fSendMaxSegmentSize;
fSendNext = segment.acknowledge;
} else if (fDuplicateAcknowledgeCount > 3)
fCongestionWindow += fSendMaxSegmentSize;
@@ -2253,7 +2250,7 @@ TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
fCongestionWindow += increment;
}

- // if there is data left to be send, send it now
+ // if there is data left to be sent, send it now
if (fSendQueue.Used() > 0)
_SendQueued();
}
diff --git a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
index c49bf47..1ff167b 100644
--- a/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
+++ b/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.h
@@ -150,7 +150,7 @@ private:
tcp_sequence fInitialSendSequence;
uint32 fDuplicateAcknowledgeCount;

- net_route *fRoute;
+ net_route *fRoute;
// TODO: don't use a net_route, but a net_route_info!!!
// (the latter will automatically adapt to routing changes)

diff --git a/src/add-ons/kernel/network/protocols/tcp/tcp.h
b/src/add-ons/kernel/network/protocols/tcp/tcp.h
index 47fe3f0..6f30ec2 100644
--- a/src/add-ons/kernel/network/protocols/tcp/tcp.h
+++ b/src/add-ons/kernel/network/protocols/tcp/tcp.h
@@ -43,30 +43,30 @@ enum tcp_state {
};

struct tcp_header {
- uint16 source_port;
- uint16 destination_port;
- uint32 sequence;
- uint32 acknowledge;
+ uint16 source_port;
+ uint16 destination_port;
+ uint32 sequence;
+ uint32 acknowledge;
struct {
#if B_HOST_IS_LENDIAN == 1
- uint8 reserved : 4;
- uint8 header_length : 4;
+ uint8 reserved : 4;
+ uint8 header_length : 4;
#else
- uint8 header_length : 4;
- uint8 reserved : 4;
+ uint8 header_length : 4;
+ uint8 reserved : 4;
#endif
};
- uint8 flags;
- uint16 advertised_window;
- uint16 checksum;
- uint16 urgent_offset;
-
- uint32 HeaderLength() const { return (uint32)header_length << 2; }
- uint32 Sequence() const { return ntohl(sequence); }
- uint32 Acknowledge() const { return ntohl(acknowledge); }
- uint16 AdvertisedWindow() const { return ntohs(advertised_window); }
- uint16 Checksum() const { return ntohs(checksum); }
- uint16 UrgentOffset() const { return ntohs(urgent_offset); }
+ uint8 flags;
+ uint16 advertised_window;
+ uint16 checksum;
+ uint16 urgent_offset;
+
+ uint32 HeaderLength() const { return (uint32)header_length << 2; }
+ uint32 Sequence() const { return ntohl(sequence); }
+ uint32 Acknowledge() const { return ntohl(acknowledge); }
+ uint16 AdvertisedWindow() const { return ntohs(advertised_window); }
+ uint16 Checksum() const { return ntohs(checksum); }
+ uint16 UrgentOffset() const { return ntohs(urgent_offset); }
} _PACKED;

class tcp_sequence {
@@ -187,6 +187,8 @@ operator==(tcp_sequence a, tcp_sequence b)
#define TCP_MAX_SEGMENT_LIFETIME 60000000 // 60 secs
#define TCP_PERSIST_TIMEOUT 1000000 // 1 sec

+// Initial estimate for packet round trip time (RTT)
+#define TCP_INITIAL_RTT 2000000
// 2 secs
// Minimum retransmit timeout (consider delayed ack)
#define TCP_MIN_RETRANSMIT_TIMEOUT 200000 // 200 msecs
// Maximum retransmit timeout (per RFC6298)


Other related posts: