From: Guus Sliepen Date: Sun, 17 Aug 2014 19:54:20 +0000 (+0200) Subject: Start implementation of congestion avoidance. X-Git-Url: http://git.meshlink.io/?p=utcp;a=commitdiff_plain;h=f60a19752749bf1d28031e78c46f8924325ffcdd Start implementation of congestion avoidance. We try to implement RFC 2001. --- diff --git a/README b/README index 0f91f87..6e14a31 100644 --- a/README +++ b/README @@ -191,3 +191,44 @@ RECEIVE PACKET - Process state changes due to SYN - Send new data to application - Process state changes due to FIN + +CONGESTION AVOIDANCE +-------------------- + +We want to send as much packets as possible that won't cause any packets to be +dropped. So we should not send more than the available bandwidth, and not more +in one go than buffers along the path can handle. + +To start, we use "self-clocking". We send one packet, and wait for an ACK +before sending another packet. On a network with a finite bandwidth but zero +delay (latency), this will send packets as efficiently as possible. We don't +need any timers to control the outgoing packet rate, that's why we call this +self-clocked. However, latency is non-zero, and this means a number of packets +is always on the way between the sender and receiver. The amount of packets +"inbetween" is in principle the bandwidth times the delay (bandwidth-delay +product, or BDP). + +Delay is fairly easy to measure (equal to half the round-trip time of a packet, +which in TCP is easily obtained from the SYN and SYNACK pair, or the ACK in +response of a segment), however bandwidth is more difficult and might change +more rapidly than the latency. + +Back to the "inbetween" packets: ideally we would like to fill the available +inbetween space completely. It should be easy to see that in that case, +self-clocking will still work as intended. Our estimate of the amount of +packets in the inbetween space is called the congestion window (CWND). If we +know the BDP, we can set the CWND to it, however if we don't know it, we can +start with a small CWND and gradually increase it (for example, every time we +receive an ACK, send the next 2 segments). At some point, we will start sending +at a higher rate than the available bandwidth, in which case packets will +inevitably be lost. We detect that because we do not receive an ACK for our +data, and then we have to reduce the CWND (for example, by half). + +The trick is to choose an algorithm that best keeps the CWND to the effective +BDP. + +A nice introduction is RFC 2001. + +snd.cwnd: size of the congestion window. +snd.nxt - snd.una: number of unacknowledged bytes, = number of bytes in flight. +snd.cwnd - (snd.nxt - snd.una): unused size of congestion window diff --git a/utcp.c b/utcp.c index 8ff9553..454a497 100644 --- a/utcp.c +++ b/utcp.c @@ -181,6 +181,8 @@ static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t s c->snd.una = c->snd.iss; c->snd.nxt = c->snd.iss + 1; c->rcv.wnd = utcp->mtu; + c->snd.last = c->snd.nxt; + c->snd.cwnd = utcp->mtu; c->utcp = utcp; // Add it to the sorted list of connections @@ -230,6 +232,46 @@ void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) { set_state(c, ESTABLISHED); } +static void ack(struct utcp_connection *c, bool sendatleastone) { + uint32_t left = seqdiff(c->snd.last, c->snd.nxt); + int32_t cwndleft = c->snd.cwnd - seqdiff(c->snd.nxt, c->snd.una); + char *data = c->sndbuf + seqdiff(c->snd.nxt, c->snd.una); + + if(cwndleft <= 0) + cwndleft = 0; + + if(cwndleft < left) + left = cwndleft; + + if(!left && !sendatleastone) + return; + + struct { + struct hdr hdr; + char data[c->utcp->mtu]; + } pkt; + + pkt.hdr.src = c->src; + pkt.hdr.dst = c->dst; + pkt.hdr.ack = c->rcv.nxt; + pkt.hdr.wnd = c->snd.wnd; + pkt.hdr.ctl = ACK; + + do { + uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left; + pkt.hdr.seq = c->snd.nxt; + + memcpy(pkt.data, data, seglen); + + c->snd.nxt += seglen; + data += seglen; + left -= seglen; + + print_packet(c->utcp, "send", &pkt, sizeof pkt.hdr + seglen); + c->utcp->send(c->utcp, &pkt, sizeof pkt.hdr + seglen); + } while(left); +} + ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) { if(c->reapable) { debug("Error: send() called on closed connection %p\n", c); @@ -277,11 +319,16 @@ ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) { */ if(len > c->sndbufsize - bufused && c->sndbufsize < c->maxsndbufsize) { + uint32_t newbufsize; if(c->sndbufsize > c->maxsndbufsize / 2) - c->sndbufsize = c->maxsndbufsize; + newbufsize = c->maxsndbufsize; else - c->sndbufsize *= 2; - c->sndbuf = realloc(c->sndbuf, c->sndbufsize); + newbufsize = c->sndbufsize * 2; + char *newbuf = realloc(c->sndbuf, newbufsize); + if(newbuf) { + c->sndbuf = newbuf; + c->sndbufsize = newbufsize; + } } if(len > c->sndbufsize - bufused) @@ -293,37 +340,9 @@ ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) { } memcpy(c->sndbuf + bufused, data, len); + c->snd.last += len; - // Send segments - - struct { - struct hdr hdr; - char data[c->utcp->mtu]; - } pkt; - - pkt.hdr.src = c->src; - pkt.hdr.dst = c->dst; - pkt.hdr.ack = c->rcv.nxt; - pkt.hdr.wnd = c->snd.wnd; - pkt.hdr.ctl = ACK; - - uint32_t left = len; - - while(left) { - uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left; - pkt.hdr.seq = c->snd.nxt; - - memcpy(pkt.data, data, seglen); - - c->snd.nxt += seglen; - data += seglen; - left -= seglen; - - print_packet(c->utcp, "send", &pkt, sizeof pkt.hdr + seglen); - c->utcp->send(c->utcp, &pkt, sizeof pkt.hdr + seglen); - } - - fprintf(stderr, "len=%zu\n", len); + ack(c, false); return len; } @@ -561,6 +580,18 @@ int utcp_recv(struct utcp *utcp, const void *data, size_t len) { uint32_t left = seqdiff(c->snd.nxt, hdr.ack); if(left) memmove(c->sndbuf, c->sndbuf + advanced, left); + c->dupack = 0; + c->snd.cwnd += utcp->mtu; + if(c->snd.cwnd > c->maxsndbufsize) + c->snd.cwnd = c->maxsndbufsize; + } else { + if(!len) { + c->dupack++; + if(c->dupack >= 3) { + debug("Triplicate ACK\n"); + abort(); + } + } } // 4. Update timers @@ -699,13 +730,7 @@ int utcp_recv(struct utcp *utcp, const void *data, size_t len) { return 0; ack: - hdr.src = c->src; - hdr.dst = c->dst; - hdr.seq = c->snd.nxt; - hdr.ack = c->rcv.nxt; - hdr.ctl = ACK; - print_packet(c->utcp, "send", &hdr, sizeof hdr); - utcp->send(utcp, &hdr, sizeof hdr); + ack(c, true); return 0; reset: diff --git a/utcp_priv.h b/utcp_priv.h index 24ae4cc..184e338 100644 --- a/utcp_priv.h +++ b/utcp_priv.h @@ -74,22 +74,27 @@ static const char *strstate[] __attribute__((unused)) = { struct utcp_connection { void *priv; struct utcp *utcp; + bool reapable; - bool nodelay; - bool keepalive; + // Callbacks + + utcp_recv_t recv; + + // TCP State uint16_t src; uint16_t dst; enum state state; - // The following two structures form the TCB - struct { uint32_t una; uint32_t nxt; uint32_t wnd; uint32_t iss; + + uint32_t last; + uint32_t cwnd; } snd; struct { @@ -98,26 +103,47 @@ struct utcp_connection { uint32_t irs; } rcv; - utcp_recv_t recv; + int dupack; + + // Timers struct timeval conn_timeout; struct timeval rtrx_timeout; + // Send buffer + char *sndbuf; + uint32_t sndbufused; uint32_t sndbufsize; uint32_t maxsndbufsize; + + // Per-socket options + + bool nodelay; + bool keepalive; + + // Congestion avoidance state + + struct timeval tlast; + uint64_t bandwidth; }; struct utcp { void *priv; + // Callbacks + utcp_accept_t accept; utcp_pre_accept_t pre_accept; utcp_send_t send; + // Global socket options + uint16_t mtu; int timeout; + // Connection management + struct utcp_connection **connections; int nconnections; int nallocated;