From f60a19752749bf1d28031e78c46f8924325ffcdd Mon Sep 17 00:00:00 2001
From: Guus Sliepen <guus@meshlink.io>
Date: Sun, 17 Aug 2014 21:54:20 +0200
Subject: [PATCH] Start implementation of congestion avoidance.

We try to implement RFC 2001.
---
 README      |  41 ++++++++++++++++++++
 utcp.c      | 105 ++++++++++++++++++++++++++++++++--------------------
 utcp_priv.h |  36 +++++++++++++++---
 3 files changed, 137 insertions(+), 45 deletions(-)

diff --git a/README b/README
index 0f91f87..6e14a31 100644
--- a/README
+++ b/README
@@ -191,3 +191,44 @@ RECEIVE PACKET
 - Process state changes due to SYN
 - Send new data to application
 - Process state changes due to FIN
+
+CONGESTION AVOIDANCE
+--------------------
+
+We want to send as much packets as possible that won't cause any packets to be
+dropped.  So we should not send more than the available bandwidth, and not more
+in one go than buffers along the path can handle.
+
+To start, we use "self-clocking". We send one packet, and wait for an ACK
+before sending another packet. On a network with a finite bandwidth but zero
+delay (latency), this will send packets as efficiently as possible. We don't
+need any timers to control the outgoing packet rate, that's why we call this
+self-clocked. However, latency is non-zero, and this means a number of packets
+is always on the way between the sender and receiver. The amount of packets
+"inbetween" is in principle the bandwidth times the delay (bandwidth-delay
+product, or BDP).
+
+Delay is fairly easy to measure (equal to half the round-trip time of a packet,
+which in TCP is easily obtained from the SYN and SYNACK pair, or the ACK in
+response of a segment), however bandwidth is more difficult and might change
+more rapidly than the latency.
+
+Back to the "inbetween" packets: ideally we would like to fill the available
+inbetween space completely. It should be easy to see that in that case,
+self-clocking will still work as intended. Our estimate of the amount of
+packets in the inbetween space is called the congestion window (CWND).  If we
+know the BDP, we can set the CWND to it, however if we don't know it, we can
+start with a small CWND and gradually increase it (for example, every time we
+receive an ACK, send the next 2 segments). At some point, we will start sending
+at a higher rate than the available bandwidth, in which case packets will
+inevitably be lost. We detect that because we do not receive an ACK for our
+data, and then we have to reduce the CWND (for example, by half).
+
+The trick is to choose an algorithm that best keeps the CWND to the effective
+BDP.
+
+A nice introduction is RFC 2001.
+
+snd.cwnd: size of the congestion window.
+snd.nxt - snd.una: number of unacknowledged bytes, = number of bytes in flight.
+snd.cwnd - (snd.nxt - snd.una): unused size of congestion window
diff --git a/utcp.c b/utcp.c
index 8ff9553..454a497 100644
--- a/utcp.c
+++ b/utcp.c
@@ -181,6 +181,8 @@ static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t s
 	c->snd.una = c->snd.iss;
 	c->snd.nxt = c->snd.iss + 1;
 	c->rcv.wnd = utcp->mtu;
+	c->snd.last = c->snd.nxt;
+	c->snd.cwnd = utcp->mtu;
 	c->utcp = utcp;
 
 	// Add it to the sorted list of connections
@@ -230,6 +232,46 @@ void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
 	set_state(c, ESTABLISHED);
 }
 
+static void ack(struct utcp_connection *c, bool sendatleastone) {
+	uint32_t left = seqdiff(c->snd.last, c->snd.nxt);
+	int32_t cwndleft = c->snd.cwnd - seqdiff(c->snd.nxt, c->snd.una);
+	char *data = c->sndbuf + seqdiff(c->snd.nxt, c->snd.una);
+
+	if(cwndleft <= 0)
+		cwndleft = 0;
+
+	if(cwndleft < left)
+		left = cwndleft;
+
+	if(!left && !sendatleastone)
+		return;
+
+	struct {
+		struct hdr hdr;
+		char data[c->utcp->mtu];
+	} pkt;
+
+	pkt.hdr.src = c->src;
+	pkt.hdr.dst = c->dst;
+	pkt.hdr.ack = c->rcv.nxt;
+	pkt.hdr.wnd = c->snd.wnd;
+	pkt.hdr.ctl = ACK;
+
+	do {
+		uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
+		pkt.hdr.seq = c->snd.nxt;
+
+		memcpy(pkt.data, data, seglen);
+
+		c->snd.nxt += seglen;
+		data += seglen;
+		left -= seglen;
+
+		print_packet(c->utcp, "send", &pkt, sizeof pkt.hdr + seglen);
+		c->utcp->send(c->utcp, &pkt, sizeof pkt.hdr + seglen);
+	} while(left);
+}
+
 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
 	if(c->reapable) {
 		debug("Error: send() called on closed connection %p\n", c);
@@ -277,11 +319,16 @@ ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
 	 */
 
 	if(len > c->sndbufsize - bufused && c->sndbufsize < c->maxsndbufsize) {
+		uint32_t newbufsize;
 		if(c->sndbufsize > c->maxsndbufsize / 2)
-			c->sndbufsize = c->maxsndbufsize;
+			newbufsize = c->maxsndbufsize;
 		else
-			c->sndbufsize *= 2;
-		c->sndbuf = realloc(c->sndbuf, c->sndbufsize);
+			newbufsize = c->sndbufsize * 2;
+		char *newbuf = realloc(c->sndbuf, newbufsize);
+		if(newbuf) {
+			c->sndbuf = newbuf;
+			c->sndbufsize = newbufsize;
+		}
 	}
 
 	if(len > c->sndbufsize - bufused)
@@ -293,37 +340,9 @@ ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
 	}
 
 	memcpy(c->sndbuf + bufused, data, len);
+	c->snd.last += len;
 
-	// Send segments
-
-	struct {
-		struct hdr hdr;
-		char data[c->utcp->mtu];
-	} pkt;
-
-	pkt.hdr.src = c->src;
-	pkt.hdr.dst = c->dst;
-	pkt.hdr.ack = c->rcv.nxt;
-	pkt.hdr.wnd = c->snd.wnd;
-	pkt.hdr.ctl = ACK;
-
-	uint32_t left = len;
-
-	while(left) {
-		uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
-		pkt.hdr.seq = c->snd.nxt;
-
-		memcpy(pkt.data, data, seglen);
-
-		c->snd.nxt += seglen;
-		data += seglen;
-		left -= seglen;
-
-		print_packet(c->utcp, "send", &pkt, sizeof pkt.hdr + seglen);
-		c->utcp->send(c->utcp, &pkt, sizeof pkt.hdr + seglen);
-	}
-
-	fprintf(stderr, "len=%zu\n", len);
+	ack(c, false);
 	return len;
 }
 
@@ -561,6 +580,18 @@ int utcp_recv(struct utcp *utcp, const void *data, size_t len) {
 		uint32_t left = seqdiff(c->snd.nxt, hdr.ack);
 		if(left)
 			memmove(c->sndbuf, c->sndbuf + advanced, left);
+		c->dupack = 0;
+		c->snd.cwnd += utcp->mtu;
+		if(c->snd.cwnd > c->maxsndbufsize)
+			c->snd.cwnd = c->maxsndbufsize;
+	} else {
+		if(!len) {
+			c->dupack++;
+			if(c->dupack >= 3) {
+				debug("Triplicate ACK\n");
+				abort();
+			}
+		}
 	}
 
 	// 4. Update timers
@@ -699,13 +730,7 @@ int utcp_recv(struct utcp *utcp, const void *data, size_t len) {
 		return 0;
 
 ack:
-	hdr.src = c->src;
-	hdr.dst = c->dst;
-	hdr.seq = c->snd.nxt;
-	hdr.ack = c->rcv.nxt;
-	hdr.ctl = ACK;
-	print_packet(c->utcp, "send", &hdr, sizeof hdr);
-	utcp->send(utcp, &hdr, sizeof hdr);
+	ack(c, true);
 	return 0;
 
 reset:
diff --git a/utcp_priv.h b/utcp_priv.h
index 24ae4cc..184e338 100644
--- a/utcp_priv.h
+++ b/utcp_priv.h
@@ -74,22 +74,27 @@ static const char *strstate[] __attribute__((unused)) = {
 struct utcp_connection {
 	void *priv;
 	struct utcp *utcp;
+
 	bool reapable;
 
-	bool nodelay;
-	bool keepalive;
+	// Callbacks
+
+	utcp_recv_t recv;
+
+	// TCP State
 
 	uint16_t src;
 	uint16_t dst;
 	enum state state;
 
-	// The following two structures form the TCB
-
 	struct {
 		uint32_t una;
 		uint32_t nxt;
 		uint32_t wnd;
 		uint32_t iss;
+
+		uint32_t last;
+		uint32_t cwnd;
 	} snd;
 
 	struct {
@@ -98,26 +103,47 @@ struct utcp_connection {
 		uint32_t irs;
 	} rcv;
 
-	utcp_recv_t recv;
+	int dupack;
+
+	// Timers
 
 	struct timeval conn_timeout;
 	struct timeval rtrx_timeout;
 
+	// Send buffer
+
 	char *sndbuf;
+	uint32_t sndbufused;
 	uint32_t sndbufsize;
 	uint32_t maxsndbufsize;
+
+	// Per-socket options
+
+	bool nodelay;
+	bool keepalive;
+
+	// Congestion avoidance state
+
+	struct timeval tlast;
+	uint64_t bandwidth;
 };
 
 struct utcp {
 	void *priv;
 
+	// Callbacks
+
 	utcp_accept_t accept;
 	utcp_pre_accept_t pre_accept;
 	utcp_send_t send;
 
+	// Global socket options
+
 	uint16_t mtu;
 	int timeout;
 
+	// Connection management
+
 	struct utcp_connection **connections;
 	int nconnections;
 	int nallocated;
-- 
2.47.3