]> git.meshlink.io Git - utcp/blob - utcp.c
Avoid sending packets smaller than the MTU if we don't need to.
[utcp] / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <sys/time.h>
31 #include <sys/socket.h>
32
33 #include "utcp_priv.h"
34
35 #ifndef EBADMSG
36 #define EBADMSG         104
37 #endif
38
39 #ifndef SHUT_RDWR
40 #define SHUT_RDWR 2
41 #endif
42
43 #ifdef poll
44 #undef poll
45 #endif
46
47 #ifndef timersub
48 #define timersub(a, b, r)\
49         do {\
50                 (r)->tv_sec = (a)->tv_sec - (b)->tv_sec;\
51                 (r)->tv_usec = (a)->tv_usec - (b)->tv_usec;\
52                 if((r)->tv_usec < 0)\
53                         (r)->tv_sec--, (r)->tv_usec += USEC_PER_SEC;\
54         } while (0)
55 #endif
56
57 static inline size_t min(size_t a, size_t b) {
58         return a < b ? a : b;
59 }
60
61 static inline size_t max(size_t a, size_t b) {
62         return a > b ? a : b;
63 }
64
65 #ifdef UTCP_DEBUG
66 #include <stdarg.h>
67
68 static void debug(const char *format, ...) {
69         va_list ap;
70         va_start(ap, format);
71         vfprintf(stderr, format, ap);
72         va_end(ap);
73 }
74
75 static void print_packet(struct utcp *utcp, const char *dir, const void *pkt, size_t len) {
76         struct hdr hdr;
77
78         if(len < sizeof(hdr)) {
79                 debug("%p %s: short packet (%lu bytes)\n", utcp, dir, (unsigned long)len);
80                 return;
81         }
82
83         memcpy(&hdr, pkt, sizeof(hdr));
84         debug("%p %s: len=%lu, src=%u dst=%u seq=%u ack=%u wnd=%u aux=%x ctl=", utcp, dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux);
85
86         if(hdr.ctl & SYN) {
87                 debug("SYN");
88         }
89
90         if(hdr.ctl & RST) {
91                 debug("RST");
92         }
93
94         if(hdr.ctl & FIN) {
95                 debug("FIN");
96         }
97
98         if(hdr.ctl & ACK) {
99                 debug("ACK");
100         }
101
102         if(len > sizeof(hdr)) {
103                 uint32_t datalen = len - sizeof(hdr);
104                 const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
105                 char str[datalen * 2 + 1];
106                 char *p = str;
107
108                 for(uint32_t i = 0; i < datalen; i++) {
109                         *p++ = "0123456789ABCDEF"[data[i] >> 4];
110                         *p++ = "0123456789ABCDEF"[data[i] & 15];
111                 }
112
113                 *p = 0;
114
115                 debug(" data=%s", str);
116         }
117
118         debug("\n");
119 }
120
121 static void debug_cwnd(struct utcp_connection *c) {
122         debug("snd.cwnd = %u\n", c->snd.cwnd);
123 }
124 #else
125 #define debug(...) do {} while(0)
126 #define print_packet(...) do {} while(0)
127 #define debug_cwnd(...) do {} while(0)
128 #endif
129
130 static void set_state(struct utcp_connection *c, enum state state) {
131         c->state = state;
132
133         if(state == ESTABLISHED) {
134                 timerclear(&c->conn_timeout);
135         }
136
137         debug("%p new state: %s\n", c->utcp, strstate[state]);
138 }
139
140 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
141         if(seq != c->snd.last) {
142                 return false;
143         }
144
145         switch(c->state) {
146         case FIN_WAIT_1:
147         case CLOSING:
148         case LAST_ACK:
149                 return true;
150
151         default:
152                 return false;
153         }
154 }
155
156 static bool is_reliable(struct utcp_connection *c) {
157         return c->flags & UTCP_RELIABLE;
158 }
159
160 static int32_t seqdiff(uint32_t a, uint32_t b) {
161         return a - b;
162 }
163
164 // Buffer functions
165 // TODO: convert to ringbuffers to avoid memmove() operations.
166
167 // Store data into the buffer
168 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
169         debug("buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
170
171         size_t required = offset + len;
172
173         if(required > buf->maxsize) {
174                 if(offset >= buf->maxsize) {
175                         return 0;
176                 }
177
178                 len = buf->maxsize - offset;
179                 required = buf->maxsize;
180         }
181
182         if(required > buf->size) {
183                 size_t newsize = buf->size;
184
185                 if(!newsize) {
186                         newsize = required;
187                 } else {
188                         do {
189                                 newsize *= 2;
190                         } while(newsize < required);
191                 }
192
193                 if(newsize > buf->maxsize) {
194                         newsize = buf->maxsize;
195                 }
196
197                 char *newdata = realloc(buf->data, newsize);
198
199                 if(!newdata) {
200                         return -1;
201                 }
202
203                 buf->data = newdata;
204                 buf->size = newsize;
205         }
206
207         memcpy(buf->data + offset, data, len);
208
209         if(required > buf->used) {
210                 buf->used = required;
211         }
212
213         return len;
214 }
215
216 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
217         return buffer_put_at(buf, buf->used, data, len);
218 }
219
220 // Get data from the buffer. data can be NULL.
221 static ssize_t buffer_get(struct buffer *buf, void *data, size_t len) {
222         if(len > buf->used) {
223                 len = buf->used;
224         }
225
226         if(data) {
227                 memcpy(data, buf->data, len);
228         }
229
230         if(len < buf->used) {
231                 memmove(buf->data, buf->data + len, buf->used - len);
232         }
233
234         buf->used -= len;
235         return len;
236 }
237
238 // Copy data from the buffer without removing it.
239 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
240         if(offset >= buf->used) {
241                 return 0;
242         }
243
244         if(offset + len > buf->used) {
245                 len = buf->used - offset;
246         }
247
248         memcpy(data, buf->data + offset, len);
249         return len;
250 }
251
252 static bool buffer_init(struct buffer *buf, uint32_t len, uint32_t maxlen) {
253         memset(buf, 0, sizeof(*buf));
254
255         if(len) {
256                 buf->data = malloc(len);
257
258                 if(!buf->data) {
259                         return false;
260                 }
261         }
262
263         buf->size = len;
264         buf->maxsize = maxlen;
265         return true;
266 }
267
268 static void buffer_exit(struct buffer *buf) {
269         free(buf->data);
270         memset(buf, 0, sizeof(*buf));
271 }
272
273 static uint32_t buffer_free(const struct buffer *buf) {
274         return buf->maxsize - buf->used;
275 }
276
277 // Connections are stored in a sorted list.
278 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
279
280 static int compare(const void *va, const void *vb) {
281         assert(va && vb);
282
283         const struct utcp_connection *a = *(struct utcp_connection **)va;
284         const struct utcp_connection *b = *(struct utcp_connection **)vb;
285
286         assert(a && b);
287         assert(a->src && b->src);
288
289         int c = (int)a->src - (int)b->src;
290
291         if(c) {
292                 return c;
293         }
294
295         c = (int)a->dst - (int)b->dst;
296         return c;
297 }
298
299 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
300         if(!utcp->nconnections) {
301                 return NULL;
302         }
303
304         struct utcp_connection key = {
305                 .src = src,
306                 .dst = dst,
307         }, *keyp = &key;
308         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
309         return match ? *match : NULL;
310 }
311
312 static void free_connection(struct utcp_connection *c) {
313         struct utcp *utcp = c->utcp;
314         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
315
316         assert(cp);
317
318         int i = cp - utcp->connections;
319         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
320         utcp->nconnections--;
321
322         buffer_exit(&c->rcvbuf);
323         buffer_exit(&c->sndbuf);
324         free(c);
325 }
326
327 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
328         // Check whether this combination of src and dst is free
329
330         if(src) {
331                 if(find_connection(utcp, src, dst)) {
332                         errno = EADDRINUSE;
333                         return NULL;
334                 }
335         } else { // If src == 0, generate a random port number with the high bit set
336                 if(utcp->nconnections >= 32767) {
337                         errno = ENOMEM;
338                         return NULL;
339                 }
340
341                 src = rand() | 0x8000;
342
343                 while(find_connection(utcp, src, dst)) {
344                         src++;
345                 }
346         }
347
348         // Allocate memory for the new connection
349
350         if(utcp->nconnections >= utcp->nallocated) {
351                 if(!utcp->nallocated) {
352                         utcp->nallocated = 4;
353                 } else {
354                         utcp->nallocated *= 2;
355                 }
356
357                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
358
359                 if(!new_array) {
360                         return NULL;
361                 }
362
363                 utcp->connections = new_array;
364         }
365
366         struct utcp_connection *c = calloc(1, sizeof(*c));
367
368         if(!c) {
369                 return NULL;
370         }
371
372         if(!buffer_init(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
373                 free(c);
374                 return NULL;
375         }
376
377         if(!buffer_init(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
378                 buffer_exit(&c->sndbuf);
379                 free(c);
380                 return NULL;
381         }
382
383         // Fill in the details
384
385         c->src = src;
386         c->dst = dst;
387 #ifdef UTCP_DEBUG
388         c->snd.iss = 0;
389 #else
390         c->snd.iss = rand();
391 #endif
392         c->snd.una = c->snd.iss;
393         c->snd.nxt = c->snd.iss + 1;
394         c->snd.last = c->snd.nxt;
395         c->snd.cwnd = (utcp->mtu > 2190 ? 2 : utcp->mtu > 1095 ? 3 : 4) * utcp->mtu;
396         c->snd.ssthresh = ~0;
397         debug_cwnd(c);
398         c->utcp = utcp;
399
400         // Add it to the sorted list of connections
401
402         utcp->connections[utcp->nconnections++] = c;
403         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
404
405         return c;
406 }
407
408 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
409         if(a > b) {
410                 return a - b;
411         } else {
412                 return b - a;
413         }
414 }
415
416 // Update RTT variables. See RFC 6298.
417 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
418         if(!rtt) {
419                 debug("invalid rtt\n");
420                 return;
421         }
422
423         struct utcp *utcp = c->utcp;
424
425         if(!utcp->srtt) {
426                 utcp->srtt = rtt;
427                 utcp->rttvar = rtt / 2;
428         } else {
429                 utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
430                 utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
431         }
432
433         utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
434
435         if(utcp->rto > MAX_RTO) {
436                 utcp->rto = MAX_RTO;
437         }
438
439         debug("rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
440 }
441
442 static void start_retransmit_timer(struct utcp_connection *c) {
443         gettimeofday(&c->rtrx_timeout, NULL);
444         c->rtrx_timeout.tv_usec += c->utcp->rto;
445
446         while(c->rtrx_timeout.tv_usec >= 1000000) {
447                 c->rtrx_timeout.tv_usec -= 1000000;
448                 c->rtrx_timeout.tv_sec++;
449         }
450
451         debug("timeout set to %lu.%06lu (%u)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_usec, c->utcp->rto);
452 }
453
454 static void stop_retransmit_timer(struct utcp_connection *c) {
455         timerclear(&c->rtrx_timeout);
456         debug("timeout cleared\n");
457 }
458
459 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
460         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
461
462         if(!c) {
463                 return NULL;
464         }
465
466         assert((flags & ~0x1f) == 0);
467
468         c->flags = flags;
469         c->recv = recv;
470         c->priv = priv;
471
472         struct {
473                 struct hdr hdr;
474                 uint8_t init[4];
475         } pkt;
476
477         pkt.hdr.src = c->src;
478         pkt.hdr.dst = c->dst;
479         pkt.hdr.seq = c->snd.iss;
480         pkt.hdr.ack = 0;
481         pkt.hdr.wnd = c->rcvbuf.maxsize;
482         pkt.hdr.ctl = SYN;
483         pkt.hdr.aux = 0x0101;
484         pkt.init[0] = 1;
485         pkt.init[1] = 0;
486         pkt.init[2] = 0;
487         pkt.init[3] = flags & 0x7;
488
489         set_state(c, SYN_SENT);
490
491         print_packet(utcp, "send", &pkt, sizeof(pkt));
492         utcp->send(utcp, &pkt, sizeof(pkt));
493
494         gettimeofday(&c->conn_timeout, NULL);
495         c->conn_timeout.tv_sec += utcp->timeout;
496
497         start_retransmit_timer(c);
498
499         return c;
500 }
501
502 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
503         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
504 }
505
506 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
507         if(c->reapable || c->state != SYN_RECEIVED) {
508                 debug("Error: accept() called on invalid connection %p in state %s\n", c, strstate[c->state]);
509                 return;
510         }
511
512         debug("%p accepted, %p %p\n", c, recv, priv);
513         c->recv = recv;
514         c->priv = priv;
515         set_state(c, ESTABLISHED);
516 }
517
518 static void ack(struct utcp_connection *c, bool sendatleastone) {
519         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
520         int32_t cwndleft = min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una);
521
522         assert(left >= 0);
523
524         if(cwndleft <= 0) {
525                 left = 0;
526         } else if(cwndleft < left) {
527                 left = cwndleft;
528
529                 if(!sendatleastone || cwndleft > c->utcp->mtu) {
530                         left -= left % c->utcp->mtu;
531                 }
532         }
533
534         debug("cwndleft = %d, left = %d\n", cwndleft, left);
535
536         if(!left && !sendatleastone) {
537                 return;
538         }
539
540         struct {
541                 struct hdr hdr;
542                 uint8_t data[];
543         } *pkt;
544
545         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
546
547         if(!pkt) {
548                 return;
549         }
550
551         pkt->hdr.src = c->src;
552         pkt->hdr.dst = c->dst;
553         pkt->hdr.ack = c->rcv.nxt;
554         pkt->hdr.wnd = c->rcvbuf.maxsize;
555         pkt->hdr.ctl = ACK;
556         pkt->hdr.aux = 0;
557
558         do {
559                 uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
560                 pkt->hdr.seq = c->snd.nxt;
561
562                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
563
564                 c->snd.nxt += seglen;
565                 left -= seglen;
566
567                 if(seglen && fin_wanted(c, c->snd.nxt)) {
568                         seglen--;
569                         pkt->hdr.ctl |= FIN;
570                 }
571
572                 if(!c->rtt_start.tv_sec) {
573                         // Start RTT measurement
574                         gettimeofday(&c->rtt_start, NULL);
575                         c->rtt_seq = pkt->hdr.seq + seglen;
576                         debug("Starting RTT measurement, expecting ack %u\n", c->rtt_seq);
577                 }
578
579                 print_packet(c->utcp, "send", pkt, sizeof(pkt->hdr) + seglen);
580                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
581         } while(left);
582
583         free(pkt);
584 }
585
586 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
587         if(c->reapable) {
588                 debug("Error: send() called on closed connection %p\n", c);
589                 errno = EBADF;
590                 return -1;
591         }
592
593         switch(c->state) {
594         case CLOSED:
595         case LISTEN:
596                 debug("Error: send() called on unconnected connection %p\n", c);
597                 errno = ENOTCONN;
598                 return -1;
599
600         case SYN_SENT:
601         case SYN_RECEIVED:
602         case ESTABLISHED:
603         case CLOSE_WAIT:
604                 break;
605
606         case FIN_WAIT_1:
607         case FIN_WAIT_2:
608         case CLOSING:
609         case LAST_ACK:
610         case TIME_WAIT:
611                 debug("Error: send() called on closing connection %p\n", c);
612                 errno = EPIPE;
613                 return -1;
614         }
615
616         // Exit early if we have nothing to send.
617
618         if(!len) {
619                 return 0;
620         }
621
622         if(!data) {
623                 errno = EFAULT;
624                 return -1;
625         }
626
627         // Check if we need to be able to buffer all data
628
629         if(c->flags & UTCP_NO_PARTIAL) {
630                 if(len > buffer_free(&c->sndbuf)) {
631                         if(len > c->sndbuf.maxsize) {
632                                 errno = EMSGSIZE;
633                                 return -1;
634                         } else {
635                                 errno = EWOULDBLOCK;
636                                 return 0;
637                         }
638                 }
639         }
640
641         // Add data to send buffer.
642
643         if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
644                 len = buffer_put(&c->sndbuf, data, len);
645         } else {
646                 return 0;
647         }
648
649         if(len <= 0) {
650                 if(is_reliable(c)) {
651                         errno = EWOULDBLOCK;
652                         return 0;
653                 } else {
654                         return len;
655                 }
656         }
657
658         c->snd.last += len;
659
660         // Don't send anything yet if the connection has not fully established yet
661
662         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
663                 return len;
664         }
665
666         ack(c, false);
667
668         if(!is_reliable(c)) {
669                 c->snd.una = c->snd.nxt = c->snd.last;
670                 buffer_get(&c->sndbuf, NULL, c->sndbuf.used);
671         }
672
673         if(is_reliable(c) && !timerisset(&c->rtrx_timeout)) {
674                 start_retransmit_timer(c);
675         }
676
677         if(is_reliable(c) && !timerisset(&c->conn_timeout)) {
678                 gettimeofday(&c->conn_timeout, NULL);
679                 c->conn_timeout.tv_sec += c->utcp->timeout;
680         }
681
682         return len;
683 }
684
685 static void swap_ports(struct hdr *hdr) {
686         uint16_t tmp = hdr->src;
687         hdr->src = hdr->dst;
688         hdr->dst = tmp;
689 }
690
691 static void retransmit(struct utcp_connection *c) {
692         if(c->state == CLOSED || c->snd.last == c->snd.una) {
693                 debug("Retransmit() called but nothing to retransmit!\n");
694                 stop_retransmit_timer(c);
695                 return;
696         }
697
698         struct utcp *utcp = c->utcp;
699
700         struct {
701                 struct hdr hdr;
702                 uint8_t data[];
703         } *pkt;
704
705         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
706
707         if(!pkt) {
708                 return;
709         }
710
711         pkt->hdr.src = c->src;
712         pkt->hdr.dst = c->dst;
713         pkt->hdr.wnd = c->rcvbuf.maxsize;
714         pkt->hdr.aux = 0;
715
716         switch(c->state) {
717         case SYN_SENT:
718                 // Send our SYN again
719                 pkt->hdr.seq = c->snd.iss;
720                 pkt->hdr.ack = 0;
721                 pkt->hdr.ctl = SYN;
722                 pkt->hdr.aux = 0x0101;
723                 pkt->data[0] = 1;
724                 pkt->data[1] = 0;
725                 pkt->data[2] = 0;
726                 pkt->data[3] = c->flags & 0x7;
727                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + 4);
728                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
729                 break;
730
731         case SYN_RECEIVED:
732                 // Send SYNACK again
733                 pkt->hdr.seq = c->snd.nxt;
734                 pkt->hdr.ack = c->rcv.nxt;
735                 pkt->hdr.ctl = SYN | ACK;
736                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr));
737                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
738                 break;
739
740         case ESTABLISHED:
741         case FIN_WAIT_1:
742         case CLOSE_WAIT:
743         case CLOSING:
744         case LAST_ACK:
745                 // Send unacked data again.
746                 pkt->hdr.seq = c->snd.una;
747                 pkt->hdr.ack = c->rcv.nxt;
748                 pkt->hdr.ctl = ACK;
749                 uint32_t len = seqdiff(c->snd.last, c->snd.una);
750
751                 if(len > utcp->mtu) {
752                         len = utcp->mtu;
753                 }
754
755                 if(fin_wanted(c, c->snd.una + len)) {
756                         len--;
757                         pkt->hdr.ctl |= FIN;
758                 }
759
760                 c->snd.nxt = c->snd.una + len;
761
762                 // RFC 5681 slow start after timeout
763                 c->snd.ssthresh = max(c->snd.cwnd / 2, utcp->mtu * 2); // eq. 4
764                 c->snd.cwnd = utcp->mtu;
765                 debug_cwnd(c);
766
767                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
768                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + len);
769                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
770                 break;
771
772         case CLOSED:
773         case LISTEN:
774         case TIME_WAIT:
775         case FIN_WAIT_2:
776                 // We shouldn't need to retransmit anything in this state.
777 #ifdef UTCP_DEBUG
778                 abort();
779 #endif
780                 stop_retransmit_timer(c);
781                 goto cleanup;
782         }
783
784         start_retransmit_timer(c);
785         utcp->rto *= 2;
786
787         if(utcp->rto > MAX_RTO) {
788                 utcp->rto = MAX_RTO;
789         }
790
791         c->rtt_start.tv_sec = 0; // invalidate RTT timer
792
793 cleanup:
794         free(pkt);
795 }
796
797 /* Update receive buffer and SACK entries after consuming data.
798  *
799  * Situation:
800  *
801  * |.....0000..1111111111.....22222......3333|
802  * |---------------^
803  *
804  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
805  * to remove data from the receive buffer. The idea is to substract "len"
806  * from the offset of all the SACK entries, and then remove/cut down entries
807  * that are shifted to before the start of the receive buffer.
808  *
809  * There are three cases:
810  * - the SACK entry is after ^, in that case just change the offset.
811  * - the SACK entry starts before and ends after ^, so we have to
812  *   change both its offset and size.
813  * - the SACK entry is completely before ^, in that case delete it.
814  */
815 static void sack_consume(struct utcp_connection *c, size_t len) {
816         debug("sack_consume %lu\n", (unsigned long)len);
817
818         if(len > c->rcvbuf.used) {
819                 debug("All SACK entries consumed");
820                 c->sacks[0].len = 0;
821                 return;
822         }
823
824         buffer_get(&c->rcvbuf, NULL, len);
825
826         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
827                 if(len < c->sacks[i].offset) {
828                         c->sacks[i].offset -= len;
829                         i++;
830                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
831                         c->sacks[i].len -= len - c->sacks[i].offset;
832                         c->sacks[i].offset = 0;
833                         i++;
834                 } else {
835                         if(i < NSACKS - 1) {
836                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
837                                 c->sacks[NSACKS - 1].len = 0;
838                         } else {
839                                 c->sacks[i].len = 0;
840                                 break;
841                         }
842                 }
843         }
844
845         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
846                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
847         }
848 }
849
850 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
851         debug("out of order packet, offset %u\n", offset);
852         // Packet loss or reordering occured. Store the data in the buffer.
853         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
854
855         if(rxd < 0 || (size_t)rxd < len) {
856                 abort();
857         }
858
859         // Make note of where we put it.
860         for(int i = 0; i < NSACKS; i++) {
861                 if(!c->sacks[i].len) { // nothing to merge, add new entry
862                         debug("New SACK entry %d\n", i);
863                         c->sacks[i].offset = offset;
864                         c->sacks[i].len = rxd;
865                         break;
866                 } else if(offset < c->sacks[i].offset) {
867                         if(offset + rxd < c->sacks[i].offset) { // insert before
868                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
869                                         debug("Insert SACK entry at %d\n", i);
870                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
871                                         c->sacks[i].offset = offset;
872                                         c->sacks[i].len = rxd;
873                                 } else {
874                                         debug("SACK entries full, dropping packet\n");
875                                 }
876
877                                 break;
878                         } else { // merge
879                                 debug("Merge with start of SACK entry at %d\n", i);
880                                 c->sacks[i].offset = offset;
881                                 break;
882                         }
883                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
884                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
885                                 debug("Merge with end of SACK entry at %d\n", i);
886                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
887                                 // TODO: handle potential merge with next entry
888                         }
889
890                         break;
891                 }
892         }
893
894         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
895                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
896         }
897 }
898
899 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
900         // Check if we can process out-of-order data now.
901         if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
902                 debug("incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
903                 buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
904                 len = max(len, c->sacks[0].offset + c->sacks[0].len);
905                 data = c->rcvbuf.data;
906         }
907
908         if(c->recv) {
909                 ssize_t rxd = c->recv(c, data, len);
910
911                 if(rxd < 0 || (size_t)rxd != len) {
912                         // TODO: handle the application not accepting all data.
913                         abort();
914                 }
915         }
916
917         if(c->rcvbuf.used) {
918                 sack_consume(c, len);
919         }
920
921         c->rcv.nxt += len;
922 }
923
924
925 static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
926         if(!is_reliable(c)) {
927                 c->recv(c, data, len);
928                 c->rcv.nxt = seq + len;
929                 return;
930         }
931
932         uint32_t offset = seqdiff(seq, c->rcv.nxt);
933
934         if(offset + len > c->rcvbuf.maxsize) {
935                 abort();
936         }
937
938         if(offset) {
939                 handle_out_of_order(c, offset, data, len);
940         } else {
941                 handle_in_order(c, data, len);
942         }
943 }
944
945
946 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
947         const uint8_t *ptr = data;
948
949         if(!utcp) {
950                 errno = EFAULT;
951                 return -1;
952         }
953
954         if(!len) {
955                 return 0;
956         }
957
958         if(!data) {
959                 errno = EFAULT;
960                 return -1;
961         }
962
963         print_packet(utcp, "recv", data, len);
964
965         // Drop packets smaller than the header
966
967         struct hdr hdr;
968
969         if(len < sizeof(hdr)) {
970                 errno = EBADMSG;
971                 return -1;
972         }
973
974         // Make a copy from the potentially unaligned data to a struct hdr
975
976         memcpy(&hdr, ptr, sizeof(hdr));
977         ptr += sizeof(hdr);
978         len -= sizeof(hdr);
979
980         // Drop packets with an unknown CTL flag
981
982         if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
983                 errno = EBADMSG;
984                 return -1;
985         }
986
987         // Check for auxiliary headers
988
989         const uint8_t *init = NULL;
990
991         uint16_t aux = hdr.aux;
992
993         while(aux) {
994                 size_t auxlen = 4 * (aux >> 8) & 0xf;
995                 uint8_t auxtype = aux & 0xff;
996
997                 if(len < auxlen) {
998                         errno = EBADMSG;
999                         return -1;
1000                 }
1001
1002                 switch(auxtype) {
1003                 case AUX_INIT:
1004                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1005                                 errno = EBADMSG;
1006                                 return -1;
1007                         }
1008
1009                         init = ptr;
1010                         break;
1011
1012                 default:
1013                         errno = EBADMSG;
1014                         return -1;
1015                 }
1016
1017                 len -= auxlen;
1018                 ptr += auxlen;
1019
1020                 if(!(aux & 0x800)) {
1021                         break;
1022                 }
1023
1024                 if(len < 2) {
1025                         errno = EBADMSG;
1026                         return -1;
1027                 }
1028
1029                 memcpy(&aux, ptr, 2);
1030                 len -= 2;
1031                 ptr += 2;
1032         }
1033
1034         bool has_data = len || (hdr.ctl & (SYN | FIN));
1035
1036         // Try to match the packet to an existing connection
1037
1038         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1039
1040         // Is it for a new connection?
1041
1042         if(!c) {
1043                 // Ignore RST packets
1044
1045                 if(hdr.ctl & RST) {
1046                         return 0;
1047                 }
1048
1049                 // Is it a SYN packet and are we LISTENing?
1050
1051                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1052                         // If we don't want to accept it, send a RST back
1053                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1054                                 len = 1;
1055                                 goto reset;
1056                         }
1057
1058                         // Try to allocate memory, otherwise send a RST back
1059                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1060
1061                         if(!c) {
1062                                 len = 1;
1063                                 goto reset;
1064                         }
1065
1066                         // Parse auxilliary information
1067                         if(init) {
1068                                 if(init[0] < 1) {
1069                                         len = 1;
1070                                         goto reset;
1071                                 }
1072
1073                                 c->flags = init[3] & 0x7;
1074                         } else {
1075                                 c->flags = UTCP_TCP;
1076                         }
1077
1078 synack:
1079                         // Return SYN+ACK, go to SYN_RECEIVED state
1080                         c->snd.wnd = hdr.wnd;
1081                         c->rcv.irs = hdr.seq;
1082                         c->rcv.nxt = c->rcv.irs + 1;
1083                         set_state(c, SYN_RECEIVED);
1084
1085                         struct {
1086                                 struct hdr hdr;
1087                                 uint8_t data[4];
1088                         } pkt;
1089
1090                         pkt.hdr.src = c->src;
1091                         pkt.hdr.dst = c->dst;
1092                         pkt.hdr.ack = c->rcv.irs + 1;
1093                         pkt.hdr.seq = c->snd.iss;
1094                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1095                         pkt.hdr.ctl = SYN | ACK;
1096
1097                         if(init) {
1098                                 pkt.hdr.aux = 0x0101;
1099                                 pkt.data[0] = 1;
1100                                 pkt.data[1] = 0;
1101                                 pkt.data[2] = 0;
1102                                 pkt.data[3] = c->flags & 0x7;
1103                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr) + 4);
1104                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1105                         } else {
1106                                 pkt.hdr.aux = 0;
1107                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr));
1108                                 utcp->send(utcp, &pkt, sizeof(hdr));
1109                         }
1110                 } else {
1111                         // No, we don't want your packets, send a RST back
1112                         len = 1;
1113                         goto reset;
1114                 }
1115
1116                 return 0;
1117         }
1118
1119         debug("%p state %s\n", c->utcp, strstate[c->state]);
1120
1121         // In case this is for a CLOSED connection, ignore the packet.
1122         // TODO: make it so incoming packets can never match a CLOSED connection.
1123
1124         if(c->state == CLOSED) {
1125                 debug("Got packet for closed connection\n");
1126                 return 0;
1127         }
1128
1129         // It is for an existing connection.
1130
1131         // 1. Drop invalid packets.
1132
1133         // 1a. Drop packets that should not happen in our current state.
1134
1135         switch(c->state) {
1136         case SYN_SENT:
1137         case SYN_RECEIVED:
1138         case ESTABLISHED:
1139         case FIN_WAIT_1:
1140         case FIN_WAIT_2:
1141         case CLOSE_WAIT:
1142         case CLOSING:
1143         case LAST_ACK:
1144         case TIME_WAIT:
1145                 break;
1146
1147         default:
1148 #ifdef UTCP_DEBUG
1149                 abort();
1150 #endif
1151                 break;
1152         }
1153
1154         // 1b. Discard data that is not in our receive window.
1155
1156         if(is_reliable(c)) {
1157                 bool acceptable;
1158
1159                 if(c->state == SYN_SENT) {
1160                         acceptable = true;
1161                 } else if(len == 0) {
1162                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1163                 } else {
1164                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1165
1166                         // cut already accepted front overlapping
1167                         if(rcv_offset < 0) {
1168                                 acceptable = len > (size_t) - rcv_offset;
1169
1170                                 if(acceptable) {
1171                                         ptr -= rcv_offset;
1172                                         len += rcv_offset;
1173                                         hdr.seq -= rcv_offset;
1174                                 }
1175                         } else {
1176                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1177                         }
1178                 }
1179
1180                 if(!acceptable) {
1181                         debug("Packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1182
1183                         // Ignore unacceptable RST packets.
1184                         if(hdr.ctl & RST) {
1185                                 return 0;
1186                         }
1187
1188                         // Otherwise, continue processing.
1189                         len = 0;
1190                 }
1191         }
1192
1193         c->snd.wnd = hdr.wnd; // TODO: move below
1194
1195         // 1c. Drop packets with an invalid ACK.
1196         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1197         // (= snd.una + c->sndbuf.used).
1198
1199         if(!is_reliable(c)) {
1200                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1201                         hdr.ack = c->snd.una;
1202                 }
1203         }
1204
1205         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1206                 debug("Packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1207
1208                 // Ignore unacceptable RST packets.
1209                 if(hdr.ctl & RST) {
1210                         return 0;
1211                 }
1212
1213                 goto reset;
1214         }
1215
1216         // 2. Handle RST packets
1217
1218         if(hdr.ctl & RST) {
1219                 switch(c->state) {
1220                 case SYN_SENT:
1221                         if(!(hdr.ctl & ACK)) {
1222                                 return 0;
1223                         }
1224
1225                         // The peer has refused our connection.
1226                         set_state(c, CLOSED);
1227                         errno = ECONNREFUSED;
1228
1229                         if(c->recv) {
1230                                 c->recv(c, NULL, 0);
1231                         }
1232
1233                         if(c->poll && !c->reapable) {
1234                                 c->poll(c, 0);
1235                         }
1236
1237                         return 0;
1238
1239                 case SYN_RECEIVED:
1240                         if(hdr.ctl & ACK) {
1241                                 return 0;
1242                         }
1243
1244                         // We haven't told the application about this connection yet. Silently delete.
1245                         free_connection(c);
1246                         return 0;
1247
1248                 case ESTABLISHED:
1249                 case FIN_WAIT_1:
1250                 case FIN_WAIT_2:
1251                 case CLOSE_WAIT:
1252                         if(hdr.ctl & ACK) {
1253                                 return 0;
1254                         }
1255
1256                         // The peer has aborted our connection.
1257                         set_state(c, CLOSED);
1258                         errno = ECONNRESET;
1259
1260                         if(c->recv) {
1261                                 c->recv(c, NULL, 0);
1262                         }
1263
1264                         if(c->poll && !c->reapable) {
1265                                 c->poll(c, 0);
1266                         }
1267
1268                         return 0;
1269
1270                 case CLOSING:
1271                 case LAST_ACK:
1272                 case TIME_WAIT:
1273                         if(hdr.ctl & ACK) {
1274                                 return 0;
1275                         }
1276
1277                         // As far as the application is concerned, the connection has already been closed.
1278                         // If it has called utcp_close() already, we can immediately free this connection.
1279                         if(c->reapable) {
1280                                 free_connection(c);
1281                                 return 0;
1282                         }
1283
1284                         // Otherwise, immediately move to the CLOSED state.
1285                         set_state(c, CLOSED);
1286                         return 0;
1287
1288                 default:
1289 #ifdef UTCP_DEBUG
1290                         abort();
1291 #endif
1292                         break;
1293                 }
1294         }
1295
1296         uint32_t advanced;
1297
1298         if(!(hdr.ctl & ACK)) {
1299                 advanced = 0;
1300                 goto skip_ack;
1301         }
1302
1303         // 3. Advance snd.una
1304
1305         advanced = seqdiff(hdr.ack, c->snd.una);
1306
1307         if(advanced) {
1308                 // RTT measurement
1309                 if(c->rtt_start.tv_sec) {
1310                         if(c->rtt_seq == hdr.ack) {
1311                                 struct timeval now, diff;
1312                                 gettimeofday(&now, NULL);
1313                                 timersub(&now, &c->rtt_start, &diff);
1314                                 update_rtt(c, diff.tv_sec * 1000000 + diff.tv_usec);
1315                                 c->rtt_start.tv_sec = 0;
1316                         } else if(c->rtt_seq < hdr.ack) {
1317                                 debug("Cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1318                                 c->rtt_start.tv_sec = 0;
1319                         }
1320                 }
1321
1322                 int32_t data_acked = advanced;
1323
1324                 switch(c->state) {
1325                 case SYN_SENT:
1326                 case SYN_RECEIVED:
1327                         data_acked--;
1328                         break;
1329
1330                 // TODO: handle FIN as well.
1331                 default:
1332                         break;
1333                 }
1334
1335                 assert(data_acked >= 0);
1336
1337 #ifndef NDEBUG
1338                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1339                 assert(data_acked <= bufused);
1340 #endif
1341
1342                 if(data_acked) {
1343                         buffer_get(&c->sndbuf, NULL, data_acked);
1344                 }
1345
1346                 // Also advance snd.nxt if possible
1347                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1348                         c->snd.nxt = hdr.ack;
1349                 }
1350
1351                 c->snd.una = hdr.ack;
1352
1353                 c->dupack = 0;
1354
1355                 // Increase the congestion window according to RFC 5681
1356                 if(c->snd.cwnd < c->snd.ssthresh) {
1357                         c->snd.cwnd += min(advanced, utcp->mtu); // eq. 2
1358                 } else {
1359                         c->snd.cwnd += max(1, (utcp->mtu * utcp->mtu) / c->snd.cwnd); // eq. 3
1360                 }
1361
1362                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1363                         c->snd.cwnd = c->sndbuf.maxsize;
1364                 }
1365
1366                 debug_cwnd(c);
1367
1368                 // Check if we have sent a FIN that is now ACKed.
1369                 switch(c->state) {
1370                 case FIN_WAIT_1:
1371                         if(c->snd.una == c->snd.last) {
1372                                 set_state(c, FIN_WAIT_2);
1373                         }
1374
1375                         break;
1376
1377                 case CLOSING:
1378                         if(c->snd.una == c->snd.last) {
1379                                 gettimeofday(&c->conn_timeout, NULL);
1380                                 c->conn_timeout.tv_sec += utcp->timeout;
1381                                 set_state(c, TIME_WAIT);
1382                         }
1383
1384                         break;
1385
1386                 default:
1387                         break;
1388                 }
1389         } else {
1390                 if(!len && is_reliable(c)) {
1391                         c->dupack++;
1392
1393                         if(c->dupack == 3) {
1394                                 debug("Triplicate ACK\n");
1395                                 //TODO: Resend one packet and go to fast recovery mode. See RFC 6582.
1396                                 //We do a very simple variant here; reset the nxt pointer to the last acknowledged packet from the peer.
1397                                 //Reset the congestion window so we wait for ACKs.
1398                                 c->snd.nxt = c->snd.una;
1399                                 c->snd.cwnd = utcp->mtu;
1400                                 debug_cwnd(c);
1401                                 start_retransmit_timer(c);
1402                         }
1403                 }
1404         }
1405
1406         // 4. Update timers
1407
1408         if(advanced) {
1409                 if(c->snd.una == c->snd.last) {
1410                         stop_retransmit_timer(c);
1411                         timerclear(&c->conn_timeout);
1412                 } else if(is_reliable(c)) {
1413                         start_retransmit_timer(c);
1414                         gettimeofday(&c->conn_timeout, NULL);
1415                         c->conn_timeout.tv_sec += utcp->timeout;
1416                 }
1417         }
1418
1419 skip_ack:
1420         // 5. Process SYN stuff
1421
1422         if(hdr.ctl & SYN) {
1423                 switch(c->state) {
1424                 case SYN_SENT:
1425
1426                         // This is a SYNACK. It should always have ACKed the SYN.
1427                         if(!advanced) {
1428                                 goto reset;
1429                         }
1430
1431                         c->rcv.irs = hdr.seq;
1432                         c->rcv.nxt = hdr.seq;
1433
1434                         if(c->shut_wr) {
1435                                 c->snd.last++;
1436                                 set_state(c, FIN_WAIT_1);
1437                         } else {
1438                                 set_state(c, ESTABLISHED);
1439                         }
1440
1441                         // TODO: notify application of this somehow.
1442                         break;
1443
1444                 case SYN_RECEIVED:
1445                         // This is a retransmit of a SYN, send back the SYNACK.
1446                         goto synack;
1447
1448                 case ESTABLISHED:
1449                 case FIN_WAIT_1:
1450                 case FIN_WAIT_2:
1451                 case CLOSE_WAIT:
1452                 case CLOSING:
1453                 case LAST_ACK:
1454                 case TIME_WAIT:
1455                         // Ehm, no. We should never receive a second SYN.
1456                         return 0;
1457
1458                 default:
1459 #ifdef UTCP_DEBUG
1460                         abort();
1461 #endif
1462                         return 0;
1463                 }
1464
1465                 // SYN counts as one sequence number
1466                 c->rcv.nxt++;
1467         }
1468
1469         // 6. Process new data
1470
1471         if(c->state == SYN_RECEIVED) {
1472                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
1473                 if(!advanced) {
1474                         goto reset;
1475                 }
1476
1477                 // Are we still LISTENing?
1478                 if(utcp->accept) {
1479                         utcp->accept(c, c->src);
1480                 }
1481
1482                 if(c->state != ESTABLISHED) {
1483                         set_state(c, CLOSED);
1484                         c->reapable = true;
1485                         goto reset;
1486                 }
1487         }
1488
1489         if(len) {
1490                 switch(c->state) {
1491                 case SYN_SENT:
1492                 case SYN_RECEIVED:
1493                         // This should never happen.
1494 #ifdef UTCP_DEBUG
1495                         abort();
1496 #endif
1497                         return 0;
1498
1499                 case ESTABLISHED:
1500                 case FIN_WAIT_1:
1501                 case FIN_WAIT_2:
1502                         break;
1503
1504                 case CLOSE_WAIT:
1505                 case CLOSING:
1506                 case LAST_ACK:
1507                 case TIME_WAIT:
1508                         // Ehm no, We should never receive more data after a FIN.
1509                         goto reset;
1510
1511                 default:
1512 #ifdef UTCP_DEBUG
1513                         abort();
1514 #endif
1515                         return 0;
1516                 }
1517
1518                 handle_incoming_data(c, hdr.seq, ptr, len);
1519         }
1520
1521         // 7. Process FIN stuff
1522
1523         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
1524                 switch(c->state) {
1525                 case SYN_SENT:
1526                 case SYN_RECEIVED:
1527                         // This should never happen.
1528 #ifdef UTCP_DEBUG
1529                         abort();
1530 #endif
1531                         break;
1532
1533                 case ESTABLISHED:
1534                         set_state(c, CLOSE_WAIT);
1535                         break;
1536
1537                 case FIN_WAIT_1:
1538                         set_state(c, CLOSING);
1539                         break;
1540
1541                 case FIN_WAIT_2:
1542                         gettimeofday(&c->conn_timeout, NULL);
1543                         c->conn_timeout.tv_sec += utcp->timeout;
1544                         set_state(c, TIME_WAIT);
1545                         break;
1546
1547                 case CLOSE_WAIT:
1548                 case CLOSING:
1549                 case LAST_ACK:
1550                 case TIME_WAIT:
1551                         // Ehm, no. We should never receive a second FIN.
1552                         goto reset;
1553
1554                 default:
1555 #ifdef UTCP_DEBUG
1556                         abort();
1557 #endif
1558                         break;
1559                 }
1560
1561                 // FIN counts as one sequence number
1562                 c->rcv.nxt++;
1563                 len++;
1564
1565                 // Inform the application that the peer closed its end of the connection.
1566                 if(c->recv) {
1567                         errno = 0;
1568                         c->recv(c, NULL, 0);
1569                 }
1570         }
1571
1572         // Now we send something back if:
1573         // - we received data, so we have to send back an ACK
1574         //   -> sendatleastone = true
1575         // - or we got an ack, so we should maybe send a bit more data
1576         //   -> sendatleastone = false
1577
1578         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
1579                 ack(c, has_data);
1580         }
1581
1582         return 0;
1583
1584 reset:
1585         swap_ports(&hdr);
1586         hdr.wnd = 0;
1587         hdr.aux = 0;
1588
1589         if(hdr.ctl & ACK) {
1590                 hdr.seq = hdr.ack;
1591                 hdr.ctl = RST;
1592         } else {
1593                 hdr.ack = hdr.seq + len;
1594                 hdr.seq = 0;
1595                 hdr.ctl = RST | ACK;
1596         }
1597
1598         print_packet(utcp, "send", &hdr, sizeof(hdr));
1599         utcp->send(utcp, &hdr, sizeof(hdr));
1600         return 0;
1601
1602 }
1603
1604 int utcp_shutdown(struct utcp_connection *c, int dir) {
1605         debug("%p shutdown %d at %u\n", c ? c->utcp : NULL, dir, c ? c->snd.last : 0);
1606
1607         if(!c) {
1608                 errno = EFAULT;
1609                 return -1;
1610         }
1611
1612         if(c->reapable) {
1613                 debug("Error: shutdown() called on closed connection %p\n", c);
1614                 errno = EBADF;
1615                 return -1;
1616         }
1617
1618         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
1619                 errno = EINVAL;
1620                 return -1;
1621         }
1622
1623         // TCP does not have a provision for stopping incoming packets.
1624         // The best we can do is to just ignore them.
1625         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
1626                 c->recv = NULL;
1627         }
1628
1629         // The rest of the code deals with shutting down writes.
1630         if(dir == UTCP_SHUT_RD) {
1631                 return 0;
1632         }
1633
1634         // Only process shutting down writes once.
1635         if(c->shut_wr) {
1636                 return 0;
1637         }
1638
1639         c->shut_wr = true;
1640
1641         switch(c->state) {
1642         case CLOSED:
1643         case LISTEN:
1644                 errno = ENOTCONN;
1645                 return -1;
1646
1647         case SYN_SENT:
1648                 return 0;
1649
1650         case SYN_RECEIVED:
1651         case ESTABLISHED:
1652                 set_state(c, FIN_WAIT_1);
1653                 break;
1654
1655         case FIN_WAIT_1:
1656         case FIN_WAIT_2:
1657                 return 0;
1658
1659         case CLOSE_WAIT:
1660                 set_state(c, CLOSING);
1661                 break;
1662
1663         case CLOSING:
1664         case LAST_ACK:
1665         case TIME_WAIT:
1666                 return 0;
1667         }
1668
1669         c->snd.last++;
1670
1671         ack(c, false);
1672
1673         if(!timerisset(&c->rtrx_timeout)) {
1674                 start_retransmit_timer(c);
1675         }
1676
1677         return 0;
1678 }
1679
1680 static bool reset_connection(struct utcp_connection *c) {
1681         if(!c) {
1682                 errno = EFAULT;
1683                 return false;
1684         }
1685
1686         if(c->reapable) {
1687                 debug("Error: abort() called on closed connection %p\n", c);
1688                 errno = EBADF;
1689                 return false;
1690         }
1691
1692         c->recv = NULL;
1693         c->poll = NULL;
1694
1695         switch(c->state) {
1696         case CLOSED:
1697                 return true;
1698
1699         case LISTEN:
1700         case SYN_SENT:
1701         case CLOSING:
1702         case LAST_ACK:
1703         case TIME_WAIT:
1704                 set_state(c, CLOSED);
1705                 return true;
1706
1707         case SYN_RECEIVED:
1708         case ESTABLISHED:
1709         case FIN_WAIT_1:
1710         case FIN_WAIT_2:
1711         case CLOSE_WAIT:
1712                 set_state(c, CLOSED);
1713                 break;
1714         }
1715
1716         // Send RST
1717
1718         struct hdr hdr;
1719
1720         hdr.src = c->src;
1721         hdr.dst = c->dst;
1722         hdr.seq = c->snd.nxt;
1723         hdr.ack = 0;
1724         hdr.wnd = 0;
1725         hdr.ctl = RST;
1726
1727         print_packet(c->utcp, "send", &hdr, sizeof(hdr));
1728         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
1729         return true;
1730 }
1731
1732 // Closes all the opened connections
1733 void utcp_abort_all_connections(struct utcp *utcp) {
1734         if(!utcp) {
1735                 errno = EINVAL;
1736                 return;
1737         }
1738
1739         for(int i = 0; i < utcp->nconnections; i++) {
1740                 struct utcp_connection *c = utcp->connections[i];
1741
1742                 if(c->reapable || c->state == CLOSED) {
1743                         continue;
1744                 }
1745
1746                 utcp_recv_t old_recv = c->recv;
1747                 utcp_poll_t old_poll = c->poll;
1748
1749                 reset_connection(c);
1750
1751                 if(old_recv) {
1752                         errno = 0;
1753                         old_recv(c, NULL, 0);
1754                 }
1755
1756                 if(old_poll && !c->reapable) {
1757                         errno = 0;
1758                         old_poll(c, 0);
1759                 }
1760         }
1761
1762         return;
1763 }
1764
1765 int utcp_close(struct utcp_connection *c) {
1766         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
1767                 return -1;
1768         }
1769
1770         c->recv = NULL;
1771         c->poll = NULL;
1772         c->reapable = true;
1773         return 0;
1774 }
1775
1776 int utcp_abort(struct utcp_connection *c) {
1777         if(!reset_connection(c)) {
1778                 return -1;
1779         }
1780
1781         c->reapable = true;
1782         return 0;
1783 }
1784
1785 /* Handle timeouts.
1786  * One call to this function will loop through all connections,
1787  * checking if something needs to be resent or not.
1788  * The return value is the time to the next timeout in milliseconds,
1789  * or maybe a negative value if the timeout is infinite.
1790  */
1791 struct timeval utcp_timeout(struct utcp *utcp) {
1792         struct timeval now;
1793         gettimeofday(&now, NULL);
1794         struct timeval next = {now.tv_sec + 3600, now.tv_usec};
1795
1796         for(int i = 0; i < utcp->nconnections; i++) {
1797                 struct utcp_connection *c = utcp->connections[i];
1798
1799                 if(!c) {
1800                         continue;
1801                 }
1802
1803                 // delete connections that have been utcp_close()d.
1804                 if(c->state == CLOSED) {
1805                         if(c->reapable) {
1806                                 debug("Reaping %p\n", c);
1807                                 free_connection(c);
1808                                 i--;
1809                         }
1810
1811                         continue;
1812                 }
1813
1814                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &now, <)) {
1815                         errno = ETIMEDOUT;
1816                         c->state = CLOSED;
1817
1818                         if(c->recv) {
1819                                 c->recv(c, NULL, 0);
1820                         }
1821
1822                         if(c->poll && !c->reapable) {
1823                                 c->poll(c, 0);
1824                         }
1825
1826                         continue;
1827                 }
1828
1829                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &now, <)) {
1830                         debug("retransmit()\n");
1831                         retransmit(c);
1832                 }
1833
1834                 if(c->poll) {
1835                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1836                                 uint32_t len =  buffer_free(&c->sndbuf);
1837
1838                                 if(len) {
1839                                         c->poll(c, len);
1840                                 }
1841                         } else if(c->state == CLOSED) {
1842                                 c->poll(c, 0);
1843                         }
1844                 }
1845
1846                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &next, <)) {
1847                         next = c->conn_timeout;
1848                 }
1849
1850                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &next, <)) {
1851                         next = c->rtrx_timeout;
1852                 }
1853         }
1854
1855         struct timeval diff;
1856
1857         timersub(&next, &now, &diff);
1858
1859         return diff;
1860 }
1861
1862 bool utcp_is_active(struct utcp *utcp) {
1863         if(!utcp) {
1864                 return false;
1865         }
1866
1867         for(int i = 0; i < utcp->nconnections; i++)
1868                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
1869                         return true;
1870                 }
1871
1872         return false;
1873 }
1874
1875 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
1876         if(!send) {
1877                 errno = EFAULT;
1878                 return NULL;
1879         }
1880
1881         struct utcp *utcp = calloc(1, sizeof(*utcp));
1882
1883         if(!utcp) {
1884                 return NULL;
1885         }
1886
1887         utcp->accept = accept;
1888         utcp->pre_accept = pre_accept;
1889         utcp->send = send;
1890         utcp->priv = priv;
1891         utcp->mtu = DEFAULT_MTU;
1892         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
1893         utcp->rto = START_RTO; // usec
1894
1895         return utcp;
1896 }
1897
1898 void utcp_exit(struct utcp *utcp) {
1899         if(!utcp) {
1900                 return;
1901         }
1902
1903         for(int i = 0; i < utcp->nconnections; i++) {
1904                 struct utcp_connection *c = utcp->connections[i];
1905
1906                 if(!c->reapable) {
1907                         if(c->recv) {
1908                                 c->recv(c, NULL, 0);
1909                         }
1910
1911                         if(c->poll && !c->reapable) {
1912                                 c->poll(c, 0);
1913                         }
1914                 }
1915
1916                 buffer_exit(&c->rcvbuf);
1917                 buffer_exit(&c->sndbuf);
1918                 free(c);
1919         }
1920
1921         free(utcp->connections);
1922         free(utcp);
1923 }
1924
1925 uint16_t utcp_get_mtu(struct utcp *utcp) {
1926         return utcp ? utcp->mtu : 0;
1927 }
1928
1929 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
1930         // TODO: handle overhead of the header
1931         if(utcp) {
1932                 utcp->mtu = mtu;
1933         }
1934 }
1935
1936 void utcp_reset_timers(struct utcp *utcp) {
1937         if(!utcp) {
1938                 return;
1939         }
1940
1941         struct timeval now, then;
1942
1943         gettimeofday(&now, NULL);
1944
1945         then = now;
1946
1947         then.tv_sec += utcp->timeout;
1948
1949         for(int i = 0; i < utcp->nconnections; i++) {
1950                 struct utcp_connection *c = utcp->connections[i];
1951
1952                 if(c->reapable) {
1953                         continue;
1954                 }
1955
1956                 if(timerisset(&c->rtrx_timeout)) {
1957                         c->rtrx_timeout = now;
1958                 }
1959
1960                 if(timerisset(&c->conn_timeout)) {
1961                         c->conn_timeout = then;
1962                 }
1963
1964                 c->rtt_start.tv_sec = 0;
1965         }
1966
1967         if(utcp->rto > START_RTO) {
1968                 utcp->rto = START_RTO;
1969         }
1970 }
1971
1972 int utcp_get_user_timeout(struct utcp *u) {
1973         return u ? u->timeout : 0;
1974 }
1975
1976 void utcp_set_user_timeout(struct utcp *u, int timeout) {
1977         if(u) {
1978                 u->timeout = timeout;
1979         }
1980 }
1981
1982 size_t utcp_get_sndbuf(struct utcp_connection *c) {
1983         return c ? c->sndbuf.maxsize : 0;
1984 }
1985
1986 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
1987         if(!c) {
1988                 return 0;
1989         }
1990
1991         switch(c->state) {
1992         case SYN_SENT:
1993         case SYN_RECEIVED:
1994         case ESTABLISHED:
1995         case CLOSE_WAIT:
1996                 return buffer_free(&c->sndbuf);
1997
1998         default:
1999                 return 0;
2000         }
2001 }
2002
2003 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2004         if(!c) {
2005                 return;
2006         }
2007
2008         c->sndbuf.maxsize = size;
2009
2010         if(c->sndbuf.maxsize != size) {
2011                 c->sndbuf.maxsize = -1;
2012         }
2013 }
2014
2015 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2016         return c ? c->rcvbuf.maxsize : 0;
2017 }
2018
2019 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2020         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2021                 return buffer_free(&c->rcvbuf);
2022         } else {
2023                 return 0;
2024         }
2025 }
2026
2027 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2028         if(!c) {
2029                 return;
2030         }
2031
2032         c->rcvbuf.maxsize = size;
2033
2034         if(c->rcvbuf.maxsize != size) {
2035                 c->rcvbuf.maxsize = -1;
2036         }
2037 }
2038
2039 size_t utcp_get_sendq(struct utcp_connection *c) {
2040         return c->sndbuf.used;
2041 }
2042
2043 size_t utcp_get_recvq(struct utcp_connection *c) {
2044         return c->rcvbuf.used;
2045 }
2046
2047 bool utcp_get_nodelay(struct utcp_connection *c) {
2048         return c ? c->nodelay : false;
2049 }
2050
2051 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2052         if(c) {
2053                 c->nodelay = nodelay;
2054         }
2055 }
2056
2057 bool utcp_get_keepalive(struct utcp_connection *c) {
2058         return c ? c->keepalive : false;
2059 }
2060
2061 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2062         if(c) {
2063                 c->keepalive = keepalive;
2064         }
2065 }
2066
2067 size_t utcp_get_outq(struct utcp_connection *c) {
2068         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2069 }
2070
2071 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2072         if(c) {
2073                 c->recv = recv;
2074         }
2075 }
2076
2077 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2078         if(c) {
2079                 c->poll = poll;
2080         }
2081 }
2082
2083 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2084         if(utcp) {
2085                 utcp->accept = accept;
2086                 utcp->pre_accept = pre_accept;
2087         }
2088 }
2089
2090 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2091         if(!c || c->reapable) {
2092                 return;
2093         }
2094
2095         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2096                 return;
2097         }
2098
2099         if(expect) {
2100                 // If we expect data, start the connection timer.
2101                 if(!timerisset(&c->conn_timeout)) {
2102                         gettimeofday(&c->conn_timeout, NULL);
2103                         c->conn_timeout.tv_sec += c->utcp->timeout;
2104                 }
2105         } else {
2106                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2107                 if(c->snd.una == c->snd.last) {
2108                         timerclear(&c->conn_timeout);
2109                 }
2110         }
2111 }
2112
2113 void utcp_offline(struct utcp *utcp, bool offline) {
2114         struct timeval now;
2115         gettimeofday(&now, NULL);
2116
2117         for(int i = 0; i < utcp->nconnections; i++) {
2118                 struct utcp_connection *c = utcp->connections[i];
2119
2120                 if(c->reapable) {
2121                         continue;
2122                 }
2123
2124                 utcp_expect_data(c, offline);
2125
2126                 if(!offline) {
2127                         if(timerisset(&c->rtrx_timeout)) {
2128                                 c->rtrx_timeout = now;
2129                         }
2130
2131                         utcp->connections[i]->rtt_start.tv_sec = 0;
2132                 }
2133         }
2134
2135         if(!offline && utcp->rto > START_RTO) {
2136                 utcp->rto = START_RTO;
2137         }
2138 }