]> git.meshlink.io Git - utcp/blob - utcp.c
Fix congestion window size after a triplicate ACK
[utcp] / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <sys/time.h>
31 #include <sys/socket.h>
32
33 #include "utcp_priv.h"
34
35 #ifndef EBADMSG
36 #define EBADMSG         104
37 #endif
38
39 #ifndef SHUT_RDWR
40 #define SHUT_RDWR 2
41 #endif
42
43 #ifdef poll
44 #undef poll
45 #endif
46
47 #ifndef timersub
48 #define timersub(a, b, r)\
49         do {\
50                 (r)->tv_sec = (a)->tv_sec - (b)->tv_sec;\
51                 (r)->tv_usec = (a)->tv_usec - (b)->tv_usec;\
52                 if((r)->tv_usec < 0)\
53                         (r)->tv_sec--, (r)->tv_usec += USEC_PER_SEC;\
54         } while (0)
55 #endif
56
57 static inline size_t min(size_t a, size_t b) {
58         return a < b ? a : b;
59 }
60
61 static inline size_t max(size_t a, size_t b) {
62         return a > b ? a : b;
63 }
64
65 #ifdef UTCP_DEBUG
66 #include <stdarg.h>
67
68 static void debug(const char *format, ...) {
69         va_list ap;
70         va_start(ap, format);
71         vfprintf(stderr, format, ap);
72         va_end(ap);
73 }
74
75 static void print_packet(struct utcp *utcp, const char *dir, const void *pkt, size_t len) {
76         struct hdr hdr;
77
78         if(len < sizeof(hdr)) {
79                 debug("%p %s: short packet (%lu bytes)\n", utcp, dir, (unsigned long)len);
80                 return;
81         }
82
83         memcpy(&hdr, pkt, sizeof(hdr));
84         debug("%p %s: len=%lu, src=%u dst=%u seq=%u ack=%u wnd=%u aux=%x ctl=", utcp, dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux);
85
86         if(hdr.ctl & SYN) {
87                 debug("SYN");
88         }
89
90         if(hdr.ctl & RST) {
91                 debug("RST");
92         }
93
94         if(hdr.ctl & FIN) {
95                 debug("FIN");
96         }
97
98         if(hdr.ctl & ACK) {
99                 debug("ACK");
100         }
101
102         if(len > sizeof(hdr)) {
103                 uint32_t datalen = len - sizeof(hdr);
104                 const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
105                 char str[datalen * 2 + 1];
106                 char *p = str;
107
108                 for(uint32_t i = 0; i < datalen; i++) {
109                         *p++ = "0123456789ABCDEF"[data[i] >> 4];
110                         *p++ = "0123456789ABCDEF"[data[i] & 15];
111                 }
112
113                 *p = 0;
114
115                 debug(" data=%s", str);
116         }
117
118         debug("\n");
119 }
120
121 static void debug_cwnd(struct utcp_connection *c) {
122         debug("snd.cwnd = %u\n", c->snd.cwnd);
123 }
124 #else
125 #define debug(...) do {} while(0)
126 #define print_packet(...) do {} while(0)
127 #define debug_cwnd(...) do {} while(0)
128 #endif
129
130 static void set_state(struct utcp_connection *c, enum state state) {
131         c->state = state;
132
133         if(state == ESTABLISHED) {
134                 timerclear(&c->conn_timeout);
135         }
136
137         debug("%p new state: %s\n", c->utcp, strstate[state]);
138 }
139
140 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
141         if(seq != c->snd.last) {
142                 return false;
143         }
144
145         switch(c->state) {
146         case FIN_WAIT_1:
147         case CLOSING:
148         case LAST_ACK:
149                 return true;
150
151         default:
152                 return false;
153         }
154 }
155
156 static bool is_reliable(struct utcp_connection *c) {
157         return c->flags & UTCP_RELIABLE;
158 }
159
160 static int32_t seqdiff(uint32_t a, uint32_t b) {
161         return a - b;
162 }
163
164 // Buffer functions
165 // TODO: convert to ringbuffers to avoid memmove() operations.
166
167 // Store data into the buffer
168 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
169         debug("buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
170
171         size_t required = offset + len;
172
173         if(required > buf->maxsize) {
174                 if(offset >= buf->maxsize) {
175                         return 0;
176                 }
177
178                 len = buf->maxsize - offset;
179                 required = buf->maxsize;
180         }
181
182         if(required > buf->size) {
183                 size_t newsize = buf->size;
184
185                 if(!newsize) {
186                         newsize = required;
187                 } else {
188                         do {
189                                 newsize *= 2;
190                         } while(newsize < required);
191                 }
192
193                 if(newsize > buf->maxsize) {
194                         newsize = buf->maxsize;
195                 }
196
197                 char *newdata = realloc(buf->data, newsize);
198
199                 if(!newdata) {
200                         return -1;
201                 }
202
203                 buf->data = newdata;
204                 buf->size = newsize;
205         }
206
207         memcpy(buf->data + offset, data, len);
208
209         if(required > buf->used) {
210                 buf->used = required;
211         }
212
213         return len;
214 }
215
216 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
217         return buffer_put_at(buf, buf->used, data, len);
218 }
219
220 // Get data from the buffer. data can be NULL.
221 static ssize_t buffer_get(struct buffer *buf, void *data, size_t len) {
222         if(len > buf->used) {
223                 len = buf->used;
224         }
225
226         if(data) {
227                 memcpy(data, buf->data, len);
228         }
229
230         if(len < buf->used) {
231                 memmove(buf->data, buf->data + len, buf->used - len);
232         }
233
234         buf->used -= len;
235         return len;
236 }
237
238 // Copy data from the buffer without removing it.
239 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
240         if(offset >= buf->used) {
241                 return 0;
242         }
243
244         if(offset + len > buf->used) {
245                 len = buf->used - offset;
246         }
247
248         memcpy(data, buf->data + offset, len);
249         return len;
250 }
251
252 static bool buffer_init(struct buffer *buf, uint32_t len, uint32_t maxlen) {
253         memset(buf, 0, sizeof(*buf));
254
255         if(len) {
256                 buf->data = malloc(len);
257
258                 if(!buf->data) {
259                         return false;
260                 }
261         }
262
263         buf->size = len;
264         buf->maxsize = maxlen;
265         return true;
266 }
267
268 static void buffer_exit(struct buffer *buf) {
269         free(buf->data);
270         memset(buf, 0, sizeof(*buf));
271 }
272
273 static uint32_t buffer_free(const struct buffer *buf) {
274         return buf->maxsize - buf->used;
275 }
276
277 // Connections are stored in a sorted list.
278 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
279
280 static int compare(const void *va, const void *vb) {
281         assert(va && vb);
282
283         const struct utcp_connection *a = *(struct utcp_connection **)va;
284         const struct utcp_connection *b = *(struct utcp_connection **)vb;
285
286         assert(a && b);
287         assert(a->src && b->src);
288
289         int c = (int)a->src - (int)b->src;
290
291         if(c) {
292                 return c;
293         }
294
295         c = (int)a->dst - (int)b->dst;
296         return c;
297 }
298
299 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
300         if(!utcp->nconnections) {
301                 return NULL;
302         }
303
304         struct utcp_connection key = {
305                 .src = src,
306                 .dst = dst,
307         }, *keyp = &key;
308         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
309         return match ? *match : NULL;
310 }
311
312 static void free_connection(struct utcp_connection *c) {
313         struct utcp *utcp = c->utcp;
314         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
315
316         assert(cp);
317
318         int i = cp - utcp->connections;
319         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
320         utcp->nconnections--;
321
322         buffer_exit(&c->rcvbuf);
323         buffer_exit(&c->sndbuf);
324         free(c);
325 }
326
327 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
328         // Check whether this combination of src and dst is free
329
330         if(src) {
331                 if(find_connection(utcp, src, dst)) {
332                         errno = EADDRINUSE;
333                         return NULL;
334                 }
335         } else { // If src == 0, generate a random port number with the high bit set
336                 if(utcp->nconnections >= 32767) {
337                         errno = ENOMEM;
338                         return NULL;
339                 }
340
341                 src = rand() | 0x8000;
342
343                 while(find_connection(utcp, src, dst)) {
344                         src++;
345                 }
346         }
347
348         // Allocate memory for the new connection
349
350         if(utcp->nconnections >= utcp->nallocated) {
351                 if(!utcp->nallocated) {
352                         utcp->nallocated = 4;
353                 } else {
354                         utcp->nallocated *= 2;
355                 }
356
357                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
358
359                 if(!new_array) {
360                         return NULL;
361                 }
362
363                 utcp->connections = new_array;
364         }
365
366         struct utcp_connection *c = calloc(1, sizeof(*c));
367
368         if(!c) {
369                 return NULL;
370         }
371
372         if(!buffer_init(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
373                 free(c);
374                 return NULL;
375         }
376
377         if(!buffer_init(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
378                 buffer_exit(&c->sndbuf);
379                 free(c);
380                 return NULL;
381         }
382
383         // Fill in the details
384
385         c->src = src;
386         c->dst = dst;
387 #ifdef UTCP_DEBUG
388         c->snd.iss = 0;
389 #else
390         c->snd.iss = rand();
391 #endif
392         c->snd.una = c->snd.iss;
393         c->snd.nxt = c->snd.iss + 1;
394         c->snd.last = c->snd.nxt;
395         c->snd.cwnd = (utcp->mtu > 2190 ? 2 : utcp->mtu > 1095 ? 3 : 4) * utcp->mtu;
396         c->snd.ssthresh = ~0;
397         debug_cwnd(c);
398         c->utcp = utcp;
399
400         // Add it to the sorted list of connections
401
402         utcp->connections[utcp->nconnections++] = c;
403         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
404
405         return c;
406 }
407
408 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
409         if(a > b) {
410                 return a - b;
411         } else {
412                 return b - a;
413         }
414 }
415
416 // Update RTT variables. See RFC 6298.
417 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
418         if(!rtt) {
419                 debug("invalid rtt\n");
420                 return;
421         }
422
423         struct utcp *utcp = c->utcp;
424
425         if(!utcp->srtt) {
426                 utcp->srtt = rtt;
427                 utcp->rttvar = rtt / 2;
428         } else {
429                 utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
430                 utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
431         }
432
433         utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
434
435         if(utcp->rto > MAX_RTO) {
436                 utcp->rto = MAX_RTO;
437         }
438
439         debug("rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
440 }
441
442 static void start_retransmit_timer(struct utcp_connection *c) {
443         gettimeofday(&c->rtrx_timeout, NULL);
444         c->rtrx_timeout.tv_usec += c->utcp->rto;
445
446         while(c->rtrx_timeout.tv_usec >= 1000000) {
447                 c->rtrx_timeout.tv_usec -= 1000000;
448                 c->rtrx_timeout.tv_sec++;
449         }
450
451         debug("timeout set to %lu.%06lu (%u)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_usec, c->utcp->rto);
452 }
453
454 static void stop_retransmit_timer(struct utcp_connection *c) {
455         timerclear(&c->rtrx_timeout);
456         debug("timeout cleared\n");
457 }
458
459 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
460         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
461
462         if(!c) {
463                 return NULL;
464         }
465
466         assert((flags & ~0x1f) == 0);
467
468         c->flags = flags;
469         c->recv = recv;
470         c->priv = priv;
471
472         struct {
473                 struct hdr hdr;
474                 uint8_t init[4];
475         } pkt;
476
477         pkt.hdr.src = c->src;
478         pkt.hdr.dst = c->dst;
479         pkt.hdr.seq = c->snd.iss;
480         pkt.hdr.ack = 0;
481         pkt.hdr.wnd = c->rcvbuf.maxsize;
482         pkt.hdr.ctl = SYN;
483         pkt.hdr.aux = 0x0101;
484         pkt.init[0] = 1;
485         pkt.init[1] = 0;
486         pkt.init[2] = 0;
487         pkt.init[3] = flags & 0x7;
488
489         set_state(c, SYN_SENT);
490
491         print_packet(utcp, "send", &pkt, sizeof(pkt));
492         utcp->send(utcp, &pkt, sizeof(pkt));
493
494         gettimeofday(&c->conn_timeout, NULL);
495         c->conn_timeout.tv_sec += utcp->timeout;
496
497         start_retransmit_timer(c);
498
499         return c;
500 }
501
502 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
503         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
504 }
505
506 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
507         if(c->reapable || c->state != SYN_RECEIVED) {
508                 debug("Error: accept() called on invalid connection %p in state %s\n", c, strstate[c->state]);
509                 return;
510         }
511
512         debug("%p accepted, %p %p\n", c, recv, priv);
513         c->recv = recv;
514         c->priv = priv;
515         set_state(c, ESTABLISHED);
516 }
517
518 static void ack(struct utcp_connection *c, bool sendatleastone) {
519         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
520         int32_t cwndleft = min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una);
521
522         assert(left >= 0);
523
524         if(cwndleft <= 0) {
525                 left = 0;
526         } else if(cwndleft < left) {
527                 left = cwndleft;
528
529                 if(!sendatleastone || cwndleft > c->utcp->mtu) {
530                         left -= left % c->utcp->mtu;
531                 }
532         }
533
534         debug("cwndleft = %d, left = %d\n", cwndleft, left);
535
536         if(!left && !sendatleastone) {
537                 return;
538         }
539
540         struct {
541                 struct hdr hdr;
542                 uint8_t data[];
543         } *pkt;
544
545         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
546
547         if(!pkt) {
548                 return;
549         }
550
551         pkt->hdr.src = c->src;
552         pkt->hdr.dst = c->dst;
553         pkt->hdr.ack = c->rcv.nxt;
554         pkt->hdr.wnd = c->rcvbuf.maxsize;
555         pkt->hdr.ctl = ACK;
556         pkt->hdr.aux = 0;
557
558         do {
559                 uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
560                 pkt->hdr.seq = c->snd.nxt;
561
562                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
563
564                 c->snd.nxt += seglen;
565                 left -= seglen;
566
567                 if(seglen && fin_wanted(c, c->snd.nxt)) {
568                         seglen--;
569                         pkt->hdr.ctl |= FIN;
570                 }
571
572                 if(!c->rtt_start.tv_sec) {
573                         // Start RTT measurement
574                         gettimeofday(&c->rtt_start, NULL);
575                         c->rtt_seq = pkt->hdr.seq + seglen;
576                         debug("Starting RTT measurement, expecting ack %u\n", c->rtt_seq);
577                 }
578
579                 print_packet(c->utcp, "send", pkt, sizeof(pkt->hdr) + seglen);
580                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
581         } while(left);
582
583         free(pkt);
584 }
585
586 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
587         if(c->reapable) {
588                 debug("Error: send() called on closed connection %p\n", c);
589                 errno = EBADF;
590                 return -1;
591         }
592
593         switch(c->state) {
594         case CLOSED:
595         case LISTEN:
596                 debug("Error: send() called on unconnected connection %p\n", c);
597                 errno = ENOTCONN;
598                 return -1;
599
600         case SYN_SENT:
601         case SYN_RECEIVED:
602         case ESTABLISHED:
603         case CLOSE_WAIT:
604                 break;
605
606         case FIN_WAIT_1:
607         case FIN_WAIT_2:
608         case CLOSING:
609         case LAST_ACK:
610         case TIME_WAIT:
611                 debug("Error: send() called on closing connection %p\n", c);
612                 errno = EPIPE;
613                 return -1;
614         }
615
616         // Exit early if we have nothing to send.
617
618         if(!len) {
619                 return 0;
620         }
621
622         if(!data) {
623                 errno = EFAULT;
624                 return -1;
625         }
626
627         // Check if we need to be able to buffer all data
628
629         if(c->flags & UTCP_NO_PARTIAL) {
630                 if(len > buffer_free(&c->sndbuf)) {
631                         if(len > c->sndbuf.maxsize) {
632                                 errno = EMSGSIZE;
633                                 return -1;
634                         } else {
635                                 errno = EWOULDBLOCK;
636                                 return 0;
637                         }
638                 }
639         }
640
641         // Add data to send buffer.
642
643         if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
644                 len = buffer_put(&c->sndbuf, data, len);
645         } else {
646                 return 0;
647         }
648
649         if(len <= 0) {
650                 if(is_reliable(c)) {
651                         errno = EWOULDBLOCK;
652                         return 0;
653                 } else {
654                         return len;
655                 }
656         }
657
658         c->snd.last += len;
659
660         // Don't send anything yet if the connection has not fully established yet
661
662         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
663                 return len;
664         }
665
666         ack(c, false);
667
668         if(!is_reliable(c)) {
669                 c->snd.una = c->snd.nxt = c->snd.last;
670                 buffer_get(&c->sndbuf, NULL, c->sndbuf.used);
671         }
672
673         if(is_reliable(c) && !timerisset(&c->rtrx_timeout)) {
674                 start_retransmit_timer(c);
675         }
676
677         if(is_reliable(c) && !timerisset(&c->conn_timeout)) {
678                 gettimeofday(&c->conn_timeout, NULL);
679                 c->conn_timeout.tv_sec += c->utcp->timeout;
680         }
681
682         return len;
683 }
684
685 static void swap_ports(struct hdr *hdr) {
686         uint16_t tmp = hdr->src;
687         hdr->src = hdr->dst;
688         hdr->dst = tmp;
689 }
690
691 static void fast_retransmit(struct utcp_connection *c) {
692         if(c->state == CLOSED || c->snd.last == c->snd.una) {
693                 debug("fast_retransmit() called but nothing to retransmit!\n");
694                 return;
695         }
696
697         struct utcp *utcp = c->utcp;
698
699         struct {
700                 struct hdr hdr;
701                 uint8_t data[];
702         } *pkt;
703
704         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
705
706         if(!pkt) {
707                 return;
708         }
709
710         pkt->hdr.src = c->src;
711         pkt->hdr.dst = c->dst;
712         pkt->hdr.wnd = c->rcvbuf.maxsize;
713         pkt->hdr.aux = 0;
714
715         switch(c->state) {
716         case ESTABLISHED:
717         case FIN_WAIT_1:
718         case CLOSE_WAIT:
719         case CLOSING:
720         case LAST_ACK:
721                 // Send unacked data again.
722                 pkt->hdr.seq = c->snd.una;
723                 pkt->hdr.ack = c->rcv.nxt;
724                 pkt->hdr.ctl = ACK;
725                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mtu);
726
727                 if(fin_wanted(c, c->snd.una + len)) {
728                         len--;
729                         pkt->hdr.ctl |= FIN;
730                 }
731
732                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
733                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + len);
734                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
735                 break;
736
737         default:
738                 break;
739         }
740
741         free(pkt);
742 }
743
744 static void retransmit(struct utcp_connection *c) {
745         if(c->state == CLOSED || c->snd.last == c->snd.una) {
746                 debug("Retransmit() called but nothing to retransmit!\n");
747                 stop_retransmit_timer(c);
748                 return;
749         }
750
751         struct utcp *utcp = c->utcp;
752
753         struct {
754                 struct hdr hdr;
755                 uint8_t data[];
756         } *pkt;
757
758         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
759
760         if(!pkt) {
761                 return;
762         }
763
764         pkt->hdr.src = c->src;
765         pkt->hdr.dst = c->dst;
766         pkt->hdr.wnd = c->rcvbuf.maxsize;
767         pkt->hdr.aux = 0;
768
769         switch(c->state) {
770         case SYN_SENT:
771                 // Send our SYN again
772                 pkt->hdr.seq = c->snd.iss;
773                 pkt->hdr.ack = 0;
774                 pkt->hdr.ctl = SYN;
775                 pkt->hdr.aux = 0x0101;
776                 pkt->data[0] = 1;
777                 pkt->data[1] = 0;
778                 pkt->data[2] = 0;
779                 pkt->data[3] = c->flags & 0x7;
780                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + 4);
781                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
782                 break;
783
784         case SYN_RECEIVED:
785                 // Send SYNACK again
786                 pkt->hdr.seq = c->snd.nxt;
787                 pkt->hdr.ack = c->rcv.nxt;
788                 pkt->hdr.ctl = SYN | ACK;
789                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr));
790                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
791                 break;
792
793         case ESTABLISHED:
794         case FIN_WAIT_1:
795         case CLOSE_WAIT:
796         case CLOSING:
797         case LAST_ACK:
798                 // Send unacked data again.
799                 pkt->hdr.seq = c->snd.una;
800                 pkt->hdr.ack = c->rcv.nxt;
801                 pkt->hdr.ctl = ACK;
802                 uint32_t len = seqdiff(c->snd.last, c->snd.una);
803
804                 if(len > utcp->mtu) {
805                         len = utcp->mtu;
806                 }
807
808                 if(fin_wanted(c, c->snd.una + len)) {
809                         len--;
810                         pkt->hdr.ctl |= FIN;
811                 }
812
813                 c->snd.nxt = c->snd.una + len;
814
815                 // RFC 5681 slow start after timeout
816                 c->snd.ssthresh = max(c->snd.cwnd / 2, utcp->mtu * 2); // eq. 4
817                 c->snd.cwnd = utcp->mtu;
818                 debug_cwnd(c);
819
820                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
821                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + len);
822                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
823                 break;
824
825         case CLOSED:
826         case LISTEN:
827         case TIME_WAIT:
828         case FIN_WAIT_2:
829                 // We shouldn't need to retransmit anything in this state.
830 #ifdef UTCP_DEBUG
831                 abort();
832 #endif
833                 stop_retransmit_timer(c);
834                 goto cleanup;
835         }
836
837         start_retransmit_timer(c);
838         utcp->rto *= 2;
839
840         if(utcp->rto > MAX_RTO) {
841                 utcp->rto = MAX_RTO;
842         }
843
844         c->rtt_start.tv_sec = 0; // invalidate RTT timer
845
846 cleanup:
847         free(pkt);
848 }
849
850 /* Update receive buffer and SACK entries after consuming data.
851  *
852  * Situation:
853  *
854  * |.....0000..1111111111.....22222......3333|
855  * |---------------^
856  *
857  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
858  * to remove data from the receive buffer. The idea is to substract "len"
859  * from the offset of all the SACK entries, and then remove/cut down entries
860  * that are shifted to before the start of the receive buffer.
861  *
862  * There are three cases:
863  * - the SACK entry is after ^, in that case just change the offset.
864  * - the SACK entry starts before and ends after ^, so we have to
865  *   change both its offset and size.
866  * - the SACK entry is completely before ^, in that case delete it.
867  */
868 static void sack_consume(struct utcp_connection *c, size_t len) {
869         debug("sack_consume %lu\n", (unsigned long)len);
870
871         if(len > c->rcvbuf.used) {
872                 debug("All SACK entries consumed");
873                 c->sacks[0].len = 0;
874                 return;
875         }
876
877         buffer_get(&c->rcvbuf, NULL, len);
878
879         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
880                 if(len < c->sacks[i].offset) {
881                         c->sacks[i].offset -= len;
882                         i++;
883                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
884                         c->sacks[i].len -= len - c->sacks[i].offset;
885                         c->sacks[i].offset = 0;
886                         i++;
887                 } else {
888                         if(i < NSACKS - 1) {
889                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
890                                 c->sacks[NSACKS - 1].len = 0;
891                         } else {
892                                 c->sacks[i].len = 0;
893                                 break;
894                         }
895                 }
896         }
897
898         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
899                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
900         }
901 }
902
903 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
904         debug("out of order packet, offset %u\n", offset);
905         // Packet loss or reordering occured. Store the data in the buffer.
906         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
907
908         if(rxd < 0 || (size_t)rxd < len) {
909                 abort();
910         }
911
912         // Make note of where we put it.
913         for(int i = 0; i < NSACKS; i++) {
914                 if(!c->sacks[i].len) { // nothing to merge, add new entry
915                         debug("New SACK entry %d\n", i);
916                         c->sacks[i].offset = offset;
917                         c->sacks[i].len = rxd;
918                         break;
919                 } else if(offset < c->sacks[i].offset) {
920                         if(offset + rxd < c->sacks[i].offset) { // insert before
921                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
922                                         debug("Insert SACK entry at %d\n", i);
923                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
924                                         c->sacks[i].offset = offset;
925                                         c->sacks[i].len = rxd;
926                                 } else {
927                                         debug("SACK entries full, dropping packet\n");
928                                 }
929
930                                 break;
931                         } else { // merge
932                                 debug("Merge with start of SACK entry at %d\n", i);
933                                 c->sacks[i].offset = offset;
934                                 break;
935                         }
936                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
937                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
938                                 debug("Merge with end of SACK entry at %d\n", i);
939                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
940                                 // TODO: handle potential merge with next entry
941                         }
942
943                         break;
944                 }
945         }
946
947         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
948                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
949         }
950 }
951
952 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
953         // Check if we can process out-of-order data now.
954         if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
955                 debug("incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
956                 buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
957                 len = max(len, c->sacks[0].offset + c->sacks[0].len);
958                 data = c->rcvbuf.data;
959         }
960
961         if(c->recv) {
962                 ssize_t rxd = c->recv(c, data, len);
963
964                 if(rxd < 0 || (size_t)rxd != len) {
965                         // TODO: handle the application not accepting all data.
966                         abort();
967                 }
968         }
969
970         if(c->rcvbuf.used) {
971                 sack_consume(c, len);
972         }
973
974         c->rcv.nxt += len;
975 }
976
977
978 static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
979         if(!is_reliable(c)) {
980                 c->recv(c, data, len);
981                 c->rcv.nxt = seq + len;
982                 return;
983         }
984
985         uint32_t offset = seqdiff(seq, c->rcv.nxt);
986
987         if(offset + len > c->rcvbuf.maxsize) {
988                 abort();
989         }
990
991         if(offset) {
992                 handle_out_of_order(c, offset, data, len);
993         } else {
994                 handle_in_order(c, data, len);
995         }
996 }
997
998
999 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
1000         const uint8_t *ptr = data;
1001
1002         if(!utcp) {
1003                 errno = EFAULT;
1004                 return -1;
1005         }
1006
1007         if(!len) {
1008                 return 0;
1009         }
1010
1011         if(!data) {
1012                 errno = EFAULT;
1013                 return -1;
1014         }
1015
1016         print_packet(utcp, "recv", data, len);
1017
1018         // Drop packets smaller than the header
1019
1020         struct hdr hdr;
1021
1022         if(len < sizeof(hdr)) {
1023                 errno = EBADMSG;
1024                 return -1;
1025         }
1026
1027         // Make a copy from the potentially unaligned data to a struct hdr
1028
1029         memcpy(&hdr, ptr, sizeof(hdr));
1030         ptr += sizeof(hdr);
1031         len -= sizeof(hdr);
1032
1033         // Drop packets with an unknown CTL flag
1034
1035         if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
1036                 errno = EBADMSG;
1037                 return -1;
1038         }
1039
1040         // Check for auxiliary headers
1041
1042         const uint8_t *init = NULL;
1043
1044         uint16_t aux = hdr.aux;
1045
1046         while(aux) {
1047                 size_t auxlen = 4 * (aux >> 8) & 0xf;
1048                 uint8_t auxtype = aux & 0xff;
1049
1050                 if(len < auxlen) {
1051                         errno = EBADMSG;
1052                         return -1;
1053                 }
1054
1055                 switch(auxtype) {
1056                 case AUX_INIT:
1057                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1058                                 errno = EBADMSG;
1059                                 return -1;
1060                         }
1061
1062                         init = ptr;
1063                         break;
1064
1065                 default:
1066                         errno = EBADMSG;
1067                         return -1;
1068                 }
1069
1070                 len -= auxlen;
1071                 ptr += auxlen;
1072
1073                 if(!(aux & 0x800)) {
1074                         break;
1075                 }
1076
1077                 if(len < 2) {
1078                         errno = EBADMSG;
1079                         return -1;
1080                 }
1081
1082                 memcpy(&aux, ptr, 2);
1083                 len -= 2;
1084                 ptr += 2;
1085         }
1086
1087         bool has_data = len || (hdr.ctl & (SYN | FIN));
1088
1089         // Try to match the packet to an existing connection
1090
1091         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1092
1093         // Is it for a new connection?
1094
1095         if(!c) {
1096                 // Ignore RST packets
1097
1098                 if(hdr.ctl & RST) {
1099                         return 0;
1100                 }
1101
1102                 // Is it a SYN packet and are we LISTENing?
1103
1104                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1105                         // If we don't want to accept it, send a RST back
1106                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1107                                 len = 1;
1108                                 goto reset;
1109                         }
1110
1111                         // Try to allocate memory, otherwise send a RST back
1112                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1113
1114                         if(!c) {
1115                                 len = 1;
1116                                 goto reset;
1117                         }
1118
1119                         // Parse auxilliary information
1120                         if(init) {
1121                                 if(init[0] < 1) {
1122                                         len = 1;
1123                                         goto reset;
1124                                 }
1125
1126                                 c->flags = init[3] & 0x7;
1127                         } else {
1128                                 c->flags = UTCP_TCP;
1129                         }
1130
1131 synack:
1132                         // Return SYN+ACK, go to SYN_RECEIVED state
1133                         c->snd.wnd = hdr.wnd;
1134                         c->rcv.irs = hdr.seq;
1135                         c->rcv.nxt = c->rcv.irs + 1;
1136                         set_state(c, SYN_RECEIVED);
1137
1138                         struct {
1139                                 struct hdr hdr;
1140                                 uint8_t data[4];
1141                         } pkt;
1142
1143                         pkt.hdr.src = c->src;
1144                         pkt.hdr.dst = c->dst;
1145                         pkt.hdr.ack = c->rcv.irs + 1;
1146                         pkt.hdr.seq = c->snd.iss;
1147                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1148                         pkt.hdr.ctl = SYN | ACK;
1149
1150                         if(init) {
1151                                 pkt.hdr.aux = 0x0101;
1152                                 pkt.data[0] = 1;
1153                                 pkt.data[1] = 0;
1154                                 pkt.data[2] = 0;
1155                                 pkt.data[3] = c->flags & 0x7;
1156                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr) + 4);
1157                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1158                         } else {
1159                                 pkt.hdr.aux = 0;
1160                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr));
1161                                 utcp->send(utcp, &pkt, sizeof(hdr));
1162                         }
1163                 } else {
1164                         // No, we don't want your packets, send a RST back
1165                         len = 1;
1166                         goto reset;
1167                 }
1168
1169                 return 0;
1170         }
1171
1172         debug("%p state %s\n", c->utcp, strstate[c->state]);
1173
1174         // In case this is for a CLOSED connection, ignore the packet.
1175         // TODO: make it so incoming packets can never match a CLOSED connection.
1176
1177         if(c->state == CLOSED) {
1178                 debug("Got packet for closed connection\n");
1179                 return 0;
1180         }
1181
1182         // It is for an existing connection.
1183
1184         // 1. Drop invalid packets.
1185
1186         // 1a. Drop packets that should not happen in our current state.
1187
1188         switch(c->state) {
1189         case SYN_SENT:
1190         case SYN_RECEIVED:
1191         case ESTABLISHED:
1192         case FIN_WAIT_1:
1193         case FIN_WAIT_2:
1194         case CLOSE_WAIT:
1195         case CLOSING:
1196         case LAST_ACK:
1197         case TIME_WAIT:
1198                 break;
1199
1200         default:
1201 #ifdef UTCP_DEBUG
1202                 abort();
1203 #endif
1204                 break;
1205         }
1206
1207         // 1b. Discard data that is not in our receive window.
1208
1209         if(is_reliable(c)) {
1210                 bool acceptable;
1211
1212                 if(c->state == SYN_SENT) {
1213                         acceptable = true;
1214                 } else if(len == 0) {
1215                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1216                 } else {
1217                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1218
1219                         // cut already accepted front overlapping
1220                         if(rcv_offset < 0) {
1221                                 acceptable = len > (size_t) - rcv_offset;
1222
1223                                 if(acceptable) {
1224                                         ptr -= rcv_offset;
1225                                         len += rcv_offset;
1226                                         hdr.seq -= rcv_offset;
1227                                 }
1228                         } else {
1229                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1230                         }
1231                 }
1232
1233                 if(!acceptable) {
1234                         debug("Packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1235
1236                         // Ignore unacceptable RST packets.
1237                         if(hdr.ctl & RST) {
1238                                 return 0;
1239                         }
1240
1241                         // Otherwise, continue processing.
1242                         len = 0;
1243                 }
1244         }
1245
1246         c->snd.wnd = hdr.wnd; // TODO: move below
1247
1248         // 1c. Drop packets with an invalid ACK.
1249         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1250         // (= snd.una + c->sndbuf.used).
1251
1252         if(!is_reliable(c)) {
1253                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1254                         hdr.ack = c->snd.una;
1255                 }
1256         }
1257
1258         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1259                 debug("Packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1260
1261                 // Ignore unacceptable RST packets.
1262                 if(hdr.ctl & RST) {
1263                         return 0;
1264                 }
1265
1266                 goto reset;
1267         }
1268
1269         // 2. Handle RST packets
1270
1271         if(hdr.ctl & RST) {
1272                 switch(c->state) {
1273                 case SYN_SENT:
1274                         if(!(hdr.ctl & ACK)) {
1275                                 return 0;
1276                         }
1277
1278                         // The peer has refused our connection.
1279                         set_state(c, CLOSED);
1280                         errno = ECONNREFUSED;
1281
1282                         if(c->recv) {
1283                                 c->recv(c, NULL, 0);
1284                         }
1285
1286                         if(c->poll && !c->reapable) {
1287                                 c->poll(c, 0);
1288                         }
1289
1290                         return 0;
1291
1292                 case SYN_RECEIVED:
1293                         if(hdr.ctl & ACK) {
1294                                 return 0;
1295                         }
1296
1297                         // We haven't told the application about this connection yet. Silently delete.
1298                         free_connection(c);
1299                         return 0;
1300
1301                 case ESTABLISHED:
1302                 case FIN_WAIT_1:
1303                 case FIN_WAIT_2:
1304                 case CLOSE_WAIT:
1305                         if(hdr.ctl & ACK) {
1306                                 return 0;
1307                         }
1308
1309                         // The peer has aborted our connection.
1310                         set_state(c, CLOSED);
1311                         errno = ECONNRESET;
1312
1313                         if(c->recv) {
1314                                 c->recv(c, NULL, 0);
1315                         }
1316
1317                         if(c->poll && !c->reapable) {
1318                                 c->poll(c, 0);
1319                         }
1320
1321                         return 0;
1322
1323                 case CLOSING:
1324                 case LAST_ACK:
1325                 case TIME_WAIT:
1326                         if(hdr.ctl & ACK) {
1327                                 return 0;
1328                         }
1329
1330                         // As far as the application is concerned, the connection has already been closed.
1331                         // If it has called utcp_close() already, we can immediately free this connection.
1332                         if(c->reapable) {
1333                                 free_connection(c);
1334                                 return 0;
1335                         }
1336
1337                         // Otherwise, immediately move to the CLOSED state.
1338                         set_state(c, CLOSED);
1339                         return 0;
1340
1341                 default:
1342 #ifdef UTCP_DEBUG
1343                         abort();
1344 #endif
1345                         break;
1346                 }
1347         }
1348
1349         uint32_t advanced;
1350
1351         if(!(hdr.ctl & ACK)) {
1352                 advanced = 0;
1353                 goto skip_ack;
1354         }
1355
1356         // 3. Advance snd.una
1357
1358         advanced = seqdiff(hdr.ack, c->snd.una);
1359
1360         if(advanced) {
1361                 // RTT measurement
1362                 if(c->rtt_start.tv_sec) {
1363                         if(c->rtt_seq == hdr.ack) {
1364                                 struct timeval now, diff;
1365                                 gettimeofday(&now, NULL);
1366                                 timersub(&now, &c->rtt_start, &diff);
1367                                 update_rtt(c, diff.tv_sec * 1000000 + diff.tv_usec);
1368                                 c->rtt_start.tv_sec = 0;
1369                         } else if(c->rtt_seq < hdr.ack) {
1370                                 debug("Cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1371                                 c->rtt_start.tv_sec = 0;
1372                         }
1373                 }
1374
1375                 int32_t data_acked = advanced;
1376
1377                 switch(c->state) {
1378                 case SYN_SENT:
1379                 case SYN_RECEIVED:
1380                         data_acked--;
1381                         break;
1382
1383                 // TODO: handle FIN as well.
1384                 default:
1385                         break;
1386                 }
1387
1388                 assert(data_acked >= 0);
1389
1390 #ifndef NDEBUG
1391                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1392                 assert(data_acked <= bufused);
1393 #endif
1394
1395                 if(data_acked) {
1396                         buffer_get(&c->sndbuf, NULL, data_acked);
1397                 }
1398
1399                 // Also advance snd.nxt if possible
1400                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1401                         c->snd.nxt = hdr.ack;
1402                 }
1403
1404                 c->snd.una = hdr.ack;
1405
1406                 if(c->dupack) {
1407                         if(c->dupack >= 3) {
1408                                 c->snd.cwnd = c->snd.ssthresh;
1409                         }
1410
1411                         c->dupack = 0;
1412                 }
1413
1414                 // Increase the congestion window according to RFC 5681
1415                 if(c->snd.cwnd < c->snd.ssthresh) {
1416                         c->snd.cwnd += min(advanced, utcp->mtu); // eq. 2
1417                 } else {
1418                         c->snd.cwnd += max(1, (utcp->mtu * utcp->mtu) / c->snd.cwnd); // eq. 3
1419                 }
1420
1421                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1422                         c->snd.cwnd = c->sndbuf.maxsize;
1423                 }
1424
1425                 debug_cwnd(c);
1426
1427                 // Check if we have sent a FIN that is now ACKed.
1428                 switch(c->state) {
1429                 case FIN_WAIT_1:
1430                         if(c->snd.una == c->snd.last) {
1431                                 set_state(c, FIN_WAIT_2);
1432                         }
1433
1434                         break;
1435
1436                 case CLOSING:
1437                         if(c->snd.una == c->snd.last) {
1438                                 gettimeofday(&c->conn_timeout, NULL);
1439                                 c->conn_timeout.tv_sec += utcp->timeout;
1440                                 set_state(c, TIME_WAIT);
1441                         }
1442
1443                         break;
1444
1445                 default:
1446                         break;
1447                 }
1448         } else {
1449                 if(!len && is_reliable(c)) {
1450                         c->dupack++;
1451
1452                         if(c->dupack == 3) {
1453                                 debug("Triplicate ACK\n");
1454
1455                                 // RFC 5681 fast recovery
1456                                 c->snd.ssthresh = max(c->snd.cwnd / 2, utcp->mtu * 2); // eq. 4
1457                                 c->snd.cwnd = min(c->snd.ssthresh + 3 * utcp->mtu, c->sndbuf.maxsize);
1458
1459                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1460                                         c->snd.cwnd = c->sndbuf.maxsize;
1461                                 }
1462
1463                                 debug_cwnd(c);
1464
1465                                 fast_retransmit(c);
1466                         } else if(c->dupack > 3) {
1467                                 c->snd.cwnd += utcp->mtu;
1468
1469                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1470                                         c->snd.cwnd = c->sndbuf.maxsize;
1471                                 }
1472
1473                                 debug_cwnd(c);
1474                         }
1475                 }
1476         }
1477
1478         // 4. Update timers
1479
1480         if(advanced) {
1481                 if(c->snd.una == c->snd.last) {
1482                         stop_retransmit_timer(c);
1483                         timerclear(&c->conn_timeout);
1484                 } else if(is_reliable(c)) {
1485                         start_retransmit_timer(c);
1486                         gettimeofday(&c->conn_timeout, NULL);
1487                         c->conn_timeout.tv_sec += utcp->timeout;
1488                 }
1489         }
1490
1491 skip_ack:
1492         // 5. Process SYN stuff
1493
1494         if(hdr.ctl & SYN) {
1495                 switch(c->state) {
1496                 case SYN_SENT:
1497
1498                         // This is a SYNACK. It should always have ACKed the SYN.
1499                         if(!advanced) {
1500                                 goto reset;
1501                         }
1502
1503                         c->rcv.irs = hdr.seq;
1504                         c->rcv.nxt = hdr.seq;
1505
1506                         if(c->shut_wr) {
1507                                 c->snd.last++;
1508                                 set_state(c, FIN_WAIT_1);
1509                         } else {
1510                                 set_state(c, ESTABLISHED);
1511                         }
1512
1513                         // TODO: notify application of this somehow.
1514                         break;
1515
1516                 case SYN_RECEIVED:
1517                         // This is a retransmit of a SYN, send back the SYNACK.
1518                         goto synack;
1519
1520                 case ESTABLISHED:
1521                 case FIN_WAIT_1:
1522                 case FIN_WAIT_2:
1523                 case CLOSE_WAIT:
1524                 case CLOSING:
1525                 case LAST_ACK:
1526                 case TIME_WAIT:
1527                         // Ehm, no. We should never receive a second SYN.
1528                         return 0;
1529
1530                 default:
1531 #ifdef UTCP_DEBUG
1532                         abort();
1533 #endif
1534                         return 0;
1535                 }
1536
1537                 // SYN counts as one sequence number
1538                 c->rcv.nxt++;
1539         }
1540
1541         // 6. Process new data
1542
1543         if(c->state == SYN_RECEIVED) {
1544                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
1545                 if(!advanced) {
1546                         goto reset;
1547                 }
1548
1549                 // Are we still LISTENing?
1550                 if(utcp->accept) {
1551                         utcp->accept(c, c->src);
1552                 }
1553
1554                 if(c->state != ESTABLISHED) {
1555                         set_state(c, CLOSED);
1556                         c->reapable = true;
1557                         goto reset;
1558                 }
1559         }
1560
1561         if(len) {
1562                 switch(c->state) {
1563                 case SYN_SENT:
1564                 case SYN_RECEIVED:
1565                         // This should never happen.
1566 #ifdef UTCP_DEBUG
1567                         abort();
1568 #endif
1569                         return 0;
1570
1571                 case ESTABLISHED:
1572                 case FIN_WAIT_1:
1573                 case FIN_WAIT_2:
1574                         break;
1575
1576                 case CLOSE_WAIT:
1577                 case CLOSING:
1578                 case LAST_ACK:
1579                 case TIME_WAIT:
1580                         // Ehm no, We should never receive more data after a FIN.
1581                         goto reset;
1582
1583                 default:
1584 #ifdef UTCP_DEBUG
1585                         abort();
1586 #endif
1587                         return 0;
1588                 }
1589
1590                 handle_incoming_data(c, hdr.seq, ptr, len);
1591         }
1592
1593         // 7. Process FIN stuff
1594
1595         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
1596                 switch(c->state) {
1597                 case SYN_SENT:
1598                 case SYN_RECEIVED:
1599                         // This should never happen.
1600 #ifdef UTCP_DEBUG
1601                         abort();
1602 #endif
1603                         break;
1604
1605                 case ESTABLISHED:
1606                         set_state(c, CLOSE_WAIT);
1607                         break;
1608
1609                 case FIN_WAIT_1:
1610                         set_state(c, CLOSING);
1611                         break;
1612
1613                 case FIN_WAIT_2:
1614                         gettimeofday(&c->conn_timeout, NULL);
1615                         c->conn_timeout.tv_sec += utcp->timeout;
1616                         set_state(c, TIME_WAIT);
1617                         break;
1618
1619                 case CLOSE_WAIT:
1620                 case CLOSING:
1621                 case LAST_ACK:
1622                 case TIME_WAIT:
1623                         // Ehm, no. We should never receive a second FIN.
1624                         goto reset;
1625
1626                 default:
1627 #ifdef UTCP_DEBUG
1628                         abort();
1629 #endif
1630                         break;
1631                 }
1632
1633                 // FIN counts as one sequence number
1634                 c->rcv.nxt++;
1635                 len++;
1636
1637                 // Inform the application that the peer closed its end of the connection.
1638                 if(c->recv) {
1639                         errno = 0;
1640                         c->recv(c, NULL, 0);
1641                 }
1642         }
1643
1644         // Now we send something back if:
1645         // - we received data, so we have to send back an ACK
1646         //   -> sendatleastone = true
1647         // - or we got an ack, so we should maybe send a bit more data
1648         //   -> sendatleastone = false
1649
1650         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
1651                 ack(c, has_data);
1652         }
1653
1654         return 0;
1655
1656 reset:
1657         swap_ports(&hdr);
1658         hdr.wnd = 0;
1659         hdr.aux = 0;
1660
1661         if(hdr.ctl & ACK) {
1662                 hdr.seq = hdr.ack;
1663                 hdr.ctl = RST;
1664         } else {
1665                 hdr.ack = hdr.seq + len;
1666                 hdr.seq = 0;
1667                 hdr.ctl = RST | ACK;
1668         }
1669
1670         print_packet(utcp, "send", &hdr, sizeof(hdr));
1671         utcp->send(utcp, &hdr, sizeof(hdr));
1672         return 0;
1673
1674 }
1675
1676 int utcp_shutdown(struct utcp_connection *c, int dir) {
1677         debug("%p shutdown %d at %u\n", c ? c->utcp : NULL, dir, c ? c->snd.last : 0);
1678
1679         if(!c) {
1680                 errno = EFAULT;
1681                 return -1;
1682         }
1683
1684         if(c->reapable) {
1685                 debug("Error: shutdown() called on closed connection %p\n", c);
1686                 errno = EBADF;
1687                 return -1;
1688         }
1689
1690         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
1691                 errno = EINVAL;
1692                 return -1;
1693         }
1694
1695         // TCP does not have a provision for stopping incoming packets.
1696         // The best we can do is to just ignore them.
1697         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
1698                 c->recv = NULL;
1699         }
1700
1701         // The rest of the code deals with shutting down writes.
1702         if(dir == UTCP_SHUT_RD) {
1703                 return 0;
1704         }
1705
1706         // Only process shutting down writes once.
1707         if(c->shut_wr) {
1708                 return 0;
1709         }
1710
1711         c->shut_wr = true;
1712
1713         switch(c->state) {
1714         case CLOSED:
1715         case LISTEN:
1716                 errno = ENOTCONN;
1717                 return -1;
1718
1719         case SYN_SENT:
1720                 return 0;
1721
1722         case SYN_RECEIVED:
1723         case ESTABLISHED:
1724                 set_state(c, FIN_WAIT_1);
1725                 break;
1726
1727         case FIN_WAIT_1:
1728         case FIN_WAIT_2:
1729                 return 0;
1730
1731         case CLOSE_WAIT:
1732                 set_state(c, CLOSING);
1733                 break;
1734
1735         case CLOSING:
1736         case LAST_ACK:
1737         case TIME_WAIT:
1738                 return 0;
1739         }
1740
1741         c->snd.last++;
1742
1743         ack(c, false);
1744
1745         if(!timerisset(&c->rtrx_timeout)) {
1746                 start_retransmit_timer(c);
1747         }
1748
1749         return 0;
1750 }
1751
1752 static bool reset_connection(struct utcp_connection *c) {
1753         if(!c) {
1754                 errno = EFAULT;
1755                 return false;
1756         }
1757
1758         if(c->reapable) {
1759                 debug("Error: abort() called on closed connection %p\n", c);
1760                 errno = EBADF;
1761                 return false;
1762         }
1763
1764         c->recv = NULL;
1765         c->poll = NULL;
1766
1767         switch(c->state) {
1768         case CLOSED:
1769                 return true;
1770
1771         case LISTEN:
1772         case SYN_SENT:
1773         case CLOSING:
1774         case LAST_ACK:
1775         case TIME_WAIT:
1776                 set_state(c, CLOSED);
1777                 return true;
1778
1779         case SYN_RECEIVED:
1780         case ESTABLISHED:
1781         case FIN_WAIT_1:
1782         case FIN_WAIT_2:
1783         case CLOSE_WAIT:
1784                 set_state(c, CLOSED);
1785                 break;
1786         }
1787
1788         // Send RST
1789
1790         struct hdr hdr;
1791
1792         hdr.src = c->src;
1793         hdr.dst = c->dst;
1794         hdr.seq = c->snd.nxt;
1795         hdr.ack = 0;
1796         hdr.wnd = 0;
1797         hdr.ctl = RST;
1798
1799         print_packet(c->utcp, "send", &hdr, sizeof(hdr));
1800         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
1801         return true;
1802 }
1803
1804 // Closes all the opened connections
1805 void utcp_abort_all_connections(struct utcp *utcp) {
1806         if(!utcp) {
1807                 errno = EINVAL;
1808                 return;
1809         }
1810
1811         for(int i = 0; i < utcp->nconnections; i++) {
1812                 struct utcp_connection *c = utcp->connections[i];
1813
1814                 if(c->reapable || c->state == CLOSED) {
1815                         continue;
1816                 }
1817
1818                 utcp_recv_t old_recv = c->recv;
1819                 utcp_poll_t old_poll = c->poll;
1820
1821                 reset_connection(c);
1822
1823                 if(old_recv) {
1824                         errno = 0;
1825                         old_recv(c, NULL, 0);
1826                 }
1827
1828                 if(old_poll && !c->reapable) {
1829                         errno = 0;
1830                         old_poll(c, 0);
1831                 }
1832         }
1833
1834         return;
1835 }
1836
1837 int utcp_close(struct utcp_connection *c) {
1838         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
1839                 return -1;
1840         }
1841
1842         c->recv = NULL;
1843         c->poll = NULL;
1844         c->reapable = true;
1845         return 0;
1846 }
1847
1848 int utcp_abort(struct utcp_connection *c) {
1849         if(!reset_connection(c)) {
1850                 return -1;
1851         }
1852
1853         c->reapable = true;
1854         return 0;
1855 }
1856
1857 /* Handle timeouts.
1858  * One call to this function will loop through all connections,
1859  * checking if something needs to be resent or not.
1860  * The return value is the time to the next timeout in milliseconds,
1861  * or maybe a negative value if the timeout is infinite.
1862  */
1863 struct timeval utcp_timeout(struct utcp *utcp) {
1864         struct timeval now;
1865         gettimeofday(&now, NULL);
1866         struct timeval next = {now.tv_sec + 3600, now.tv_usec};
1867
1868         for(int i = 0; i < utcp->nconnections; i++) {
1869                 struct utcp_connection *c = utcp->connections[i];
1870
1871                 if(!c) {
1872                         continue;
1873                 }
1874
1875                 // delete connections that have been utcp_close()d.
1876                 if(c->state == CLOSED) {
1877                         if(c->reapable) {
1878                                 debug("Reaping %p\n", c);
1879                                 free_connection(c);
1880                                 i--;
1881                         }
1882
1883                         continue;
1884                 }
1885
1886                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &now, <)) {
1887                         errno = ETIMEDOUT;
1888                         c->state = CLOSED;
1889
1890                         if(c->recv) {
1891                                 c->recv(c, NULL, 0);
1892                         }
1893
1894                         if(c->poll && !c->reapable) {
1895                                 c->poll(c, 0);
1896                         }
1897
1898                         continue;
1899                 }
1900
1901                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &now, <)) {
1902                         debug("retransmit()\n");
1903                         retransmit(c);
1904                 }
1905
1906                 if(c->poll) {
1907                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1908                                 uint32_t len =  buffer_free(&c->sndbuf);
1909
1910                                 if(len) {
1911                                         c->poll(c, len);
1912                                 }
1913                         } else if(c->state == CLOSED) {
1914                                 c->poll(c, 0);
1915                         }
1916                 }
1917
1918                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &next, <)) {
1919                         next = c->conn_timeout;
1920                 }
1921
1922                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &next, <)) {
1923                         next = c->rtrx_timeout;
1924                 }
1925         }
1926
1927         struct timeval diff;
1928
1929         timersub(&next, &now, &diff);
1930
1931         return diff;
1932 }
1933
1934 bool utcp_is_active(struct utcp *utcp) {
1935         if(!utcp) {
1936                 return false;
1937         }
1938
1939         for(int i = 0; i < utcp->nconnections; i++)
1940                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
1941                         return true;
1942                 }
1943
1944         return false;
1945 }
1946
1947 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
1948         if(!send) {
1949                 errno = EFAULT;
1950                 return NULL;
1951         }
1952
1953         struct utcp *utcp = calloc(1, sizeof(*utcp));
1954
1955         if(!utcp) {
1956                 return NULL;
1957         }
1958
1959         utcp->accept = accept;
1960         utcp->pre_accept = pre_accept;
1961         utcp->send = send;
1962         utcp->priv = priv;
1963         utcp->mtu = DEFAULT_MTU;
1964         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
1965         utcp->rto = START_RTO; // usec
1966
1967         return utcp;
1968 }
1969
1970 void utcp_exit(struct utcp *utcp) {
1971         if(!utcp) {
1972                 return;
1973         }
1974
1975         for(int i = 0; i < utcp->nconnections; i++) {
1976                 struct utcp_connection *c = utcp->connections[i];
1977
1978                 if(!c->reapable) {
1979                         if(c->recv) {
1980                                 c->recv(c, NULL, 0);
1981                         }
1982
1983                         if(c->poll && !c->reapable) {
1984                                 c->poll(c, 0);
1985                         }
1986                 }
1987
1988                 buffer_exit(&c->rcvbuf);
1989                 buffer_exit(&c->sndbuf);
1990                 free(c);
1991         }
1992
1993         free(utcp->connections);
1994         free(utcp);
1995 }
1996
1997 uint16_t utcp_get_mtu(struct utcp *utcp) {
1998         return utcp ? utcp->mtu : 0;
1999 }
2000
2001 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
2002         // TODO: handle overhead of the header
2003         if(utcp) {
2004                 utcp->mtu = mtu;
2005         }
2006 }
2007
2008 void utcp_reset_timers(struct utcp *utcp) {
2009         if(!utcp) {
2010                 return;
2011         }
2012
2013         struct timeval now, then;
2014
2015         gettimeofday(&now, NULL);
2016
2017         then = now;
2018
2019         then.tv_sec += utcp->timeout;
2020
2021         for(int i = 0; i < utcp->nconnections; i++) {
2022                 struct utcp_connection *c = utcp->connections[i];
2023
2024                 if(c->reapable) {
2025                         continue;
2026                 }
2027
2028                 if(timerisset(&c->rtrx_timeout)) {
2029                         c->rtrx_timeout = now;
2030                 }
2031
2032                 if(timerisset(&c->conn_timeout)) {
2033                         c->conn_timeout = then;
2034                 }
2035
2036                 c->rtt_start.tv_sec = 0;
2037         }
2038
2039         if(utcp->rto > START_RTO) {
2040                 utcp->rto = START_RTO;
2041         }
2042 }
2043
2044 int utcp_get_user_timeout(struct utcp *u) {
2045         return u ? u->timeout : 0;
2046 }
2047
2048 void utcp_set_user_timeout(struct utcp *u, int timeout) {
2049         if(u) {
2050                 u->timeout = timeout;
2051         }
2052 }
2053
2054 size_t utcp_get_sndbuf(struct utcp_connection *c) {
2055         return c ? c->sndbuf.maxsize : 0;
2056 }
2057
2058 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
2059         if(!c) {
2060                 return 0;
2061         }
2062
2063         switch(c->state) {
2064         case SYN_SENT:
2065         case SYN_RECEIVED:
2066         case ESTABLISHED:
2067         case CLOSE_WAIT:
2068                 return buffer_free(&c->sndbuf);
2069
2070         default:
2071                 return 0;
2072         }
2073 }
2074
2075 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2076         if(!c) {
2077                 return;
2078         }
2079
2080         c->sndbuf.maxsize = size;
2081
2082         if(c->sndbuf.maxsize != size) {
2083                 c->sndbuf.maxsize = -1;
2084         }
2085 }
2086
2087 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2088         return c ? c->rcvbuf.maxsize : 0;
2089 }
2090
2091 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2092         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2093                 return buffer_free(&c->rcvbuf);
2094         } else {
2095                 return 0;
2096         }
2097 }
2098
2099 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2100         if(!c) {
2101                 return;
2102         }
2103
2104         c->rcvbuf.maxsize = size;
2105
2106         if(c->rcvbuf.maxsize != size) {
2107                 c->rcvbuf.maxsize = -1;
2108         }
2109 }
2110
2111 size_t utcp_get_sendq(struct utcp_connection *c) {
2112         return c->sndbuf.used;
2113 }
2114
2115 size_t utcp_get_recvq(struct utcp_connection *c) {
2116         return c->rcvbuf.used;
2117 }
2118
2119 bool utcp_get_nodelay(struct utcp_connection *c) {
2120         return c ? c->nodelay : false;
2121 }
2122
2123 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2124         if(c) {
2125                 c->nodelay = nodelay;
2126         }
2127 }
2128
2129 bool utcp_get_keepalive(struct utcp_connection *c) {
2130         return c ? c->keepalive : false;
2131 }
2132
2133 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2134         if(c) {
2135                 c->keepalive = keepalive;
2136         }
2137 }
2138
2139 size_t utcp_get_outq(struct utcp_connection *c) {
2140         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2141 }
2142
2143 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2144         if(c) {
2145                 c->recv = recv;
2146         }
2147 }
2148
2149 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2150         if(c) {
2151                 c->poll = poll;
2152         }
2153 }
2154
2155 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2156         if(utcp) {
2157                 utcp->accept = accept;
2158                 utcp->pre_accept = pre_accept;
2159         }
2160 }
2161
2162 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2163         if(!c || c->reapable) {
2164                 return;
2165         }
2166
2167         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2168                 return;
2169         }
2170
2171         if(expect) {
2172                 // If we expect data, start the connection timer.
2173                 if(!timerisset(&c->conn_timeout)) {
2174                         gettimeofday(&c->conn_timeout, NULL);
2175                         c->conn_timeout.tv_sec += c->utcp->timeout;
2176                 }
2177         } else {
2178                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2179                 if(c->snd.una == c->snd.last) {
2180                         timerclear(&c->conn_timeout);
2181                 }
2182         }
2183 }
2184
2185 void utcp_offline(struct utcp *utcp, bool offline) {
2186         struct timeval now;
2187         gettimeofday(&now, NULL);
2188
2189         for(int i = 0; i < utcp->nconnections; i++) {
2190                 struct utcp_connection *c = utcp->connections[i];
2191
2192                 if(c->reapable) {
2193                         continue;
2194                 }
2195
2196                 utcp_expect_data(c, offline);
2197
2198                 if(!offline) {
2199                         if(timerisset(&c->rtrx_timeout)) {
2200                                 c->rtrx_timeout = now;
2201                         }
2202
2203                         utcp->connections[i]->rtt_start.tv_sec = 0;
2204                 }
2205         }
2206
2207         if(!offline && utcp->rto > START_RTO) {
2208                 utcp->rto = START_RTO;
2209         }
2210 }