]> git.meshlink.io Git - utcp/blob - utcp.c
Always announce the receive window size as the size of the receive buffer.
[utcp] / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <sys/time.h>
31 #include <sys/socket.h>
32
33 #include "utcp_priv.h"
34
35 #ifndef EBADMSG
36 #define EBADMSG         104
37 #endif
38
39 #ifndef SHUT_RDWR
40 #define SHUT_RDWR 2
41 #endif
42
43 #ifdef poll
44 #undef poll
45 #endif
46
47 #ifndef timersub
48 #define timersub(a, b, r)\
49         do {\
50                 (r)->tv_sec = (a)->tv_sec - (b)->tv_sec;\
51                 (r)->tv_usec = (a)->tv_usec - (b)->tv_usec;\
52                 if((r)->tv_usec < 0)\
53                         (r)->tv_sec--, (r)->tv_usec += USEC_PER_SEC;\
54         } while (0)
55 #endif
56
57 static inline size_t max(size_t a, size_t b) {
58         return a > b ? a : b;
59 }
60
61 #ifdef UTCP_DEBUG
62 #include <stdarg.h>
63
64 static void debug(const char *format, ...) {
65         va_list ap;
66         va_start(ap, format);
67         vfprintf(stderr, format, ap);
68         va_end(ap);
69 }
70
71 static void print_packet(struct utcp *utcp, const char *dir, const void *pkt, size_t len) {
72         struct hdr hdr;
73
74         if(len < sizeof(hdr)) {
75                 debug("%p %s: short packet (%lu bytes)\n", utcp, dir, (unsigned long)len);
76                 return;
77         }
78
79         memcpy(&hdr, pkt, sizeof(hdr));
80         debug("%p %s: len=%lu, src=%u dst=%u seq=%u ack=%u wnd=%u aux=%x ctl=", utcp, dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux);
81
82         if(hdr.ctl & SYN) {
83                 debug("SYN");
84         }
85
86         if(hdr.ctl & RST) {
87                 debug("RST");
88         }
89
90         if(hdr.ctl & FIN) {
91                 debug("FIN");
92         }
93
94         if(hdr.ctl & ACK) {
95                 debug("ACK");
96         }
97
98         if(len > sizeof(hdr)) {
99                 uint32_t datalen = len - sizeof(hdr);
100                 const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
101                 char str[datalen * 2 + 1];
102                 char *p = str;
103
104                 for(uint32_t i = 0; i < datalen; i++) {
105                         *p++ = "0123456789ABCDEF"[data[i] >> 4];
106                         *p++ = "0123456789ABCDEF"[data[i] & 15];
107                 }
108
109                 *p = 0;
110
111                 debug(" data=%s", str);
112         }
113
114         debug("\n");
115 }
116 #else
117 #define debug(...) do {} while(0)
118 #define print_packet(...) do {} while(0)
119 #endif
120
121 static void set_state(struct utcp_connection *c, enum state state) {
122         c->state = state;
123
124         if(state == ESTABLISHED) {
125                 timerclear(&c->conn_timeout);
126         }
127
128         debug("%p new state: %s\n", c->utcp, strstate[state]);
129 }
130
131 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
132         if(seq != c->snd.last) {
133                 return false;
134         }
135
136         switch(c->state) {
137         case FIN_WAIT_1:
138         case CLOSING:
139         case LAST_ACK:
140                 return true;
141
142         default:
143                 return false;
144         }
145 }
146
147 static bool is_reliable(struct utcp_connection *c) {
148         return c->flags & UTCP_RELIABLE;
149 }
150
151 static int32_t seqdiff(uint32_t a, uint32_t b) {
152         return a - b;
153 }
154
155 // Buffer functions
156 // TODO: convert to ringbuffers to avoid memmove() operations.
157
158 // Store data into the buffer
159 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
160         debug("buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
161
162         size_t required = offset + len;
163
164         if(required > buf->maxsize) {
165                 if(offset >= buf->maxsize) {
166                         return 0;
167                 }
168
169                 len = buf->maxsize - offset;
170                 required = buf->maxsize;
171         }
172
173         if(required > buf->size) {
174                 size_t newsize = buf->size;
175
176                 if(!newsize) {
177                         newsize = required;
178                 } else {
179                         do {
180                                 newsize *= 2;
181                         } while(newsize < required);
182                 }
183
184                 if(newsize > buf->maxsize) {
185                         newsize = buf->maxsize;
186                 }
187
188                 char *newdata = realloc(buf->data, newsize);
189
190                 if(!newdata) {
191                         return -1;
192                 }
193
194                 buf->data = newdata;
195                 buf->size = newsize;
196         }
197
198         memcpy(buf->data + offset, data, len);
199
200         if(required > buf->used) {
201                 buf->used = required;
202         }
203
204         return len;
205 }
206
207 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
208         return buffer_put_at(buf, buf->used, data, len);
209 }
210
211 // Get data from the buffer. data can be NULL.
212 static ssize_t buffer_get(struct buffer *buf, void *data, size_t len) {
213         if(len > buf->used) {
214                 len = buf->used;
215         }
216
217         if(data) {
218                 memcpy(data, buf->data, len);
219         }
220
221         if(len < buf->used) {
222                 memmove(buf->data, buf->data + len, buf->used - len);
223         }
224
225         buf->used -= len;
226         return len;
227 }
228
229 // Copy data from the buffer without removing it.
230 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
231         if(offset >= buf->used) {
232                 return 0;
233         }
234
235         if(offset + len > buf->used) {
236                 len = buf->used - offset;
237         }
238
239         memcpy(data, buf->data + offset, len);
240         return len;
241 }
242
243 static bool buffer_init(struct buffer *buf, uint32_t len, uint32_t maxlen) {
244         memset(buf, 0, sizeof(*buf));
245
246         if(len) {
247                 buf->data = malloc(len);
248
249                 if(!buf->data) {
250                         return false;
251                 }
252         }
253
254         buf->size = len;
255         buf->maxsize = maxlen;
256         return true;
257 }
258
259 static void buffer_exit(struct buffer *buf) {
260         free(buf->data);
261         memset(buf, 0, sizeof(*buf));
262 }
263
264 static uint32_t buffer_free(const struct buffer *buf) {
265         return buf->maxsize - buf->used;
266 }
267
268 // Connections are stored in a sorted list.
269 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
270
271 static int compare(const void *va, const void *vb) {
272         assert(va && vb);
273
274         const struct utcp_connection *a = *(struct utcp_connection **)va;
275         const struct utcp_connection *b = *(struct utcp_connection **)vb;
276
277         assert(a && b);
278         assert(a->src && b->src);
279
280         int c = (int)a->src - (int)b->src;
281
282         if(c) {
283                 return c;
284         }
285
286         c = (int)a->dst - (int)b->dst;
287         return c;
288 }
289
290 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
291         if(!utcp->nconnections) {
292                 return NULL;
293         }
294
295         struct utcp_connection key = {
296                 .src = src,
297                 .dst = dst,
298         }, *keyp = &key;
299         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
300         return match ? *match : NULL;
301 }
302
303 static void free_connection(struct utcp_connection *c) {
304         struct utcp *utcp = c->utcp;
305         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
306
307         assert(cp);
308
309         int i = cp - utcp->connections;
310         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
311         utcp->nconnections--;
312
313         buffer_exit(&c->rcvbuf);
314         buffer_exit(&c->sndbuf);
315         free(c);
316 }
317
318 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
319         // Check whether this combination of src and dst is free
320
321         if(src) {
322                 if(find_connection(utcp, src, dst)) {
323                         errno = EADDRINUSE;
324                         return NULL;
325                 }
326         } else { // If src == 0, generate a random port number with the high bit set
327                 if(utcp->nconnections >= 32767) {
328                         errno = ENOMEM;
329                         return NULL;
330                 }
331
332                 src = rand() | 0x8000;
333
334                 while(find_connection(utcp, src, dst)) {
335                         src++;
336                 }
337         }
338
339         // Allocate memory for the new connection
340
341         if(utcp->nconnections >= utcp->nallocated) {
342                 if(!utcp->nallocated) {
343                         utcp->nallocated = 4;
344                 } else {
345                         utcp->nallocated *= 2;
346                 }
347
348                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
349
350                 if(!new_array) {
351                         return NULL;
352                 }
353
354                 utcp->connections = new_array;
355         }
356
357         struct utcp_connection *c = calloc(1, sizeof(*c));
358
359         if(!c) {
360                 return NULL;
361         }
362
363         if(!buffer_init(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
364                 free(c);
365                 return NULL;
366         }
367
368         if(!buffer_init(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
369                 buffer_exit(&c->sndbuf);
370                 free(c);
371                 return NULL;
372         }
373
374         // Fill in the details
375
376         c->src = src;
377         c->dst = dst;
378 #ifdef UTCP_DEBUG
379         c->snd.iss = 0;
380 #else
381         c->snd.iss = rand();
382 #endif
383         c->snd.una = c->snd.iss;
384         c->snd.nxt = c->snd.iss + 1;
385         c->snd.last = c->snd.nxt;
386         c->snd.cwnd = utcp->mtu;
387         c->utcp = utcp;
388
389         // Add it to the sorted list of connections
390
391         utcp->connections[utcp->nconnections++] = c;
392         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
393
394         return c;
395 }
396
397 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
398         if(a > b) {
399                 return a - b;
400         } else {
401                 return b - a;
402         }
403 }
404
405 // Update RTT variables. See RFC 6298.
406 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
407         if(!rtt) {
408                 debug("invalid rtt\n");
409                 return;
410         }
411
412         struct utcp *utcp = c->utcp;
413
414         if(!utcp->srtt) {
415                 utcp->srtt = rtt;
416                 utcp->rttvar = rtt / 2;
417         } else {
418                 utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
419                 utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
420         }
421
422         utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
423
424         if(utcp->rto > MAX_RTO) {
425                 utcp->rto = MAX_RTO;
426         }
427
428         debug("rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
429 }
430
431 static void start_retransmit_timer(struct utcp_connection *c) {
432         gettimeofday(&c->rtrx_timeout, NULL);
433         c->rtrx_timeout.tv_usec += c->utcp->rto;
434
435         while(c->rtrx_timeout.tv_usec >= 1000000) {
436                 c->rtrx_timeout.tv_usec -= 1000000;
437                 c->rtrx_timeout.tv_sec++;
438         }
439
440         debug("timeout set to %lu.%06lu (%u)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_usec, c->utcp->rto);
441 }
442
443 static void stop_retransmit_timer(struct utcp_connection *c) {
444         timerclear(&c->rtrx_timeout);
445         debug("timeout cleared\n");
446 }
447
448 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
449         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
450
451         if(!c) {
452                 return NULL;
453         }
454
455         assert((flags & ~0x1f) == 0);
456
457         c->flags = flags;
458         c->recv = recv;
459         c->priv = priv;
460
461         struct {
462                 struct hdr hdr;
463                 uint8_t init[4];
464         } pkt;
465
466         pkt.hdr.src = c->src;
467         pkt.hdr.dst = c->dst;
468         pkt.hdr.seq = c->snd.iss;
469         pkt.hdr.ack = 0;
470         pkt.hdr.wnd = c->rcvbuf.maxsize;
471         pkt.hdr.ctl = SYN;
472         pkt.hdr.aux = 0x0101;
473         pkt.init[0] = 1;
474         pkt.init[1] = 0;
475         pkt.init[2] = 0;
476         pkt.init[3] = flags & 0x7;
477
478         set_state(c, SYN_SENT);
479
480         print_packet(utcp, "send", &pkt, sizeof(pkt));
481         utcp->send(utcp, &pkt, sizeof(pkt));
482
483         gettimeofday(&c->conn_timeout, NULL);
484         c->conn_timeout.tv_sec += utcp->timeout;
485
486         start_retransmit_timer(c);
487
488         return c;
489 }
490
491 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
492         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
493 }
494
495 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
496         if(c->reapable || c->state != SYN_RECEIVED) {
497                 debug("Error: accept() called on invalid connection %p in state %s\n", c, strstate[c->state]);
498                 return;
499         }
500
501         debug("%p accepted, %p %p\n", c, recv, priv);
502         c->recv = recv;
503         c->priv = priv;
504         set_state(c, ESTABLISHED);
505 }
506
507 static void ack(struct utcp_connection *c, bool sendatleastone) {
508         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
509         int32_t cwndleft = c->snd.cwnd - seqdiff(c->snd.nxt, c->snd.una);
510         debug("cwndleft = %d\n", cwndleft);
511
512         assert(left >= 0);
513
514         if(cwndleft <= 0) {
515                 cwndleft = 0;
516         }
517
518         if(cwndleft < left) {
519                 left = cwndleft;
520         }
521
522         if(!left && !sendatleastone) {
523                 return;
524         }
525
526         struct {
527                 struct hdr hdr;
528                 uint8_t data[];
529         } *pkt;
530
531         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
532
533         if(!pkt) {
534                 return;
535         }
536
537         pkt->hdr.src = c->src;
538         pkt->hdr.dst = c->dst;
539         pkt->hdr.ack = c->rcv.nxt;
540         pkt->hdr.wnd = c->rcvbuf.maxsize;
541         pkt->hdr.ctl = ACK;
542         pkt->hdr.aux = 0;
543
544         do {
545                 uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
546                 pkt->hdr.seq = c->snd.nxt;
547
548                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
549
550                 c->snd.nxt += seglen;
551                 left -= seglen;
552
553                 if(seglen && fin_wanted(c, c->snd.nxt)) {
554                         seglen--;
555                         pkt->hdr.ctl |= FIN;
556                 }
557
558                 if(!c->rtt_start.tv_sec) {
559                         // Start RTT measurement
560                         gettimeofday(&c->rtt_start, NULL);
561                         c->rtt_seq = pkt->hdr.seq + seglen;
562                         debug("Starting RTT measurement, expecting ack %u\n", c->rtt_seq);
563                 }
564
565                 print_packet(c->utcp, "send", pkt, sizeof(pkt->hdr) + seglen);
566                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
567         } while(left);
568
569         free(pkt);
570 }
571
572 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
573         if(c->reapable) {
574                 debug("Error: send() called on closed connection %p\n", c);
575                 errno = EBADF;
576                 return -1;
577         }
578
579         switch(c->state) {
580         case CLOSED:
581         case LISTEN:
582                 debug("Error: send() called on unconnected connection %p\n", c);
583                 errno = ENOTCONN;
584                 return -1;
585
586         case SYN_SENT:
587         case SYN_RECEIVED:
588         case ESTABLISHED:
589         case CLOSE_WAIT:
590                 break;
591
592         case FIN_WAIT_1:
593         case FIN_WAIT_2:
594         case CLOSING:
595         case LAST_ACK:
596         case TIME_WAIT:
597                 debug("Error: send() called on closing connection %p\n", c);
598                 errno = EPIPE;
599                 return -1;
600         }
601
602         // Exit early if we have nothing to send.
603
604         if(!len) {
605                 return 0;
606         }
607
608         if(!data) {
609                 errno = EFAULT;
610                 return -1;
611         }
612
613         // Check if we need to be able to buffer all data
614
615         if(c->flags & UTCP_NO_PARTIAL) {
616                 if(len > buffer_free(&c->sndbuf)) {
617                         if(len > c->sndbuf.maxsize) {
618                                 errno = EMSGSIZE;
619                                 return -1;
620                         } else {
621                                 errno = EWOULDBLOCK;
622                                 return 0;
623                         }
624                 }
625         }
626
627         // Add data to send buffer.
628
629         if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
630                 len = buffer_put(&c->sndbuf, data, len);
631         } else {
632                 return 0;
633         }
634
635         if(len <= 0) {
636                 if(is_reliable(c)) {
637                         errno = EWOULDBLOCK;
638                         return 0;
639                 } else {
640                         return len;
641                 }
642         }
643
644         c->snd.last += len;
645
646         // Don't send anything yet if the connection has not fully established yet
647
648         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
649                 return len;
650         }
651
652         ack(c, false);
653
654         if(!is_reliable(c)) {
655                 c->snd.una = c->snd.nxt = c->snd.last;
656                 buffer_get(&c->sndbuf, NULL, c->sndbuf.used);
657         }
658
659         if(is_reliable(c) && !timerisset(&c->rtrx_timeout)) {
660                 start_retransmit_timer(c);
661         }
662
663         if(is_reliable(c) && !timerisset(&c->conn_timeout)) {
664                 gettimeofday(&c->conn_timeout, NULL);
665                 c->conn_timeout.tv_sec += c->utcp->timeout;
666         }
667
668         return len;
669 }
670
671 static void swap_ports(struct hdr *hdr) {
672         uint16_t tmp = hdr->src;
673         hdr->src = hdr->dst;
674         hdr->dst = tmp;
675 }
676
677 static void retransmit(struct utcp_connection *c) {
678         if(c->state == CLOSED || c->snd.last == c->snd.una) {
679                 debug("Retransmit() called but nothing to retransmit!\n");
680                 stop_retransmit_timer(c);
681                 return;
682         }
683
684         struct utcp *utcp = c->utcp;
685
686         struct {
687                 struct hdr hdr;
688                 uint8_t data[];
689         } *pkt;
690
691         pkt = malloc(sizeof(pkt->hdr) + c->utcp->mtu);
692
693         if(!pkt) {
694                 return;
695         }
696
697         pkt->hdr.src = c->src;
698         pkt->hdr.dst = c->dst;
699         pkt->hdr.wnd = c->rcvbuf.maxsize;
700         pkt->hdr.aux = 0;
701
702         switch(c->state) {
703         case SYN_SENT:
704                 // Send our SYN again
705                 pkt->hdr.seq = c->snd.iss;
706                 pkt->hdr.ack = 0;
707                 pkt->hdr.ctl = SYN;
708                 pkt->hdr.aux = 0x0101;
709                 pkt->data[0] = 1;
710                 pkt->data[1] = 0;
711                 pkt->data[2] = 0;
712                 pkt->data[3] = c->flags & 0x7;
713                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + 4);
714                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
715                 break;
716
717         case SYN_RECEIVED:
718                 // Send SYNACK again
719                 pkt->hdr.seq = c->snd.nxt;
720                 pkt->hdr.ack = c->rcv.nxt;
721                 pkt->hdr.ctl = SYN | ACK;
722                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr));
723                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
724                 break;
725
726         case ESTABLISHED:
727         case FIN_WAIT_1:
728         case CLOSE_WAIT:
729         case CLOSING:
730         case LAST_ACK:
731                 // Send unacked data again.
732                 pkt->hdr.seq = c->snd.una;
733                 pkt->hdr.ack = c->rcv.nxt;
734                 pkt->hdr.ctl = ACK;
735                 uint32_t len = seqdiff(c->snd.last, c->snd.una);
736
737                 if(len > utcp->mtu) {
738                         len = utcp->mtu;
739                 }
740
741                 if(fin_wanted(c, c->snd.una + len)) {
742                         len--;
743                         pkt->hdr.ctl |= FIN;
744                 }
745
746                 c->snd.nxt = c->snd.una + len;
747                 c->snd.cwnd = utcp->mtu; // reduce cwnd on retransmit
748                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
749                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + len);
750                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
751                 break;
752
753         case CLOSED:
754         case LISTEN:
755         case TIME_WAIT:
756         case FIN_WAIT_2:
757                 // We shouldn't need to retransmit anything in this state.
758 #ifdef UTCP_DEBUG
759                 abort();
760 #endif
761                 stop_retransmit_timer(c);
762                 goto cleanup;
763         }
764
765         start_retransmit_timer(c);
766         utcp->rto *= 2;
767
768         if(utcp->rto > MAX_RTO) {
769                 utcp->rto = MAX_RTO;
770         }
771
772         c->rtt_start.tv_sec = 0; // invalidate RTT timer
773
774 cleanup:
775         free(pkt);
776 }
777
778 /* Update receive buffer and SACK entries after consuming data.
779  *
780  * Situation:
781  *
782  * |.....0000..1111111111.....22222......3333|
783  * |---------------^
784  *
785  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
786  * to remove data from the receive buffer. The idea is to substract "len"
787  * from the offset of all the SACK entries, and then remove/cut down entries
788  * that are shifted to before the start of the receive buffer.
789  *
790  * There are three cases:
791  * - the SACK entry is after ^, in that case just change the offset.
792  * - the SACK entry starts before and ends after ^, so we have to
793  *   change both its offset and size.
794  * - the SACK entry is completely before ^, in that case delete it.
795  */
796 static void sack_consume(struct utcp_connection *c, size_t len) {
797         debug("sack_consume %lu\n", (unsigned long)len);
798
799         if(len > c->rcvbuf.used) {
800                 debug("All SACK entries consumed");
801                 c->sacks[0].len = 0;
802                 return;
803         }
804
805         buffer_get(&c->rcvbuf, NULL, len);
806
807         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
808                 if(len < c->sacks[i].offset) {
809                         c->sacks[i].offset -= len;
810                         i++;
811                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
812                         c->sacks[i].len -= len - c->sacks[i].offset;
813                         c->sacks[i].offset = 0;
814                         i++;
815                 } else {
816                         if(i < NSACKS - 1) {
817                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
818                                 c->sacks[NSACKS - 1].len = 0;
819                         } else {
820                                 c->sacks[i].len = 0;
821                                 break;
822                         }
823                 }
824         }
825
826         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
827                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
828         }
829 }
830
831 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
832         debug("out of order packet, offset %u\n", offset);
833         // Packet loss or reordering occured. Store the data in the buffer.
834         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
835
836         if(rxd < 0 || (size_t)rxd < len) {
837                 abort();
838         }
839
840         // Make note of where we put it.
841         for(int i = 0; i < NSACKS; i++) {
842                 if(!c->sacks[i].len) { // nothing to merge, add new entry
843                         debug("New SACK entry %d\n", i);
844                         c->sacks[i].offset = offset;
845                         c->sacks[i].len = rxd;
846                         break;
847                 } else if(offset < c->sacks[i].offset) {
848                         if(offset + rxd < c->sacks[i].offset) { // insert before
849                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
850                                         debug("Insert SACK entry at %d\n", i);
851                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
852                                         c->sacks[i].offset = offset;
853                                         c->sacks[i].len = rxd;
854                                 } else {
855                                         debug("SACK entries full, dropping packet\n");
856                                 }
857
858                                 break;
859                         } else { // merge
860                                 debug("Merge with start of SACK entry at %d\n", i);
861                                 c->sacks[i].offset = offset;
862                                 break;
863                         }
864                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
865                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
866                                 debug("Merge with end of SACK entry at %d\n", i);
867                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
868                                 // TODO: handle potential merge with next entry
869                         }
870
871                         break;
872                 }
873         }
874
875         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
876                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
877         }
878 }
879
880 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
881         // Check if we can process out-of-order data now.
882         if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
883                 debug("incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
884                 buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
885                 len = max(len, c->sacks[0].offset + c->sacks[0].len);
886                 data = c->rcvbuf.data;
887         }
888
889         if(c->recv) {
890                 ssize_t rxd = c->recv(c, data, len);
891
892                 if(rxd < 0 || (size_t)rxd != len) {
893                         // TODO: handle the application not accepting all data.
894                         abort();
895                 }
896         }
897
898         if(c->rcvbuf.used) {
899                 sack_consume(c, len);
900         }
901
902         c->rcv.nxt += len;
903 }
904
905
906 static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
907         if(!is_reliable(c)) {
908                 c->recv(c, data, len);
909                 c->rcv.nxt = seq + len;
910                 return;
911         }
912
913         uint32_t offset = seqdiff(seq, c->rcv.nxt);
914
915         if(offset + len > c->rcvbuf.maxsize) {
916                 abort();
917         }
918
919         if(offset) {
920                 handle_out_of_order(c, offset, data, len);
921         } else {
922                 handle_in_order(c, data, len);
923         }
924 }
925
926
927 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
928         const uint8_t *ptr = data;
929
930         if(!utcp) {
931                 errno = EFAULT;
932                 return -1;
933         }
934
935         if(!len) {
936                 return 0;
937         }
938
939         if(!data) {
940                 errno = EFAULT;
941                 return -1;
942         }
943
944         print_packet(utcp, "recv", data, len);
945
946         // Drop packets smaller than the header
947
948         struct hdr hdr;
949
950         if(len < sizeof(hdr)) {
951                 errno = EBADMSG;
952                 return -1;
953         }
954
955         // Make a copy from the potentially unaligned data to a struct hdr
956
957         memcpy(&hdr, ptr, sizeof(hdr));
958         ptr += sizeof(hdr);
959         len -= sizeof(hdr);
960
961         // Drop packets with an unknown CTL flag
962
963         if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
964                 errno = EBADMSG;
965                 return -1;
966         }
967
968         // Check for auxiliary headers
969
970         const uint8_t *init = NULL;
971
972         uint16_t aux = hdr.aux;
973
974         while(aux) {
975                 size_t auxlen = 4 * (aux >> 8) & 0xf;
976                 uint8_t auxtype = aux & 0xff;
977
978                 if(len < auxlen) {
979                         errno = EBADMSG;
980                         return -1;
981                 }
982
983                 switch(auxtype) {
984                 case AUX_INIT:
985                         if(!(hdr.ctl & SYN) || auxlen != 4) {
986                                 errno = EBADMSG;
987                                 return -1;
988                         }
989
990                         init = ptr;
991                         break;
992
993                 default:
994                         errno = EBADMSG;
995                         return -1;
996                 }
997
998                 len -= auxlen;
999                 ptr += auxlen;
1000
1001                 if(!(aux & 0x800)) {
1002                         break;
1003                 }
1004
1005                 if(len < 2) {
1006                         errno = EBADMSG;
1007                         return -1;
1008                 }
1009
1010                 memcpy(&aux, ptr, 2);
1011                 len -= 2;
1012                 ptr += 2;
1013         }
1014
1015         bool has_data = len || (hdr.ctl & (SYN | FIN));
1016
1017         // Try to match the packet to an existing connection
1018
1019         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1020
1021         // Is it for a new connection?
1022
1023         if(!c) {
1024                 // Ignore RST packets
1025
1026                 if(hdr.ctl & RST) {
1027                         return 0;
1028                 }
1029
1030                 // Is it a SYN packet and are we LISTENing?
1031
1032                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1033                         // If we don't want to accept it, send a RST back
1034                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1035                                 len = 1;
1036                                 goto reset;
1037                         }
1038
1039                         // Try to allocate memory, otherwise send a RST back
1040                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1041
1042                         if(!c) {
1043                                 len = 1;
1044                                 goto reset;
1045                         }
1046
1047                         // Parse auxilliary information
1048                         if(init) {
1049                                 if(init[0] < 1) {
1050                                         len = 1;
1051                                         goto reset;
1052                                 }
1053
1054                                 c->flags = init[3] & 0x7;
1055                         } else {
1056                                 c->flags = UTCP_TCP;
1057                         }
1058
1059 synack:
1060                         // Return SYN+ACK, go to SYN_RECEIVED state
1061                         c->snd.wnd = hdr.wnd;
1062                         c->rcv.irs = hdr.seq;
1063                         c->rcv.nxt = c->rcv.irs + 1;
1064                         set_state(c, SYN_RECEIVED);
1065
1066                         struct {
1067                                 struct hdr hdr;
1068                                 uint8_t data[4];
1069                         } pkt;
1070
1071                         pkt.hdr.src = c->src;
1072                         pkt.hdr.dst = c->dst;
1073                         pkt.hdr.ack = c->rcv.irs + 1;
1074                         pkt.hdr.seq = c->snd.iss;
1075                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1076                         pkt.hdr.ctl = SYN | ACK;
1077
1078                         if(init) {
1079                                 pkt.hdr.aux = 0x0101;
1080                                 pkt.data[0] = 1;
1081                                 pkt.data[1] = 0;
1082                                 pkt.data[2] = 0;
1083                                 pkt.data[3] = c->flags & 0x7;
1084                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr) + 4);
1085                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1086                         } else {
1087                                 pkt.hdr.aux = 0;
1088                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr));
1089                                 utcp->send(utcp, &pkt, sizeof(hdr));
1090                         }
1091                 } else {
1092                         // No, we don't want your packets, send a RST back
1093                         len = 1;
1094                         goto reset;
1095                 }
1096
1097                 return 0;
1098         }
1099
1100         debug("%p state %s\n", c->utcp, strstate[c->state]);
1101
1102         // In case this is for a CLOSED connection, ignore the packet.
1103         // TODO: make it so incoming packets can never match a CLOSED connection.
1104
1105         if(c->state == CLOSED) {
1106                 debug("Got packet for closed connection\n");
1107                 return 0;
1108         }
1109
1110         // It is for an existing connection.
1111
1112         // 1. Drop invalid packets.
1113
1114         // 1a. Drop packets that should not happen in our current state.
1115
1116         switch(c->state) {
1117         case SYN_SENT:
1118         case SYN_RECEIVED:
1119         case ESTABLISHED:
1120         case FIN_WAIT_1:
1121         case FIN_WAIT_2:
1122         case CLOSE_WAIT:
1123         case CLOSING:
1124         case LAST_ACK:
1125         case TIME_WAIT:
1126                 break;
1127
1128         default:
1129 #ifdef UTCP_DEBUG
1130                 abort();
1131 #endif
1132                 break;
1133         }
1134
1135         // 1b. Discard data that is not in our receive window.
1136
1137         if(is_reliable(c)) {
1138                 bool acceptable;
1139
1140                 if(c->state == SYN_SENT) {
1141                         acceptable = true;
1142                 } else if(len == 0) {
1143                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1144                 } else {
1145                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1146
1147                         // cut already accepted front overlapping
1148                         if(rcv_offset < 0) {
1149                                 acceptable = len > (size_t) - rcv_offset;
1150
1151                                 if(acceptable) {
1152                                         ptr -= rcv_offset;
1153                                         len += rcv_offset;
1154                                         hdr.seq -= rcv_offset;
1155                                 }
1156                         } else {
1157                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1158                         }
1159                 }
1160
1161                 if(!acceptable) {
1162                         debug("Packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1163
1164                         // Ignore unacceptable RST packets.
1165                         if(hdr.ctl & RST) {
1166                                 return 0;
1167                         }
1168
1169                         // Otherwise, continue processing.
1170                         len = 0;
1171                 }
1172         }
1173
1174         c->snd.wnd = hdr.wnd; // TODO: move below
1175
1176         // 1c. Drop packets with an invalid ACK.
1177         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1178         // (= snd.una + c->sndbuf.used).
1179
1180         if(!is_reliable(c)) {
1181                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1182                         hdr.ack = c->snd.una;
1183                 }
1184         }
1185
1186         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1187                 debug("Packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1188
1189                 // Ignore unacceptable RST packets.
1190                 if(hdr.ctl & RST) {
1191                         return 0;
1192                 }
1193
1194                 goto reset;
1195         }
1196
1197         // 2. Handle RST packets
1198
1199         if(hdr.ctl & RST) {
1200                 switch(c->state) {
1201                 case SYN_SENT:
1202                         if(!(hdr.ctl & ACK)) {
1203                                 return 0;
1204                         }
1205
1206                         // The peer has refused our connection.
1207                         set_state(c, CLOSED);
1208                         errno = ECONNREFUSED;
1209
1210                         if(c->recv) {
1211                                 c->recv(c, NULL, 0);
1212                         }
1213
1214                         if(c->poll && !c->reapable) {
1215                                 c->poll(c, 0);
1216                         }
1217
1218                         return 0;
1219
1220                 case SYN_RECEIVED:
1221                         if(hdr.ctl & ACK) {
1222                                 return 0;
1223                         }
1224
1225                         // We haven't told the application about this connection yet. Silently delete.
1226                         free_connection(c);
1227                         return 0;
1228
1229                 case ESTABLISHED:
1230                 case FIN_WAIT_1:
1231                 case FIN_WAIT_2:
1232                 case CLOSE_WAIT:
1233                         if(hdr.ctl & ACK) {
1234                                 return 0;
1235                         }
1236
1237                         // The peer has aborted our connection.
1238                         set_state(c, CLOSED);
1239                         errno = ECONNRESET;
1240
1241                         if(c->recv) {
1242                                 c->recv(c, NULL, 0);
1243                         }
1244
1245                         if(c->poll && !c->reapable) {
1246                                 c->poll(c, 0);
1247                         }
1248
1249                         return 0;
1250
1251                 case CLOSING:
1252                 case LAST_ACK:
1253                 case TIME_WAIT:
1254                         if(hdr.ctl & ACK) {
1255                                 return 0;
1256                         }
1257
1258                         // As far as the application is concerned, the connection has already been closed.
1259                         // If it has called utcp_close() already, we can immediately free this connection.
1260                         if(c->reapable) {
1261                                 free_connection(c);
1262                                 return 0;
1263                         }
1264
1265                         // Otherwise, immediately move to the CLOSED state.
1266                         set_state(c, CLOSED);
1267                         return 0;
1268
1269                 default:
1270 #ifdef UTCP_DEBUG
1271                         abort();
1272 #endif
1273                         break;
1274                 }
1275         }
1276
1277         uint32_t advanced;
1278
1279         if(!(hdr.ctl & ACK)) {
1280                 advanced = 0;
1281                 goto skip_ack;
1282         }
1283
1284         // 3. Advance snd.una
1285
1286         advanced = seqdiff(hdr.ack, c->snd.una);
1287
1288         if(advanced) {
1289                 // RTT measurement
1290                 if(c->rtt_start.tv_sec) {
1291                         if(c->rtt_seq == hdr.ack) {
1292                                 struct timeval now, diff;
1293                                 gettimeofday(&now, NULL);
1294                                 timersub(&now, &c->rtt_start, &diff);
1295                                 update_rtt(c, diff.tv_sec * 1000000 + diff.tv_usec);
1296                                 c->rtt_start.tv_sec = 0;
1297                         } else if(c->rtt_seq < hdr.ack) {
1298                                 debug("Cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1299                                 c->rtt_start.tv_sec = 0;
1300                         }
1301                 }
1302
1303                 int32_t data_acked = advanced;
1304
1305                 switch(c->state) {
1306                 case SYN_SENT:
1307                 case SYN_RECEIVED:
1308                         data_acked--;
1309                         break;
1310
1311                 // TODO: handle FIN as well.
1312                 default:
1313                         break;
1314                 }
1315
1316                 assert(data_acked >= 0);
1317
1318 #ifndef NDEBUG
1319                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1320                 assert(data_acked <= bufused);
1321 #endif
1322
1323                 if(data_acked) {
1324                         buffer_get(&c->sndbuf, NULL, data_acked);
1325                 }
1326
1327                 // Also advance snd.nxt if possible
1328                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1329                         c->snd.nxt = hdr.ack;
1330                 }
1331
1332                 c->snd.una = hdr.ack;
1333
1334                 c->dupack = 0;
1335                 c->snd.cwnd += utcp->mtu;
1336
1337                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1338                         c->snd.cwnd = c->sndbuf.maxsize;
1339                 }
1340
1341                 // Check if we have sent a FIN that is now ACKed.
1342                 switch(c->state) {
1343                 case FIN_WAIT_1:
1344                         if(c->snd.una == c->snd.last) {
1345                                 set_state(c, FIN_WAIT_2);
1346                         }
1347
1348                         break;
1349
1350                 case CLOSING:
1351                         if(c->snd.una == c->snd.last) {
1352                                 gettimeofday(&c->conn_timeout, NULL);
1353                                 c->conn_timeout.tv_sec += utcp->timeout;
1354                                 set_state(c, TIME_WAIT);
1355                         }
1356
1357                         break;
1358
1359                 default:
1360                         break;
1361                 }
1362         } else {
1363                 if(!len && is_reliable(c)) {
1364                         c->dupack++;
1365
1366                         if(c->dupack == 3) {
1367                                 debug("Triplicate ACK\n");
1368                                 //TODO: Resend one packet and go to fast recovery mode. See RFC 6582.
1369                                 //We do a very simple variant here; reset the nxt pointer to the last acknowledged packet from the peer.
1370                                 //Reset the congestion window so we wait for ACKs.
1371                                 c->snd.nxt = c->snd.una;
1372                                 c->snd.cwnd = utcp->mtu;
1373                                 start_retransmit_timer(c);
1374                         }
1375                 }
1376         }
1377
1378         // 4. Update timers
1379
1380         if(advanced) {
1381                 if(c->snd.una == c->snd.last) {
1382                         stop_retransmit_timer(c);
1383                         timerclear(&c->conn_timeout);
1384                 } else if(is_reliable(c)) {
1385                         start_retransmit_timer(c);
1386                         gettimeofday(&c->conn_timeout, NULL);
1387                         c->conn_timeout.tv_sec += utcp->timeout;
1388                 }
1389         }
1390
1391 skip_ack:
1392         // 5. Process SYN stuff
1393
1394         if(hdr.ctl & SYN) {
1395                 switch(c->state) {
1396                 case SYN_SENT:
1397
1398                         // This is a SYNACK. It should always have ACKed the SYN.
1399                         if(!advanced) {
1400                                 goto reset;
1401                         }
1402
1403                         c->rcv.irs = hdr.seq;
1404                         c->rcv.nxt = hdr.seq;
1405
1406                         if(c->shut_wr) {
1407                                 c->snd.last++;
1408                                 set_state(c, FIN_WAIT_1);
1409                         } else {
1410                                 set_state(c, ESTABLISHED);
1411                         }
1412
1413                         // TODO: notify application of this somehow.
1414                         break;
1415
1416                 case SYN_RECEIVED:
1417                         // This is a retransmit of a SYN, send back the SYNACK.
1418                         goto synack;
1419
1420                 case ESTABLISHED:
1421                 case FIN_WAIT_1:
1422                 case FIN_WAIT_2:
1423                 case CLOSE_WAIT:
1424                 case CLOSING:
1425                 case LAST_ACK:
1426                 case TIME_WAIT:
1427                         // Ehm, no. We should never receive a second SYN.
1428                         return 0;
1429
1430                 default:
1431 #ifdef UTCP_DEBUG
1432                         abort();
1433 #endif
1434                         return 0;
1435                 }
1436
1437                 // SYN counts as one sequence number
1438                 c->rcv.nxt++;
1439         }
1440
1441         // 6. Process new data
1442
1443         if(c->state == SYN_RECEIVED) {
1444                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
1445                 if(!advanced) {
1446                         goto reset;
1447                 }
1448
1449                 // Are we still LISTENing?
1450                 if(utcp->accept) {
1451                         utcp->accept(c, c->src);
1452                 }
1453
1454                 if(c->state != ESTABLISHED) {
1455                         set_state(c, CLOSED);
1456                         c->reapable = true;
1457                         goto reset;
1458                 }
1459         }
1460
1461         if(len) {
1462                 switch(c->state) {
1463                 case SYN_SENT:
1464                 case SYN_RECEIVED:
1465                         // This should never happen.
1466 #ifdef UTCP_DEBUG
1467                         abort();
1468 #endif
1469                         return 0;
1470
1471                 case ESTABLISHED:
1472                 case FIN_WAIT_1:
1473                 case FIN_WAIT_2:
1474                         break;
1475
1476                 case CLOSE_WAIT:
1477                 case CLOSING:
1478                 case LAST_ACK:
1479                 case TIME_WAIT:
1480                         // Ehm no, We should never receive more data after a FIN.
1481                         goto reset;
1482
1483                 default:
1484 #ifdef UTCP_DEBUG
1485                         abort();
1486 #endif
1487                         return 0;
1488                 }
1489
1490                 handle_incoming_data(c, hdr.seq, ptr, len);
1491         }
1492
1493         // 7. Process FIN stuff
1494
1495         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
1496                 switch(c->state) {
1497                 case SYN_SENT:
1498                 case SYN_RECEIVED:
1499                         // This should never happen.
1500 #ifdef UTCP_DEBUG
1501                         abort();
1502 #endif
1503                         break;
1504
1505                 case ESTABLISHED:
1506                         set_state(c, CLOSE_WAIT);
1507                         break;
1508
1509                 case FIN_WAIT_1:
1510                         set_state(c, CLOSING);
1511                         break;
1512
1513                 case FIN_WAIT_2:
1514                         gettimeofday(&c->conn_timeout, NULL);
1515                         c->conn_timeout.tv_sec += utcp->timeout;
1516                         set_state(c, TIME_WAIT);
1517                         break;
1518
1519                 case CLOSE_WAIT:
1520                 case CLOSING:
1521                 case LAST_ACK:
1522                 case TIME_WAIT:
1523                         // Ehm, no. We should never receive a second FIN.
1524                         goto reset;
1525
1526                 default:
1527 #ifdef UTCP_DEBUG
1528                         abort();
1529 #endif
1530                         break;
1531                 }
1532
1533                 // FIN counts as one sequence number
1534                 c->rcv.nxt++;
1535                 len++;
1536
1537                 // Inform the application that the peer closed its end of the connection.
1538                 if(c->recv) {
1539                         errno = 0;
1540                         c->recv(c, NULL, 0);
1541                 }
1542         }
1543
1544         // Now we send something back if:
1545         // - we received data, so we have to send back an ACK
1546         //   -> sendatleastone = true
1547         // - or we got an ack, so we should maybe send a bit more data
1548         //   -> sendatleastone = false
1549
1550         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
1551                 ack(c, has_data);
1552         }
1553
1554         return 0;
1555
1556 reset:
1557         swap_ports(&hdr);
1558         hdr.wnd = 0;
1559         hdr.aux = 0;
1560
1561         if(hdr.ctl & ACK) {
1562                 hdr.seq = hdr.ack;
1563                 hdr.ctl = RST;
1564         } else {
1565                 hdr.ack = hdr.seq + len;
1566                 hdr.seq = 0;
1567                 hdr.ctl = RST | ACK;
1568         }
1569
1570         print_packet(utcp, "send", &hdr, sizeof(hdr));
1571         utcp->send(utcp, &hdr, sizeof(hdr));
1572         return 0;
1573
1574 }
1575
1576 int utcp_shutdown(struct utcp_connection *c, int dir) {
1577         debug("%p shutdown %d at %u\n", c ? c->utcp : NULL, dir, c ? c->snd.last : 0);
1578
1579         if(!c) {
1580                 errno = EFAULT;
1581                 return -1;
1582         }
1583
1584         if(c->reapable) {
1585                 debug("Error: shutdown() called on closed connection %p\n", c);
1586                 errno = EBADF;
1587                 return -1;
1588         }
1589
1590         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
1591                 errno = EINVAL;
1592                 return -1;
1593         }
1594
1595         // TCP does not have a provision for stopping incoming packets.
1596         // The best we can do is to just ignore them.
1597         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
1598                 c->recv = NULL;
1599         }
1600
1601         // The rest of the code deals with shutting down writes.
1602         if(dir == UTCP_SHUT_RD) {
1603                 return 0;
1604         }
1605
1606         // Only process shutting down writes once.
1607         if(c->shut_wr) {
1608                 return 0;
1609         }
1610
1611         c->shut_wr = true;
1612
1613         switch(c->state) {
1614         case CLOSED:
1615         case LISTEN:
1616                 errno = ENOTCONN;
1617                 return -1;
1618
1619         case SYN_SENT:
1620                 return 0;
1621
1622         case SYN_RECEIVED:
1623         case ESTABLISHED:
1624                 set_state(c, FIN_WAIT_1);
1625                 break;
1626
1627         case FIN_WAIT_1:
1628         case FIN_WAIT_2:
1629                 return 0;
1630
1631         case CLOSE_WAIT:
1632                 set_state(c, CLOSING);
1633                 break;
1634
1635         case CLOSING:
1636         case LAST_ACK:
1637         case TIME_WAIT:
1638                 return 0;
1639         }
1640
1641         c->snd.last++;
1642
1643         ack(c, false);
1644
1645         if(!timerisset(&c->rtrx_timeout)) {
1646                 start_retransmit_timer(c);
1647         }
1648
1649         return 0;
1650 }
1651
1652 static bool reset_connection(struct utcp_connection *c) {
1653         if(!c) {
1654                 errno = EFAULT;
1655                 return false;
1656         }
1657
1658         if(c->reapable) {
1659                 debug("Error: abort() called on closed connection %p\n", c);
1660                 errno = EBADF;
1661                 return false;
1662         }
1663
1664         c->recv = NULL;
1665         c->poll = NULL;
1666
1667         switch(c->state) {
1668         case CLOSED:
1669                 return true;
1670
1671         case LISTEN:
1672         case SYN_SENT:
1673         case CLOSING:
1674         case LAST_ACK:
1675         case TIME_WAIT:
1676                 set_state(c, CLOSED);
1677                 return true;
1678
1679         case SYN_RECEIVED:
1680         case ESTABLISHED:
1681         case FIN_WAIT_1:
1682         case FIN_WAIT_2:
1683         case CLOSE_WAIT:
1684                 set_state(c, CLOSED);
1685                 break;
1686         }
1687
1688         // Send RST
1689
1690         struct hdr hdr;
1691
1692         hdr.src = c->src;
1693         hdr.dst = c->dst;
1694         hdr.seq = c->snd.nxt;
1695         hdr.ack = 0;
1696         hdr.wnd = 0;
1697         hdr.ctl = RST;
1698
1699         print_packet(c->utcp, "send", &hdr, sizeof(hdr));
1700         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
1701         return true;
1702 }
1703
1704 // Closes all the opened connections
1705 void utcp_abort_all_connections(struct utcp *utcp) {
1706         if(!utcp) {
1707                 errno = EINVAL;
1708                 return;
1709         }
1710
1711         for(int i = 0; i < utcp->nconnections; i++) {
1712                 struct utcp_connection *c = utcp->connections[i];
1713
1714                 if(c->reapable || c->state == CLOSED) {
1715                         continue;
1716                 }
1717
1718                 utcp_recv_t old_recv = c->recv;
1719                 utcp_poll_t old_poll = c->poll;
1720
1721                 reset_connection(c);
1722
1723                 if(old_recv) {
1724                         errno = 0;
1725                         old_recv(c, NULL, 0);
1726                 }
1727
1728                 if(old_poll && !c->reapable) {
1729                         errno = 0;
1730                         old_poll(c, 0);
1731                 }
1732         }
1733
1734         return;
1735 }
1736
1737 int utcp_close(struct utcp_connection *c) {
1738         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
1739                 return -1;
1740         }
1741
1742         c->recv = NULL;
1743         c->poll = NULL;
1744         c->reapable = true;
1745         return 0;
1746 }
1747
1748 int utcp_abort(struct utcp_connection *c) {
1749         if(!reset_connection(c)) {
1750                 return -1;
1751         }
1752
1753         c->reapable = true;
1754         return 0;
1755 }
1756
1757 /* Handle timeouts.
1758  * One call to this function will loop through all connections,
1759  * checking if something needs to be resent or not.
1760  * The return value is the time to the next timeout in milliseconds,
1761  * or maybe a negative value if the timeout is infinite.
1762  */
1763 struct timeval utcp_timeout(struct utcp *utcp) {
1764         struct timeval now;
1765         gettimeofday(&now, NULL);
1766         struct timeval next = {now.tv_sec + 3600, now.tv_usec};
1767
1768         for(int i = 0; i < utcp->nconnections; i++) {
1769                 struct utcp_connection *c = utcp->connections[i];
1770
1771                 if(!c) {
1772                         continue;
1773                 }
1774
1775                 // delete connections that have been utcp_close()d.
1776                 if(c->state == CLOSED) {
1777                         if(c->reapable) {
1778                                 debug("Reaping %p\n", c);
1779                                 free_connection(c);
1780                                 i--;
1781                         }
1782
1783                         continue;
1784                 }
1785
1786                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &now, <)) {
1787                         errno = ETIMEDOUT;
1788                         c->state = CLOSED;
1789
1790                         if(c->recv) {
1791                                 c->recv(c, NULL, 0);
1792                         }
1793
1794                         if(c->poll && !c->reapable) {
1795                                 c->poll(c, 0);
1796                         }
1797
1798                         continue;
1799                 }
1800
1801                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &now, <)) {
1802                         debug("retransmit()\n");
1803                         retransmit(c);
1804                 }
1805
1806                 if(c->poll) {
1807                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1808                                 uint32_t len =  buffer_free(&c->sndbuf);
1809
1810                                 if(len) {
1811                                         c->poll(c, len);
1812                                 }
1813                         } else if(c->state == CLOSED) {
1814                                 c->poll(c, 0);
1815                         }
1816                 }
1817
1818                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &next, <)) {
1819                         next = c->conn_timeout;
1820                 }
1821
1822                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &next, <)) {
1823                         next = c->rtrx_timeout;
1824                 }
1825         }
1826
1827         struct timeval diff;
1828
1829         timersub(&next, &now, &diff);
1830
1831         return diff;
1832 }
1833
1834 bool utcp_is_active(struct utcp *utcp) {
1835         if(!utcp) {
1836                 return false;
1837         }
1838
1839         for(int i = 0; i < utcp->nconnections; i++)
1840                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
1841                         return true;
1842                 }
1843
1844         return false;
1845 }
1846
1847 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
1848         if(!send) {
1849                 errno = EFAULT;
1850                 return NULL;
1851         }
1852
1853         struct utcp *utcp = calloc(1, sizeof(*utcp));
1854
1855         if(!utcp) {
1856                 return NULL;
1857         }
1858
1859         utcp->accept = accept;
1860         utcp->pre_accept = pre_accept;
1861         utcp->send = send;
1862         utcp->priv = priv;
1863         utcp->mtu = DEFAULT_MTU;
1864         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
1865         utcp->rto = START_RTO; // usec
1866
1867         return utcp;
1868 }
1869
1870 void utcp_exit(struct utcp *utcp) {
1871         if(!utcp) {
1872                 return;
1873         }
1874
1875         for(int i = 0; i < utcp->nconnections; i++) {
1876                 struct utcp_connection *c = utcp->connections[i];
1877
1878                 if(!c->reapable) {
1879                         if(c->recv) {
1880                                 c->recv(c, NULL, 0);
1881                         }
1882
1883                         if(c->poll && !c->reapable) {
1884                                 c->poll(c, 0);
1885                         }
1886                 }
1887
1888                 buffer_exit(&c->rcvbuf);
1889                 buffer_exit(&c->sndbuf);
1890                 free(c);
1891         }
1892
1893         free(utcp->connections);
1894         free(utcp);
1895 }
1896
1897 uint16_t utcp_get_mtu(struct utcp *utcp) {
1898         return utcp ? utcp->mtu : 0;
1899 }
1900
1901 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
1902         // TODO: handle overhead of the header
1903         if(utcp) {
1904                 utcp->mtu = mtu;
1905         }
1906 }
1907
1908 void utcp_reset_timers(struct utcp *utcp) {
1909         if(!utcp) {
1910                 return;
1911         }
1912
1913         struct timeval now, then;
1914
1915         gettimeofday(&now, NULL);
1916
1917         then = now;
1918
1919         then.tv_sec += utcp->timeout;
1920
1921         for(int i = 0; i < utcp->nconnections; i++) {
1922                 struct utcp_connection *c = utcp->connections[i];
1923
1924                 if(c->reapable) {
1925                         continue;
1926                 }
1927
1928                 if(timerisset(&c->rtrx_timeout)) {
1929                         c->rtrx_timeout = now;
1930                 }
1931
1932                 if(timerisset(&c->conn_timeout)) {
1933                         c->conn_timeout = then;
1934                 }
1935
1936                 c->rtt_start.tv_sec = 0;
1937         }
1938
1939         if(utcp->rto > START_RTO) {
1940                 utcp->rto = START_RTO;
1941         }
1942 }
1943
1944 int utcp_get_user_timeout(struct utcp *u) {
1945         return u ? u->timeout : 0;
1946 }
1947
1948 void utcp_set_user_timeout(struct utcp *u, int timeout) {
1949         if(u) {
1950                 u->timeout = timeout;
1951         }
1952 }
1953
1954 size_t utcp_get_sndbuf(struct utcp_connection *c) {
1955         return c ? c->sndbuf.maxsize : 0;
1956 }
1957
1958 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
1959         if(!c) {
1960                 return 0;
1961         }
1962
1963         switch(c->state) {
1964         case SYN_SENT:
1965         case SYN_RECEIVED:
1966         case ESTABLISHED:
1967         case CLOSE_WAIT:
1968                 return buffer_free(&c->sndbuf);
1969
1970         default:
1971                 return 0;
1972         }
1973 }
1974
1975 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
1976         if(!c) {
1977                 return;
1978         }
1979
1980         c->sndbuf.maxsize = size;
1981
1982         if(c->sndbuf.maxsize != size) {
1983                 c->sndbuf.maxsize = -1;
1984         }
1985 }
1986
1987 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
1988         return c ? c->rcvbuf.maxsize : 0;
1989 }
1990
1991 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
1992         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1993                 return buffer_free(&c->rcvbuf);
1994         } else {
1995                 return 0;
1996         }
1997 }
1998
1999 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2000         if(!c) {
2001                 return;
2002         }
2003
2004         c->rcvbuf.maxsize = size;
2005
2006         if(c->rcvbuf.maxsize != size) {
2007                 c->rcvbuf.maxsize = -1;
2008         }
2009 }
2010
2011 size_t utcp_get_sendq(struct utcp_connection *c) {
2012         return c->sndbuf.used;
2013 }
2014
2015 size_t utcp_get_recvq(struct utcp_connection *c) {
2016         return c->rcvbuf.used;
2017 }
2018
2019 bool utcp_get_nodelay(struct utcp_connection *c) {
2020         return c ? c->nodelay : false;
2021 }
2022
2023 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2024         if(c) {
2025                 c->nodelay = nodelay;
2026         }
2027 }
2028
2029 bool utcp_get_keepalive(struct utcp_connection *c) {
2030         return c ? c->keepalive : false;
2031 }
2032
2033 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2034         if(c) {
2035                 c->keepalive = keepalive;
2036         }
2037 }
2038
2039 size_t utcp_get_outq(struct utcp_connection *c) {
2040         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2041 }
2042
2043 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2044         if(c) {
2045                 c->recv = recv;
2046         }
2047 }
2048
2049 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2050         if(c) {
2051                 c->poll = poll;
2052         }
2053 }
2054
2055 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2056         if(utcp) {
2057                 utcp->accept = accept;
2058                 utcp->pre_accept = pre_accept;
2059         }
2060 }
2061
2062 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2063         if(!c || c->reapable) {
2064                 return;
2065         }
2066
2067         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2068                 return;
2069         }
2070
2071         if(expect) {
2072                 // If we expect data, start the connection timer.
2073                 if(!timerisset(&c->conn_timeout)) {
2074                         gettimeofday(&c->conn_timeout, NULL);
2075                         c->conn_timeout.tv_sec += c->utcp->timeout;
2076                 }
2077         } else {
2078                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2079                 if(c->snd.una == c->snd.last) {
2080                         timerclear(&c->conn_timeout);
2081                 }
2082         }
2083 }
2084
2085 void utcp_offline(struct utcp *utcp, bool offline) {
2086         struct timeval now;
2087         gettimeofday(&now, NULL);
2088
2089         for(int i = 0; i < utcp->nconnections; i++) {
2090                 struct utcp_connection *c = utcp->connections[i];
2091
2092                 if(c->reapable) {
2093                         continue;
2094                 }
2095
2096                 utcp_expect_data(c, offline);
2097
2098                 if(!offline) {
2099                         if(timerisset(&c->rtrx_timeout)) {
2100                                 c->rtrx_timeout = now;
2101                         }
2102
2103                         utcp->connections[i]->rtt_start.tv_sec = 0;
2104                 }
2105         }
2106
2107         if(!offline && utcp->rto > START_RTO) {
2108                 utcp->rto = START_RTO;
2109         }
2110 }