]> git.meshlink.io Git - utcp/blob - utcp.c
Implement ringbuffers.
[utcp] / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <sys/time.h>
31 #include <sys/socket.h>
32
33 #include "utcp_priv.h"
34
35 #ifndef EBADMSG
36 #define EBADMSG         104
37 #endif
38
39 #ifndef SHUT_RDWR
40 #define SHUT_RDWR 2
41 #endif
42
43 #ifdef poll
44 #undef poll
45 #endif
46
47 #ifndef timersub
48 #define timersub(a, b, r)\
49         do {\
50                 (r)->tv_sec = (a)->tv_sec - (b)->tv_sec;\
51                 (r)->tv_usec = (a)->tv_usec - (b)->tv_usec;\
52                 if((r)->tv_usec < 0)\
53                         (r)->tv_sec--, (r)->tv_usec += USEC_PER_SEC;\
54         } while (0)
55 #endif
56
57 static inline size_t max(size_t a, size_t b) {
58         return a > b ? a : b;
59 }
60
61 #ifdef UTCP_DEBUG
62 #include <stdarg.h>
63
64 static void debug(const char *format, ...) {
65         va_list ap;
66         va_start(ap, format);
67         vfprintf(stderr, format, ap);
68         va_end(ap);
69 }
70
71 static void print_packet(struct utcp *utcp, const char *dir, const void *pkt, size_t len) {
72         struct hdr hdr;
73
74         if(len < sizeof(hdr)) {
75                 debug("%p %s: short packet (%lu bytes)\n", utcp, dir, (unsigned long)len);
76                 return;
77         }
78
79         memcpy(&hdr, pkt, sizeof(hdr));
80         debug("%p %s: len=%lu, src=%u dst=%u seq=%u ack=%u wnd=%u aux=%x ctl=", utcp, dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux);
81
82         if(hdr.ctl & SYN) {
83                 debug("SYN");
84         }
85
86         if(hdr.ctl & RST) {
87                 debug("RST");
88         }
89
90         if(hdr.ctl & FIN) {
91                 debug("FIN");
92         }
93
94         if(hdr.ctl & ACK) {
95                 debug("ACK");
96         }
97
98         if(len > sizeof(hdr)) {
99                 uint32_t datalen = len - sizeof(hdr);
100                 const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
101                 char str[datalen * 2 + 1];
102                 char *p = str;
103
104                 for(uint32_t i = 0; i < datalen; i++) {
105                         *p++ = "0123456789ABCDEF"[data[i] >> 4];
106                         *p++ = "0123456789ABCDEF"[data[i] & 15];
107                 }
108
109                 *p = 0;
110
111                 debug(" data=%s", str);
112         }
113
114         debug("\n");
115 }
116 #else
117 #define debug(...) do {} while(0)
118 #define print_packet(...) do {} while(0)
119 #endif
120
121 static void set_state(struct utcp_connection *c, enum state state) {
122         c->state = state;
123
124         if(state == ESTABLISHED) {
125                 timerclear(&c->conn_timeout);
126         }
127
128         debug("%p new state: %s\n", c->utcp, strstate[state]);
129 }
130
131 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
132         if(seq != c->snd.last) {
133                 return false;
134         }
135
136         switch(c->state) {
137         case FIN_WAIT_1:
138         case CLOSING:
139         case LAST_ACK:
140                 return true;
141
142         default:
143                 return false;
144         }
145 }
146
147 static bool is_reliable(struct utcp_connection *c) {
148         return c->flags & UTCP_RELIABLE;
149 }
150
151 static int32_t seqdiff(uint32_t a, uint32_t b) {
152         return a - b;
153 }
154
155 // Buffer functions
156 static bool buffer_wraps(struct buffer *buf) {
157         return buf->size - buf->offset < buf->used;
158 }
159
160 static bool buffer_resize(struct buffer *buf, uint32_t newsize) {
161         char *newdata = realloc(buf->data, newsize);
162
163         if(!newdata) {
164                 return false;
165         }
166
167         buf->data = newdata;
168
169         if(buffer_wraps(buf)) {
170                 // Shift the right part of the buffer until it hits the end of the new buffer.
171                 // Old situation:
172                 // [345......012]
173                 // New situation:
174                 // [345.........|........012]
175                 uint32_t tailsize = buf->size - buf->offset;
176                 uint32_t newoffset = newsize - tailsize;
177                 memmove(buf + newoffset, buf + buf->offset, tailsize);
178                 buf->offset = newoffset;
179         }
180
181         buf->size = newsize;
182         return true;
183 }
184
185 // Store data into the buffer
186 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
187         debug("buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
188
189         // Ensure we don't store more than maxsize bytes in total
190         size_t required = offset + len;
191
192         if(required > buf->maxsize) {
193                 if(offset >= buf->maxsize) {
194                         return 0;
195                 }
196
197                 len = buf->maxsize - offset;
198                 required = buf->maxsize;
199         }
200
201         // Check if we need to resize the buffer
202         if(required > buf->size) {
203                 size_t newsize = buf->size;
204
205                 if(!newsize) {
206                         newsize = 4096;
207                 }
208
209                 do {
210                         newsize *= 2;
211                 } while(newsize < required);
212
213                 if(newsize > buf->maxsize) {
214                         newsize = buf->maxsize;
215                 }
216
217                 if(!buffer_resize(buf, newsize)) {
218                         return -1;
219                 }
220         }
221
222         uint32_t realoffset = buf->offset + offset;
223
224         if(buf->size - buf->offset < offset) {
225                 // The offset wrapped
226                 realoffset -= buf->size;
227         }
228
229         if(buf->size - realoffset < len) {
230                 // The new chunk of data must be wrapped
231                 memcpy(buf->data + realoffset, data, buf->size - realoffset);
232                 memcpy(buf->data, (char *)data + buf->size - realoffset, len - (buf->size - realoffset));
233         } else {
234                 memcpy(buf->data + realoffset, data, len);
235         }
236
237         if(required > buf->used) {
238                 buf->used = required;
239         }
240
241         return len;
242 }
243
244 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
245         return buffer_put_at(buf, buf->used, data, len);
246 }
247
248 // Copy data from the buffer without removing it.
249 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
250         // Ensure we don't copy more than is actually stored in the buffer
251         if(offset >= buf->used) {
252                 return 0;
253         }
254
255         if(buf->used - offset < len) {
256                 len = buf->used - offset;
257         }
258
259         uint32_t realoffset = buf->offset + offset;
260
261         if(buf->size - buf->offset < offset) {
262                 // The offset wrapped
263                 realoffset -= buf->size;
264         }
265
266         if(buf->size - realoffset < len) {
267                 // The data is wrapped
268                 memcpy(data, buf->data + realoffset, buf->size - realoffset);
269                 memcpy((char *)data + buf->size - realoffset, buf->data, len - (buf->size - realoffset));
270         } else {
271                 memcpy(data, buf->data + realoffset, len);
272         }
273
274         return len;
275 }
276
277 // Get data from the buffer.
278 static ssize_t buffer_get(struct buffer *buf, void *data, size_t len) {
279         len = buffer_copy(buf, data, 0, len);
280
281         if(buf->size - buf->offset < len) {
282                 buf->offset -= buf->size;
283         }
284
285         buf->offset += len;
286         buf->used -= len;
287         return len;
288 }
289
290 // Discard data from the buffer.
291 static ssize_t buffer_discard(struct buffer *buf, size_t len) {
292         if(buf->used < len) {
293                 len = buf->used;
294         }
295
296         if(buf->size - buf->offset < len) {
297                 buf->offset -= buf->size;
298         }
299
300         buf->offset += len;
301         buf->used -= len;
302
303         return len;
304 }
305
306 static bool buffer_set_size(struct buffer *buf, uint32_t minsize, uint32_t maxsize) {
307         if(maxsize < minsize) {
308                 maxsize = minsize;
309         }
310
311         buf->maxsize = maxsize;
312
313         return buf->size >= minsize || buffer_resize(buf, minsize);
314 }
315
316 static void buffer_exit(struct buffer *buf) {
317         free(buf->data);
318         memset(buf, 0, sizeof(*buf));
319 }
320
321 static uint32_t buffer_free(const struct buffer *buf) {
322         return buf->maxsize - buf->used;
323 }
324
325 // Connections are stored in a sorted list.
326 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
327
328 static int compare(const void *va, const void *vb) {
329         assert(va && vb);
330
331         const struct utcp_connection *a = *(struct utcp_connection **)va;
332         const struct utcp_connection *b = *(struct utcp_connection **)vb;
333
334         assert(a && b);
335         assert(a->src && b->src);
336
337         int c = (int)a->src - (int)b->src;
338
339         if(c) {
340                 return c;
341         }
342
343         c = (int)a->dst - (int)b->dst;
344         return c;
345 }
346
347 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
348         if(!utcp->nconnections) {
349                 return NULL;
350         }
351
352         struct utcp_connection key = {
353                 .src = src,
354                 .dst = dst,
355         }, *keyp = &key;
356         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
357         return match ? *match : NULL;
358 }
359
360 static void free_connection(struct utcp_connection *c) {
361         struct utcp *utcp = c->utcp;
362         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
363
364         assert(cp);
365
366         int i = cp - utcp->connections;
367         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
368         utcp->nconnections--;
369
370         buffer_exit(&c->rcvbuf);
371         buffer_exit(&c->sndbuf);
372         free(c);
373 }
374
375 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
376         // Check whether this combination of src and dst is free
377
378         if(src) {
379                 if(find_connection(utcp, src, dst)) {
380                         errno = EADDRINUSE;
381                         return NULL;
382                 }
383         } else { // If src == 0, generate a random port number with the high bit set
384                 if(utcp->nconnections >= 32767) {
385                         errno = ENOMEM;
386                         return NULL;
387                 }
388
389                 src = rand() | 0x8000;
390
391                 while(find_connection(utcp, src, dst)) {
392                         src++;
393                 }
394         }
395
396         // Allocate memory for the new connection
397
398         if(utcp->nconnections >= utcp->nallocated) {
399                 if(!utcp->nallocated) {
400                         utcp->nallocated = 4;
401                 } else {
402                         utcp->nallocated *= 2;
403                 }
404
405                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
406
407                 if(!new_array) {
408                         return NULL;
409                 }
410
411                 utcp->connections = new_array;
412         }
413
414         struct utcp_connection *c = calloc(1, sizeof(*c));
415
416         if(!c) {
417                 return NULL;
418         }
419
420         if(!buffer_set_size(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
421                 free(c);
422                 return NULL;
423         }
424
425         if(!buffer_set_size(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
426                 buffer_exit(&c->sndbuf);
427                 free(c);
428                 return NULL;
429         }
430
431         // Fill in the details
432
433         c->src = src;
434         c->dst = dst;
435 #ifdef UTCP_DEBUG
436         c->snd.iss = 0;
437 #else
438         c->snd.iss = rand();
439 #endif
440         c->snd.una = c->snd.iss;
441         c->snd.nxt = c->snd.iss + 1;
442         c->rcv.wnd = utcp->mtu;
443         c->snd.last = c->snd.nxt;
444         c->snd.cwnd = utcp->mtu;
445         c->utcp = utcp;
446
447         // Add it to the sorted list of connections
448
449         utcp->connections[utcp->nconnections++] = c;
450         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
451
452         return c;
453 }
454
455 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
456         if(a > b) {
457                 return a - b;
458         } else {
459                 return b - a;
460         }
461 }
462
463 // Update RTT variables. See RFC 6298.
464 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
465         if(!rtt) {
466                 debug("invalid rtt\n");
467                 return;
468         }
469
470         struct utcp *utcp = c->utcp;
471
472         if(!utcp->srtt) {
473                 utcp->srtt = rtt;
474                 utcp->rttvar = rtt / 2;
475         } else {
476                 utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
477                 utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
478         }
479
480         utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
481
482         if(utcp->rto > MAX_RTO) {
483                 utcp->rto = MAX_RTO;
484         }
485
486         debug("rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
487 }
488
489 static void start_retransmit_timer(struct utcp_connection *c) {
490         gettimeofday(&c->rtrx_timeout, NULL);
491         c->rtrx_timeout.tv_usec += c->utcp->rto;
492
493         while(c->rtrx_timeout.tv_usec >= 1000000) {
494                 c->rtrx_timeout.tv_usec -= 1000000;
495                 c->rtrx_timeout.tv_sec++;
496         }
497
498         debug("timeout set to %lu.%06lu (%u)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_usec, c->utcp->rto);
499 }
500
501 static void stop_retransmit_timer(struct utcp_connection *c) {
502         timerclear(&c->rtrx_timeout);
503         debug("timeout cleared\n");
504 }
505
506 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
507         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
508
509         if(!c) {
510                 return NULL;
511         }
512
513         assert((flags & ~0x1f) == 0);
514
515         c->flags = flags;
516         c->recv = recv;
517         c->priv = priv;
518
519         struct {
520                 struct hdr hdr;
521                 uint8_t init[4];
522         } pkt;
523
524         pkt.hdr.src = c->src;
525         pkt.hdr.dst = c->dst;
526         pkt.hdr.seq = c->snd.iss;
527         pkt.hdr.ack = 0;
528         pkt.hdr.wnd = c->rcv.wnd;
529         pkt.hdr.ctl = SYN;
530         pkt.hdr.aux = 0x0101;
531         pkt.init[0] = 1;
532         pkt.init[1] = 0;
533         pkt.init[2] = 0;
534         pkt.init[3] = flags & 0x7;
535
536         set_state(c, SYN_SENT);
537
538         print_packet(utcp, "send", &pkt, sizeof(pkt));
539         utcp->send(utcp, &pkt, sizeof(pkt));
540
541         gettimeofday(&c->conn_timeout, NULL);
542         c->conn_timeout.tv_sec += utcp->timeout;
543
544         start_retransmit_timer(c);
545
546         return c;
547 }
548
549 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
550         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
551 }
552
553 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
554         if(c->reapable || c->state != SYN_RECEIVED) {
555                 debug("Error: accept() called on invalid connection %p in state %s\n", c, strstate[c->state]);
556                 return;
557         }
558
559         debug("%p accepted, %p %p\n", c, recv, priv);
560         c->recv = recv;
561         c->priv = priv;
562         set_state(c, ESTABLISHED);
563 }
564
565 static void ack(struct utcp_connection *c, bool sendatleastone) {
566         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
567         int32_t cwndleft = c->snd.cwnd - seqdiff(c->snd.nxt, c->snd.una);
568         debug("cwndleft = %d\n", cwndleft);
569
570         assert(left >= 0);
571
572         if(cwndleft <= 0) {
573                 cwndleft = 0;
574         }
575
576         if(cwndleft < left) {
577                 left = cwndleft;
578         }
579
580         if(!left && !sendatleastone) {
581                 return;
582         }
583
584         struct {
585                 struct hdr hdr;
586                 uint8_t data[];
587         } *pkt = c->utcp->pkt;
588
589         pkt->hdr.src = c->src;
590         pkt->hdr.dst = c->dst;
591         pkt->hdr.ack = c->rcv.nxt;
592         pkt->hdr.wnd = c->snd.wnd;
593         pkt->hdr.ctl = ACK;
594         pkt->hdr.aux = 0;
595
596         do {
597                 uint32_t seglen = left > c->utcp->mtu ? c->utcp->mtu : left;
598                 pkt->hdr.seq = c->snd.nxt;
599
600                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
601
602                 c->snd.nxt += seglen;
603                 left -= seglen;
604
605                 if(seglen && fin_wanted(c, c->snd.nxt)) {
606                         seglen--;
607                         pkt->hdr.ctl |= FIN;
608                 }
609
610                 if(!c->rtt_start.tv_sec) {
611                         // Start RTT measurement
612                         gettimeofday(&c->rtt_start, NULL);
613                         c->rtt_seq = pkt->hdr.seq + seglen;
614                         debug("Starting RTT measurement, expecting ack %u\n", c->rtt_seq);
615                 }
616
617                 print_packet(c->utcp, "send", pkt, sizeof(pkt->hdr) + seglen);
618                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
619         } while(left);
620 }
621
622 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
623         if(c->reapable) {
624                 debug("Error: send() called on closed connection %p\n", c);
625                 errno = EBADF;
626                 return -1;
627         }
628
629         switch(c->state) {
630         case CLOSED:
631         case LISTEN:
632                 debug("Error: send() called on unconnected connection %p\n", c);
633                 errno = ENOTCONN;
634                 return -1;
635
636         case SYN_SENT:
637         case SYN_RECEIVED:
638         case ESTABLISHED:
639         case CLOSE_WAIT:
640                 break;
641
642         case FIN_WAIT_1:
643         case FIN_WAIT_2:
644         case CLOSING:
645         case LAST_ACK:
646         case TIME_WAIT:
647                 debug("Error: send() called on closing connection %p\n", c);
648                 errno = EPIPE;
649                 return -1;
650         }
651
652         // Exit early if we have nothing to send.
653
654         if(!len) {
655                 return 0;
656         }
657
658         if(!data) {
659                 errno = EFAULT;
660                 return -1;
661         }
662
663         // Check if we need to be able to buffer all data
664
665         if(c->flags & UTCP_NO_PARTIAL) {
666                 if(len > buffer_free(&c->sndbuf)) {
667                         if(len > c->sndbuf.maxsize) {
668                                 errno = EMSGSIZE;
669                                 return -1;
670                         } else {
671                                 errno = EWOULDBLOCK;
672                                 return 0;
673                         }
674                 }
675         }
676
677         // Add data to send buffer.
678
679         if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
680                 len = buffer_put(&c->sndbuf, data, len);
681         } else {
682                 return 0;
683         }
684
685         if(len <= 0) {
686                 if(is_reliable(c)) {
687                         errno = EWOULDBLOCK;
688                         return 0;
689                 } else {
690                         return len;
691                 }
692         }
693
694         c->snd.last += len;
695
696         // Don't send anything yet if the connection has not fully established yet
697
698         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
699                 return len;
700         }
701
702         ack(c, false);
703
704         if(!is_reliable(c)) {
705                 c->snd.una = c->snd.nxt = c->snd.last;
706                 buffer_get(&c->sndbuf, NULL, c->sndbuf.used);
707         }
708
709         if(is_reliable(c) && !timerisset(&c->rtrx_timeout)) {
710                 start_retransmit_timer(c);
711         }
712
713         if(is_reliable(c) && !timerisset(&c->conn_timeout)) {
714                 gettimeofday(&c->conn_timeout, NULL);
715                 c->conn_timeout.tv_sec += c->utcp->timeout;
716         }
717
718         return len;
719 }
720
721 static void swap_ports(struct hdr *hdr) {
722         uint16_t tmp = hdr->src;
723         hdr->src = hdr->dst;
724         hdr->dst = tmp;
725 }
726
727 static void retransmit(struct utcp_connection *c) {
728         if(c->state == CLOSED || c->snd.last == c->snd.una) {
729                 debug("Retransmit() called but nothing to retransmit!\n");
730                 stop_retransmit_timer(c);
731                 return;
732         }
733
734         struct utcp *utcp = c->utcp;
735
736         struct {
737                 struct hdr hdr;
738                 uint8_t data[];
739         } *pkt = c->utcp->pkt;
740
741         pkt->hdr.src = c->src;
742         pkt->hdr.dst = c->dst;
743         pkt->hdr.wnd = c->rcv.wnd;
744         pkt->hdr.aux = 0;
745
746         switch(c->state) {
747         case SYN_SENT:
748                 // Send our SYN again
749                 pkt->hdr.seq = c->snd.iss;
750                 pkt->hdr.ack = 0;
751                 pkt->hdr.ctl = SYN;
752                 pkt->hdr.aux = 0x0101;
753                 pkt->data[0] = 1;
754                 pkt->data[1] = 0;
755                 pkt->data[2] = 0;
756                 pkt->data[3] = c->flags & 0x7;
757                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + 4);
758                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
759                 break;
760
761         case SYN_RECEIVED:
762                 // Send SYNACK again
763                 pkt->hdr.seq = c->snd.nxt;
764                 pkt->hdr.ack = c->rcv.nxt;
765                 pkt->hdr.ctl = SYN | ACK;
766                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr));
767                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
768                 break;
769
770         case ESTABLISHED:
771         case FIN_WAIT_1:
772         case CLOSE_WAIT:
773         case CLOSING:
774         case LAST_ACK:
775                 // Send unacked data again.
776                 pkt->hdr.seq = c->snd.una;
777                 pkt->hdr.ack = c->rcv.nxt;
778                 pkt->hdr.ctl = ACK;
779                 uint32_t len = seqdiff(c->snd.last, c->snd.una);
780
781                 if(len > utcp->mtu) {
782                         len = utcp->mtu;
783                 }
784
785                 if(fin_wanted(c, c->snd.una + len)) {
786                         len--;
787                         pkt->hdr.ctl |= FIN;
788                 }
789
790                 c->snd.nxt = c->snd.una + len;
791                 c->snd.cwnd = utcp->mtu; // reduce cwnd on retransmit
792                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
793                 print_packet(c->utcp, "rtrx", pkt, sizeof(pkt->hdr) + len);
794                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
795                 break;
796
797         case CLOSED:
798         case LISTEN:
799         case TIME_WAIT:
800         case FIN_WAIT_2:
801                 // We shouldn't need to retransmit anything in this state.
802 #ifdef UTCP_DEBUG
803                 abort();
804 #endif
805                 stop_retransmit_timer(c);
806                 goto cleanup;
807         }
808
809         start_retransmit_timer(c);
810         utcp->rto *= 2;
811
812         if(utcp->rto > MAX_RTO) {
813                 utcp->rto = MAX_RTO;
814         }
815
816         c->rtt_start.tv_sec = 0; // invalidate RTT timer
817
818 cleanup:
819         return;
820 }
821
822 /* Update receive buffer and SACK entries after consuming data.
823  *
824  * Situation:
825  *
826  * |.....0000..1111111111.....22222......3333|
827  * |---------------^
828  *
829  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
830  * to remove data from the receive buffer. The idea is to substract "len"
831  * from the offset of all the SACK entries, and then remove/cut down entries
832  * that are shifted to before the start of the receive buffer.
833  *
834  * There are three cases:
835  * - the SACK entry is after ^, in that case just change the offset.
836  * - the SACK entry starts before and ends after ^, so we have to
837  *   change both its offset and size.
838  * - the SACK entry is completely before ^, in that case delete it.
839  */
840 static void sack_consume(struct utcp_connection *c, size_t len) {
841         debug("sack_consume %lu\n", (unsigned long)len);
842
843         if(len > c->rcvbuf.used) {
844                 debug("All SACK entries consumed");
845                 c->sacks[0].len = 0;
846                 return;
847         }
848
849         buffer_get(&c->rcvbuf, NULL, len);
850
851         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
852                 if(len < c->sacks[i].offset) {
853                         c->sacks[i].offset -= len;
854                         i++;
855                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
856                         c->sacks[i].len -= len - c->sacks[i].offset;
857                         c->sacks[i].offset = 0;
858                         i++;
859                 } else {
860                         if(i < NSACKS - 1) {
861                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
862                                 c->sacks[NSACKS - 1].len = 0;
863                         } else {
864                                 c->sacks[i].len = 0;
865                                 break;
866                         }
867                 }
868         }
869
870         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
871                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
872         }
873 }
874
875 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
876         debug("out of order packet, offset %u\n", offset);
877         // Packet loss or reordering occured. Store the data in the buffer.
878         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
879
880         if(rxd < 0 || (size_t)rxd < len) {
881                 abort();
882         }
883
884         // Make note of where we put it.
885         for(int i = 0; i < NSACKS; i++) {
886                 if(!c->sacks[i].len) { // nothing to merge, add new entry
887                         debug("New SACK entry %d\n", i);
888                         c->sacks[i].offset = offset;
889                         c->sacks[i].len = rxd;
890                         break;
891                 } else if(offset < c->sacks[i].offset) {
892                         if(offset + rxd < c->sacks[i].offset) { // insert before
893                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
894                                         debug("Insert SACK entry at %d\n", i);
895                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
896                                         c->sacks[i].offset = offset;
897                                         c->sacks[i].len = rxd;
898                                 } else {
899                                         debug("SACK entries full, dropping packet\n");
900                                 }
901
902                                 break;
903                         } else { // merge
904                                 debug("Merge with start of SACK entry at %d\n", i);
905                                 c->sacks[i].offset = offset;
906                                 break;
907                         }
908                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
909                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
910                                 debug("Merge with end of SACK entry at %d\n", i);
911                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
912                                 // TODO: handle potential merge with next entry
913                         }
914
915                         break;
916                 }
917         }
918
919         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
920                 debug("SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
921         }
922 }
923
924 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
925         // Check if we can process out-of-order data now.
926         if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
927                 debug("incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
928                 buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
929                 len = max(len, c->sacks[0].offset + c->sacks[0].len);
930                 data = c->rcvbuf.data;
931         }
932
933         if(c->recv) {
934                 ssize_t rxd = c->recv(c, data, len);
935
936                 if(rxd < 0 || (size_t)rxd != len) {
937                         // TODO: handle the application not accepting all data.
938                         abort();
939                 }
940         }
941
942         if(c->rcvbuf.used) {
943                 sack_consume(c, len);
944         }
945
946         c->rcv.nxt += len;
947 }
948
949
950 static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
951         if(!is_reliable(c)) {
952                 c->recv(c, data, len);
953                 c->rcv.nxt = seq + len;
954                 return;
955         }
956
957         uint32_t offset = seqdiff(seq, c->rcv.nxt);
958
959         if(offset + len > c->rcvbuf.maxsize) {
960                 abort();
961         }
962
963         if(offset) {
964                 handle_out_of_order(c, offset, data, len);
965         } else {
966                 handle_in_order(c, data, len);
967         }
968 }
969
970
971 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
972         const uint8_t *ptr = data;
973
974         if(!utcp) {
975                 errno = EFAULT;
976                 return -1;
977         }
978
979         if(!len) {
980                 return 0;
981         }
982
983         if(!data) {
984                 errno = EFAULT;
985                 return -1;
986         }
987
988         print_packet(utcp, "recv", data, len);
989
990         // Drop packets smaller than the header
991
992         struct hdr hdr;
993
994         if(len < sizeof(hdr)) {
995                 errno = EBADMSG;
996                 return -1;
997         }
998
999         // Make a copy from the potentially unaligned data to a struct hdr
1000
1001         memcpy(&hdr, ptr, sizeof(hdr));
1002         ptr += sizeof(hdr);
1003         len -= sizeof(hdr);
1004
1005         // Drop packets with an unknown CTL flag
1006
1007         if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
1008                 errno = EBADMSG;
1009                 return -1;
1010         }
1011
1012         // Check for auxiliary headers
1013
1014         const uint8_t *init = NULL;
1015
1016         uint16_t aux = hdr.aux;
1017
1018         while(aux) {
1019                 size_t auxlen = 4 * (aux >> 8) & 0xf;
1020                 uint8_t auxtype = aux & 0xff;
1021
1022                 if(len < auxlen) {
1023                         errno = EBADMSG;
1024                         return -1;
1025                 }
1026
1027                 switch(auxtype) {
1028                 case AUX_INIT:
1029                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1030                                 errno = EBADMSG;
1031                                 return -1;
1032                         }
1033
1034                         init = ptr;
1035                         break;
1036
1037                 default:
1038                         errno = EBADMSG;
1039                         return -1;
1040                 }
1041
1042                 len -= auxlen;
1043                 ptr += auxlen;
1044
1045                 if(!(aux & 0x800)) {
1046                         break;
1047                 }
1048
1049                 if(len < 2) {
1050                         errno = EBADMSG;
1051                         return -1;
1052                 }
1053
1054                 memcpy(&aux, ptr, 2);
1055                 len -= 2;
1056                 ptr += 2;
1057         }
1058
1059         bool has_data = len || (hdr.ctl & (SYN | FIN));
1060
1061         // Try to match the packet to an existing connection
1062
1063         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1064
1065         // Is it for a new connection?
1066
1067         if(!c) {
1068                 // Ignore RST packets
1069
1070                 if(hdr.ctl & RST) {
1071                         return 0;
1072                 }
1073
1074                 // Is it a SYN packet and are we LISTENing?
1075
1076                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1077                         // If we don't want to accept it, send a RST back
1078                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1079                                 len = 1;
1080                                 goto reset;
1081                         }
1082
1083                         // Try to allocate memory, otherwise send a RST back
1084                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1085
1086                         if(!c) {
1087                                 len = 1;
1088                                 goto reset;
1089                         }
1090
1091                         // Parse auxilliary information
1092                         if(init) {
1093                                 if(init[0] < 1) {
1094                                         len = 1;
1095                                         goto reset;
1096                                 }
1097
1098                                 c->flags = init[3] & 0x7;
1099                         } else {
1100                                 c->flags = UTCP_TCP;
1101                         }
1102
1103 synack:
1104                         // Return SYN+ACK, go to SYN_RECEIVED state
1105                         c->snd.wnd = hdr.wnd;
1106                         c->rcv.irs = hdr.seq;
1107                         c->rcv.nxt = c->rcv.irs + 1;
1108                         set_state(c, SYN_RECEIVED);
1109
1110                         struct {
1111                                 struct hdr hdr;
1112                                 uint8_t data[4];
1113                         } pkt;
1114
1115                         pkt.hdr.src = c->src;
1116                         pkt.hdr.dst = c->dst;
1117                         pkt.hdr.ack = c->rcv.irs + 1;
1118                         pkt.hdr.seq = c->snd.iss;
1119                         pkt.hdr.wnd = c->rcv.wnd;
1120                         pkt.hdr.ctl = SYN | ACK;
1121
1122                         if(init) {
1123                                 pkt.hdr.aux = 0x0101;
1124                                 pkt.data[0] = 1;
1125                                 pkt.data[1] = 0;
1126                                 pkt.data[2] = 0;
1127                                 pkt.data[3] = c->flags & 0x7;
1128                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr) + 4);
1129                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1130                         } else {
1131                                 pkt.hdr.aux = 0;
1132                                 print_packet(c->utcp, "send", &pkt, sizeof(hdr));
1133                                 utcp->send(utcp, &pkt, sizeof(hdr));
1134                         }
1135                 } else {
1136                         // No, we don't want your packets, send a RST back
1137                         len = 1;
1138                         goto reset;
1139                 }
1140
1141                 return 0;
1142         }
1143
1144         debug("%p state %s\n", c->utcp, strstate[c->state]);
1145
1146         // In case this is for a CLOSED connection, ignore the packet.
1147         // TODO: make it so incoming packets can never match a CLOSED connection.
1148
1149         if(c->state == CLOSED) {
1150                 debug("Got packet for closed connection\n");
1151                 return 0;
1152         }
1153
1154         // It is for an existing connection.
1155
1156         // 1. Drop invalid packets.
1157
1158         // 1a. Drop packets that should not happen in our current state.
1159
1160         switch(c->state) {
1161         case SYN_SENT:
1162         case SYN_RECEIVED:
1163         case ESTABLISHED:
1164         case FIN_WAIT_1:
1165         case FIN_WAIT_2:
1166         case CLOSE_WAIT:
1167         case CLOSING:
1168         case LAST_ACK:
1169         case TIME_WAIT:
1170                 break;
1171
1172         default:
1173 #ifdef UTCP_DEBUG
1174                 abort();
1175 #endif
1176                 break;
1177         }
1178
1179         // 1b. Discard data that is not in our receive window.
1180
1181         if(is_reliable(c)) {
1182                 bool acceptable;
1183
1184                 if(c->state == SYN_SENT) {
1185                         acceptable = true;
1186                 } else if(len == 0) {
1187                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1188                 } else {
1189                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1190
1191                         // cut already accepted front overlapping
1192                         if(rcv_offset < 0) {
1193                                 acceptable = len > (size_t) - rcv_offset;
1194
1195                                 if(acceptable) {
1196                                         ptr -= rcv_offset;
1197                                         len += rcv_offset;
1198                                         hdr.seq -= rcv_offset;
1199                                 }
1200                         } else {
1201                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1202                         }
1203                 }
1204
1205                 if(!acceptable) {
1206                         debug("Packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1207
1208                         // Ignore unacceptable RST packets.
1209                         if(hdr.ctl & RST) {
1210                                 return 0;
1211                         }
1212
1213                         // Otherwise, continue processing.
1214                         len = 0;
1215                 }
1216         }
1217
1218         c->snd.wnd = hdr.wnd; // TODO: move below
1219
1220         // 1c. Drop packets with an invalid ACK.
1221         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1222         // (= snd.una + c->sndbuf.used).
1223
1224         if(!is_reliable(c)) {
1225                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1226                         hdr.ack = c->snd.una;
1227                 }
1228         }
1229
1230         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1231                 debug("Packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1232
1233                 // Ignore unacceptable RST packets.
1234                 if(hdr.ctl & RST) {
1235                         return 0;
1236                 }
1237
1238                 goto reset;
1239         }
1240
1241         // 2. Handle RST packets
1242
1243         if(hdr.ctl & RST) {
1244                 switch(c->state) {
1245                 case SYN_SENT:
1246                         if(!(hdr.ctl & ACK)) {
1247                                 return 0;
1248                         }
1249
1250                         // The peer has refused our connection.
1251                         set_state(c, CLOSED);
1252                         errno = ECONNREFUSED;
1253
1254                         if(c->recv) {
1255                                 c->recv(c, NULL, 0);
1256                         }
1257
1258                         if(c->poll && !c->reapable) {
1259                                 c->poll(c, 0);
1260                         }
1261
1262                         return 0;
1263
1264                 case SYN_RECEIVED:
1265                         if(hdr.ctl & ACK) {
1266                                 return 0;
1267                         }
1268
1269                         // We haven't told the application about this connection yet. Silently delete.
1270                         free_connection(c);
1271                         return 0;
1272
1273                 case ESTABLISHED:
1274                 case FIN_WAIT_1:
1275                 case FIN_WAIT_2:
1276                 case CLOSE_WAIT:
1277                         if(hdr.ctl & ACK) {
1278                                 return 0;
1279                         }
1280
1281                         // The peer has aborted our connection.
1282                         set_state(c, CLOSED);
1283                         errno = ECONNRESET;
1284
1285                         if(c->recv) {
1286                                 c->recv(c, NULL, 0);
1287                         }
1288
1289                         if(c->poll && !c->reapable) {
1290                                 c->poll(c, 0);
1291                         }
1292
1293                         return 0;
1294
1295                 case CLOSING:
1296                 case LAST_ACK:
1297                 case TIME_WAIT:
1298                         if(hdr.ctl & ACK) {
1299                                 return 0;
1300                         }
1301
1302                         // As far as the application is concerned, the connection has already been closed.
1303                         // If it has called utcp_close() already, we can immediately free this connection.
1304                         if(c->reapable) {
1305                                 free_connection(c);
1306                                 return 0;
1307                         }
1308
1309                         // Otherwise, immediately move to the CLOSED state.
1310                         set_state(c, CLOSED);
1311                         return 0;
1312
1313                 default:
1314 #ifdef UTCP_DEBUG
1315                         abort();
1316 #endif
1317                         break;
1318                 }
1319         }
1320
1321         uint32_t advanced;
1322
1323         if(!(hdr.ctl & ACK)) {
1324                 advanced = 0;
1325                 goto skip_ack;
1326         }
1327
1328         // 3. Advance snd.una
1329
1330         advanced = seqdiff(hdr.ack, c->snd.una);
1331
1332         if(advanced) {
1333                 // RTT measurement
1334                 if(c->rtt_start.tv_sec) {
1335                         if(c->rtt_seq == hdr.ack) {
1336                                 struct timeval now, diff;
1337                                 gettimeofday(&now, NULL);
1338                                 timersub(&now, &c->rtt_start, &diff);
1339                                 update_rtt(c, diff.tv_sec * 1000000 + diff.tv_usec);
1340                                 c->rtt_start.tv_sec = 0;
1341                         } else if(c->rtt_seq < hdr.ack) {
1342                                 debug("Cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1343                                 c->rtt_start.tv_sec = 0;
1344                         }
1345                 }
1346
1347                 int32_t data_acked = advanced;
1348
1349                 switch(c->state) {
1350                 case SYN_SENT:
1351                 case SYN_RECEIVED:
1352                         data_acked--;
1353                         break;
1354
1355                 // TODO: handle FIN as well.
1356                 default:
1357                         break;
1358                 }
1359
1360                 assert(data_acked >= 0);
1361
1362 #ifndef NDEBUG
1363                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1364                 assert(data_acked <= bufused);
1365 #endif
1366
1367                 if(data_acked) {
1368                         buffer_discard(&c->sndbuf, data_acked);
1369                 }
1370
1371                 // Also advance snd.nxt if possible
1372                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1373                         c->snd.nxt = hdr.ack;
1374                 }
1375
1376                 c->snd.una = hdr.ack;
1377
1378                 c->dupack = 0;
1379                 c->snd.cwnd += utcp->mtu;
1380
1381                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1382                         c->snd.cwnd = c->sndbuf.maxsize;
1383                 }
1384
1385                 // Check if we have sent a FIN that is now ACKed.
1386                 switch(c->state) {
1387                 case FIN_WAIT_1:
1388                         if(c->snd.una == c->snd.last) {
1389                                 set_state(c, FIN_WAIT_2);
1390                         }
1391
1392                         break;
1393
1394                 case CLOSING:
1395                         if(c->snd.una == c->snd.last) {
1396                                 gettimeofday(&c->conn_timeout, NULL);
1397                                 c->conn_timeout.tv_sec += utcp->timeout;
1398                                 set_state(c, TIME_WAIT);
1399                         }
1400
1401                         break;
1402
1403                 default:
1404                         break;
1405                 }
1406         } else {
1407                 if(!len && is_reliable(c)) {
1408                         c->dupack++;
1409
1410                         if(c->dupack == 3) {
1411                                 debug("Triplicate ACK\n");
1412                                 //TODO: Resend one packet and go to fast recovery mode. See RFC 6582.
1413                                 //We do a very simple variant here; reset the nxt pointer to the last acknowledged packet from the peer.
1414                                 //Reset the congestion window so we wait for ACKs.
1415                                 c->snd.nxt = c->snd.una;
1416                                 c->snd.cwnd = utcp->mtu;
1417                                 start_retransmit_timer(c);
1418                         }
1419                 }
1420         }
1421
1422         // 4. Update timers
1423
1424         if(advanced) {
1425                 if(c->snd.una == c->snd.last) {
1426                         stop_retransmit_timer(c);
1427                         timerclear(&c->conn_timeout);
1428                 } else if(is_reliable(c)) {
1429                         start_retransmit_timer(c);
1430                         gettimeofday(&c->conn_timeout, NULL);
1431                         c->conn_timeout.tv_sec += utcp->timeout;
1432                 }
1433         }
1434
1435 skip_ack:
1436         // 5. Process SYN stuff
1437
1438         if(hdr.ctl & SYN) {
1439                 switch(c->state) {
1440                 case SYN_SENT:
1441
1442                         // This is a SYNACK. It should always have ACKed the SYN.
1443                         if(!advanced) {
1444                                 goto reset;
1445                         }
1446
1447                         c->rcv.irs = hdr.seq;
1448                         c->rcv.nxt = hdr.seq;
1449
1450                         if(c->shut_wr) {
1451                                 c->snd.last++;
1452                                 set_state(c, FIN_WAIT_1);
1453                         } else {
1454                                 set_state(c, ESTABLISHED);
1455                         }
1456
1457                         // TODO: notify application of this somehow.
1458                         break;
1459
1460                 case SYN_RECEIVED:
1461                         // This is a retransmit of a SYN, send back the SYNACK.
1462                         goto synack;
1463
1464                 case ESTABLISHED:
1465                 case FIN_WAIT_1:
1466                 case FIN_WAIT_2:
1467                 case CLOSE_WAIT:
1468                 case CLOSING:
1469                 case LAST_ACK:
1470                 case TIME_WAIT:
1471                         // Ehm, no. We should never receive a second SYN.
1472                         return 0;
1473
1474                 default:
1475 #ifdef UTCP_DEBUG
1476                         abort();
1477 #endif
1478                         return 0;
1479                 }
1480
1481                 // SYN counts as one sequence number
1482                 c->rcv.nxt++;
1483         }
1484
1485         // 6. Process new data
1486
1487         if(c->state == SYN_RECEIVED) {
1488                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
1489                 if(!advanced) {
1490                         goto reset;
1491                 }
1492
1493                 // Are we still LISTENing?
1494                 if(utcp->accept) {
1495                         utcp->accept(c, c->src);
1496                 }
1497
1498                 if(c->state != ESTABLISHED) {
1499                         set_state(c, CLOSED);
1500                         c->reapable = true;
1501                         goto reset;
1502                 }
1503         }
1504
1505         if(len) {
1506                 switch(c->state) {
1507                 case SYN_SENT:
1508                 case SYN_RECEIVED:
1509                         // This should never happen.
1510 #ifdef UTCP_DEBUG
1511                         abort();
1512 #endif
1513                         return 0;
1514
1515                 case ESTABLISHED:
1516                 case FIN_WAIT_1:
1517                 case FIN_WAIT_2:
1518                         break;
1519
1520                 case CLOSE_WAIT:
1521                 case CLOSING:
1522                 case LAST_ACK:
1523                 case TIME_WAIT:
1524                         // Ehm no, We should never receive more data after a FIN.
1525                         goto reset;
1526
1527                 default:
1528 #ifdef UTCP_DEBUG
1529                         abort();
1530 #endif
1531                         return 0;
1532                 }
1533
1534                 handle_incoming_data(c, hdr.seq, ptr, len);
1535         }
1536
1537         // 7. Process FIN stuff
1538
1539         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
1540                 switch(c->state) {
1541                 case SYN_SENT:
1542                 case SYN_RECEIVED:
1543                         // This should never happen.
1544 #ifdef UTCP_DEBUG
1545                         abort();
1546 #endif
1547                         break;
1548
1549                 case ESTABLISHED:
1550                         set_state(c, CLOSE_WAIT);
1551                         break;
1552
1553                 case FIN_WAIT_1:
1554                         set_state(c, CLOSING);
1555                         break;
1556
1557                 case FIN_WAIT_2:
1558                         gettimeofday(&c->conn_timeout, NULL);
1559                         c->conn_timeout.tv_sec += utcp->timeout;
1560                         set_state(c, TIME_WAIT);
1561                         break;
1562
1563                 case CLOSE_WAIT:
1564                 case CLOSING:
1565                 case LAST_ACK:
1566                 case TIME_WAIT:
1567                         // Ehm, no. We should never receive a second FIN.
1568                         goto reset;
1569
1570                 default:
1571 #ifdef UTCP_DEBUG
1572                         abort();
1573 #endif
1574                         break;
1575                 }
1576
1577                 // FIN counts as one sequence number
1578                 c->rcv.nxt++;
1579                 len++;
1580
1581                 // Inform the application that the peer closed its end of the connection.
1582                 if(c->recv) {
1583                         errno = 0;
1584                         c->recv(c, NULL, 0);
1585                 }
1586         }
1587
1588         // Now we send something back if:
1589         // - we received data, so we have to send back an ACK
1590         //   -> sendatleastone = true
1591         // - or we got an ack, so we should maybe send a bit more data
1592         //   -> sendatleastone = false
1593
1594         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
1595                 ack(c, has_data);
1596         }
1597
1598         return 0;
1599
1600 reset:
1601         swap_ports(&hdr);
1602         hdr.wnd = 0;
1603         hdr.aux = 0;
1604
1605         if(hdr.ctl & ACK) {
1606                 hdr.seq = hdr.ack;
1607                 hdr.ctl = RST;
1608         } else {
1609                 hdr.ack = hdr.seq + len;
1610                 hdr.seq = 0;
1611                 hdr.ctl = RST | ACK;
1612         }
1613
1614         print_packet(utcp, "send", &hdr, sizeof(hdr));
1615         utcp->send(utcp, &hdr, sizeof(hdr));
1616         return 0;
1617
1618 }
1619
1620 int utcp_shutdown(struct utcp_connection *c, int dir) {
1621         debug("%p shutdown %d at %u\n", c ? c->utcp : NULL, dir, c ? c->snd.last : 0);
1622
1623         if(!c) {
1624                 errno = EFAULT;
1625                 return -1;
1626         }
1627
1628         if(c->reapable) {
1629                 debug("Error: shutdown() called on closed connection %p\n", c);
1630                 errno = EBADF;
1631                 return -1;
1632         }
1633
1634         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
1635                 errno = EINVAL;
1636                 return -1;
1637         }
1638
1639         // TCP does not have a provision for stopping incoming packets.
1640         // The best we can do is to just ignore them.
1641         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
1642                 c->recv = NULL;
1643         }
1644
1645         // The rest of the code deals with shutting down writes.
1646         if(dir == UTCP_SHUT_RD) {
1647                 return 0;
1648         }
1649
1650         // Only process shutting down writes once.
1651         if(c->shut_wr) {
1652                 return 0;
1653         }
1654
1655         c->shut_wr = true;
1656
1657         switch(c->state) {
1658         case CLOSED:
1659         case LISTEN:
1660                 errno = ENOTCONN;
1661                 return -1;
1662
1663         case SYN_SENT:
1664                 return 0;
1665
1666         case SYN_RECEIVED:
1667         case ESTABLISHED:
1668                 set_state(c, FIN_WAIT_1);
1669                 break;
1670
1671         case FIN_WAIT_1:
1672         case FIN_WAIT_2:
1673                 return 0;
1674
1675         case CLOSE_WAIT:
1676                 set_state(c, CLOSING);
1677                 break;
1678
1679         case CLOSING:
1680         case LAST_ACK:
1681         case TIME_WAIT:
1682                 return 0;
1683         }
1684
1685         c->snd.last++;
1686
1687         ack(c, false);
1688
1689         if(!timerisset(&c->rtrx_timeout)) {
1690                 start_retransmit_timer(c);
1691         }
1692
1693         return 0;
1694 }
1695
1696 static bool reset_connection(struct utcp_connection *c) {
1697         if(!c) {
1698                 errno = EFAULT;
1699                 return false;
1700         }
1701
1702         if(c->reapable) {
1703                 debug("Error: abort() called on closed connection %p\n", c);
1704                 errno = EBADF;
1705                 return false;
1706         }
1707
1708         c->recv = NULL;
1709         c->poll = NULL;
1710
1711         switch(c->state) {
1712         case CLOSED:
1713                 return true;
1714
1715         case LISTEN:
1716         case SYN_SENT:
1717         case CLOSING:
1718         case LAST_ACK:
1719         case TIME_WAIT:
1720                 set_state(c, CLOSED);
1721                 return true;
1722
1723         case SYN_RECEIVED:
1724         case ESTABLISHED:
1725         case FIN_WAIT_1:
1726         case FIN_WAIT_2:
1727         case CLOSE_WAIT:
1728                 set_state(c, CLOSED);
1729                 break;
1730         }
1731
1732         // Send RST
1733
1734         struct hdr hdr;
1735
1736         hdr.src = c->src;
1737         hdr.dst = c->dst;
1738         hdr.seq = c->snd.nxt;
1739         hdr.ack = 0;
1740         hdr.wnd = 0;
1741         hdr.ctl = RST;
1742
1743         print_packet(c->utcp, "send", &hdr, sizeof(hdr));
1744         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
1745         return true;
1746 }
1747
1748 // Closes all the opened connections
1749 void utcp_abort_all_connections(struct utcp *utcp) {
1750         if(!utcp) {
1751                 errno = EINVAL;
1752                 return;
1753         }
1754
1755         for(int i = 0; i < utcp->nconnections; i++) {
1756                 struct utcp_connection *c = utcp->connections[i];
1757
1758                 if(c->reapable || c->state == CLOSED) {
1759                         continue;
1760                 }
1761
1762                 utcp_recv_t old_recv = c->recv;
1763                 utcp_poll_t old_poll = c->poll;
1764
1765                 reset_connection(c);
1766
1767                 if(old_recv) {
1768                         errno = 0;
1769                         old_recv(c, NULL, 0);
1770                 }
1771
1772                 if(old_poll && !c->reapable) {
1773                         errno = 0;
1774                         old_poll(c, 0);
1775                 }
1776         }
1777
1778         return;
1779 }
1780
1781 int utcp_close(struct utcp_connection *c) {
1782         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
1783                 return -1;
1784         }
1785
1786         c->recv = NULL;
1787         c->poll = NULL;
1788         c->reapable = true;
1789         return 0;
1790 }
1791
1792 int utcp_abort(struct utcp_connection *c) {
1793         if(!reset_connection(c)) {
1794                 return -1;
1795         }
1796
1797         c->reapable = true;
1798         return 0;
1799 }
1800
1801 /* Handle timeouts.
1802  * One call to this function will loop through all connections,
1803  * checking if something needs to be resent or not.
1804  * The return value is the time to the next timeout in milliseconds,
1805  * or maybe a negative value if the timeout is infinite.
1806  */
1807 struct timeval utcp_timeout(struct utcp *utcp) {
1808         struct timeval now;
1809         gettimeofday(&now, NULL);
1810         struct timeval next = {now.tv_sec + 3600, now.tv_usec};
1811
1812         for(int i = 0; i < utcp->nconnections; i++) {
1813                 struct utcp_connection *c = utcp->connections[i];
1814
1815                 if(!c) {
1816                         continue;
1817                 }
1818
1819                 // delete connections that have been utcp_close()d.
1820                 if(c->state == CLOSED) {
1821                         if(c->reapable) {
1822                                 debug("Reaping %p\n", c);
1823                                 free_connection(c);
1824                                 i--;
1825                         }
1826
1827                         continue;
1828                 }
1829
1830                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &now, <)) {
1831                         errno = ETIMEDOUT;
1832                         c->state = CLOSED;
1833
1834                         if(c->recv) {
1835                                 c->recv(c, NULL, 0);
1836                         }
1837
1838                         if(c->poll && !c->reapable) {
1839                                 c->poll(c, 0);
1840                         }
1841
1842                         continue;
1843                 }
1844
1845                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &now, <)) {
1846                         debug("retransmit()\n");
1847                         retransmit(c);
1848                 }
1849
1850                 if(c->poll) {
1851                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1852                                 uint32_t len =  buffer_free(&c->sndbuf);
1853
1854                                 if(len) {
1855                                         c->poll(c, len);
1856                                 }
1857                         } else if(c->state == CLOSED) {
1858                                 c->poll(c, 0);
1859                         }
1860                 }
1861
1862                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &next, <)) {
1863                         next = c->conn_timeout;
1864                 }
1865
1866                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &next, <)) {
1867                         next = c->rtrx_timeout;
1868                 }
1869         }
1870
1871         struct timeval diff;
1872
1873         timersub(&next, &now, &diff);
1874
1875         return diff;
1876 }
1877
1878 bool utcp_is_active(struct utcp *utcp) {
1879         if(!utcp) {
1880                 return false;
1881         }
1882
1883         for(int i = 0; i < utcp->nconnections; i++)
1884                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
1885                         return true;
1886                 }
1887
1888         return false;
1889 }
1890
1891 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
1892         if(!send) {
1893                 errno = EFAULT;
1894                 return NULL;
1895         }
1896
1897         struct utcp *utcp = calloc(1, sizeof(*utcp));
1898
1899         if(!utcp) {
1900                 return NULL;
1901         }
1902
1903         utcp->accept = accept;
1904         utcp->pre_accept = pre_accept;
1905         utcp->send = send;
1906         utcp->priv = priv;
1907         utcp_set_mtu(utcp, DEFAULT_MTU);
1908         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
1909         utcp->rto = START_RTO; // usec
1910
1911         return utcp;
1912 }
1913
1914 void utcp_exit(struct utcp *utcp) {
1915         if(!utcp) {
1916                 return;
1917         }
1918
1919         for(int i = 0; i < utcp->nconnections; i++) {
1920                 struct utcp_connection *c = utcp->connections[i];
1921
1922                 if(!c->reapable) {
1923                         if(c->recv) {
1924                                 c->recv(c, NULL, 0);
1925                         }
1926
1927                         if(c->poll && !c->reapable) {
1928                                 c->poll(c, 0);
1929                         }
1930                 }
1931
1932                 buffer_exit(&c->rcvbuf);
1933                 buffer_exit(&c->sndbuf);
1934                 free(c);
1935         }
1936
1937         free(utcp->connections);
1938         free(utcp);
1939 }
1940
1941 uint16_t utcp_get_mtu(struct utcp *utcp) {
1942         return utcp ? utcp->mtu : 0;
1943 }
1944
1945 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
1946         if(!utcp) {
1947                 return;
1948         }
1949
1950         if(mtu <= sizeof(struct hdr)) {
1951                 return;
1952         }
1953
1954         if(mtu > utcp->mtu) {
1955                 char *new = realloc(utcp->pkt, mtu + sizeof(struct hdr));
1956
1957                 if(!new) {
1958                         return;
1959                 }
1960
1961                 utcp->pkt = new;
1962         }
1963
1964         utcp->mtu = mtu;
1965 }
1966
1967 void utcp_reset_timers(struct utcp *utcp) {
1968         if(!utcp) {
1969                 return;
1970         }
1971
1972         struct timeval now, then;
1973
1974         gettimeofday(&now, NULL);
1975
1976         then = now;
1977
1978         then.tv_sec += utcp->timeout;
1979
1980         for(int i = 0; i < utcp->nconnections; i++) {
1981                 struct utcp_connection *c = utcp->connections[i];
1982
1983                 if(c->reapable) {
1984                         continue;
1985                 }
1986
1987                 if(timerisset(&c->rtrx_timeout)) {
1988                         c->rtrx_timeout = now;
1989                 }
1990
1991                 if(timerisset(&c->conn_timeout)) {
1992                         c->conn_timeout = then;
1993                 }
1994
1995                 c->rtt_start.tv_sec = 0;
1996         }
1997
1998         if(utcp->rto > START_RTO) {
1999                 utcp->rto = START_RTO;
2000         }
2001 }
2002
2003 int utcp_get_user_timeout(struct utcp *u) {
2004         return u ? u->timeout : 0;
2005 }
2006
2007 void utcp_set_user_timeout(struct utcp *u, int timeout) {
2008         if(u) {
2009                 u->timeout = timeout;
2010         }
2011 }
2012
2013 size_t utcp_get_sndbuf(struct utcp_connection *c) {
2014         return c ? c->sndbuf.maxsize : 0;
2015 }
2016
2017 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
2018         if(!c) {
2019                 return 0;
2020         }
2021
2022         switch(c->state) {
2023         case SYN_SENT:
2024         case SYN_RECEIVED:
2025         case ESTABLISHED:
2026         case CLOSE_WAIT:
2027                 return buffer_free(&c->sndbuf);
2028
2029         default:
2030                 return 0;
2031         }
2032 }
2033
2034 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2035         if(!c) {
2036                 return;
2037         }
2038
2039         c->sndbuf.maxsize = size;
2040
2041         if(c->sndbuf.maxsize != size) {
2042                 c->sndbuf.maxsize = -1;
2043         }
2044 }
2045
2046 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2047         return c ? c->rcvbuf.maxsize : 0;
2048 }
2049
2050 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2051         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2052                 return buffer_free(&c->rcvbuf);
2053         } else {
2054                 return 0;
2055         }
2056 }
2057
2058 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2059         if(!c) {
2060                 return;
2061         }
2062
2063         c->rcvbuf.maxsize = size;
2064
2065         if(c->rcvbuf.maxsize != size) {
2066                 c->rcvbuf.maxsize = -1;
2067         }
2068 }
2069
2070 size_t utcp_get_sendq(struct utcp_connection *c) {
2071         return c->sndbuf.used;
2072 }
2073
2074 size_t utcp_get_recvq(struct utcp_connection *c) {
2075         return c->rcvbuf.used;
2076 }
2077
2078 bool utcp_get_nodelay(struct utcp_connection *c) {
2079         return c ? c->nodelay : false;
2080 }
2081
2082 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2083         if(c) {
2084                 c->nodelay = nodelay;
2085         }
2086 }
2087
2088 bool utcp_get_keepalive(struct utcp_connection *c) {
2089         return c ? c->keepalive : false;
2090 }
2091
2092 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2093         if(c) {
2094                 c->keepalive = keepalive;
2095         }
2096 }
2097
2098 size_t utcp_get_outq(struct utcp_connection *c) {
2099         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2100 }
2101
2102 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2103         if(c) {
2104                 c->recv = recv;
2105         }
2106 }
2107
2108 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2109         if(c) {
2110                 c->poll = poll;
2111         }
2112 }
2113
2114 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2115         if(utcp) {
2116                 utcp->accept = accept;
2117                 utcp->pre_accept = pre_accept;
2118         }
2119 }
2120
2121 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2122         if(!c || c->reapable) {
2123                 return;
2124         }
2125
2126         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2127                 return;
2128         }
2129
2130         if(expect) {
2131                 // If we expect data, start the connection timer.
2132                 if(!timerisset(&c->conn_timeout)) {
2133                         gettimeofday(&c->conn_timeout, NULL);
2134                         c->conn_timeout.tv_sec += c->utcp->timeout;
2135                 }
2136         } else {
2137                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2138                 if(c->snd.una == c->snd.last) {
2139                         timerclear(&c->conn_timeout);
2140                 }
2141         }
2142 }
2143
2144 void utcp_offline(struct utcp *utcp, bool offline) {
2145         struct timeval now;
2146         gettimeofday(&now, NULL);
2147
2148         for(int i = 0; i < utcp->nconnections; i++) {
2149                 struct utcp_connection *c = utcp->connections[i];
2150
2151                 if(c->reapable) {
2152                         continue;
2153                 }
2154
2155                 utcp_expect_data(c, offline);
2156
2157                 if(!offline) {
2158                         if(timerisset(&c->rtrx_timeout)) {
2159                                 c->rtrx_timeout = now;
2160                         }
2161
2162                         utcp->connections[i]->rtt_start.tv_sec = 0;
2163                 }
2164         }
2165
2166         if(!offline && utcp->rto > START_RTO) {
2167                 utcp->rto = START_RTO;
2168         }
2169 }