]> git.meshlink.io Git - meshlink/blob - src/utcp.c
Implement MESHLINK_CHANNEL_FRAMED.
[meshlink] / src / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <time.h>
31
32 #include "utcp_priv.h"
33
34 #ifndef EBADMSG
35 #define EBADMSG         104
36 #endif
37
38 #ifndef SHUT_RDWR
39 #define SHUT_RDWR 2
40 #endif
41
42 #ifdef poll
43 #undef poll
44 #endif
45
46 #ifndef UTCP_CLOCK
47 #if defined(CLOCK_MONOTONIC_RAW) && defined(__x86_64__)
48 #define UTCP_CLOCK CLOCK_MONOTONIC_RAW
49 #else
50 #define UTCP_CLOCK CLOCK_MONOTONIC
51 #endif
52 #endif
53
54 static void timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *r) {
55         r->tv_sec = a->tv_sec - b->tv_sec;
56         r->tv_nsec = a->tv_nsec - b->tv_nsec;
57
58         if(r->tv_nsec < 0) {
59                 r->tv_sec--, r->tv_nsec += NSEC_PER_SEC;
60         }
61 }
62
63 static int32_t timespec_diff_usec(const struct timespec *a, const struct timespec *b) {
64         return (a->tv_sec - b->tv_sec) * 1000000 + (a->tv_nsec - b->tv_nsec) / 1000;
65 }
66
67 static bool timespec_lt(const struct timespec *a, const struct timespec *b) {
68         if(a->tv_sec == b->tv_sec) {
69                 return a->tv_nsec < b->tv_nsec;
70         } else {
71                 return a->tv_sec < b->tv_sec;
72         }
73 }
74
75 static void timespec_clear(struct timespec *a) {
76         a->tv_sec = 0;
77         a->tv_nsec = 0;
78 }
79
80 static bool timespec_isset(const struct timespec *a) {
81         return a->tv_sec;
82 }
83
84 static long CLOCK_GRANULARITY; // usec
85
86 static inline size_t min(size_t a, size_t b) {
87         return a < b ? a : b;
88 }
89
90 static inline size_t max(size_t a, size_t b) {
91         return a > b ? a : b;
92 }
93
94 #ifdef UTCP_DEBUG
95 #include <stdarg.h>
96
97 #ifndef UTCP_DEBUG_DATALEN
98 #define UTCP_DEBUG_DATALEN 20
99 #endif
100
101 static void debug(struct utcp_connection *c, const char *format, ...) {
102         struct timespec tv;
103         char buf[1024];
104         int len;
105
106         clock_gettime(CLOCK_REALTIME, &tv);
107         len = snprintf(buf, sizeof(buf), "%ld.%06lu %u:%u ", (long)tv.tv_sec, tv.tv_nsec / 1000, c ? c->src : 0, c ? c->dst : 0);
108         va_list ap;
109         va_start(ap, format);
110         len += vsnprintf(buf + len, sizeof(buf) - len, format, ap);
111         va_end(ap);
112
113         if(len > 0 && (size_t)len < sizeof(buf)) {
114                 fwrite(buf, len, 1, stderr);
115         }
116 }
117
118 static void print_packet(struct utcp_connection *c, const char *dir, const void *pkt, size_t len) {
119         struct hdr hdr;
120
121         if(len < sizeof(hdr)) {
122                 debug(c, "%s: short packet (%lu bytes)\n", dir, (unsigned long)len);
123                 return;
124         }
125
126         memcpy(&hdr, pkt, sizeof(hdr));
127
128         uint32_t datalen;
129
130         if(len > sizeof(hdr)) {
131                 datalen = min(len - sizeof(hdr), UTCP_DEBUG_DATALEN);
132         } else {
133                 datalen = 0;
134         }
135
136
137         const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
138         char str[datalen * 2 + 1];
139         char *p = str;
140
141         for(uint32_t i = 0; i < datalen; i++) {
142                 *p++ = "0123456789ABCDEF"[data[i] >> 4];
143                 *p++ = "0123456789ABCDEF"[data[i] & 15];
144         }
145
146         *p = 0;
147
148         debug(c, "%s: len %lu src %u dst %u seq %u ack %u wnd %u aux %x ctl %s%s%s%s%s data %s\n",
149               dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux,
150               hdr.ctl & SYN ? "SYN" : "",
151               hdr.ctl & RST ? "RST" : "",
152               hdr.ctl & FIN ? "FIN" : "",
153               hdr.ctl & ACK ? "ACK" : "",
154               hdr.ctl & MF ? "MF" : "",
155               str
156              );
157 }
158
159 static void debug_cwnd(struct utcp_connection *c) {
160         debug(c, "snd.cwnd %u snd.ssthresh %u\n", c->snd.cwnd, ~c->snd.ssthresh ? c->snd.ssthresh : 0);
161 }
162 #else
163 #define debug(...) do {} while(0)
164 #define print_packet(...) do {} while(0)
165 #define debug_cwnd(...) do {} while(0)
166 #endif
167
168 static void set_state(struct utcp_connection *c, enum state state) {
169         c->state = state;
170
171         if(state == ESTABLISHED) {
172                 timespec_clear(&c->conn_timeout);
173         }
174
175         debug(c, "state %s\n", strstate[state]);
176 }
177
178 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
179         if(seq != c->snd.last) {
180                 return false;
181         }
182
183         switch(c->state) {
184         case FIN_WAIT_1:
185         case CLOSING:
186         case LAST_ACK:
187                 return true;
188
189         default:
190                 return false;
191         }
192 }
193
194 static bool is_reliable(struct utcp_connection *c) {
195         return c->flags & UTCP_RELIABLE;
196 }
197
198 static bool is_framed(struct utcp_connection *c) {
199         return c->flags & UTCP_FRAMED;
200 }
201
202 static int32_t seqdiff(uint32_t a, uint32_t b) {
203         return a - b;
204 }
205
206 // Buffer functions
207 static bool buffer_wraps(struct buffer *buf) {
208         return buf->size - buf->offset < buf->used;
209 }
210
211 static bool buffer_resize(struct buffer *buf, uint32_t newsize) {
212         char *newdata = realloc(buf->data, newsize);
213
214         if(!newdata) {
215                 return false;
216         }
217
218         buf->data = newdata;
219
220         if(buffer_wraps(buf)) {
221                 // Shift the right part of the buffer until it hits the end of the new buffer.
222                 // Old situation:
223                 // [345......012]
224                 // New situation:
225                 // [345.........|........012]
226                 uint32_t tailsize = buf->size - buf->offset;
227                 uint32_t newoffset = newsize - tailsize;
228                 memmove(buf->data + newoffset, buf->data + buf->offset, tailsize);
229                 buf->offset = newoffset;
230         }
231
232         buf->size = newsize;
233         return true;
234 }
235
236 // Store data into the buffer
237 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
238         debug(NULL, "buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
239
240         // Ensure we don't store more than maxsize bytes in total
241         size_t required = offset + len;
242
243         if(required > buf->maxsize) {
244                 if(offset >= buf->maxsize) {
245                         return 0;
246                 }
247
248                 len = buf->maxsize - offset;
249                 required = buf->maxsize;
250         }
251
252         // Check if we need to resize the buffer
253         if(required > buf->size) {
254                 size_t newsize = buf->size;
255
256                 if(!newsize) {
257                         newsize = 4096;
258                 }
259
260                 do {
261                         newsize *= 2;
262                 } while(newsize < required);
263
264                 if(newsize > buf->maxsize) {
265                         newsize = buf->maxsize;
266                 }
267
268                 if(!buffer_resize(buf, newsize)) {
269                         return -1;
270                 }
271         }
272
273         uint32_t realoffset = buf->offset + offset;
274
275         if(buf->size - buf->offset <= offset) {
276                 // The offset wrapped
277                 realoffset -= buf->size;
278         }
279
280         if(buf->size - realoffset < len) {
281                 // The new chunk of data must be wrapped
282                 memcpy(buf->data + realoffset, data, buf->size - realoffset);
283                 memcpy(buf->data, (char *)data + buf->size - realoffset, len - (buf->size - realoffset));
284         } else {
285                 memcpy(buf->data + realoffset, data, len);
286         }
287
288         if(required > buf->used) {
289                 buf->used = required;
290         }
291
292         return len;
293 }
294
295 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
296         return buffer_put_at(buf, buf->used, data, len);
297 }
298
299 // Copy data from the buffer without removing it.
300 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
301         // Ensure we don't copy more than is actually stored in the buffer
302         if(offset >= buf->used) {
303                 return 0;
304         }
305
306         if(buf->used - offset < len) {
307                 len = buf->used - offset;
308         }
309
310         uint32_t realoffset = buf->offset + offset;
311
312         if(buf->size - buf->offset <= offset) {
313                 // The offset wrapped
314                 realoffset -= buf->size;
315         }
316
317         if(buf->size - realoffset < len) {
318                 // The data is wrapped
319                 memcpy(data, buf->data + realoffset, buf->size - realoffset);
320                 memcpy((char *)data + buf->size - realoffset, buf->data, len - (buf->size - realoffset));
321         } else {
322                 memcpy(data, buf->data + realoffset, len);
323         }
324
325         return len;
326 }
327
328 // Copy data from the buffer without removing it.
329 static ssize_t buffer_call(struct utcp_connection *c, struct buffer *buf, size_t offset, size_t len) {
330         if(!c->recv) {
331                 return len;
332         }
333
334         // Ensure we don't copy more than is actually stored in the buffer
335         if(offset >= buf->used) {
336                 return 0;
337         }
338
339         if(buf->used - offset < len) {
340                 len = buf->used - offset;
341         }
342
343         uint32_t realoffset = buf->offset + offset;
344
345         if(buf->size - buf->offset <= offset) {
346                 // The offset wrapped
347                 realoffset -= buf->size;
348         }
349
350         if(buf->size - realoffset < len) {
351                 // The data is wrapped
352                 ssize_t rx1 = c->recv(c, buf->data + realoffset, buf->size - realoffset);
353
354                 if(rx1 < buf->size - realoffset) {
355                         return rx1;
356                 }
357
358                 // The channel might have been closed by the previous callback
359                 if(!c->recv) {
360                         return len;
361                 }
362
363                 ssize_t rx2 = c->recv(c, buf->data, len - (buf->size - realoffset));
364
365                 if(rx2 < 0) {
366                         return rx2;
367                 } else {
368                         return rx1 + rx2;
369                 }
370         } else {
371                 return c->recv(c, buf->data + realoffset, len);
372         }
373 }
374
375 // Discard data from the buffer.
376 static ssize_t buffer_discard(struct buffer *buf, size_t len) {
377         if(buf->used < len) {
378                 len = buf->used;
379         }
380
381         if(buf->size - buf->offset <= len) {
382                 buf->offset -= buf->size;
383         }
384
385         if(buf->used == len) {
386                 buf->offset = 0;
387         } else {
388                 buf->offset += len;
389         }
390
391         buf->used -= len;
392
393         return len;
394 }
395
396 static void buffer_clear(struct buffer *buf) {
397         buf->used = 0;
398         buf->offset = 0;
399 }
400
401 static bool buffer_set_size(struct buffer *buf, uint32_t minsize, uint32_t maxsize) {
402         if(maxsize < minsize) {
403                 maxsize = minsize;
404         }
405
406         buf->maxsize = maxsize;
407
408         return buf->size >= minsize || buffer_resize(buf, minsize);
409 }
410
411 static void buffer_exit(struct buffer *buf) {
412         free(buf->data);
413         memset(buf, 0, sizeof(*buf));
414 }
415
416 static uint32_t buffer_free(const struct buffer *buf) {
417         return buf->maxsize > buf->used ? buf->maxsize - buf->used : 0;
418 }
419
420 // Connections are stored in a sorted list.
421 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
422
423 static int compare(const void *va, const void *vb) {
424         assert(va && vb);
425
426         const struct utcp_connection *a = *(struct utcp_connection **)va;
427         const struct utcp_connection *b = *(struct utcp_connection **)vb;
428
429         assert(a && b);
430         assert(a->src && b->src);
431
432         int c = (int)a->src - (int)b->src;
433
434         if(c) {
435                 return c;
436         }
437
438         c = (int)a->dst - (int)b->dst;
439         return c;
440 }
441
442 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
443         if(!utcp->nconnections) {
444                 return NULL;
445         }
446
447         struct utcp_connection key = {
448                 .src = src,
449                 .dst = dst,
450         }, *keyp = &key;
451         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
452         return match ? *match : NULL;
453 }
454
455 static void free_connection(struct utcp_connection *c) {
456         struct utcp *utcp = c->utcp;
457         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
458
459         assert(cp);
460
461         int i = cp - utcp->connections;
462         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
463         utcp->nconnections--;
464
465         buffer_exit(&c->rcvbuf);
466         buffer_exit(&c->sndbuf);
467         free(c);
468 }
469
470 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
471         // Check whether this combination of src and dst is free
472
473         if(src) {
474                 if(find_connection(utcp, src, dst)) {
475                         errno = EADDRINUSE;
476                         return NULL;
477                 }
478         } else { // If src == 0, generate a random port number with the high bit set
479                 if(utcp->nconnections >= 32767) {
480                         errno = ENOMEM;
481                         return NULL;
482                 }
483
484                 src = rand() | 0x8000;
485
486                 while(find_connection(utcp, src, dst)) {
487                         src++;
488                 }
489         }
490
491         // Allocate memory for the new connection
492
493         if(utcp->nconnections >= utcp->nallocated) {
494                 if(!utcp->nallocated) {
495                         utcp->nallocated = 4;
496                 } else {
497                         utcp->nallocated *= 2;
498                 }
499
500                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
501
502                 if(!new_array) {
503                         return NULL;
504                 }
505
506                 utcp->connections = new_array;
507         }
508
509         struct utcp_connection *c = calloc(1, sizeof(*c));
510
511         if(!c) {
512                 return NULL;
513         }
514
515         if(!buffer_set_size(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
516                 free(c);
517                 return NULL;
518         }
519
520         if(!buffer_set_size(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
521                 buffer_exit(&c->sndbuf);
522                 free(c);
523                 return NULL;
524         }
525
526         // Fill in the details
527
528         c->src = src;
529         c->dst = dst;
530 #ifdef UTCP_DEBUG
531         c->snd.iss = 0;
532 #else
533         c->snd.iss = rand();
534 #endif
535         c->snd.una = c->snd.iss;
536         c->snd.nxt = c->snd.iss + 1;
537         c->snd.last = c->snd.nxt;
538         c->snd.cwnd = (utcp->mss > 2190 ? 2 : utcp->mss > 1095 ? 3 : 4) * utcp->mss;
539         c->snd.ssthresh = ~0;
540         debug_cwnd(c);
541         c->srtt = 0;
542         c->rttvar = 0;
543         c->rto = START_RTO;
544         c->utcp = utcp;
545
546         // Add it to the sorted list of connections
547
548         utcp->connections[utcp->nconnections++] = c;
549         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
550
551         return c;
552 }
553
554 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
555         if(a > b) {
556                 return a - b;
557         } else {
558                 return b - a;
559         }
560 }
561
562 // Update RTT variables. See RFC 6298.
563 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
564         if(!rtt) {
565                 debug(c, "invalid rtt\n");
566                 return;
567         }
568
569         if(!c->srtt) {
570                 c->srtt = rtt;
571                 c->rttvar = rtt / 2;
572         } else {
573                 c->rttvar = (c->rttvar * 3 + absdiff(c->srtt, rtt)) / 4;
574                 c->srtt = (c->srtt * 7 + rtt) / 8;
575         }
576
577         c->rto = c->srtt + max(4 * c->rttvar, CLOCK_GRANULARITY);
578
579         if(c->rto > MAX_RTO) {
580                 c->rto = MAX_RTO;
581         }
582
583         debug(c, "rtt %u srtt %u rttvar %u rto %u\n", rtt, c->srtt, c->rttvar, c->rto);
584 }
585
586 static void start_retransmit_timer(struct utcp_connection *c) {
587         clock_gettime(UTCP_CLOCK, &c->rtrx_timeout);
588
589         uint32_t rto = c->rto;
590
591         while(rto > USEC_PER_SEC) {
592                 c->rtrx_timeout.tv_sec++;
593                 rto -= USEC_PER_SEC;
594         }
595
596         c->rtrx_timeout.tv_nsec += rto * 1000;
597
598         if(c->rtrx_timeout.tv_nsec >= NSEC_PER_SEC) {
599                 c->rtrx_timeout.tv_nsec -= NSEC_PER_SEC;
600                 c->rtrx_timeout.tv_sec++;
601         }
602
603         debug(c, "rtrx_timeout %ld.%06lu\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_nsec);
604 }
605
606 static void start_flush_timer(struct utcp_connection *c) {
607         clock_gettime(UTCP_CLOCK, &c->rtrx_timeout);
608
609         c->rtrx_timeout.tv_nsec += c->utcp->flush_timeout * 1000000;
610
611         if(c->rtrx_timeout.tv_nsec >= NSEC_PER_SEC) {
612                 c->rtrx_timeout.tv_nsec -= NSEC_PER_SEC;
613                 c->rtrx_timeout.tv_sec++;
614         }
615
616         debug(c, "rtrx_timeout %ld.%06lu (flush)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_nsec);
617 }
618
619 static void stop_retransmit_timer(struct utcp_connection *c) {
620         timespec_clear(&c->rtrx_timeout);
621         debug(c, "rtrx_timeout cleared\n");
622 }
623
624 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
625         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
626
627         if(!c) {
628                 return NULL;
629         }
630
631         assert((flags & ~0x1f) == 0);
632
633         c->flags = flags;
634         c->recv = recv;
635         c->priv = priv;
636
637         struct {
638                 struct hdr hdr;
639                 uint8_t init[4];
640         } pkt;
641
642         pkt.hdr.src = c->src;
643         pkt.hdr.dst = c->dst;
644         pkt.hdr.seq = c->snd.iss;
645         pkt.hdr.ack = 0;
646         pkt.hdr.wnd = c->rcvbuf.maxsize;
647         pkt.hdr.ctl = SYN;
648         pkt.hdr.aux = 0x0101;
649         pkt.init[0] = 1;
650         pkt.init[1] = 0;
651         pkt.init[2] = 0;
652         pkt.init[3] = flags & 0x7;
653
654         set_state(c, SYN_SENT);
655
656         print_packet(c, "send", &pkt, sizeof(pkt));
657         utcp->send(utcp, &pkt, sizeof(pkt));
658
659         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
660         c->conn_timeout.tv_sec += utcp->timeout;
661
662         start_retransmit_timer(c);
663
664         return c;
665 }
666
667 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
668         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
669 }
670
671 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
672         if(c->reapable || c->state != SYN_RECEIVED) {
673                 debug(c, "accept() called on invalid connection in state %s\n", c, strstate[c->state]);
674                 return;
675         }
676
677         debug(c, "accepted %p %p\n", c, recv, priv);
678         c->recv = recv;
679         c->priv = priv;
680         set_state(c, ESTABLISHED);
681 }
682
683 static void ack(struct utcp_connection *c, bool sendatleastone) {
684         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
685         int32_t cwndleft = is_reliable(c) ? min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una) : MAX_UNRELIABLE_SIZE;
686
687         assert(left >= 0);
688
689         if(cwndleft <= 0) {
690                 left = 0;
691         } else if(cwndleft < left) {
692                 left = cwndleft;
693
694                 if(!sendatleastone || cwndleft > c->utcp->mss) {
695                         left -= left % c->utcp->mss;
696                 }
697         }
698
699         debug(c, "cwndleft %d left %d\n", cwndleft, left);
700
701         if(!left && !sendatleastone) {
702                 return;
703         }
704
705         struct {
706                 struct hdr hdr;
707                 uint8_t data[];
708         } *pkt = c->utcp->pkt;
709
710         pkt->hdr.src = c->src;
711         pkt->hdr.dst = c->dst;
712         pkt->hdr.ack = c->rcv.nxt;
713         pkt->hdr.wnd = is_reliable(c) ? c->rcvbuf.maxsize : 0;
714         pkt->hdr.ctl = ACK;
715         pkt->hdr.aux = 0;
716
717         do {
718                 uint32_t seglen = left > c->utcp->mss ? c->utcp->mss : left;
719                 pkt->hdr.seq = c->snd.nxt;
720
721                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
722
723                 c->snd.nxt += seglen;
724                 left -= seglen;
725
726                 if(!is_reliable(c)) {
727                         if(left) {
728                                 pkt->hdr.ctl |= MF;
729                         } else {
730                                 pkt->hdr.ctl &= ~MF;
731                         }
732                 }
733
734                 if(seglen && fin_wanted(c, c->snd.nxt)) {
735                         seglen--;
736                         pkt->hdr.ctl |= FIN;
737                 }
738
739                 if(!c->rtt_start.tv_sec && is_reliable(c)) {
740                         // Start RTT measurement
741                         clock_gettime(UTCP_CLOCK, &c->rtt_start);
742                         c->rtt_seq = pkt->hdr.seq + seglen;
743                         debug(c, "starting RTT measurement, expecting ack %u\n", c->rtt_seq);
744                 }
745
746                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
747                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
748
749                 if(left && !is_reliable(c)) {
750                         pkt->hdr.wnd += seglen;
751                 }
752         } while(left);
753 }
754
755 static ssize_t utcp_send_reliable(struct utcp_connection *c, const void *data, size_t len) {
756         size_t rlen = len + (is_framed(c) ? 2 : 0);
757
758         if(!rlen) {
759                 return 0;
760         }
761
762         // Check if we need to be able to buffer all data
763
764         if(c->flags & (UTCP_NO_PARTIAL | UTCP_FRAMED)) {
765                 if(rlen > c->sndbuf.maxsize) {
766                         errno = EMSGSIZE;
767                         return -1;
768                 }
769
770                 if((c->flags & UTCP_FRAMED) && len > MAX_UNRELIABLE_SIZE) {
771                         errno = EMSGSIZE;
772                         return -1;
773                 }
774
775                 if(rlen > buffer_free(&c->sndbuf)) {
776                         errno = EWOULDBLOCK;
777                         return 0;
778                 }
779         }
780
781         // Add data to the send buffer.
782
783         if(is_framed(c)) {
784                 uint16_t len16 = len;
785                 buffer_put(&c->sndbuf, &len16, sizeof(len16));
786                 assert(buffer_put(&c->sndbuf, data, len) == (ssize_t)len);
787         } else {
788                 len = buffer_put(&c->sndbuf, data, len);
789
790                 if(len <= 0) {
791                         errno = EWOULDBLOCK;
792                         return 0;
793                 }
794         }
795
796         c->snd.last += rlen;
797
798         // Don't send anything yet if the connection has not fully established yet
799
800         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
801                 return len;
802         }
803
804         ack(c, false);
805
806         if(!timespec_isset(&c->rtrx_timeout)) {
807                 start_retransmit_timer(c);
808         }
809
810         if(!timespec_isset(&c->conn_timeout)) {
811                 clock_gettime(UTCP_CLOCK, &c->conn_timeout);
812                 c->conn_timeout.tv_sec += c->utcp->timeout;
813         }
814
815         return len;
816 }
817
818
819 /* In the send buffer we can have multiple frames, each prefixed with their
820    length. However, the first frame might already have been partially sent. The
821    variable c->frame_offset tracks how much of a partial frame is left at the
822    start. If it is 0, it means there is no partial frame, and the first two
823    bytes in the send buffer are the length of the first frame.
824
825    After sending an MSS sized packet, we need to calculate the new frame_offset
826    value, since it is likely that the next packet will also have a partial frame
827    at the start. We do this by scanning the previously sent packet for frame
828    headers, to find out how many bytes of the last frame are left to send.
829 */
830 static void ack_unreliable_framed(struct utcp_connection *c) {
831         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
832         assert(left > 0);
833
834         struct {
835                 struct hdr hdr;
836                 uint8_t data[];
837         } *pkt = c->utcp->pkt;
838
839         pkt->hdr.src = c->src;
840         pkt->hdr.dst = c->dst;
841         pkt->hdr.ack = c->rcv.nxt;
842         pkt->hdr.ctl = ACK | MF;
843         pkt->hdr.aux = 0;
844
845         bool sent_packet = false;
846
847         while(left >= c->utcp->mss) {
848                 pkt->hdr.wnd = c->frame_offset;
849                 uint32_t seglen = c->utcp->mss;
850
851                 pkt->hdr.seq = c->snd.nxt;
852
853                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
854
855                 c->snd.nxt += seglen;
856                 c->snd.una = c->snd.nxt;
857                 left -= seglen;
858
859                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
860                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
861                 sent_packet = true;
862
863                 // Calculate the new frame offset
864                 while(c->frame_offset < seglen) {
865                         uint16_t framelen;
866                         buffer_copy(&c->sndbuf, &framelen, c->frame_offset, sizeof(framelen));
867                         c->frame_offset += framelen + 2;
868                 }
869
870                 buffer_discard(&c->sndbuf, seglen);
871                 c->frame_offset -= seglen;
872         };
873
874         if(sent_packet) {
875                 if(left) {
876                         // We sent one packet but we have partial data left, (re)start the flush timer
877                         start_flush_timer(c);
878                 } else {
879                         // There is no partial data in the send buffer, so stop the flush timer
880                         stop_retransmit_timer(c);
881                 }
882         }
883 }
884
885 static void flush_unreliable_framed(struct utcp_connection *c) {
886         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
887
888         /* If the MSS dropped since last time ack_unreliable_frame() was called,
889           we might now have more than one segment worth of data left.
890         */
891         if(left > c->utcp->mss) {
892                 ack_unreliable_framed(c);
893                 left = seqdiff(c->snd.last, c->snd.nxt);
894                 assert(left <= c->utcp->mss);
895         }
896
897         if(left) {
898                 struct {
899                         struct hdr hdr;
900                         uint8_t data[];
901                 } *pkt = c->utcp->pkt;
902
903                 pkt->hdr.src = c->src;
904                 pkt->hdr.dst = c->dst;
905                 pkt->hdr.seq = c->snd.nxt;
906                 pkt->hdr.ack = c->rcv.nxt;
907                 pkt->hdr.wnd = c->frame_offset;
908                 pkt->hdr.ctl = ACK | MF;
909                 pkt->hdr.aux = 0;
910
911                 uint32_t seglen = left;
912
913                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
914                 buffer_discard(&c->sndbuf, seglen);
915
916                 c->snd.nxt += seglen;
917                 c->snd.una = c->snd.nxt;
918
919                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
920                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
921         }
922
923         c->frame_offset = 0;
924         stop_retransmit_timer(c);
925 }
926
927
928 static ssize_t utcp_send_unreliable(struct utcp_connection *c, const void *data, size_t len) {
929         if(len > MAX_UNRELIABLE_SIZE) {
930                 errno = EMSGSIZE;
931                 return -1;
932         }
933
934         size_t rlen = len + (is_framed(c) ? 2 : 0);
935
936         if(rlen > buffer_free(&c->sndbuf)) {
937                 if(rlen > c->sndbuf.maxsize) {
938                         errno = EMSGSIZE;
939                         return -1;
940                 } else {
941                         errno = EWOULDBLOCK;
942                         return 0;
943                 }
944         }
945
946         // Don't send anything yet if the connection has not fully established yet
947
948         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
949                 return len;
950         }
951
952         if(is_framed(c)) {
953                 uint16_t framelen = len;
954                 buffer_put(&c->sndbuf, &framelen, sizeof(framelen));
955         }
956
957         buffer_put(&c->sndbuf, data, len);
958
959         c->snd.last += rlen;
960
961         if(is_framed(c)) {
962                 ack_unreliable_framed(c);
963         } else {
964                 ack(c, false);
965                 c->snd.una = c->snd.nxt = c->snd.last;
966                 buffer_discard(&c->sndbuf, c->sndbuf.used);
967         }
968
969         return len;
970 }
971
972 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
973         if(c->reapable) {
974                 debug(c, "send() called on closed connection\n");
975                 errno = EBADF;
976                 return -1;
977         }
978
979         switch(c->state) {
980         case CLOSED:
981         case LISTEN:
982                 debug(c, "send() called on unconnected connection\n");
983                 errno = ENOTCONN;
984                 return -1;
985
986         case SYN_SENT:
987         case SYN_RECEIVED:
988         case ESTABLISHED:
989         case CLOSE_WAIT:
990                 break;
991
992         case FIN_WAIT_1:
993         case FIN_WAIT_2:
994         case CLOSING:
995         case LAST_ACK:
996         case TIME_WAIT:
997                 debug(c, "send() called on closed connection\n");
998                 errno = EPIPE;
999                 return -1;
1000         }
1001
1002         if(!data && len) {
1003                 errno = EFAULT;
1004                 return -1;
1005         }
1006
1007         if(is_reliable(c)) {
1008                 return utcp_send_reliable(c, data, len);
1009         } else {
1010                 return utcp_send_unreliable(c, data, len);
1011         }
1012 }
1013
1014 static void swap_ports(struct hdr *hdr) {
1015         uint16_t tmp = hdr->src;
1016         hdr->src = hdr->dst;
1017         hdr->dst = tmp;
1018 }
1019
1020 static void fast_retransmit(struct utcp_connection *c) {
1021         if(c->state == CLOSED || c->snd.last == c->snd.una) {
1022                 debug(c, "fast_retransmit() called but nothing to retransmit!\n");
1023                 return;
1024         }
1025
1026         struct utcp *utcp = c->utcp;
1027
1028         struct {
1029                 struct hdr hdr;
1030                 uint8_t data[];
1031         } *pkt = c->utcp->pkt;
1032
1033         pkt->hdr.src = c->src;
1034         pkt->hdr.dst = c->dst;
1035         pkt->hdr.wnd = c->rcvbuf.maxsize;
1036         pkt->hdr.aux = 0;
1037
1038         switch(c->state) {
1039         case ESTABLISHED:
1040         case FIN_WAIT_1:
1041         case CLOSE_WAIT:
1042         case CLOSING:
1043         case LAST_ACK:
1044                 // Send unacked data again.
1045                 pkt->hdr.seq = c->snd.una;
1046                 pkt->hdr.ack = c->rcv.nxt;
1047                 pkt->hdr.ctl = ACK;
1048                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
1049
1050                 if(fin_wanted(c, c->snd.una + len)) {
1051                         len--;
1052                         pkt->hdr.ctl |= FIN;
1053                 }
1054
1055                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
1056                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
1057                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
1058                 break;
1059
1060         default:
1061                 break;
1062         }
1063 }
1064
1065 static void retransmit(struct utcp_connection *c) {
1066         if(c->state == CLOSED || c->snd.last == c->snd.una) {
1067                 debug(c, "retransmit() called but nothing to retransmit!\n");
1068                 stop_retransmit_timer(c);
1069                 return;
1070         }
1071
1072         struct utcp *utcp = c->utcp;
1073
1074         if(utcp->retransmit && is_reliable(c)) {
1075                 utcp->retransmit(c);
1076         }
1077
1078         struct {
1079                 struct hdr hdr;
1080                 uint8_t data[];
1081         } *pkt = c->utcp->pkt;
1082
1083         pkt->hdr.src = c->src;
1084         pkt->hdr.dst = c->dst;
1085         pkt->hdr.wnd = c->rcvbuf.maxsize;
1086         pkt->hdr.aux = 0;
1087
1088         switch(c->state) {
1089         case SYN_SENT:
1090                 // Send our SYN again
1091                 pkt->hdr.seq = c->snd.iss;
1092                 pkt->hdr.ack = 0;
1093                 pkt->hdr.ctl = SYN;
1094                 pkt->hdr.aux = 0x0101;
1095                 pkt->data[0] = 1;
1096                 pkt->data[1] = 0;
1097                 pkt->data[2] = 0;
1098                 pkt->data[3] = c->flags & 0x7;
1099                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + 4);
1100                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
1101                 break;
1102
1103         case SYN_RECEIVED:
1104                 // Send SYNACK again
1105                 pkt->hdr.seq = c->snd.nxt;
1106                 pkt->hdr.ack = c->rcv.nxt;
1107                 pkt->hdr.ctl = SYN | ACK;
1108                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr));
1109                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
1110                 break;
1111
1112         case ESTABLISHED:
1113         case FIN_WAIT_1:
1114         case CLOSE_WAIT:
1115         case CLOSING:
1116         case LAST_ACK:
1117                 if(!is_reliable(c) && is_framed(c) && c->sndbuf.used) {
1118                         flush_unreliable_framed(c);
1119                         return;
1120                 }
1121
1122                 // Send unacked data again.
1123                 pkt->hdr.seq = c->snd.una;
1124                 pkt->hdr.ack = c->rcv.nxt;
1125                 pkt->hdr.ctl = ACK;
1126                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
1127
1128                 if(fin_wanted(c, c->snd.una + len)) {
1129                         len--;
1130                         pkt->hdr.ctl |= FIN;
1131                 }
1132
1133                 // RFC 5681 slow start after timeout
1134                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
1135                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
1136                 c->snd.cwnd = utcp->mss;
1137                 debug_cwnd(c);
1138
1139                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
1140                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
1141                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
1142
1143                 c->snd.nxt = c->snd.una + len;
1144                 break;
1145
1146         case CLOSED:
1147         case LISTEN:
1148         case TIME_WAIT:
1149         case FIN_WAIT_2:
1150                 // We shouldn't need to retransmit anything in this state.
1151 #ifdef UTCP_DEBUG
1152                 abort();
1153 #endif
1154                 stop_retransmit_timer(c);
1155                 goto cleanup;
1156         }
1157
1158         start_retransmit_timer(c);
1159         c->rto *= 2;
1160
1161         if(c->rto > MAX_RTO) {
1162                 c->rto = MAX_RTO;
1163         }
1164
1165         c->rtt_start.tv_sec = 0; // invalidate RTT timer
1166         c->dupack = 0; // cancel any ongoing fast recovery
1167
1168 cleanup:
1169         return;
1170 }
1171
1172 /* Update receive buffer and SACK entries after consuming data.
1173  *
1174  * Situation:
1175  *
1176  * |.....0000..1111111111.....22222......3333|
1177  * |---------------^
1178  *
1179  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
1180  * to remove data from the receive buffer. The idea is to substract "len"
1181  * from the offset of all the SACK entries, and then remove/cut down entries
1182  * that are shifted to before the start of the receive buffer.
1183  *
1184  * There are three cases:
1185  * - the SACK entry is after ^, in that case just change the offset.
1186  * - the SACK entry starts before and ends after ^, so we have to
1187  *   change both its offset and size.
1188  * - the SACK entry is completely before ^, in that case delete it.
1189  */
1190 static void sack_consume(struct utcp_connection *c, size_t len) {
1191         debug(c, "sack_consume %lu\n", (unsigned long)len);
1192
1193         if(len > c->rcvbuf.used) {
1194                 debug(c, "all SACK entries consumed\n");
1195                 c->sacks[0].len = 0;
1196                 return;
1197         }
1198
1199         buffer_discard(&c->rcvbuf, len);
1200
1201         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
1202                 if(len < c->sacks[i].offset) {
1203                         c->sacks[i].offset -= len;
1204                         i++;
1205                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
1206                         c->sacks[i].len -= len - c->sacks[i].offset;
1207                         c->sacks[i].offset = 0;
1208                         i++;
1209                 } else {
1210                         if(i < NSACKS - 1) {
1211                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
1212                                 c->sacks[NSACKS - 1].len = 0;
1213                         } else {
1214                                 c->sacks[i].len = 0;
1215                                 break;
1216                         }
1217                 }
1218         }
1219
1220         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
1221                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
1222         }
1223 }
1224
1225 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
1226         debug(c, "out of order packet, offset %u\n", offset);
1227         // Packet loss or reordering occured. Store the data in the buffer.
1228         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
1229
1230         if(rxd <= 0) {
1231                 debug(c, "packet outside receive buffer, dropping\n");
1232                 return;
1233         }
1234
1235         if((size_t)rxd < len) {
1236                 debug(c, "packet partially outside receive buffer\n");
1237                 len = rxd;
1238         }
1239
1240         // Make note of where we put it.
1241         for(int i = 0; i < NSACKS; i++) {
1242                 if(!c->sacks[i].len) { // nothing to merge, add new entry
1243                         debug(c, "new SACK entry %d\n", i);
1244                         c->sacks[i].offset = offset;
1245                         c->sacks[i].len = rxd;
1246                         break;
1247                 } else if(offset < c->sacks[i].offset) {
1248                         if(offset + rxd < c->sacks[i].offset) { // insert before
1249                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
1250                                         debug(c, "insert SACK entry at %d\n", i);
1251                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
1252                                         c->sacks[i].offset = offset;
1253                                         c->sacks[i].len = rxd;
1254                                 } else {
1255                                         debug(c, "SACK entries full, dropping packet\n");
1256                                 }
1257
1258                                 break;
1259                         } else { // merge
1260                                 debug(c, "merge with start of SACK entry at %d\n", i);
1261                                 c->sacks[i].offset = offset;
1262                                 break;
1263                         }
1264                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
1265                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
1266                                 debug(c, "merge with end of SACK entry at %d\n", i);
1267                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
1268                                 // TODO: handle potential merge with next entry
1269                         }
1270
1271                         break;
1272                 }
1273         }
1274
1275         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
1276                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
1277         }
1278 }
1279
1280 static void handle_out_of_order_framed(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
1281         uint32_t in_order_offset = (c->sacks[0].len && c->sacks[0].offset == 0) ? c->sacks[0].len : 0;
1282
1283         // Put the data into the receive buffer
1284         handle_out_of_order(c, offset + in_order_offset, data, len);
1285 }
1286
1287 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
1288         if(c->recv) {
1289                 ssize_t rxd = c->recv(c, data, len);
1290
1291                 if(rxd != (ssize_t)len) {
1292                         // TODO: handle the application not accepting all data.
1293                         abort();
1294                 }
1295         }
1296
1297         // Check if we can process out-of-order data now.
1298         if(c->sacks[0].len && len >= c->sacks[0].offset) {
1299                 debug(c, "incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
1300
1301                 if(len < c->sacks[0].offset + c->sacks[0].len) {
1302                         size_t offset = len;
1303                         len = c->sacks[0].offset + c->sacks[0].len;
1304                         size_t remainder = len - offset;
1305
1306                         ssize_t rxd = buffer_call(c, &c->rcvbuf, offset, remainder);
1307
1308                         if(rxd != (ssize_t)remainder) {
1309                                 // TODO: handle the application not accepting all data.
1310                                 abort();
1311                         }
1312                 }
1313         }
1314
1315         if(c->rcvbuf.used) {
1316                 sack_consume(c, len);
1317         }
1318
1319         c->rcv.nxt += len;
1320 }
1321
1322 static void handle_in_order_framed(struct utcp_connection *c, const void *data, size_t len) {
1323         // Treat it as out of order, since it is unlikely the start of this packet contains the start of a frame.
1324         uint32_t in_order_offset = (c->sacks[0].len && c->sacks[0].offset == 0) ? c->sacks[0].len : 0;
1325         handle_out_of_order(c, in_order_offset, data, len);
1326
1327         // While we have full frames at the start, give them to the application
1328         while(c->sacks[0].len >= 2 && c->sacks[0].offset == 0) {
1329                 uint16_t framelen;
1330                 buffer_copy(&c->rcvbuf, &framelen, 0, sizeof(&framelen));
1331
1332                 if(framelen > c->sacks[0].len - 2) {
1333                         break;
1334                 }
1335
1336                 if(c->recv) {
1337                         ssize_t rxd;
1338                         uint32_t realoffset = c->rcvbuf.offset + 2;
1339
1340                         if(c->rcvbuf.size - c->rcvbuf.offset <= 2) {
1341                                 realoffset -= c->rcvbuf.size;
1342                         }
1343
1344                         if(realoffset > c->rcvbuf.size - framelen) {
1345                                 // The buffer wraps, we need to copy
1346                                 uint8_t buf[framelen];
1347                                 buffer_copy(&c->rcvbuf, buf, 2, framelen);
1348                                 rxd = c->recv(c, buf, framelen);
1349                         } else {
1350                                 // The frame is contiguous in the receive buffer
1351                                 rxd = c->recv(c, c->rcvbuf.data + realoffset, framelen);
1352                         }
1353
1354                         if(rxd != (ssize_t)framelen) {
1355                                 // TODO: handle the application not accepting all data.
1356                                 abort();
1357                         }
1358                 }
1359
1360                 sack_consume(c, framelen + 2);
1361         }
1362
1363         c->rcv.nxt += len;
1364 }
1365
1366 static void handle_unreliable(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1367         // Fast path for unfragmented packets
1368         if(!hdr->wnd && !(hdr->ctl & MF)) {
1369                 if(c->recv) {
1370                         c->recv(c, data, len);
1371                 }
1372
1373                 c->rcv.nxt = hdr->seq + len;
1374                 return;
1375         }
1376
1377         // Ensure reassembled packet are not larger than 64 kiB
1378         if(hdr->wnd > MAX_UNRELIABLE_SIZE || hdr->wnd + len > MAX_UNRELIABLE_SIZE) {
1379                 return;
1380         }
1381
1382         // Don't accept out of order fragments
1383         if(hdr->wnd && hdr->seq != c->rcv.nxt) {
1384                 return;
1385         }
1386
1387         // Reset the receive buffer for the first fragment
1388         if(!hdr->wnd) {
1389                 buffer_clear(&c->rcvbuf);
1390         }
1391
1392         ssize_t rxd = buffer_put_at(&c->rcvbuf, hdr->wnd, data, len);
1393
1394         if(rxd != (ssize_t)len) {
1395                 return;
1396         }
1397
1398         // Send the packet if it's the final fragment
1399         if(!(hdr->ctl & MF)) {
1400                 buffer_call(c, &c->rcvbuf, 0, hdr->wnd + len);
1401         }
1402
1403         c->rcv.nxt = hdr->seq + len;
1404 }
1405
1406 static void handle_unreliable_framed(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1407         bool in_order = hdr->seq == c->rcv.nxt;
1408         c->rcv.nxt = hdr->seq + len;
1409
1410         const uint8_t *ptr = data;
1411         size_t left = len;
1412
1413         // Does it start with a partial frame?
1414         if(hdr->wnd) {
1415                 // Only accept the data if it is in order
1416                 if(in_order && c->rcvbuf.used) {
1417                         // In order, append it to the receive buffer
1418                         buffer_put(&c->rcvbuf, data, min(hdr->wnd, len));
1419
1420                         if(hdr->wnd <= len) {
1421                                 // We have a full frame
1422                                 c->recv(c, (uint8_t *)c->rcvbuf.data + 2, c->rcvbuf.used - 2);
1423                         }
1424                 }
1425
1426                 // Exit early if there is other data in this frame
1427                 if(hdr->wnd > len) {
1428                         if(!in_order) {
1429                                 buffer_clear(&c->rcvbuf);
1430                         }
1431
1432                         return;
1433                 }
1434
1435                 ptr += hdr->wnd;
1436                 left -= hdr->wnd;
1437         }
1438
1439         // We now start with new frames, so clear any data in the receive buffer
1440         buffer_clear(&c->rcvbuf);
1441
1442         // Handle whole frames
1443         while(left > 2) {
1444                 uint16_t framelen;
1445                 memcpy(&framelen, ptr, sizeof(framelen));
1446
1447                 if(left <= (size_t)framelen + 2) {
1448                         break;
1449                 }
1450
1451                 c->recv(c, ptr + 2, framelen);
1452                 ptr += framelen + 2;
1453                 left -= framelen + 2;
1454         }
1455
1456         // Handle partial last frame
1457         if(left) {
1458                 buffer_put(&c->rcvbuf, ptr, left);
1459         }
1460 }
1461
1462 static void handle_incoming_data(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1463         if(!is_reliable(c)) {
1464                 if(is_framed(c)) {
1465                         handle_unreliable_framed(c, hdr, data, len);
1466                 } else {
1467                         handle_unreliable(c, hdr, data, len);
1468                 }
1469
1470                 return;
1471         }
1472
1473         uint32_t offset = seqdiff(hdr->seq, c->rcv.nxt);
1474
1475         if(is_framed(c)) {
1476                 if(offset) {
1477                         handle_out_of_order_framed(c, offset, data, len);
1478                 } else {
1479                         handle_in_order_framed(c, data, len);
1480                 }
1481         } else {
1482                 if(offset) {
1483                         handle_out_of_order(c, offset, data, len);
1484                 } else {
1485                         handle_in_order(c, data, len);
1486                 }
1487         }
1488 }
1489
1490
1491 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
1492         const uint8_t *ptr = data;
1493
1494         if(!utcp) {
1495                 errno = EFAULT;
1496                 return -1;
1497         }
1498
1499         if(!len) {
1500                 return 0;
1501         }
1502
1503         if(!data) {
1504                 errno = EFAULT;
1505                 return -1;
1506         }
1507
1508         // Drop packets smaller than the header
1509
1510         struct hdr hdr;
1511
1512         if(len < sizeof(hdr)) {
1513                 print_packet(NULL, "recv", data, len);
1514                 errno = EBADMSG;
1515                 return -1;
1516         }
1517
1518         // Make a copy from the potentially unaligned data to a struct hdr
1519
1520         memcpy(&hdr, ptr, sizeof(hdr));
1521
1522         // Try to match the packet to an existing connection
1523
1524         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1525         print_packet(c, "recv", data, len);
1526
1527         // Process the header
1528
1529         ptr += sizeof(hdr);
1530         len -= sizeof(hdr);
1531
1532         // Drop packets with an unknown CTL flag
1533
1534         if(hdr.ctl & ~(SYN | ACK | RST | FIN | MF)) {
1535                 print_packet(NULL, "recv", data, len);
1536                 errno = EBADMSG;
1537                 return -1;
1538         }
1539
1540         // Check for auxiliary headers
1541
1542         const uint8_t *init = NULL;
1543
1544         uint16_t aux = hdr.aux;
1545
1546         while(aux) {
1547                 size_t auxlen = 4 * (aux >> 8) & 0xf;
1548                 uint8_t auxtype = aux & 0xff;
1549
1550                 if(len < auxlen) {
1551                         errno = EBADMSG;
1552                         return -1;
1553                 }
1554
1555                 switch(auxtype) {
1556                 case AUX_INIT:
1557                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1558                                 errno = EBADMSG;
1559                                 return -1;
1560                         }
1561
1562                         init = ptr;
1563                         break;
1564
1565                 default:
1566                         errno = EBADMSG;
1567                         return -1;
1568                 }
1569
1570                 len -= auxlen;
1571                 ptr += auxlen;
1572
1573                 if(!(aux & 0x800)) {
1574                         break;
1575                 }
1576
1577                 if(len < 2) {
1578                         errno = EBADMSG;
1579                         return -1;
1580                 }
1581
1582                 memcpy(&aux, ptr, 2);
1583                 len -= 2;
1584                 ptr += 2;
1585         }
1586
1587         bool has_data = len || (hdr.ctl & (SYN | FIN));
1588
1589         // Is it for a new connection?
1590
1591         if(!c) {
1592                 // Ignore RST packets
1593
1594                 if(hdr.ctl & RST) {
1595                         return 0;
1596                 }
1597
1598                 // Is it a SYN packet and are we LISTENing?
1599
1600                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1601                         // If we don't want to accept it, send a RST back
1602                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1603                                 len = 1;
1604                                 goto reset;
1605                         }
1606
1607                         // Try to allocate memory, otherwise send a RST back
1608                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1609
1610                         if(!c) {
1611                                 len = 1;
1612                                 goto reset;
1613                         }
1614
1615                         // Parse auxilliary information
1616                         if(init) {
1617                                 if(init[0] < 1) {
1618                                         len = 1;
1619                                         goto reset;
1620                                 }
1621
1622                                 c->flags = init[3] & 0x7;
1623                         } else {
1624                                 c->flags = UTCP_TCP;
1625                         }
1626
1627 synack:
1628                         // Return SYN+ACK, go to SYN_RECEIVED state
1629                         c->snd.wnd = hdr.wnd;
1630                         c->rcv.irs = hdr.seq;
1631                         c->rcv.nxt = c->rcv.irs + 1;
1632                         set_state(c, SYN_RECEIVED);
1633
1634                         struct {
1635                                 struct hdr hdr;
1636                                 uint8_t data[4];
1637                         } pkt;
1638
1639                         pkt.hdr.src = c->src;
1640                         pkt.hdr.dst = c->dst;
1641                         pkt.hdr.ack = c->rcv.irs + 1;
1642                         pkt.hdr.seq = c->snd.iss;
1643                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1644                         pkt.hdr.ctl = SYN | ACK;
1645
1646                         if(init) {
1647                                 pkt.hdr.aux = 0x0101;
1648                                 pkt.data[0] = 1;
1649                                 pkt.data[1] = 0;
1650                                 pkt.data[2] = 0;
1651                                 pkt.data[3] = c->flags & 0x7;
1652                                 print_packet(c, "send", &pkt, sizeof(hdr) + 4);
1653                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1654                         } else {
1655                                 pkt.hdr.aux = 0;
1656                                 print_packet(c, "send", &pkt, sizeof(hdr));
1657                                 utcp->send(utcp, &pkt, sizeof(hdr));
1658                         }
1659
1660                         start_retransmit_timer(c);
1661                 } else {
1662                         // No, we don't want your packets, send a RST back
1663                         len = 1;
1664                         goto reset;
1665                 }
1666
1667                 return 0;
1668         }
1669
1670         debug(c, "state %s\n", strstate[c->state]);
1671
1672         // In case this is for a CLOSED connection, ignore the packet.
1673         // TODO: make it so incoming packets can never match a CLOSED connection.
1674
1675         if(c->state == CLOSED) {
1676                 debug(c, "got packet for closed connection\n");
1677                 return 0;
1678         }
1679
1680         // It is for an existing connection.
1681
1682         // 1. Drop invalid packets.
1683
1684         // 1a. Drop packets that should not happen in our current state.
1685
1686         switch(c->state) {
1687         case SYN_SENT:
1688         case SYN_RECEIVED:
1689         case ESTABLISHED:
1690         case FIN_WAIT_1:
1691         case FIN_WAIT_2:
1692         case CLOSE_WAIT:
1693         case CLOSING:
1694         case LAST_ACK:
1695         case TIME_WAIT:
1696                 break;
1697
1698         default:
1699 #ifdef UTCP_DEBUG
1700                 abort();
1701 #endif
1702                 break;
1703         }
1704
1705         // 1b. Discard data that is not in our receive window.
1706
1707         if(is_reliable(c)) {
1708                 bool acceptable;
1709
1710                 if(c->state == SYN_SENT) {
1711                         acceptable = true;
1712                 } else if(len == 0) {
1713                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1714                 } else {
1715                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1716
1717                         // cut already accepted front overlapping
1718                         if(rcv_offset < 0) {
1719                                 acceptable = len > (size_t) - rcv_offset;
1720
1721                                 if(acceptable) {
1722                                         ptr -= rcv_offset;
1723                                         len += rcv_offset;
1724                                         hdr.seq -= rcv_offset;
1725                                 }
1726                         } else {
1727                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1728                         }
1729                 }
1730
1731                 if(!acceptable) {
1732                         debug(c, "packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1733
1734                         // Ignore unacceptable RST packets.
1735                         if(hdr.ctl & RST) {
1736                                 return 0;
1737                         }
1738
1739                         // Otherwise, continue processing.
1740                         len = 0;
1741                 }
1742         } else {
1743 #if UTCP_DEBUG
1744                 int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1745
1746                 if(rcv_offset) {
1747                         debug(c, "packet out of order, offset %u bytes\n", rcv_offset);
1748                 }
1749
1750 #endif
1751         }
1752
1753         c->snd.wnd = hdr.wnd; // TODO: move below
1754
1755         // 1c. Drop packets with an invalid ACK.
1756         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1757         // (= snd.una + c->sndbuf.used).
1758
1759         if(!is_reliable(c)) {
1760                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1761                         hdr.ack = c->snd.una;
1762                 }
1763         }
1764
1765         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1766                 debug(c, "packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1767
1768                 // Ignore unacceptable RST packets.
1769                 if(hdr.ctl & RST) {
1770                         return 0;
1771                 }
1772
1773                 goto reset;
1774         }
1775
1776         // 2. Handle RST packets
1777
1778         if(hdr.ctl & RST) {
1779                 switch(c->state) {
1780                 case SYN_SENT:
1781                         if(!(hdr.ctl & ACK)) {
1782                                 return 0;
1783                         }
1784
1785                         // The peer has refused our connection.
1786                         set_state(c, CLOSED);
1787                         errno = ECONNREFUSED;
1788
1789                         if(c->recv) {
1790                                 c->recv(c, NULL, 0);
1791                         }
1792
1793                         if(c->poll && !c->reapable) {
1794                                 c->poll(c, 0);
1795                         }
1796
1797                         return 0;
1798
1799                 case SYN_RECEIVED:
1800                         if(hdr.ctl & ACK) {
1801                                 return 0;
1802                         }
1803
1804                         // We haven't told the application about this connection yet. Silently delete.
1805                         free_connection(c);
1806                         return 0;
1807
1808                 case ESTABLISHED:
1809                 case FIN_WAIT_1:
1810                 case FIN_WAIT_2:
1811                 case CLOSE_WAIT:
1812                         if(hdr.ctl & ACK) {
1813                                 return 0;
1814                         }
1815
1816                         // The peer has aborted our connection.
1817                         set_state(c, CLOSED);
1818                         errno = ECONNRESET;
1819
1820                         if(c->recv) {
1821                                 c->recv(c, NULL, 0);
1822                         }
1823
1824                         if(c->poll && !c->reapable) {
1825                                 c->poll(c, 0);
1826                         }
1827
1828                         return 0;
1829
1830                 case CLOSING:
1831                 case LAST_ACK:
1832                 case TIME_WAIT:
1833                         if(hdr.ctl & ACK) {
1834                                 return 0;
1835                         }
1836
1837                         // As far as the application is concerned, the connection has already been closed.
1838                         // If it has called utcp_close() already, we can immediately free this connection.
1839                         if(c->reapable) {
1840                                 free_connection(c);
1841                                 return 0;
1842                         }
1843
1844                         // Otherwise, immediately move to the CLOSED state.
1845                         set_state(c, CLOSED);
1846                         return 0;
1847
1848                 default:
1849 #ifdef UTCP_DEBUG
1850                         abort();
1851 #endif
1852                         break;
1853                 }
1854         }
1855
1856         uint32_t advanced;
1857
1858         if(!(hdr.ctl & ACK)) {
1859                 advanced = 0;
1860                 goto skip_ack;
1861         }
1862
1863         // 3. Advance snd.una
1864
1865         advanced = seqdiff(hdr.ack, c->snd.una);
1866
1867         if(advanced) {
1868                 // RTT measurement
1869                 if(c->rtt_start.tv_sec) {
1870                         if(c->rtt_seq == hdr.ack) {
1871                                 struct timespec now;
1872                                 clock_gettime(UTCP_CLOCK, &now);
1873                                 int32_t diff = timespec_diff_usec(&now, &c->rtt_start);
1874                                 update_rtt(c, diff);
1875                                 c->rtt_start.tv_sec = 0;
1876                         } else if(c->rtt_seq < hdr.ack) {
1877                                 debug(c, "cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1878                                 c->rtt_start.tv_sec = 0;
1879                         }
1880                 }
1881
1882                 int32_t data_acked = advanced;
1883
1884                 switch(c->state) {
1885                 case SYN_SENT:
1886                 case SYN_RECEIVED:
1887                         data_acked--;
1888                         break;
1889
1890                 // TODO: handle FIN as well.
1891                 default:
1892                         break;
1893                 }
1894
1895                 assert(data_acked >= 0);
1896
1897 #ifndef NDEBUG
1898                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1899                 assert(data_acked <= bufused);
1900 #endif
1901
1902                 if(data_acked) {
1903                         buffer_discard(&c->sndbuf, data_acked);
1904
1905                         if(is_reliable(c)) {
1906                                 c->do_poll = true;
1907                         }
1908                 }
1909
1910                 // Also advance snd.nxt if possible
1911                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1912                         c->snd.nxt = hdr.ack;
1913                 }
1914
1915                 c->snd.una = hdr.ack;
1916
1917                 if(c->dupack) {
1918                         if(c->dupack >= 3) {
1919                                 debug(c, "fast recovery ended\n");
1920                                 c->snd.cwnd = c->snd.ssthresh;
1921                         }
1922
1923                         c->dupack = 0;
1924                 }
1925
1926                 // Increase the congestion window according to RFC 5681
1927                 if(c->snd.cwnd < c->snd.ssthresh) {
1928                         c->snd.cwnd += min(advanced, utcp->mss); // eq. 2
1929                 } else {
1930                         c->snd.cwnd += max(1, (utcp->mss * utcp->mss) / c->snd.cwnd); // eq. 3
1931                 }
1932
1933                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1934                         c->snd.cwnd = c->sndbuf.maxsize;
1935                 }
1936
1937                 debug_cwnd(c);
1938
1939                 // Check if we have sent a FIN that is now ACKed.
1940                 switch(c->state) {
1941                 case FIN_WAIT_1:
1942                         if(c->snd.una == c->snd.last) {
1943                                 set_state(c, FIN_WAIT_2);
1944                         }
1945
1946                         break;
1947
1948                 case CLOSING:
1949                         if(c->snd.una == c->snd.last) {
1950                                 clock_gettime(UTCP_CLOCK, &c->conn_timeout);
1951                                 c->conn_timeout.tv_sec += utcp->timeout;
1952                                 set_state(c, TIME_WAIT);
1953                         }
1954
1955                         break;
1956
1957                 default:
1958                         break;
1959                 }
1960         } else {
1961                 if(!len && is_reliable(c) && c->snd.una != c->snd.last) {
1962                         c->dupack++;
1963                         debug(c, "duplicate ACK %d\n", c->dupack);
1964
1965                         if(c->dupack == 3) {
1966                                 // RFC 5681 fast recovery
1967                                 debug(c, "fast recovery started\n", c->dupack);
1968                                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
1969                                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
1970                                 c->snd.cwnd = min(c->snd.ssthresh + 3 * utcp->mss, c->sndbuf.maxsize);
1971
1972                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1973                                         c->snd.cwnd = c->sndbuf.maxsize;
1974                                 }
1975
1976                                 debug_cwnd(c);
1977
1978                                 fast_retransmit(c);
1979                         } else if(c->dupack > 3) {
1980                                 c->snd.cwnd += utcp->mss;
1981
1982                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1983                                         c->snd.cwnd = c->sndbuf.maxsize;
1984                                 }
1985
1986                                 debug_cwnd(c);
1987                         }
1988
1989                         // We got an ACK which indicates the other side did get one of our packets.
1990                         // Reset the retransmission timer to avoid going to slow start,
1991                         // but don't touch the connection timeout.
1992                         start_retransmit_timer(c);
1993                 }
1994         }
1995
1996         // 4. Update timers
1997
1998         if(advanced) {
1999                 if(c->snd.una == c->snd.last) {
2000                         stop_retransmit_timer(c);
2001                         timespec_clear(&c->conn_timeout);
2002                 } else if(is_reliable(c)) {
2003                         start_retransmit_timer(c);
2004                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2005                         c->conn_timeout.tv_sec += utcp->timeout;
2006                 }
2007         }
2008
2009 skip_ack:
2010         // 5. Process SYN stuff
2011
2012         if(hdr.ctl & SYN) {
2013                 switch(c->state) {
2014                 case SYN_SENT:
2015
2016                         // This is a SYNACK. It should always have ACKed the SYN.
2017                         if(!advanced) {
2018                                 goto reset;
2019                         }
2020
2021                         c->rcv.irs = hdr.seq;
2022                         c->rcv.nxt = hdr.seq + 1;
2023
2024                         if(c->shut_wr) {
2025                                 c->snd.last++;
2026                                 set_state(c, FIN_WAIT_1);
2027                         } else {
2028                                 set_state(c, ESTABLISHED);
2029                         }
2030
2031                         break;
2032
2033                 case SYN_RECEIVED:
2034                         // This is a retransmit of a SYN, send back the SYNACK.
2035                         goto synack;
2036
2037                 case ESTABLISHED:
2038                 case FIN_WAIT_1:
2039                 case FIN_WAIT_2:
2040                 case CLOSE_WAIT:
2041                 case CLOSING:
2042                 case LAST_ACK:
2043                 case TIME_WAIT:
2044                         // This could be a retransmission. Ignore the SYN flag, but send an ACK back.
2045                         break;
2046
2047                 default:
2048 #ifdef UTCP_DEBUG
2049                         abort();
2050 #endif
2051                         return 0;
2052                 }
2053         }
2054
2055         // 6. Process new data
2056
2057         if(c->state == SYN_RECEIVED) {
2058                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
2059                 if(!advanced) {
2060                         goto reset;
2061                 }
2062
2063                 // Are we still LISTENing?
2064                 if(utcp->accept) {
2065                         utcp->accept(c, c->src);
2066                 }
2067
2068                 if(c->state != ESTABLISHED) {
2069                         set_state(c, CLOSED);
2070                         c->reapable = true;
2071                         goto reset;
2072                 }
2073         }
2074
2075         if(len) {
2076                 switch(c->state) {
2077                 case SYN_SENT:
2078                 case SYN_RECEIVED:
2079                         // This should never happen.
2080 #ifdef UTCP_DEBUG
2081                         abort();
2082 #endif
2083                         return 0;
2084
2085                 case ESTABLISHED:
2086                 case FIN_WAIT_1:
2087                 case FIN_WAIT_2:
2088                         break;
2089
2090                 case CLOSE_WAIT:
2091                 case CLOSING:
2092                 case LAST_ACK:
2093                 case TIME_WAIT:
2094                         // Ehm no, We should never receive more data after a FIN.
2095                         goto reset;
2096
2097                 default:
2098 #ifdef UTCP_DEBUG
2099                         abort();
2100 #endif
2101                         return 0;
2102                 }
2103
2104                 handle_incoming_data(c, &hdr, ptr, len);
2105         }
2106
2107         // 7. Process FIN stuff
2108
2109         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
2110                 switch(c->state) {
2111                 case SYN_SENT:
2112                 case SYN_RECEIVED:
2113                         // This should never happen.
2114 #ifdef UTCP_DEBUG
2115                         abort();
2116 #endif
2117                         break;
2118
2119                 case ESTABLISHED:
2120                         set_state(c, CLOSE_WAIT);
2121                         break;
2122
2123                 case FIN_WAIT_1:
2124                         set_state(c, CLOSING);
2125                         break;
2126
2127                 case FIN_WAIT_2:
2128                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2129                         c->conn_timeout.tv_sec += utcp->timeout;
2130                         set_state(c, TIME_WAIT);
2131                         break;
2132
2133                 case CLOSE_WAIT:
2134                 case CLOSING:
2135                 case LAST_ACK:
2136                 case TIME_WAIT:
2137                         // Ehm, no. We should never receive a second FIN.
2138                         goto reset;
2139
2140                 default:
2141 #ifdef UTCP_DEBUG
2142                         abort();
2143 #endif
2144                         break;
2145                 }
2146
2147                 // FIN counts as one sequence number
2148                 c->rcv.nxt++;
2149                 len++;
2150
2151                 // Inform the application that the peer closed its end of the connection.
2152                 if(c->recv) {
2153                         errno = 0;
2154                         c->recv(c, NULL, 0);
2155                 }
2156         }
2157
2158         // Now we send something back if:
2159         // - we received data, so we have to send back an ACK
2160         //   -> sendatleastone = true
2161         // - or we got an ack, so we should maybe send a bit more data
2162         //   -> sendatleastone = false
2163
2164         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
2165                 ack(c, has_data);
2166         }
2167
2168         return 0;
2169
2170 reset:
2171         swap_ports(&hdr);
2172         hdr.wnd = 0;
2173         hdr.aux = 0;
2174
2175         if(hdr.ctl & ACK) {
2176                 hdr.seq = hdr.ack;
2177                 hdr.ctl = RST;
2178         } else {
2179                 hdr.ack = hdr.seq + len;
2180                 hdr.seq = 0;
2181                 hdr.ctl = RST | ACK;
2182         }
2183
2184         print_packet(c, "send", &hdr, sizeof(hdr));
2185         utcp->send(utcp, &hdr, sizeof(hdr));
2186         return 0;
2187
2188 }
2189
2190 int utcp_shutdown(struct utcp_connection *c, int dir) {
2191         debug(c, "shutdown %d at %u\n", dir, c ? c->snd.last : 0);
2192
2193         if(!c) {
2194                 errno = EFAULT;
2195                 return -1;
2196         }
2197
2198         if(c->reapable) {
2199                 debug(c, "shutdown() called on closed connection\n");
2200                 errno = EBADF;
2201                 return -1;
2202         }
2203
2204         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
2205                 errno = EINVAL;
2206                 return -1;
2207         }
2208
2209         // TCP does not have a provision for stopping incoming packets.
2210         // The best we can do is to just ignore them.
2211         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
2212                 c->recv = NULL;
2213         }
2214
2215         // The rest of the code deals with shutting down writes.
2216         if(dir == UTCP_SHUT_RD) {
2217                 return 0;
2218         }
2219
2220         // Only process shutting down writes once.
2221         if(c->shut_wr) {
2222                 return 0;
2223         }
2224
2225         c->shut_wr = true;
2226
2227         switch(c->state) {
2228         case CLOSED:
2229         case LISTEN:
2230                 errno = ENOTCONN;
2231                 return -1;
2232
2233         case SYN_SENT:
2234                 return 0;
2235
2236         case SYN_RECEIVED:
2237         case ESTABLISHED:
2238                 if(!is_reliable(c) && is_framed(c)) {
2239                         flush_unreliable_framed(c);
2240                 }
2241
2242                 set_state(c, FIN_WAIT_1);
2243                 break;
2244
2245         case FIN_WAIT_1:
2246         case FIN_WAIT_2:
2247                 return 0;
2248
2249         case CLOSE_WAIT:
2250                 set_state(c, CLOSING);
2251                 break;
2252
2253         case CLOSING:
2254         case LAST_ACK:
2255         case TIME_WAIT:
2256                 return 0;
2257         }
2258
2259         c->snd.last++;
2260
2261         ack(c, !is_reliable(c));
2262
2263         if(!timespec_isset(&c->rtrx_timeout)) {
2264                 start_retransmit_timer(c);
2265         }
2266
2267         return 0;
2268 }
2269
2270 static bool reset_connection(struct utcp_connection *c) {
2271         if(!c) {
2272                 errno = EFAULT;
2273                 return false;
2274         }
2275
2276         if(c->reapable) {
2277                 debug(c, "abort() called on closed connection\n");
2278                 errno = EBADF;
2279                 return false;
2280         }
2281
2282         c->recv = NULL;
2283         c->poll = NULL;
2284
2285         switch(c->state) {
2286         case CLOSED:
2287                 return true;
2288
2289         case LISTEN:
2290         case SYN_SENT:
2291         case CLOSING:
2292         case LAST_ACK:
2293         case TIME_WAIT:
2294                 set_state(c, CLOSED);
2295                 return true;
2296
2297         case SYN_RECEIVED:
2298         case ESTABLISHED:
2299         case FIN_WAIT_1:
2300         case FIN_WAIT_2:
2301         case CLOSE_WAIT:
2302                 set_state(c, CLOSED);
2303                 break;
2304         }
2305
2306         // Send RST
2307
2308         struct hdr hdr;
2309
2310         hdr.src = c->src;
2311         hdr.dst = c->dst;
2312         hdr.seq = c->snd.nxt;
2313         hdr.ack = 0;
2314         hdr.wnd = 0;
2315         hdr.ctl = RST;
2316
2317         print_packet(c, "send", &hdr, sizeof(hdr));
2318         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
2319         return true;
2320 }
2321
2322 // Closes all the opened connections
2323 void utcp_abort_all_connections(struct utcp *utcp) {
2324         if(!utcp) {
2325                 errno = EINVAL;
2326                 return;
2327         }
2328
2329         for(int i = 0; i < utcp->nconnections; i++) {
2330                 struct utcp_connection *c = utcp->connections[i];
2331
2332                 if(c->reapable || c->state == CLOSED) {
2333                         continue;
2334                 }
2335
2336                 utcp_recv_t old_recv = c->recv;
2337                 utcp_poll_t old_poll = c->poll;
2338
2339                 reset_connection(c);
2340
2341                 if(old_recv) {
2342                         errno = 0;
2343                         old_recv(c, NULL, 0);
2344                 }
2345
2346                 if(old_poll && !c->reapable) {
2347                         errno = 0;
2348                         old_poll(c, 0);
2349                 }
2350         }
2351
2352         return;
2353 }
2354
2355 int utcp_close(struct utcp_connection *c) {
2356         debug(c, "closing\n");
2357
2358         if(c->rcvbuf.used) {
2359                 debug(c, "receive buffer not empty, resetting\n");
2360                 return reset_connection(c) ? 0 : -1;
2361         }
2362
2363         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
2364                 return -1;
2365         }
2366
2367         c->recv = NULL;
2368         c->poll = NULL;
2369         c->reapable = true;
2370         return 0;
2371 }
2372
2373 int utcp_abort(struct utcp_connection *c) {
2374         if(!reset_connection(c)) {
2375                 return -1;
2376         }
2377
2378         c->reapable = true;
2379         return 0;
2380 }
2381
2382 /* Handle timeouts.
2383  * One call to this function will loop through all connections,
2384  * checking if something needs to be resent or not.
2385  * The return value is the time to the next timeout in milliseconds,
2386  * or maybe a negative value if the timeout is infinite.
2387  */
2388 struct timespec utcp_timeout(struct utcp *utcp) {
2389         struct timespec now;
2390         clock_gettime(UTCP_CLOCK, &now);
2391         struct timespec next = {now.tv_sec + 3600, now.tv_nsec};
2392
2393         for(int i = 0; i < utcp->nconnections; i++) {
2394                 struct utcp_connection *c = utcp->connections[i];
2395
2396                 if(!c) {
2397                         continue;
2398                 }
2399
2400                 // delete connections that have been utcp_close()d.
2401                 if(c->state == CLOSED) {
2402                         if(c->reapable) {
2403                                 debug(c, "reaping\n");
2404                                 free_connection(c);
2405                                 i--;
2406                         }
2407
2408                         continue;
2409                 }
2410
2411                 if(timespec_isset(&c->conn_timeout) && timespec_lt(&c->conn_timeout, &now)) {
2412                         errno = ETIMEDOUT;
2413                         c->state = CLOSED;
2414
2415                         if(c->recv) {
2416                                 c->recv(c, NULL, 0);
2417                         }
2418
2419                         if(c->poll && !c->reapable) {
2420                                 c->poll(c, 0);
2421                         }
2422
2423                         continue;
2424                 }
2425
2426                 if(timespec_isset(&c->rtrx_timeout) && timespec_lt(&c->rtrx_timeout, &now)) {
2427                         debug(c, "retransmitting after timeout\n");
2428                         retransmit(c);
2429                 }
2430
2431                 if(c->poll) {
2432                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT) && c->do_poll) {
2433                                 c->do_poll = false;
2434                                 uint32_t len = buffer_free(&c->sndbuf);
2435
2436                                 if(len) {
2437                                         c->poll(c, len);
2438                                 }
2439                         } else if(c->state == CLOSED) {
2440                                 c->poll(c, 0);
2441                         }
2442                 }
2443
2444                 if(timespec_isset(&c->conn_timeout) && timespec_lt(&c->conn_timeout, &next)) {
2445                         next = c->conn_timeout;
2446                 }
2447
2448                 if(timespec_isset(&c->rtrx_timeout) && timespec_lt(&c->rtrx_timeout, &next)) {
2449                         next = c->rtrx_timeout;
2450                 }
2451         }
2452
2453         struct timespec diff;
2454
2455         timespec_sub(&next, &now, &diff);
2456
2457         return diff;
2458 }
2459
2460 bool utcp_is_active(struct utcp *utcp) {
2461         if(!utcp) {
2462                 return false;
2463         }
2464
2465         for(int i = 0; i < utcp->nconnections; i++)
2466                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
2467                         return true;
2468                 }
2469
2470         return false;
2471 }
2472
2473 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
2474         if(!send) {
2475                 errno = EFAULT;
2476                 return NULL;
2477         }
2478
2479         struct utcp *utcp = calloc(1, sizeof(*utcp));
2480
2481         if(!utcp) {
2482                 return NULL;
2483         }
2484
2485         utcp_set_mtu(utcp, DEFAULT_MTU);
2486
2487         if(!utcp->pkt) {
2488                 free(utcp);
2489                 return NULL;
2490         }
2491
2492         if(!CLOCK_GRANULARITY) {
2493                 struct timespec res;
2494                 clock_getres(UTCP_CLOCK, &res);
2495                 CLOCK_GRANULARITY = res.tv_sec * USEC_PER_SEC + res.tv_nsec / 1000;
2496         }
2497
2498         utcp->accept = accept;
2499         utcp->pre_accept = pre_accept;
2500         utcp->send = send;
2501         utcp->priv = priv;
2502         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
2503
2504         return utcp;
2505 }
2506
2507 void utcp_exit(struct utcp *utcp) {
2508         if(!utcp) {
2509                 return;
2510         }
2511
2512         for(int i = 0; i < utcp->nconnections; i++) {
2513                 struct utcp_connection *c = utcp->connections[i];
2514
2515                 if(!c->reapable) {
2516                         if(c->recv) {
2517                                 c->recv(c, NULL, 0);
2518                         }
2519
2520                         if(c->poll && !c->reapable) {
2521                                 c->poll(c, 0);
2522                         }
2523                 }
2524
2525                 buffer_exit(&c->rcvbuf);
2526                 buffer_exit(&c->sndbuf);
2527                 free(c);
2528         }
2529
2530         free(utcp->connections);
2531         free(utcp->pkt);
2532         free(utcp);
2533 }
2534
2535 uint16_t utcp_get_mtu(struct utcp *utcp) {
2536         return utcp ? utcp->mtu : 0;
2537 }
2538
2539 uint16_t utcp_get_mss(struct utcp *utcp) {
2540         return utcp ? utcp->mss : 0;
2541 }
2542
2543 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
2544         if(!utcp) {
2545                 return;
2546         }
2547
2548         if(mtu <= sizeof(struct hdr)) {
2549                 return;
2550         }
2551
2552         if(mtu > utcp->mtu) {
2553                 char *new = realloc(utcp->pkt, mtu + sizeof(struct hdr));
2554
2555                 if(!new) {
2556                         return;
2557                 }
2558
2559                 utcp->pkt = new;
2560         }
2561
2562         utcp->mtu = mtu;
2563         utcp->mss = mtu - sizeof(struct hdr);
2564 }
2565
2566 void utcp_reset_timers(struct utcp *utcp) {
2567         if(!utcp) {
2568                 return;
2569         }
2570
2571         struct timespec now, then;
2572
2573         clock_gettime(UTCP_CLOCK, &now);
2574
2575         then = now;
2576
2577         then.tv_sec += utcp->timeout;
2578
2579         for(int i = 0; i < utcp->nconnections; i++) {
2580                 struct utcp_connection *c = utcp->connections[i];
2581
2582                 if(c->reapable) {
2583                         continue;
2584                 }
2585
2586                 if(timespec_isset(&c->rtrx_timeout)) {
2587                         c->rtrx_timeout = now;
2588                 }
2589
2590                 if(timespec_isset(&c->conn_timeout)) {
2591                         c->conn_timeout = then;
2592                 }
2593
2594                 c->rtt_start.tv_sec = 0;
2595
2596                 if(c->rto > START_RTO) {
2597                         c->rto = START_RTO;
2598                 }
2599         }
2600 }
2601
2602 int utcp_get_user_timeout(struct utcp *u) {
2603         return u ? u->timeout : 0;
2604 }
2605
2606 void utcp_set_user_timeout(struct utcp *u, int timeout) {
2607         if(u) {
2608                 u->timeout = timeout;
2609         }
2610 }
2611
2612 size_t utcp_get_sndbuf(struct utcp_connection *c) {
2613         return c ? c->sndbuf.maxsize : 0;
2614 }
2615
2616 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
2617         if(!c) {
2618                 return 0;
2619         }
2620
2621         switch(c->state) {
2622         case SYN_SENT:
2623         case SYN_RECEIVED:
2624         case ESTABLISHED:
2625         case CLOSE_WAIT:
2626                 return buffer_free(&c->sndbuf);
2627
2628         default:
2629                 return 0;
2630         }
2631 }
2632
2633 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2634         if(!c) {
2635                 return;
2636         }
2637
2638         c->sndbuf.maxsize = size;
2639
2640         if(c->sndbuf.maxsize != size) {
2641                 c->sndbuf.maxsize = -1;
2642         }
2643
2644         c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
2645 }
2646
2647 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2648         return c ? c->rcvbuf.maxsize : 0;
2649 }
2650
2651 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2652         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2653                 return buffer_free(&c->rcvbuf);
2654         } else {
2655                 return 0;
2656         }
2657 }
2658
2659 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2660         if(!c) {
2661                 return;
2662         }
2663
2664         c->rcvbuf.maxsize = size;
2665
2666         if(c->rcvbuf.maxsize != size) {
2667                 c->rcvbuf.maxsize = -1;
2668         }
2669 }
2670
2671 size_t utcp_get_sendq(struct utcp_connection *c) {
2672         return c->sndbuf.used;
2673 }
2674
2675 size_t utcp_get_recvq(struct utcp_connection *c) {
2676         return c->rcvbuf.used;
2677 }
2678
2679 bool utcp_get_nodelay(struct utcp_connection *c) {
2680         return c ? c->nodelay : false;
2681 }
2682
2683 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2684         if(c) {
2685                 c->nodelay = nodelay;
2686         }
2687 }
2688
2689 bool utcp_get_keepalive(struct utcp_connection *c) {
2690         return c ? c->keepalive : false;
2691 }
2692
2693 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2694         if(c) {
2695                 c->keepalive = keepalive;
2696         }
2697 }
2698
2699 size_t utcp_get_outq(struct utcp_connection *c) {
2700         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2701 }
2702
2703 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2704         if(c) {
2705                 c->recv = recv;
2706         }
2707 }
2708
2709 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2710         if(c) {
2711                 c->poll = poll;
2712                 c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
2713         }
2714 }
2715
2716 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2717         if(utcp) {
2718                 utcp->accept = accept;
2719                 utcp->pre_accept = pre_accept;
2720         }
2721 }
2722
2723 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2724         if(!c || c->reapable) {
2725                 return;
2726         }
2727
2728         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2729                 return;
2730         }
2731
2732         if(expect) {
2733                 // If we expect data, start the connection timer.
2734                 if(!timespec_isset(&c->conn_timeout)) {
2735                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2736                         c->conn_timeout.tv_sec += c->utcp->timeout;
2737                 }
2738         } else {
2739                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2740                 if(c->snd.una == c->snd.last) {
2741                         timespec_clear(&c->conn_timeout);
2742                 }
2743         }
2744 }
2745
2746 void utcp_offline(struct utcp *utcp, bool offline) {
2747         struct timespec now;
2748         clock_gettime(UTCP_CLOCK, &now);
2749
2750         for(int i = 0; i < utcp->nconnections; i++) {
2751                 struct utcp_connection *c = utcp->connections[i];
2752
2753                 if(c->reapable) {
2754                         continue;
2755                 }
2756
2757                 utcp_expect_data(c, offline);
2758
2759                 if(!offline) {
2760                         if(timespec_isset(&c->rtrx_timeout)) {
2761                                 c->rtrx_timeout = now;
2762                         }
2763
2764                         utcp->connections[i]->rtt_start.tv_sec = 0;
2765
2766                         if(c->rto > START_RTO) {
2767                                 c->rto = START_RTO;
2768                         }
2769                 }
2770         }
2771 }
2772
2773 void utcp_set_retransmit_cb(struct utcp *utcp, utcp_retransmit_t cb) {
2774         utcp->retransmit = cb;
2775 }
2776
2777 void utcp_set_clock_granularity(long granularity) {
2778         CLOCK_GRANULARITY = granularity;
2779 }
2780
2781 int utcp_get_flush_timeout(struct utcp *utcp) {
2782         return utcp->flush_timeout;
2783 }
2784
2785 void utcp_set_flush_timeout(struct utcp *utcp, int milliseconds) {
2786         utcp->flush_timeout = milliseconds;
2787 }