]> git.meshlink.io Git - meshlink/blob - src/utcp.c
Correctly report the maximum allowed send size in the poll callback for framed channels.
[meshlink] / src / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <time.h>
31
32 #include "utcp_priv.h"
33
34 #ifndef EBADMSG
35 #define EBADMSG         104
36 #endif
37
38 #ifndef SHUT_RDWR
39 #define SHUT_RDWR 2
40 #endif
41
42 #ifdef poll
43 #undef poll
44 #endif
45
46 #ifndef UTCP_CLOCK
47 #if defined(CLOCK_MONOTONIC_RAW) && defined(__x86_64__)
48 #define UTCP_CLOCK CLOCK_MONOTONIC_RAW
49 #else
50 #define UTCP_CLOCK CLOCK_MONOTONIC
51 #endif
52 #endif
53
54 static void timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *r) {
55         r->tv_sec = a->tv_sec - b->tv_sec;
56         r->tv_nsec = a->tv_nsec - b->tv_nsec;
57
58         if(r->tv_nsec < 0) {
59                 r->tv_sec--, r->tv_nsec += NSEC_PER_SEC;
60         }
61 }
62
63 static int32_t timespec_diff_usec(const struct timespec *a, const struct timespec *b) {
64         return (a->tv_sec - b->tv_sec) * 1000000 + (a->tv_nsec - b->tv_nsec) / 1000;
65 }
66
67 static bool timespec_lt(const struct timespec *a, const struct timespec *b) {
68         if(a->tv_sec == b->tv_sec) {
69                 return a->tv_nsec < b->tv_nsec;
70         } else {
71                 return a->tv_sec < b->tv_sec;
72         }
73 }
74
75 static void timespec_clear(struct timespec *a) {
76         a->tv_sec = 0;
77         a->tv_nsec = 0;
78 }
79
80 static bool timespec_isset(const struct timespec *a) {
81         return a->tv_sec;
82 }
83
84 static long CLOCK_GRANULARITY; // usec
85
86 static inline size_t min(size_t a, size_t b) {
87         return a < b ? a : b;
88 }
89
90 static inline size_t max(size_t a, size_t b) {
91         return a > b ? a : b;
92 }
93
94 #ifdef UTCP_DEBUG
95 #include <stdarg.h>
96
97 #ifndef UTCP_DEBUG_DATALEN
98 #define UTCP_DEBUG_DATALEN 20
99 #endif
100
101 static void debug(struct utcp_connection *c, const char *format, ...) {
102         struct timespec tv;
103         char buf[1024];
104         int len;
105
106         clock_gettime(CLOCK_REALTIME, &tv);
107         len = snprintf(buf, sizeof(buf), "%ld.%06lu %u:%u ", (long)tv.tv_sec, tv.tv_nsec / 1000, c ? c->src : 0, c ? c->dst : 0);
108         va_list ap;
109         va_start(ap, format);
110         len += vsnprintf(buf + len, sizeof(buf) - len, format, ap);
111         va_end(ap);
112
113         if(len > 0 && (size_t)len < sizeof(buf)) {
114                 fwrite(buf, len, 1, stderr);
115         }
116 }
117
118 static void print_packet(struct utcp_connection *c, const char *dir, const void *pkt, size_t len) {
119         struct hdr hdr;
120
121         if(len < sizeof(hdr)) {
122                 debug(c, "%s: short packet (%lu bytes)\n", dir, (unsigned long)len);
123                 return;
124         }
125
126         memcpy(&hdr, pkt, sizeof(hdr));
127
128         uint32_t datalen;
129
130         if(len > sizeof(hdr)) {
131                 datalen = min(len - sizeof(hdr), UTCP_DEBUG_DATALEN);
132         } else {
133                 datalen = 0;
134         }
135
136
137         const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
138         char str[datalen * 2 + 1];
139         char *p = str;
140
141         for(uint32_t i = 0; i < datalen; i++) {
142                 *p++ = "0123456789ABCDEF"[data[i] >> 4];
143                 *p++ = "0123456789ABCDEF"[data[i] & 15];
144         }
145
146         *p = 0;
147
148         debug(c, "%s: len %lu src %u dst %u seq %u ack %u wnd %u aux %x ctl %s%s%s%s%s data %s\n",
149               dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux,
150               hdr.ctl & SYN ? "SYN" : "",
151               hdr.ctl & RST ? "RST" : "",
152               hdr.ctl & FIN ? "FIN" : "",
153               hdr.ctl & ACK ? "ACK" : "",
154               hdr.ctl & MF ? "MF" : "",
155               str
156              );
157 }
158
159 static void debug_cwnd(struct utcp_connection *c) {
160         debug(c, "snd.cwnd %u snd.ssthresh %u\n", c->snd.cwnd, ~c->snd.ssthresh ? c->snd.ssthresh : 0);
161 }
162 #else
163 #define debug(...) do {} while(0)
164 #define print_packet(...) do {} while(0)
165 #define debug_cwnd(...) do {} while(0)
166 #endif
167
168 static void set_state(struct utcp_connection *c, enum state state) {
169         c->state = state;
170
171         if(state == ESTABLISHED) {
172                 timespec_clear(&c->conn_timeout);
173         }
174
175         debug(c, "state %s\n", strstate[state]);
176 }
177
178 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
179         if(seq != c->snd.last) {
180                 return false;
181         }
182
183         switch(c->state) {
184         case FIN_WAIT_1:
185         case CLOSING:
186         case LAST_ACK:
187                 return true;
188
189         default:
190                 return false;
191         }
192 }
193
194 static bool is_reliable(struct utcp_connection *c) {
195         return c->flags & UTCP_RELIABLE;
196 }
197
198 static bool is_framed(struct utcp_connection *c) {
199         return c->flags & UTCP_FRAMED;
200 }
201
202 static int32_t seqdiff(uint32_t a, uint32_t b) {
203         return a - b;
204 }
205
206 // Buffer functions
207 static bool buffer_wraps(struct buffer *buf) {
208         return buf->size - buf->offset < buf->used;
209 }
210
211 static bool buffer_resize(struct buffer *buf, uint32_t newsize) {
212         char *newdata = realloc(buf->data, newsize);
213
214         if(!newdata) {
215                 return false;
216         }
217
218         buf->data = newdata;
219
220         if(buffer_wraps(buf)) {
221                 // Shift the right part of the buffer until it hits the end of the new buffer.
222                 // Old situation:
223                 // [345......012]
224                 // New situation:
225                 // [345.........|........012]
226                 uint32_t tailsize = buf->size - buf->offset;
227                 uint32_t newoffset = newsize - tailsize;
228                 memmove(buf->data + newoffset, buf->data + buf->offset, tailsize);
229                 buf->offset = newoffset;
230         }
231
232         buf->size = newsize;
233         return true;
234 }
235
236 // Store data into the buffer
237 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
238         debug(NULL, "buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
239
240         // Ensure we don't store more than maxsize bytes in total
241         size_t required = offset + len;
242
243         if(required > buf->maxsize) {
244                 if(offset >= buf->maxsize) {
245                         return 0;
246                 }
247
248                 len = buf->maxsize - offset;
249                 required = buf->maxsize;
250         }
251
252         // Check if we need to resize the buffer
253         if(required > buf->size) {
254                 size_t newsize = buf->size;
255
256                 if(!newsize) {
257                         newsize = 4096;
258                 }
259
260                 do {
261                         newsize *= 2;
262                 } while(newsize < required);
263
264                 if(newsize > buf->maxsize) {
265                         newsize = buf->maxsize;
266                 }
267
268                 if(!buffer_resize(buf, newsize)) {
269                         return -1;
270                 }
271         }
272
273         uint32_t realoffset = buf->offset + offset;
274
275         if(buf->size - buf->offset <= offset) {
276                 // The offset wrapped
277                 realoffset -= buf->size;
278         }
279
280         if(buf->size - realoffset < len) {
281                 // The new chunk of data must be wrapped
282                 memcpy(buf->data + realoffset, data, buf->size - realoffset);
283                 memcpy(buf->data, (char *)data + buf->size - realoffset, len - (buf->size - realoffset));
284         } else {
285                 memcpy(buf->data + realoffset, data, len);
286         }
287
288         if(required > buf->used) {
289                 buf->used = required;
290         }
291
292         return len;
293 }
294
295 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
296         return buffer_put_at(buf, buf->used, data, len);
297 }
298
299 // Copy data from the buffer without removing it.
300 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
301         // Ensure we don't copy more than is actually stored in the buffer
302         if(offset >= buf->used) {
303                 return 0;
304         }
305
306         if(buf->used - offset < len) {
307                 len = buf->used - offset;
308         }
309
310         uint32_t realoffset = buf->offset + offset;
311
312         if(buf->size - buf->offset <= offset) {
313                 // The offset wrapped
314                 realoffset -= buf->size;
315         }
316
317         if(buf->size - realoffset < len) {
318                 // The data is wrapped
319                 memcpy(data, buf->data + realoffset, buf->size - realoffset);
320                 memcpy((char *)data + buf->size - realoffset, buf->data, len - (buf->size - realoffset));
321         } else {
322                 memcpy(data, buf->data + realoffset, len);
323         }
324
325         return len;
326 }
327
328 // Copy data from the buffer without removing it.
329 static ssize_t buffer_call(struct utcp_connection *c, struct buffer *buf, size_t offset, size_t len) {
330         if(!c->recv) {
331                 return len;
332         }
333
334         // Ensure we don't copy more than is actually stored in the buffer
335         if(offset >= buf->used) {
336                 return 0;
337         }
338
339         if(buf->used - offset < len) {
340                 len = buf->used - offset;
341         }
342
343         uint32_t realoffset = buf->offset + offset;
344
345         if(buf->size - buf->offset <= offset) {
346                 // The offset wrapped
347                 realoffset -= buf->size;
348         }
349
350         if(buf->size - realoffset < len) {
351                 // The data is wrapped
352                 ssize_t rx1 = c->recv(c, buf->data + realoffset, buf->size - realoffset);
353
354                 if(rx1 < buf->size - realoffset) {
355                         return rx1;
356                 }
357
358                 // The channel might have been closed by the previous callback
359                 if(!c->recv) {
360                         return len;
361                 }
362
363                 ssize_t rx2 = c->recv(c, buf->data, len - (buf->size - realoffset));
364
365                 if(rx2 < 0) {
366                         return rx2;
367                 } else {
368                         return rx1 + rx2;
369                 }
370         } else {
371                 return c->recv(c, buf->data + realoffset, len);
372         }
373 }
374
375 // Discard data from the buffer.
376 static ssize_t buffer_discard(struct buffer *buf, size_t len) {
377         if(buf->used < len) {
378                 len = buf->used;
379         }
380
381         if(buf->size - buf->offset <= len) {
382                 buf->offset -= buf->size;
383         }
384
385         if(buf->used == len) {
386                 buf->offset = 0;
387         } else {
388                 buf->offset += len;
389         }
390
391         buf->used -= len;
392
393         return len;
394 }
395
396 static void buffer_clear(struct buffer *buf) {
397         buf->used = 0;
398         buf->offset = 0;
399 }
400
401 static bool buffer_set_size(struct buffer *buf, uint32_t minsize, uint32_t maxsize) {
402         if(maxsize < minsize) {
403                 maxsize = minsize;
404         }
405
406         buf->maxsize = maxsize;
407
408         return buf->size >= minsize || buffer_resize(buf, minsize);
409 }
410
411 static void buffer_exit(struct buffer *buf) {
412         free(buf->data);
413         memset(buf, 0, sizeof(*buf));
414 }
415
416 static uint32_t buffer_free(const struct buffer *buf) {
417         return buf->maxsize > buf->used ? buf->maxsize - buf->used : 0;
418 }
419
420 // Connections are stored in a sorted list.
421 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
422
423 static int compare(const void *va, const void *vb) {
424         assert(va && vb);
425
426         const struct utcp_connection *a = *(struct utcp_connection **)va;
427         const struct utcp_connection *b = *(struct utcp_connection **)vb;
428
429         assert(a && b);
430         assert(a->src && b->src);
431
432         int c = (int)a->src - (int)b->src;
433
434         if(c) {
435                 return c;
436         }
437
438         c = (int)a->dst - (int)b->dst;
439         return c;
440 }
441
442 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
443         if(!utcp->nconnections) {
444                 return NULL;
445         }
446
447         struct utcp_connection key = {
448                 .src = src,
449                 .dst = dst,
450         }, *keyp = &key;
451         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
452         return match ? *match : NULL;
453 }
454
455 static void free_connection(struct utcp_connection *c) {
456         struct utcp *utcp = c->utcp;
457         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
458
459         assert(cp);
460
461         int i = cp - utcp->connections;
462         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
463         utcp->nconnections--;
464
465         buffer_exit(&c->rcvbuf);
466         buffer_exit(&c->sndbuf);
467         free(c);
468 }
469
470 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
471         // Check whether this combination of src and dst is free
472
473         if(src) {
474                 if(find_connection(utcp, src, dst)) {
475                         errno = EADDRINUSE;
476                         return NULL;
477                 }
478         } else { // If src == 0, generate a random port number with the high bit set
479                 if(utcp->nconnections >= 32767) {
480                         errno = ENOMEM;
481                         return NULL;
482                 }
483
484                 src = rand() | 0x8000;
485
486                 while(find_connection(utcp, src, dst)) {
487                         src++;
488                 }
489         }
490
491         // Allocate memory for the new connection
492
493         if(utcp->nconnections >= utcp->nallocated) {
494                 if(!utcp->nallocated) {
495                         utcp->nallocated = 4;
496                 } else {
497                         utcp->nallocated *= 2;
498                 }
499
500                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
501
502                 if(!new_array) {
503                         return NULL;
504                 }
505
506                 utcp->connections = new_array;
507         }
508
509         struct utcp_connection *c = calloc(1, sizeof(*c));
510
511         if(!c) {
512                 return NULL;
513         }
514
515         if(!buffer_set_size(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
516                 free(c);
517                 return NULL;
518         }
519
520         if(!buffer_set_size(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
521                 buffer_exit(&c->sndbuf);
522                 free(c);
523                 return NULL;
524         }
525
526         // Fill in the details
527
528         c->src = src;
529         c->dst = dst;
530 #ifdef UTCP_DEBUG
531         c->snd.iss = 0;
532 #else
533         c->snd.iss = rand();
534 #endif
535         c->snd.una = c->snd.iss;
536         c->snd.nxt = c->snd.iss + 1;
537         c->snd.last = c->snd.nxt;
538         c->snd.cwnd = (utcp->mss > 2190 ? 2 : utcp->mss > 1095 ? 3 : 4) * utcp->mss;
539         c->snd.ssthresh = ~0;
540         debug_cwnd(c);
541         c->srtt = 0;
542         c->rttvar = 0;
543         c->rto = START_RTO;
544         c->utcp = utcp;
545
546         // Add it to the sorted list of connections
547
548         utcp->connections[utcp->nconnections++] = c;
549         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
550
551         return c;
552 }
553
554 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
555         if(a > b) {
556                 return a - b;
557         } else {
558                 return b - a;
559         }
560 }
561
562 // Update RTT variables. See RFC 6298.
563 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
564         if(!rtt) {
565                 debug(c, "invalid rtt\n");
566                 return;
567         }
568
569         if(!c->srtt) {
570                 c->srtt = rtt;
571                 c->rttvar = rtt / 2;
572         } else {
573                 c->rttvar = (c->rttvar * 3 + absdiff(c->srtt, rtt)) / 4;
574                 c->srtt = (c->srtt * 7 + rtt) / 8;
575         }
576
577         c->rto = c->srtt + max(4 * c->rttvar, CLOCK_GRANULARITY);
578
579         if(c->rto > MAX_RTO) {
580                 c->rto = MAX_RTO;
581         }
582
583         debug(c, "rtt %u srtt %u rttvar %u rto %u\n", rtt, c->srtt, c->rttvar, c->rto);
584 }
585
586 static void start_retransmit_timer(struct utcp_connection *c) {
587         clock_gettime(UTCP_CLOCK, &c->rtrx_timeout);
588
589         uint32_t rto = c->rto;
590
591         while(rto > USEC_PER_SEC) {
592                 c->rtrx_timeout.tv_sec++;
593                 rto -= USEC_PER_SEC;
594         }
595
596         c->rtrx_timeout.tv_nsec += rto * 1000;
597
598         if(c->rtrx_timeout.tv_nsec >= NSEC_PER_SEC) {
599                 c->rtrx_timeout.tv_nsec -= NSEC_PER_SEC;
600                 c->rtrx_timeout.tv_sec++;
601         }
602
603         debug(c, "rtrx_timeout %ld.%06lu\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_nsec);
604 }
605
606 static void start_flush_timer(struct utcp_connection *c) {
607         clock_gettime(UTCP_CLOCK, &c->rtrx_timeout);
608
609         c->rtrx_timeout.tv_nsec += c->utcp->flush_timeout * 1000000;
610
611         if(c->rtrx_timeout.tv_nsec >= NSEC_PER_SEC) {
612                 c->rtrx_timeout.tv_nsec -= NSEC_PER_SEC;
613                 c->rtrx_timeout.tv_sec++;
614         }
615
616         debug(c, "rtrx_timeout %ld.%06lu (flush)\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_nsec);
617 }
618
619 static void stop_retransmit_timer(struct utcp_connection *c) {
620         timespec_clear(&c->rtrx_timeout);
621         debug(c, "rtrx_timeout cleared\n");
622 }
623
624 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
625         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
626
627         if(!c) {
628                 return NULL;
629         }
630
631         assert((flags & ~0x1f) == 0);
632
633         c->flags = flags;
634         c->recv = recv;
635         c->priv = priv;
636
637         struct {
638                 struct hdr hdr;
639                 uint8_t init[4];
640         } pkt;
641
642         pkt.hdr.src = c->src;
643         pkt.hdr.dst = c->dst;
644         pkt.hdr.seq = c->snd.iss;
645         pkt.hdr.ack = 0;
646         pkt.hdr.wnd = c->rcvbuf.maxsize;
647         pkt.hdr.ctl = SYN;
648         pkt.hdr.aux = 0x0101;
649         pkt.init[0] = 1;
650         pkt.init[1] = 0;
651         pkt.init[2] = 0;
652         pkt.init[3] = flags & 0x7;
653
654         set_state(c, SYN_SENT);
655
656         print_packet(c, "send", &pkt, sizeof(pkt));
657         utcp->send(utcp, &pkt, sizeof(pkt));
658
659         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
660         c->conn_timeout.tv_sec += utcp->timeout;
661
662         start_retransmit_timer(c);
663
664         return c;
665 }
666
667 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
668         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
669 }
670
671 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
672         if(c->reapable || c->state != SYN_RECEIVED) {
673                 debug(c, "accept() called on invalid connection in state %s\n", c, strstate[c->state]);
674                 return;
675         }
676
677         debug(c, "accepted %p %p\n", c, recv, priv);
678         c->recv = recv;
679         c->priv = priv;
680         c->do_poll = true;
681         set_state(c, ESTABLISHED);
682 }
683
684 static void ack(struct utcp_connection *c, bool sendatleastone) {
685         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
686         int32_t cwndleft = is_reliable(c) ? min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una) : MAX_UNRELIABLE_SIZE;
687
688         assert(left >= 0);
689
690         if(cwndleft <= 0) {
691                 left = 0;
692         } else if(cwndleft < left) {
693                 left = cwndleft;
694
695                 if(!sendatleastone || cwndleft > c->utcp->mss) {
696                         left -= left % c->utcp->mss;
697                 }
698         }
699
700         debug(c, "cwndleft %d left %d\n", cwndleft, left);
701
702         if(!left && !sendatleastone) {
703                 return;
704         }
705
706         struct {
707                 struct hdr hdr;
708                 uint8_t data[];
709         } *pkt = c->utcp->pkt;
710
711         pkt->hdr.src = c->src;
712         pkt->hdr.dst = c->dst;
713         pkt->hdr.ack = c->rcv.nxt;
714         pkt->hdr.wnd = is_reliable(c) ? c->rcvbuf.maxsize : 0;
715         pkt->hdr.ctl = ACK;
716         pkt->hdr.aux = 0;
717
718         do {
719                 uint32_t seglen = left > c->utcp->mss ? c->utcp->mss : left;
720                 pkt->hdr.seq = c->snd.nxt;
721
722                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
723
724                 c->snd.nxt += seglen;
725                 left -= seglen;
726
727                 if(!is_reliable(c)) {
728                         if(left) {
729                                 pkt->hdr.ctl |= MF;
730                         } else {
731                                 pkt->hdr.ctl &= ~MF;
732                         }
733                 }
734
735                 if(seglen && fin_wanted(c, c->snd.nxt)) {
736                         seglen--;
737                         pkt->hdr.ctl |= FIN;
738                 }
739
740                 if(!c->rtt_start.tv_sec && is_reliable(c)) {
741                         // Start RTT measurement
742                         clock_gettime(UTCP_CLOCK, &c->rtt_start);
743                         c->rtt_seq = pkt->hdr.seq + seglen;
744                         debug(c, "starting RTT measurement, expecting ack %u\n", c->rtt_seq);
745                 }
746
747                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
748                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
749
750                 if(left && !is_reliable(c)) {
751                         pkt->hdr.wnd += seglen;
752                 }
753         } while(left);
754 }
755
756 static ssize_t utcp_send_reliable(struct utcp_connection *c, const void *data, size_t len) {
757         size_t rlen = len + (is_framed(c) ? 2 : 0);
758
759         if(!rlen) {
760                 return 0;
761         }
762
763         // Check if we need to be able to buffer all data
764
765         if(c->flags & (UTCP_NO_PARTIAL | UTCP_FRAMED)) {
766                 if(rlen > c->sndbuf.maxsize) {
767                         errno = EMSGSIZE;
768                         return -1;
769                 }
770
771                 if((c->flags & UTCP_FRAMED) && len > MAX_UNRELIABLE_SIZE) {
772                         errno = EMSGSIZE;
773                         return -1;
774                 }
775
776                 if(rlen > buffer_free(&c->sndbuf)) {
777                         errno = EWOULDBLOCK;
778                         return 0;
779                 }
780         }
781
782         // Add data to the send buffer.
783
784         if(is_framed(c)) {
785                 uint16_t len16 = len;
786                 buffer_put(&c->sndbuf, &len16, sizeof(len16));
787                 assert(buffer_put(&c->sndbuf, data, len) == (ssize_t)len);
788         } else {
789                 len = buffer_put(&c->sndbuf, data, len);
790
791                 if(len <= 0) {
792                         errno = EWOULDBLOCK;
793                         return 0;
794                 }
795         }
796
797         c->snd.last += rlen;
798
799         // Don't send anything yet if the connection has not fully established yet
800
801         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
802                 return len;
803         }
804
805         ack(c, false);
806
807         if(!timespec_isset(&c->rtrx_timeout)) {
808                 start_retransmit_timer(c);
809         }
810
811         if(!timespec_isset(&c->conn_timeout)) {
812                 clock_gettime(UTCP_CLOCK, &c->conn_timeout);
813                 c->conn_timeout.tv_sec += c->utcp->timeout;
814         }
815
816         return len;
817 }
818
819
820 /* In the send buffer we can have multiple frames, each prefixed with their
821    length. However, the first frame might already have been partially sent. The
822    variable c->frame_offset tracks how much of a partial frame is left at the
823    start. If it is 0, it means there is no partial frame, and the first two
824    bytes in the send buffer are the length of the first frame.
825
826    After sending an MSS sized packet, we need to calculate the new frame_offset
827    value, since it is likely that the next packet will also have a partial frame
828    at the start. We do this by scanning the previously sent packet for frame
829    headers, to find out how many bytes of the last frame are left to send.
830 */
831 static void ack_unreliable_framed(struct utcp_connection *c) {
832         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
833         assert(left > 0);
834
835         struct {
836                 struct hdr hdr;
837                 uint8_t data[];
838         } *pkt = c->utcp->pkt;
839
840         pkt->hdr.src = c->src;
841         pkt->hdr.dst = c->dst;
842         pkt->hdr.ack = c->rcv.nxt;
843         pkt->hdr.ctl = ACK | MF;
844         pkt->hdr.aux = 0;
845
846         bool sent_packet = false;
847
848         while(left >= c->utcp->mss) {
849                 pkt->hdr.wnd = c->frame_offset;
850                 uint32_t seglen = c->utcp->mss;
851
852                 pkt->hdr.seq = c->snd.nxt;
853
854                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
855
856                 c->snd.nxt += seglen;
857                 c->snd.una = c->snd.nxt;
858                 left -= seglen;
859
860                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
861                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
862                 sent_packet = true;
863
864                 // Calculate the new frame offset
865                 while(c->frame_offset < seglen) {
866                         uint16_t framelen;
867                         buffer_copy(&c->sndbuf, &framelen, c->frame_offset, sizeof(framelen));
868                         c->frame_offset += framelen + 2;
869                 }
870
871                 buffer_discard(&c->sndbuf, seglen);
872                 c->frame_offset -= seglen;
873         };
874
875         if(sent_packet) {
876                 if(left) {
877                         // We sent one packet but we have partial data left, (re)start the flush timer
878                         start_flush_timer(c);
879                 } else {
880                         // There is no partial data in the send buffer, so stop the flush timer
881                         stop_retransmit_timer(c);
882                 }
883         }
884 }
885
886 static void flush_unreliable_framed(struct utcp_connection *c) {
887         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
888
889         /* If the MSS dropped since last time ack_unreliable_frame() was called,
890           we might now have more than one segment worth of data left.
891         */
892         if(left > c->utcp->mss) {
893                 ack_unreliable_framed(c);
894                 left = seqdiff(c->snd.last, c->snd.nxt);
895                 assert(left <= c->utcp->mss);
896         }
897
898         if(left) {
899                 struct {
900                         struct hdr hdr;
901                         uint8_t data[];
902                 } *pkt = c->utcp->pkt;
903
904                 pkt->hdr.src = c->src;
905                 pkt->hdr.dst = c->dst;
906                 pkt->hdr.seq = c->snd.nxt;
907                 pkt->hdr.ack = c->rcv.nxt;
908                 pkt->hdr.wnd = c->frame_offset;
909                 pkt->hdr.ctl = ACK | MF;
910                 pkt->hdr.aux = 0;
911
912                 uint32_t seglen = left;
913
914                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
915                 buffer_discard(&c->sndbuf, seglen);
916
917                 c->snd.nxt += seglen;
918                 c->snd.una = c->snd.nxt;
919
920                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
921                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
922         }
923
924         c->frame_offset = 0;
925         stop_retransmit_timer(c);
926 }
927
928
929 static ssize_t utcp_send_unreliable(struct utcp_connection *c, const void *data, size_t len) {
930         if(len > MAX_UNRELIABLE_SIZE) {
931                 errno = EMSGSIZE;
932                 return -1;
933         }
934
935         size_t rlen = len + (is_framed(c) ? 2 : 0);
936
937         if(rlen > buffer_free(&c->sndbuf)) {
938                 if(rlen > c->sndbuf.maxsize) {
939                         errno = EMSGSIZE;
940                         return -1;
941                 } else {
942                         errno = EWOULDBLOCK;
943                         return 0;
944                 }
945         }
946
947         // Don't send anything yet if the connection has not fully established yet
948
949         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
950                 return len;
951         }
952
953         if(is_framed(c)) {
954                 uint16_t framelen = len;
955                 buffer_put(&c->sndbuf, &framelen, sizeof(framelen));
956         }
957
958         buffer_put(&c->sndbuf, data, len);
959
960         c->snd.last += rlen;
961
962         if(is_framed(c)) {
963                 ack_unreliable_framed(c);
964         } else {
965                 ack(c, false);
966                 c->snd.una = c->snd.nxt = c->snd.last;
967                 buffer_discard(&c->sndbuf, c->sndbuf.used);
968         }
969
970         return len;
971 }
972
973 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
974         if(c->reapable) {
975                 debug(c, "send() called on closed connection\n");
976                 errno = EBADF;
977                 return -1;
978         }
979
980         switch(c->state) {
981         case CLOSED:
982         case LISTEN:
983                 debug(c, "send() called on unconnected connection\n");
984                 errno = ENOTCONN;
985                 return -1;
986
987         case SYN_SENT:
988         case SYN_RECEIVED:
989         case ESTABLISHED:
990         case CLOSE_WAIT:
991                 break;
992
993         case FIN_WAIT_1:
994         case FIN_WAIT_2:
995         case CLOSING:
996         case LAST_ACK:
997         case TIME_WAIT:
998                 debug(c, "send() called on closed connection\n");
999                 errno = EPIPE;
1000                 return -1;
1001         }
1002
1003         if(!data && len) {
1004                 errno = EFAULT;
1005                 return -1;
1006         }
1007
1008         if(is_reliable(c)) {
1009                 return utcp_send_reliable(c, data, len);
1010         } else {
1011                 return utcp_send_unreliable(c, data, len);
1012         }
1013 }
1014
1015 static void swap_ports(struct hdr *hdr) {
1016         uint16_t tmp = hdr->src;
1017         hdr->src = hdr->dst;
1018         hdr->dst = tmp;
1019 }
1020
1021 static void fast_retransmit(struct utcp_connection *c) {
1022         if(c->state == CLOSED || c->snd.last == c->snd.una) {
1023                 debug(c, "fast_retransmit() called but nothing to retransmit!\n");
1024                 return;
1025         }
1026
1027         struct utcp *utcp = c->utcp;
1028
1029         struct {
1030                 struct hdr hdr;
1031                 uint8_t data[];
1032         } *pkt = c->utcp->pkt;
1033
1034         pkt->hdr.src = c->src;
1035         pkt->hdr.dst = c->dst;
1036         pkt->hdr.wnd = c->rcvbuf.maxsize;
1037         pkt->hdr.aux = 0;
1038
1039         switch(c->state) {
1040         case ESTABLISHED:
1041         case FIN_WAIT_1:
1042         case CLOSE_WAIT:
1043         case CLOSING:
1044         case LAST_ACK:
1045                 // Send unacked data again.
1046                 pkt->hdr.seq = c->snd.una;
1047                 pkt->hdr.ack = c->rcv.nxt;
1048                 pkt->hdr.ctl = ACK;
1049                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
1050
1051                 if(fin_wanted(c, c->snd.una + len)) {
1052                         len--;
1053                         pkt->hdr.ctl |= FIN;
1054                 }
1055
1056                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
1057                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
1058                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
1059                 break;
1060
1061         default:
1062                 break;
1063         }
1064 }
1065
1066 static void retransmit(struct utcp_connection *c) {
1067         if(c->state == CLOSED || c->snd.last == c->snd.una) {
1068                 debug(c, "retransmit() called but nothing to retransmit!\n");
1069                 stop_retransmit_timer(c);
1070                 return;
1071         }
1072
1073         struct utcp *utcp = c->utcp;
1074
1075         if(utcp->retransmit && is_reliable(c)) {
1076                 utcp->retransmit(c);
1077         }
1078
1079         struct {
1080                 struct hdr hdr;
1081                 uint8_t data[];
1082         } *pkt = c->utcp->pkt;
1083
1084         pkt->hdr.src = c->src;
1085         pkt->hdr.dst = c->dst;
1086         pkt->hdr.wnd = c->rcvbuf.maxsize;
1087         pkt->hdr.aux = 0;
1088
1089         switch(c->state) {
1090         case SYN_SENT:
1091                 // Send our SYN again
1092                 pkt->hdr.seq = c->snd.iss;
1093                 pkt->hdr.ack = 0;
1094                 pkt->hdr.ctl = SYN;
1095                 pkt->hdr.aux = 0x0101;
1096                 pkt->data[0] = 1;
1097                 pkt->data[1] = 0;
1098                 pkt->data[2] = 0;
1099                 pkt->data[3] = c->flags & 0x7;
1100                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + 4);
1101                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
1102                 break;
1103
1104         case SYN_RECEIVED:
1105                 // Send SYNACK again
1106                 pkt->hdr.seq = c->snd.nxt;
1107                 pkt->hdr.ack = c->rcv.nxt;
1108                 pkt->hdr.ctl = SYN | ACK;
1109                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr));
1110                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
1111                 break;
1112
1113         case ESTABLISHED:
1114         case FIN_WAIT_1:
1115         case CLOSE_WAIT:
1116         case CLOSING:
1117         case LAST_ACK:
1118                 if(!is_reliable(c) && is_framed(c) && c->sndbuf.used) {
1119                         flush_unreliable_framed(c);
1120                         return;
1121                 }
1122
1123                 // Send unacked data again.
1124                 pkt->hdr.seq = c->snd.una;
1125                 pkt->hdr.ack = c->rcv.nxt;
1126                 pkt->hdr.ctl = ACK;
1127                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
1128
1129                 if(fin_wanted(c, c->snd.una + len)) {
1130                         len--;
1131                         pkt->hdr.ctl |= FIN;
1132                 }
1133
1134                 // RFC 5681 slow start after timeout
1135                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
1136                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
1137                 c->snd.cwnd = utcp->mss;
1138                 debug_cwnd(c);
1139
1140                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
1141                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
1142                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
1143
1144                 c->snd.nxt = c->snd.una + len;
1145                 break;
1146
1147         case CLOSED:
1148         case LISTEN:
1149         case TIME_WAIT:
1150         case FIN_WAIT_2:
1151                 // We shouldn't need to retransmit anything in this state.
1152 #ifdef UTCP_DEBUG
1153                 abort();
1154 #endif
1155                 stop_retransmit_timer(c);
1156                 goto cleanup;
1157         }
1158
1159         start_retransmit_timer(c);
1160         c->rto *= 2;
1161
1162         if(c->rto > MAX_RTO) {
1163                 c->rto = MAX_RTO;
1164         }
1165
1166         c->rtt_start.tv_sec = 0; // invalidate RTT timer
1167         c->dupack = 0; // cancel any ongoing fast recovery
1168
1169 cleanup:
1170         return;
1171 }
1172
1173 /* Update receive buffer and SACK entries after consuming data.
1174  *
1175  * Situation:
1176  *
1177  * |.....0000..1111111111.....22222......3333|
1178  * |---------------^
1179  *
1180  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
1181  * to remove data from the receive buffer. The idea is to substract "len"
1182  * from the offset of all the SACK entries, and then remove/cut down entries
1183  * that are shifted to before the start of the receive buffer.
1184  *
1185  * There are three cases:
1186  * - the SACK entry is after ^, in that case just change the offset.
1187  * - the SACK entry starts before and ends after ^, so we have to
1188  *   change both its offset and size.
1189  * - the SACK entry is completely before ^, in that case delete it.
1190  */
1191 static void sack_consume(struct utcp_connection *c, size_t len) {
1192         debug(c, "sack_consume %lu\n", (unsigned long)len);
1193
1194         if(len > c->rcvbuf.used) {
1195                 debug(c, "all SACK entries consumed\n");
1196                 c->sacks[0].len = 0;
1197                 return;
1198         }
1199
1200         buffer_discard(&c->rcvbuf, len);
1201
1202         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
1203                 if(len < c->sacks[i].offset) {
1204                         c->sacks[i].offset -= len;
1205                         i++;
1206                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
1207                         c->sacks[i].len -= len - c->sacks[i].offset;
1208                         c->sacks[i].offset = 0;
1209                         i++;
1210                 } else {
1211                         if(i < NSACKS - 1) {
1212                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
1213                                 c->sacks[NSACKS - 1].len = 0;
1214                         } else {
1215                                 c->sacks[i].len = 0;
1216                                 break;
1217                         }
1218                 }
1219         }
1220
1221         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
1222                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
1223         }
1224 }
1225
1226 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
1227         debug(c, "out of order packet, offset %u\n", offset);
1228         // Packet loss or reordering occured. Store the data in the buffer.
1229         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
1230
1231         if(rxd <= 0) {
1232                 debug(c, "packet outside receive buffer, dropping\n");
1233                 return;
1234         }
1235
1236         if((size_t)rxd < len) {
1237                 debug(c, "packet partially outside receive buffer\n");
1238                 len = rxd;
1239         }
1240
1241         // Make note of where we put it.
1242         for(int i = 0; i < NSACKS; i++) {
1243                 if(!c->sacks[i].len) { // nothing to merge, add new entry
1244                         debug(c, "new SACK entry %d\n", i);
1245                         c->sacks[i].offset = offset;
1246                         c->sacks[i].len = rxd;
1247                         break;
1248                 } else if(offset < c->sacks[i].offset) {
1249                         if(offset + rxd < c->sacks[i].offset) { // insert before
1250                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
1251                                         debug(c, "insert SACK entry at %d\n", i);
1252                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
1253                                         c->sacks[i].offset = offset;
1254                                         c->sacks[i].len = rxd;
1255                                 } else {
1256                                         debug(c, "SACK entries full, dropping packet\n");
1257                                 }
1258
1259                                 break;
1260                         } else { // merge
1261                                 debug(c, "merge with start of SACK entry at %d\n", i);
1262                                 c->sacks[i].offset = offset;
1263                                 break;
1264                         }
1265                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
1266                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
1267                                 debug(c, "merge with end of SACK entry at %d\n", i);
1268                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
1269                                 // TODO: handle potential merge with next entry
1270                         }
1271
1272                         break;
1273                 }
1274         }
1275
1276         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
1277                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
1278         }
1279 }
1280
1281 static void handle_out_of_order_framed(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
1282         uint32_t in_order_offset = (c->sacks[0].len && c->sacks[0].offset == 0) ? c->sacks[0].len : 0;
1283
1284         // Put the data into the receive buffer
1285         handle_out_of_order(c, offset + in_order_offset, data, len);
1286 }
1287
1288 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
1289         if(c->recv) {
1290                 ssize_t rxd = c->recv(c, data, len);
1291
1292                 if(rxd != (ssize_t)len) {
1293                         // TODO: handle the application not accepting all data.
1294                         abort();
1295                 }
1296         }
1297
1298         // Check if we can process out-of-order data now.
1299         if(c->sacks[0].len && len >= c->sacks[0].offset) {
1300                 debug(c, "incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
1301
1302                 if(len < c->sacks[0].offset + c->sacks[0].len) {
1303                         size_t offset = len;
1304                         len = c->sacks[0].offset + c->sacks[0].len;
1305                         size_t remainder = len - offset;
1306
1307                         ssize_t rxd = buffer_call(c, &c->rcvbuf, offset, remainder);
1308
1309                         if(rxd != (ssize_t)remainder) {
1310                                 // TODO: handle the application not accepting all data.
1311                                 abort();
1312                         }
1313                 }
1314         }
1315
1316         if(c->rcvbuf.used) {
1317                 sack_consume(c, len);
1318         }
1319
1320         c->rcv.nxt += len;
1321 }
1322
1323 static void handle_in_order_framed(struct utcp_connection *c, const void *data, size_t len) {
1324         // Treat it as out of order, since it is unlikely the start of this packet contains the start of a frame.
1325         uint32_t in_order_offset = (c->sacks[0].len && c->sacks[0].offset == 0) ? c->sacks[0].len : 0;
1326         handle_out_of_order(c, in_order_offset, data, len);
1327
1328         // While we have full frames at the start, give them to the application
1329         while(c->sacks[0].len >= 2 && c->sacks[0].offset == 0) {
1330                 uint16_t framelen;
1331                 buffer_copy(&c->rcvbuf, &framelen, 0, sizeof(&framelen));
1332
1333                 if(framelen > c->sacks[0].len - 2) {
1334                         break;
1335                 }
1336
1337                 if(c->recv) {
1338                         ssize_t rxd;
1339                         uint32_t realoffset = c->rcvbuf.offset + 2;
1340
1341                         if(c->rcvbuf.size - c->rcvbuf.offset <= 2) {
1342                                 realoffset -= c->rcvbuf.size;
1343                         }
1344
1345                         if(realoffset > c->rcvbuf.size - framelen) {
1346                                 // The buffer wraps, we need to copy
1347                                 uint8_t buf[framelen];
1348                                 buffer_copy(&c->rcvbuf, buf, 2, framelen);
1349                                 rxd = c->recv(c, buf, framelen);
1350                         } else {
1351                                 // The frame is contiguous in the receive buffer
1352                                 rxd = c->recv(c, c->rcvbuf.data + realoffset, framelen);
1353                         }
1354
1355                         if(rxd != (ssize_t)framelen) {
1356                                 // TODO: handle the application not accepting all data.
1357                                 abort();
1358                         }
1359                 }
1360
1361                 sack_consume(c, framelen + 2);
1362         }
1363
1364         c->rcv.nxt += len;
1365 }
1366
1367 static void handle_unreliable(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1368         // Fast path for unfragmented packets
1369         if(!hdr->wnd && !(hdr->ctl & MF)) {
1370                 if(c->recv) {
1371                         c->recv(c, data, len);
1372                 }
1373
1374                 c->rcv.nxt = hdr->seq + len;
1375                 return;
1376         }
1377
1378         // Ensure reassembled packet are not larger than 64 kiB
1379         if(hdr->wnd > MAX_UNRELIABLE_SIZE || hdr->wnd + len > MAX_UNRELIABLE_SIZE) {
1380                 return;
1381         }
1382
1383         // Don't accept out of order fragments
1384         if(hdr->wnd && hdr->seq != c->rcv.nxt) {
1385                 return;
1386         }
1387
1388         // Reset the receive buffer for the first fragment
1389         if(!hdr->wnd) {
1390                 buffer_clear(&c->rcvbuf);
1391         }
1392
1393         ssize_t rxd = buffer_put_at(&c->rcvbuf, hdr->wnd, data, len);
1394
1395         if(rxd != (ssize_t)len) {
1396                 return;
1397         }
1398
1399         // Send the packet if it's the final fragment
1400         if(!(hdr->ctl & MF)) {
1401                 buffer_call(c, &c->rcvbuf, 0, hdr->wnd + len);
1402         }
1403
1404         c->rcv.nxt = hdr->seq + len;
1405 }
1406
1407 static void handle_unreliable_framed(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1408         bool in_order = hdr->seq == c->rcv.nxt;
1409         c->rcv.nxt = hdr->seq + len;
1410
1411         const uint8_t *ptr = data;
1412         size_t left = len;
1413
1414         // Does it start with a partial frame?
1415         if(hdr->wnd) {
1416                 // Only accept the data if it is in order
1417                 if(in_order && c->rcvbuf.used) {
1418                         // In order, append it to the receive buffer
1419                         buffer_put(&c->rcvbuf, data, min(hdr->wnd, len));
1420
1421                         if(hdr->wnd <= len) {
1422                                 // We have a full frame
1423                                 c->recv(c, (uint8_t *)c->rcvbuf.data + 2, c->rcvbuf.used - 2);
1424                         }
1425                 }
1426
1427                 // Exit early if there is other data in this frame
1428                 if(hdr->wnd > len) {
1429                         if(!in_order) {
1430                                 buffer_clear(&c->rcvbuf);
1431                         }
1432
1433                         return;
1434                 }
1435
1436                 ptr += hdr->wnd;
1437                 left -= hdr->wnd;
1438         }
1439
1440         // We now start with new frames, so clear any data in the receive buffer
1441         buffer_clear(&c->rcvbuf);
1442
1443         // Handle whole frames
1444         while(left > 2) {
1445                 uint16_t framelen;
1446                 memcpy(&framelen, ptr, sizeof(framelen));
1447
1448                 if(left <= (size_t)framelen + 2) {
1449                         break;
1450                 }
1451
1452                 c->recv(c, ptr + 2, framelen);
1453                 ptr += framelen + 2;
1454                 left -= framelen + 2;
1455         }
1456
1457         // Handle partial last frame
1458         if(left) {
1459                 buffer_put(&c->rcvbuf, ptr, left);
1460         }
1461 }
1462
1463 static void handle_incoming_data(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
1464         if(!is_reliable(c)) {
1465                 if(is_framed(c)) {
1466                         handle_unreliable_framed(c, hdr, data, len);
1467                 } else {
1468                         handle_unreliable(c, hdr, data, len);
1469                 }
1470
1471                 return;
1472         }
1473
1474         uint32_t offset = seqdiff(hdr->seq, c->rcv.nxt);
1475
1476         if(is_framed(c)) {
1477                 if(offset) {
1478                         handle_out_of_order_framed(c, offset, data, len);
1479                 } else {
1480                         handle_in_order_framed(c, data, len);
1481                 }
1482         } else {
1483                 if(offset) {
1484                         handle_out_of_order(c, offset, data, len);
1485                 } else {
1486                         handle_in_order(c, data, len);
1487                 }
1488         }
1489 }
1490
1491
1492 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
1493         const uint8_t *ptr = data;
1494
1495         if(!utcp) {
1496                 errno = EFAULT;
1497                 return -1;
1498         }
1499
1500         if(!len) {
1501                 return 0;
1502         }
1503
1504         if(!data) {
1505                 errno = EFAULT;
1506                 return -1;
1507         }
1508
1509         // Drop packets smaller than the header
1510
1511         struct hdr hdr;
1512
1513         if(len < sizeof(hdr)) {
1514                 print_packet(NULL, "recv", data, len);
1515                 errno = EBADMSG;
1516                 return -1;
1517         }
1518
1519         // Make a copy from the potentially unaligned data to a struct hdr
1520
1521         memcpy(&hdr, ptr, sizeof(hdr));
1522
1523         // Try to match the packet to an existing connection
1524
1525         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1526         print_packet(c, "recv", data, len);
1527
1528         // Process the header
1529
1530         ptr += sizeof(hdr);
1531         len -= sizeof(hdr);
1532
1533         // Drop packets with an unknown CTL flag
1534
1535         if(hdr.ctl & ~(SYN | ACK | RST | FIN | MF)) {
1536                 print_packet(NULL, "recv", data, len);
1537                 errno = EBADMSG;
1538                 return -1;
1539         }
1540
1541         // Check for auxiliary headers
1542
1543         const uint8_t *init = NULL;
1544
1545         uint16_t aux = hdr.aux;
1546
1547         while(aux) {
1548                 size_t auxlen = 4 * (aux >> 8) & 0xf;
1549                 uint8_t auxtype = aux & 0xff;
1550
1551                 if(len < auxlen) {
1552                         errno = EBADMSG;
1553                         return -1;
1554                 }
1555
1556                 switch(auxtype) {
1557                 case AUX_INIT:
1558                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1559                                 errno = EBADMSG;
1560                                 return -1;
1561                         }
1562
1563                         init = ptr;
1564                         break;
1565
1566                 default:
1567                         errno = EBADMSG;
1568                         return -1;
1569                 }
1570
1571                 len -= auxlen;
1572                 ptr += auxlen;
1573
1574                 if(!(aux & 0x800)) {
1575                         break;
1576                 }
1577
1578                 if(len < 2) {
1579                         errno = EBADMSG;
1580                         return -1;
1581                 }
1582
1583                 memcpy(&aux, ptr, 2);
1584                 len -= 2;
1585                 ptr += 2;
1586         }
1587
1588         bool has_data = len || (hdr.ctl & (SYN | FIN));
1589
1590         // Is it for a new connection?
1591
1592         if(!c) {
1593                 // Ignore RST packets
1594
1595                 if(hdr.ctl & RST) {
1596                         return 0;
1597                 }
1598
1599                 // Is it a SYN packet and are we LISTENing?
1600
1601                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1602                         // If we don't want to accept it, send a RST back
1603                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1604                                 len = 1;
1605                                 goto reset;
1606                         }
1607
1608                         // Try to allocate memory, otherwise send a RST back
1609                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1610
1611                         if(!c) {
1612                                 len = 1;
1613                                 goto reset;
1614                         }
1615
1616                         // Parse auxilliary information
1617                         if(init) {
1618                                 if(init[0] < 1) {
1619                                         len = 1;
1620                                         goto reset;
1621                                 }
1622
1623                                 c->flags = init[3] & 0x7;
1624                         } else {
1625                                 c->flags = UTCP_TCP;
1626                         }
1627
1628 synack:
1629                         // Return SYN+ACK, go to SYN_RECEIVED state
1630                         c->snd.wnd = hdr.wnd;
1631                         c->rcv.irs = hdr.seq;
1632                         c->rcv.nxt = c->rcv.irs + 1;
1633                         set_state(c, SYN_RECEIVED);
1634
1635                         struct {
1636                                 struct hdr hdr;
1637                                 uint8_t data[4];
1638                         } pkt;
1639
1640                         pkt.hdr.src = c->src;
1641                         pkt.hdr.dst = c->dst;
1642                         pkt.hdr.ack = c->rcv.irs + 1;
1643                         pkt.hdr.seq = c->snd.iss;
1644                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1645                         pkt.hdr.ctl = SYN | ACK;
1646
1647                         if(init) {
1648                                 pkt.hdr.aux = 0x0101;
1649                                 pkt.data[0] = 1;
1650                                 pkt.data[1] = 0;
1651                                 pkt.data[2] = 0;
1652                                 pkt.data[3] = c->flags & 0x7;
1653                                 print_packet(c, "send", &pkt, sizeof(hdr) + 4);
1654                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1655                         } else {
1656                                 pkt.hdr.aux = 0;
1657                                 print_packet(c, "send", &pkt, sizeof(hdr));
1658                                 utcp->send(utcp, &pkt, sizeof(hdr));
1659                         }
1660
1661                         start_retransmit_timer(c);
1662                 } else {
1663                         // No, we don't want your packets, send a RST back
1664                         len = 1;
1665                         goto reset;
1666                 }
1667
1668                 return 0;
1669         }
1670
1671         debug(c, "state %s\n", strstate[c->state]);
1672
1673         // In case this is for a CLOSED connection, ignore the packet.
1674         // TODO: make it so incoming packets can never match a CLOSED connection.
1675
1676         if(c->state == CLOSED) {
1677                 debug(c, "got packet for closed connection\n");
1678                 return 0;
1679         }
1680
1681         // It is for an existing connection.
1682
1683         // 1. Drop invalid packets.
1684
1685         // 1a. Drop packets that should not happen in our current state.
1686
1687         switch(c->state) {
1688         case SYN_SENT:
1689         case SYN_RECEIVED:
1690         case ESTABLISHED:
1691         case FIN_WAIT_1:
1692         case FIN_WAIT_2:
1693         case CLOSE_WAIT:
1694         case CLOSING:
1695         case LAST_ACK:
1696         case TIME_WAIT:
1697                 break;
1698
1699         default:
1700 #ifdef UTCP_DEBUG
1701                 abort();
1702 #endif
1703                 break;
1704         }
1705
1706         // 1b. Discard data that is not in our receive window.
1707
1708         if(is_reliable(c)) {
1709                 bool acceptable;
1710
1711                 if(c->state == SYN_SENT) {
1712                         acceptable = true;
1713                 } else if(len == 0) {
1714                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1715                 } else {
1716                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1717
1718                         // cut already accepted front overlapping
1719                         if(rcv_offset < 0) {
1720                                 acceptable = len > (size_t) - rcv_offset;
1721
1722                                 if(acceptable) {
1723                                         ptr -= rcv_offset;
1724                                         len += rcv_offset;
1725                                         hdr.seq -= rcv_offset;
1726                                 }
1727                         } else {
1728                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1729                         }
1730                 }
1731
1732                 if(!acceptable) {
1733                         debug(c, "packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1734
1735                         // Ignore unacceptable RST packets.
1736                         if(hdr.ctl & RST) {
1737                                 return 0;
1738                         }
1739
1740                         // Otherwise, continue processing.
1741                         len = 0;
1742                 }
1743         } else {
1744 #if UTCP_DEBUG
1745                 int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1746
1747                 if(rcv_offset) {
1748                         debug(c, "packet out of order, offset %u bytes\n", rcv_offset);
1749                 }
1750
1751 #endif
1752         }
1753
1754         c->snd.wnd = hdr.wnd; // TODO: move below
1755
1756         // 1c. Drop packets with an invalid ACK.
1757         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1758         // (= snd.una + c->sndbuf.used).
1759
1760         if(!is_reliable(c)) {
1761                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1762                         hdr.ack = c->snd.una;
1763                 }
1764         }
1765
1766         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1767                 debug(c, "packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1768
1769                 // Ignore unacceptable RST packets.
1770                 if(hdr.ctl & RST) {
1771                         return 0;
1772                 }
1773
1774                 goto reset;
1775         }
1776
1777         // 2. Handle RST packets
1778
1779         if(hdr.ctl & RST) {
1780                 switch(c->state) {
1781                 case SYN_SENT:
1782                         if(!(hdr.ctl & ACK)) {
1783                                 return 0;
1784                         }
1785
1786                         // The peer has refused our connection.
1787                         set_state(c, CLOSED);
1788                         errno = ECONNREFUSED;
1789
1790                         if(c->recv) {
1791                                 c->recv(c, NULL, 0);
1792                         }
1793
1794                         if(c->poll && !c->reapable) {
1795                                 c->poll(c, 0);
1796                         }
1797
1798                         return 0;
1799
1800                 case SYN_RECEIVED:
1801                         if(hdr.ctl & ACK) {
1802                                 return 0;
1803                         }
1804
1805                         // We haven't told the application about this connection yet. Silently delete.
1806                         free_connection(c);
1807                         return 0;
1808
1809                 case ESTABLISHED:
1810                 case FIN_WAIT_1:
1811                 case FIN_WAIT_2:
1812                 case CLOSE_WAIT:
1813                         if(hdr.ctl & ACK) {
1814                                 return 0;
1815                         }
1816
1817                         // The peer has aborted our connection.
1818                         set_state(c, CLOSED);
1819                         errno = ECONNRESET;
1820
1821                         if(c->recv) {
1822                                 c->recv(c, NULL, 0);
1823                         }
1824
1825                         if(c->poll && !c->reapable) {
1826                                 c->poll(c, 0);
1827                         }
1828
1829                         return 0;
1830
1831                 case CLOSING:
1832                 case LAST_ACK:
1833                 case TIME_WAIT:
1834                         if(hdr.ctl & ACK) {
1835                                 return 0;
1836                         }
1837
1838                         // As far as the application is concerned, the connection has already been closed.
1839                         // If it has called utcp_close() already, we can immediately free this connection.
1840                         if(c->reapable) {
1841                                 free_connection(c);
1842                                 return 0;
1843                         }
1844
1845                         // Otherwise, immediately move to the CLOSED state.
1846                         set_state(c, CLOSED);
1847                         return 0;
1848
1849                 default:
1850 #ifdef UTCP_DEBUG
1851                         abort();
1852 #endif
1853                         break;
1854                 }
1855         }
1856
1857         uint32_t advanced;
1858
1859         if(!(hdr.ctl & ACK)) {
1860                 advanced = 0;
1861                 goto skip_ack;
1862         }
1863
1864         // 3. Advance snd.una
1865
1866         advanced = seqdiff(hdr.ack, c->snd.una);
1867
1868         if(advanced) {
1869                 // RTT measurement
1870                 if(c->rtt_start.tv_sec) {
1871                         if(c->rtt_seq == hdr.ack) {
1872                                 struct timespec now;
1873                                 clock_gettime(UTCP_CLOCK, &now);
1874                                 int32_t diff = timespec_diff_usec(&now, &c->rtt_start);
1875                                 update_rtt(c, diff);
1876                                 c->rtt_start.tv_sec = 0;
1877                         } else if(c->rtt_seq < hdr.ack) {
1878                                 debug(c, "cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1879                                 c->rtt_start.tv_sec = 0;
1880                         }
1881                 }
1882
1883                 int32_t data_acked = advanced;
1884
1885                 switch(c->state) {
1886                 case SYN_SENT:
1887                 case SYN_RECEIVED:
1888                         data_acked--;
1889                         break;
1890
1891                 // TODO: handle FIN as well.
1892                 default:
1893                         break;
1894                 }
1895
1896                 assert(data_acked >= 0);
1897
1898 #ifndef NDEBUG
1899                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1900                 assert(data_acked <= bufused);
1901 #endif
1902
1903                 if(data_acked) {
1904                         buffer_discard(&c->sndbuf, data_acked);
1905
1906                         if(is_reliable(c)) {
1907                                 c->do_poll = true;
1908                         }
1909                 }
1910
1911                 // Also advance snd.nxt if possible
1912                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1913                         c->snd.nxt = hdr.ack;
1914                 }
1915
1916                 c->snd.una = hdr.ack;
1917
1918                 if(c->dupack) {
1919                         if(c->dupack >= 3) {
1920                                 debug(c, "fast recovery ended\n");
1921                                 c->snd.cwnd = c->snd.ssthresh;
1922                         }
1923
1924                         c->dupack = 0;
1925                 }
1926
1927                 // Increase the congestion window according to RFC 5681
1928                 if(c->snd.cwnd < c->snd.ssthresh) {
1929                         c->snd.cwnd += min(advanced, utcp->mss); // eq. 2
1930                 } else {
1931                         c->snd.cwnd += max(1, (utcp->mss * utcp->mss) / c->snd.cwnd); // eq. 3
1932                 }
1933
1934                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1935                         c->snd.cwnd = c->sndbuf.maxsize;
1936                 }
1937
1938                 debug_cwnd(c);
1939
1940                 // Check if we have sent a FIN that is now ACKed.
1941                 switch(c->state) {
1942                 case FIN_WAIT_1:
1943                         if(c->snd.una == c->snd.last) {
1944                                 set_state(c, FIN_WAIT_2);
1945                         }
1946
1947                         break;
1948
1949                 case CLOSING:
1950                         if(c->snd.una == c->snd.last) {
1951                                 clock_gettime(UTCP_CLOCK, &c->conn_timeout);
1952                                 c->conn_timeout.tv_sec += utcp->timeout;
1953                                 set_state(c, TIME_WAIT);
1954                         }
1955
1956                         break;
1957
1958                 default:
1959                         break;
1960                 }
1961         } else {
1962                 if(!len && is_reliable(c) && c->snd.una != c->snd.last) {
1963                         c->dupack++;
1964                         debug(c, "duplicate ACK %d\n", c->dupack);
1965
1966                         if(c->dupack == 3) {
1967                                 // RFC 5681 fast recovery
1968                                 debug(c, "fast recovery started\n", c->dupack);
1969                                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
1970                                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
1971                                 c->snd.cwnd = min(c->snd.ssthresh + 3 * utcp->mss, c->sndbuf.maxsize);
1972
1973                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1974                                         c->snd.cwnd = c->sndbuf.maxsize;
1975                                 }
1976
1977                                 debug_cwnd(c);
1978
1979                                 fast_retransmit(c);
1980                         } else if(c->dupack > 3) {
1981                                 c->snd.cwnd += utcp->mss;
1982
1983                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1984                                         c->snd.cwnd = c->sndbuf.maxsize;
1985                                 }
1986
1987                                 debug_cwnd(c);
1988                         }
1989
1990                         // We got an ACK which indicates the other side did get one of our packets.
1991                         // Reset the retransmission timer to avoid going to slow start,
1992                         // but don't touch the connection timeout.
1993                         start_retransmit_timer(c);
1994                 }
1995         }
1996
1997         // 4. Update timers
1998
1999         if(advanced) {
2000                 if(c->snd.una == c->snd.last) {
2001                         stop_retransmit_timer(c);
2002                         timespec_clear(&c->conn_timeout);
2003                 } else if(is_reliable(c)) {
2004                         start_retransmit_timer(c);
2005                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2006                         c->conn_timeout.tv_sec += utcp->timeout;
2007                 }
2008         }
2009
2010 skip_ack:
2011         // 5. Process SYN stuff
2012
2013         if(hdr.ctl & SYN) {
2014                 switch(c->state) {
2015                 case SYN_SENT:
2016
2017                         // This is a SYNACK. It should always have ACKed the SYN.
2018                         if(!advanced) {
2019                                 goto reset;
2020                         }
2021
2022                         c->rcv.irs = hdr.seq;
2023                         c->rcv.nxt = hdr.seq + 1;
2024
2025                         if(c->shut_wr) {
2026                                 c->snd.last++;
2027                                 set_state(c, FIN_WAIT_1);
2028                         } else {
2029                                 c->do_poll = true;
2030                                 set_state(c, ESTABLISHED);
2031                         }
2032
2033                         break;
2034
2035                 case SYN_RECEIVED:
2036                         // This is a retransmit of a SYN, send back the SYNACK.
2037                         goto synack;
2038
2039                 case ESTABLISHED:
2040                 case FIN_WAIT_1:
2041                 case FIN_WAIT_2:
2042                 case CLOSE_WAIT:
2043                 case CLOSING:
2044                 case LAST_ACK:
2045                 case TIME_WAIT:
2046                         // This could be a retransmission. Ignore the SYN flag, but send an ACK back.
2047                         break;
2048
2049                 default:
2050 #ifdef UTCP_DEBUG
2051                         abort();
2052 #endif
2053                         return 0;
2054                 }
2055         }
2056
2057         // 6. Process new data
2058
2059         if(c->state == SYN_RECEIVED) {
2060                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
2061                 if(!advanced) {
2062                         goto reset;
2063                 }
2064
2065                 // Are we still LISTENing?
2066                 if(utcp->accept) {
2067                         utcp->accept(c, c->src);
2068                 }
2069
2070                 if(c->state != ESTABLISHED) {
2071                         set_state(c, CLOSED);
2072                         c->reapable = true;
2073                         goto reset;
2074                 }
2075         }
2076
2077         if(len) {
2078                 switch(c->state) {
2079                 case SYN_SENT:
2080                 case SYN_RECEIVED:
2081                         // This should never happen.
2082 #ifdef UTCP_DEBUG
2083                         abort();
2084 #endif
2085                         return 0;
2086
2087                 case ESTABLISHED:
2088                 case FIN_WAIT_1:
2089                 case FIN_WAIT_2:
2090                         break;
2091
2092                 case CLOSE_WAIT:
2093                 case CLOSING:
2094                 case LAST_ACK:
2095                 case TIME_WAIT:
2096                         // Ehm no, We should never receive more data after a FIN.
2097                         goto reset;
2098
2099                 default:
2100 #ifdef UTCP_DEBUG
2101                         abort();
2102 #endif
2103                         return 0;
2104                 }
2105
2106                 handle_incoming_data(c, &hdr, ptr, len);
2107         }
2108
2109         // 7. Process FIN stuff
2110
2111         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
2112                 switch(c->state) {
2113                 case SYN_SENT:
2114                 case SYN_RECEIVED:
2115                         // This should never happen.
2116 #ifdef UTCP_DEBUG
2117                         abort();
2118 #endif
2119                         break;
2120
2121                 case ESTABLISHED:
2122                         set_state(c, CLOSE_WAIT);
2123                         break;
2124
2125                 case FIN_WAIT_1:
2126                         set_state(c, CLOSING);
2127                         break;
2128
2129                 case FIN_WAIT_2:
2130                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2131                         c->conn_timeout.tv_sec += utcp->timeout;
2132                         set_state(c, TIME_WAIT);
2133                         break;
2134
2135                 case CLOSE_WAIT:
2136                 case CLOSING:
2137                 case LAST_ACK:
2138                 case TIME_WAIT:
2139                         // Ehm, no. We should never receive a second FIN.
2140                         goto reset;
2141
2142                 default:
2143 #ifdef UTCP_DEBUG
2144                         abort();
2145 #endif
2146                         break;
2147                 }
2148
2149                 // FIN counts as one sequence number
2150                 c->rcv.nxt++;
2151                 len++;
2152
2153                 // Inform the application that the peer closed its end of the connection.
2154                 if(c->recv) {
2155                         errno = 0;
2156                         c->recv(c, NULL, 0);
2157                 }
2158         }
2159
2160         // Now we send something back if:
2161         // - we received data, so we have to send back an ACK
2162         //   -> sendatleastone = true
2163         // - or we got an ack, so we should maybe send a bit more data
2164         //   -> sendatleastone = false
2165
2166         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
2167                 ack(c, has_data);
2168         }
2169
2170         return 0;
2171
2172 reset:
2173         swap_ports(&hdr);
2174         hdr.wnd = 0;
2175         hdr.aux = 0;
2176
2177         if(hdr.ctl & ACK) {
2178                 hdr.seq = hdr.ack;
2179                 hdr.ctl = RST;
2180         } else {
2181                 hdr.ack = hdr.seq + len;
2182                 hdr.seq = 0;
2183                 hdr.ctl = RST | ACK;
2184         }
2185
2186         print_packet(c, "send", &hdr, sizeof(hdr));
2187         utcp->send(utcp, &hdr, sizeof(hdr));
2188         return 0;
2189
2190 }
2191
2192 int utcp_shutdown(struct utcp_connection *c, int dir) {
2193         debug(c, "shutdown %d at %u\n", dir, c ? c->snd.last : 0);
2194
2195         if(!c) {
2196                 errno = EFAULT;
2197                 return -1;
2198         }
2199
2200         if(c->reapable) {
2201                 debug(c, "shutdown() called on closed connection\n");
2202                 errno = EBADF;
2203                 return -1;
2204         }
2205
2206         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
2207                 errno = EINVAL;
2208                 return -1;
2209         }
2210
2211         // TCP does not have a provision for stopping incoming packets.
2212         // The best we can do is to just ignore them.
2213         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
2214                 c->recv = NULL;
2215         }
2216
2217         // The rest of the code deals with shutting down writes.
2218         if(dir == UTCP_SHUT_RD) {
2219                 return 0;
2220         }
2221
2222         // Only process shutting down writes once.
2223         if(c->shut_wr) {
2224                 return 0;
2225         }
2226
2227         c->shut_wr = true;
2228
2229         switch(c->state) {
2230         case CLOSED:
2231         case LISTEN:
2232                 errno = ENOTCONN;
2233                 return -1;
2234
2235         case SYN_SENT:
2236                 return 0;
2237
2238         case SYN_RECEIVED:
2239         case ESTABLISHED:
2240                 if(!is_reliable(c) && is_framed(c)) {
2241                         flush_unreliable_framed(c);
2242                 }
2243
2244                 set_state(c, FIN_WAIT_1);
2245                 break;
2246
2247         case FIN_WAIT_1:
2248         case FIN_WAIT_2:
2249                 return 0;
2250
2251         case CLOSE_WAIT:
2252                 set_state(c, CLOSING);
2253                 break;
2254
2255         case CLOSING:
2256         case LAST_ACK:
2257         case TIME_WAIT:
2258                 return 0;
2259         }
2260
2261         c->snd.last++;
2262
2263         ack(c, !is_reliable(c));
2264
2265         if(!timespec_isset(&c->rtrx_timeout)) {
2266                 start_retransmit_timer(c);
2267         }
2268
2269         return 0;
2270 }
2271
2272 static bool reset_connection(struct utcp_connection *c) {
2273         if(!c) {
2274                 errno = EFAULT;
2275                 return false;
2276         }
2277
2278         if(c->reapable) {
2279                 debug(c, "abort() called on closed connection\n");
2280                 errno = EBADF;
2281                 return false;
2282         }
2283
2284         c->recv = NULL;
2285         c->poll = NULL;
2286
2287         switch(c->state) {
2288         case CLOSED:
2289                 return true;
2290
2291         case LISTEN:
2292         case SYN_SENT:
2293         case CLOSING:
2294         case LAST_ACK:
2295         case TIME_WAIT:
2296                 set_state(c, CLOSED);
2297                 return true;
2298
2299         case SYN_RECEIVED:
2300         case ESTABLISHED:
2301         case FIN_WAIT_1:
2302         case FIN_WAIT_2:
2303         case CLOSE_WAIT:
2304                 set_state(c, CLOSED);
2305                 break;
2306         }
2307
2308         // Send RST
2309
2310         struct hdr hdr;
2311
2312         hdr.src = c->src;
2313         hdr.dst = c->dst;
2314         hdr.seq = c->snd.nxt;
2315         hdr.ack = 0;
2316         hdr.wnd = 0;
2317         hdr.ctl = RST;
2318
2319         print_packet(c, "send", &hdr, sizeof(hdr));
2320         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
2321         return true;
2322 }
2323
2324 // Closes all the opened connections
2325 void utcp_abort_all_connections(struct utcp *utcp) {
2326         if(!utcp) {
2327                 errno = EINVAL;
2328                 return;
2329         }
2330
2331         for(int i = 0; i < utcp->nconnections; i++) {
2332                 struct utcp_connection *c = utcp->connections[i];
2333
2334                 if(c->reapable || c->state == CLOSED) {
2335                         continue;
2336                 }
2337
2338                 utcp_recv_t old_recv = c->recv;
2339                 utcp_poll_t old_poll = c->poll;
2340
2341                 reset_connection(c);
2342
2343                 if(old_recv) {
2344                         errno = 0;
2345                         old_recv(c, NULL, 0);
2346                 }
2347
2348                 if(old_poll && !c->reapable) {
2349                         errno = 0;
2350                         old_poll(c, 0);
2351                 }
2352         }
2353
2354         return;
2355 }
2356
2357 int utcp_close(struct utcp_connection *c) {
2358         debug(c, "closing\n");
2359
2360         if(c->rcvbuf.used) {
2361                 debug(c, "receive buffer not empty, resetting\n");
2362                 return reset_connection(c) ? 0 : -1;
2363         }
2364
2365         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
2366                 return -1;
2367         }
2368
2369         c->recv = NULL;
2370         c->poll = NULL;
2371         c->reapable = true;
2372         return 0;
2373 }
2374
2375 int utcp_abort(struct utcp_connection *c) {
2376         if(!reset_connection(c)) {
2377                 return -1;
2378         }
2379
2380         c->reapable = true;
2381         return 0;
2382 }
2383
2384 /* Handle timeouts.
2385  * One call to this function will loop through all connections,
2386  * checking if something needs to be resent or not.
2387  * The return value is the time to the next timeout in milliseconds,
2388  * or maybe a negative value if the timeout is infinite.
2389  */
2390 struct timespec utcp_timeout(struct utcp *utcp) {
2391         struct timespec now;
2392         clock_gettime(UTCP_CLOCK, &now);
2393         struct timespec next = {now.tv_sec + 3600, now.tv_nsec};
2394
2395         for(int i = 0; i < utcp->nconnections; i++) {
2396                 struct utcp_connection *c = utcp->connections[i];
2397
2398                 if(!c) {
2399                         continue;
2400                 }
2401
2402                 // delete connections that have been utcp_close()d.
2403                 if(c->state == CLOSED) {
2404                         if(c->reapable) {
2405                                 debug(c, "reaping\n");
2406                                 free_connection(c);
2407                                 i--;
2408                         }
2409
2410                         continue;
2411                 }
2412
2413                 if(timespec_isset(&c->conn_timeout) && timespec_lt(&c->conn_timeout, &now)) {
2414                         errno = ETIMEDOUT;
2415                         c->state = CLOSED;
2416
2417                         if(c->recv) {
2418                                 c->recv(c, NULL, 0);
2419                         }
2420
2421                         if(c->poll && !c->reapable) {
2422                                 c->poll(c, 0);
2423                         }
2424
2425                         continue;
2426                 }
2427
2428                 if(timespec_isset(&c->rtrx_timeout) && timespec_lt(&c->rtrx_timeout, &now)) {
2429                         debug(c, "retransmitting after timeout\n");
2430                         retransmit(c);
2431                 }
2432
2433                 if(c->poll) {
2434                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT) && c->do_poll) {
2435                                 c->do_poll = false;
2436                                 uint32_t len = is_framed(c) ? min(buffer_free(&c->sndbuf), MAX_UNRELIABLE_SIZE) : buffer_free(&c->sndbuf);
2437
2438                                 if(len) {
2439                                         c->poll(c, len);
2440                                 }
2441                         } else if(c->state == CLOSED) {
2442                                 c->poll(c, 0);
2443                         }
2444                 }
2445
2446                 if(timespec_isset(&c->conn_timeout) && timespec_lt(&c->conn_timeout, &next)) {
2447                         next = c->conn_timeout;
2448                 }
2449
2450                 if(timespec_isset(&c->rtrx_timeout) && timespec_lt(&c->rtrx_timeout, &next)) {
2451                         next = c->rtrx_timeout;
2452                 }
2453         }
2454
2455         struct timespec diff;
2456
2457         timespec_sub(&next, &now, &diff);
2458
2459         return diff;
2460 }
2461
2462 bool utcp_is_active(struct utcp *utcp) {
2463         if(!utcp) {
2464                 return false;
2465         }
2466
2467         for(int i = 0; i < utcp->nconnections; i++)
2468                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
2469                         return true;
2470                 }
2471
2472         return false;
2473 }
2474
2475 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
2476         if(!send) {
2477                 errno = EFAULT;
2478                 return NULL;
2479         }
2480
2481         struct utcp *utcp = calloc(1, sizeof(*utcp));
2482
2483         if(!utcp) {
2484                 return NULL;
2485         }
2486
2487         utcp_set_mtu(utcp, DEFAULT_MTU);
2488
2489         if(!utcp->pkt) {
2490                 free(utcp);
2491                 return NULL;
2492         }
2493
2494         if(!CLOCK_GRANULARITY) {
2495                 struct timespec res;
2496                 clock_getres(UTCP_CLOCK, &res);
2497                 CLOCK_GRANULARITY = res.tv_sec * USEC_PER_SEC + res.tv_nsec / 1000;
2498         }
2499
2500         utcp->accept = accept;
2501         utcp->pre_accept = pre_accept;
2502         utcp->send = send;
2503         utcp->priv = priv;
2504         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
2505
2506         return utcp;
2507 }
2508
2509 void utcp_exit(struct utcp *utcp) {
2510         if(!utcp) {
2511                 return;
2512         }
2513
2514         for(int i = 0; i < utcp->nconnections; i++) {
2515                 struct utcp_connection *c = utcp->connections[i];
2516
2517                 if(!c->reapable) {
2518                         if(c->recv) {
2519                                 c->recv(c, NULL, 0);
2520                         }
2521
2522                         if(c->poll && !c->reapable) {
2523                                 c->poll(c, 0);
2524                         }
2525                 }
2526
2527                 buffer_exit(&c->rcvbuf);
2528                 buffer_exit(&c->sndbuf);
2529                 free(c);
2530         }
2531
2532         free(utcp->connections);
2533         free(utcp->pkt);
2534         free(utcp);
2535 }
2536
2537 uint16_t utcp_get_mtu(struct utcp *utcp) {
2538         return utcp ? utcp->mtu : 0;
2539 }
2540
2541 uint16_t utcp_get_mss(struct utcp *utcp) {
2542         return utcp ? utcp->mss : 0;
2543 }
2544
2545 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
2546         if(!utcp) {
2547                 return;
2548         }
2549
2550         if(mtu <= sizeof(struct hdr)) {
2551                 return;
2552         }
2553
2554         if(mtu > utcp->mtu) {
2555                 char *new = realloc(utcp->pkt, mtu + sizeof(struct hdr));
2556
2557                 if(!new) {
2558                         return;
2559                 }
2560
2561                 utcp->pkt = new;
2562         }
2563
2564         utcp->mtu = mtu;
2565         utcp->mss = mtu - sizeof(struct hdr);
2566 }
2567
2568 void utcp_reset_timers(struct utcp *utcp) {
2569         if(!utcp) {
2570                 return;
2571         }
2572
2573         struct timespec now, then;
2574
2575         clock_gettime(UTCP_CLOCK, &now);
2576
2577         then = now;
2578
2579         then.tv_sec += utcp->timeout;
2580
2581         for(int i = 0; i < utcp->nconnections; i++) {
2582                 struct utcp_connection *c = utcp->connections[i];
2583
2584                 if(c->reapable) {
2585                         continue;
2586                 }
2587
2588                 if(timespec_isset(&c->rtrx_timeout)) {
2589                         c->rtrx_timeout = now;
2590                 }
2591
2592                 if(timespec_isset(&c->conn_timeout)) {
2593                         c->conn_timeout = then;
2594                 }
2595
2596                 c->rtt_start.tv_sec = 0;
2597
2598                 if(c->rto > START_RTO) {
2599                         c->rto = START_RTO;
2600                 }
2601         }
2602 }
2603
2604 int utcp_get_user_timeout(struct utcp *u) {
2605         return u ? u->timeout : 0;
2606 }
2607
2608 void utcp_set_user_timeout(struct utcp *u, int timeout) {
2609         if(u) {
2610                 u->timeout = timeout;
2611         }
2612 }
2613
2614 size_t utcp_get_sndbuf(struct utcp_connection *c) {
2615         return c ? c->sndbuf.maxsize : 0;
2616 }
2617
2618 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
2619         if(!c) {
2620                 return 0;
2621         }
2622
2623         switch(c->state) {
2624         case SYN_SENT:
2625         case SYN_RECEIVED:
2626         case ESTABLISHED:
2627         case CLOSE_WAIT:
2628                 return buffer_free(&c->sndbuf);
2629
2630         default:
2631                 return 0;
2632         }
2633 }
2634
2635 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2636         if(!c) {
2637                 return;
2638         }
2639
2640         c->sndbuf.maxsize = size;
2641
2642         if(c->sndbuf.maxsize != size) {
2643                 c->sndbuf.maxsize = -1;
2644         }
2645
2646         c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
2647 }
2648
2649 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2650         return c ? c->rcvbuf.maxsize : 0;
2651 }
2652
2653 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2654         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2655                 return buffer_free(&c->rcvbuf);
2656         } else {
2657                 return 0;
2658         }
2659 }
2660
2661 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2662         if(!c) {
2663                 return;
2664         }
2665
2666         c->rcvbuf.maxsize = size;
2667
2668         if(c->rcvbuf.maxsize != size) {
2669                 c->rcvbuf.maxsize = -1;
2670         }
2671 }
2672
2673 size_t utcp_get_sendq(struct utcp_connection *c) {
2674         return c->sndbuf.used;
2675 }
2676
2677 size_t utcp_get_recvq(struct utcp_connection *c) {
2678         return c->rcvbuf.used;
2679 }
2680
2681 bool utcp_get_nodelay(struct utcp_connection *c) {
2682         return c ? c->nodelay : false;
2683 }
2684
2685 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2686         if(c) {
2687                 c->nodelay = nodelay;
2688         }
2689 }
2690
2691 bool utcp_get_keepalive(struct utcp_connection *c) {
2692         return c ? c->keepalive : false;
2693 }
2694
2695 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2696         if(c) {
2697                 c->keepalive = keepalive;
2698         }
2699 }
2700
2701 size_t utcp_get_outq(struct utcp_connection *c) {
2702         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2703 }
2704
2705 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2706         if(c) {
2707                 c->recv = recv;
2708         }
2709 }
2710
2711 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2712         if(c) {
2713                 c->poll = poll;
2714                 c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
2715         }
2716 }
2717
2718 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2719         if(utcp) {
2720                 utcp->accept = accept;
2721                 utcp->pre_accept = pre_accept;
2722         }
2723 }
2724
2725 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2726         if(!c || c->reapable) {
2727                 return;
2728         }
2729
2730         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2731                 return;
2732         }
2733
2734         if(expect) {
2735                 // If we expect data, start the connection timer.
2736                 if(!timespec_isset(&c->conn_timeout)) {
2737                         clock_gettime(UTCP_CLOCK, &c->conn_timeout);
2738                         c->conn_timeout.tv_sec += c->utcp->timeout;
2739                 }
2740         } else {
2741                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2742                 if(c->snd.una == c->snd.last) {
2743                         timespec_clear(&c->conn_timeout);
2744                 }
2745         }
2746 }
2747
2748 void utcp_offline(struct utcp *utcp, bool offline) {
2749         struct timespec now;
2750         clock_gettime(UTCP_CLOCK, &now);
2751
2752         for(int i = 0; i < utcp->nconnections; i++) {
2753                 struct utcp_connection *c = utcp->connections[i];
2754
2755                 if(c->reapable) {
2756                         continue;
2757                 }
2758
2759                 utcp_expect_data(c, offline);
2760
2761                 if(!offline) {
2762                         if(timespec_isset(&c->rtrx_timeout)) {
2763                                 c->rtrx_timeout = now;
2764                         }
2765
2766                         utcp->connections[i]->rtt_start.tv_sec = 0;
2767
2768                         if(c->rto > START_RTO) {
2769                                 c->rto = START_RTO;
2770                         }
2771                 }
2772         }
2773 }
2774
2775 void utcp_set_retransmit_cb(struct utcp *utcp, utcp_retransmit_t cb) {
2776         utcp->retransmit = cb;
2777 }
2778
2779 void utcp_set_clock_granularity(long granularity) {
2780         CLOCK_GRANULARITY = granularity;
2781 }
2782
2783 int utcp_get_flush_timeout(struct utcp *utcp) {
2784         return utcp->flush_timeout;
2785 }
2786
2787 void utcp_set_flush_timeout(struct utcp *utcp, int milliseconds) {
2788         utcp->flush_timeout = milliseconds;
2789 }