]> git.meshlink.io Git - utcp/blob - utcp.c
Add utcp_get_mss().
[utcp] / utcp.c
1 /*
2     utcp.c -- Userspace TCP
3     Copyright (C) 2014-2017 Guus Sliepen <guus@tinc-vpn.org>
4
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14
15     You should have received a copy of the GNU General Public License along
16     with this program; if not, write to the Free Software Foundation, Inc.,
17     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #define _GNU_SOURCE
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <sys/time.h>
31 #include <sys/socket.h>
32 #include <time.h>
33
34 #include "utcp_priv.h"
35
36 #ifndef EBADMSG
37 #define EBADMSG         104
38 #endif
39
40 #ifndef SHUT_RDWR
41 #define SHUT_RDWR 2
42 #endif
43
44 #ifdef poll
45 #undef poll
46 #endif
47
48 #ifndef timersub
49 #define timersub(a, b, r)\
50         do {\
51                 (r)->tv_sec = (a)->tv_sec - (b)->tv_sec;\
52                 (r)->tv_usec = (a)->tv_usec - (b)->tv_usec;\
53                 if((r)->tv_usec < 0)\
54                         (r)->tv_sec--, (r)->tv_usec += USEC_PER_SEC;\
55         } while (0)
56 #endif
57
58 static inline size_t min(size_t a, size_t b) {
59         return a < b ? a : b;
60 }
61
62 static inline size_t max(size_t a, size_t b) {
63         return a > b ? a : b;
64 }
65
66 #ifdef UTCP_DEBUG
67 #include <stdarg.h>
68
69 #ifndef UTCP_DEBUG_DATALEN
70 #define UTCP_DEBUG_DATALEN 20
71 #endif
72
73 static void debug(struct utcp_connection *c, const char *format, ...) {
74         struct timespec tv;
75         char buf[1024];
76         int len;
77
78         clock_gettime(CLOCK_REALTIME, &tv);
79         len = snprintf(buf, sizeof(buf), "%ld.%06lu %u:%u ", (long)tv.tv_sec, tv.tv_nsec / 1000, c ? c->src : 0, c ? c->dst : 0);
80         va_list ap;
81         va_start(ap, format);
82         len += vsnprintf(buf + len, sizeof(buf) - len, format, ap);
83         va_end(ap);
84
85         if(len > 0 && (size_t)len < sizeof(buf)) {
86                 fwrite(buf, len, 1, stderr);
87         }
88 }
89
90 static void print_packet(struct utcp_connection *c, const char *dir, const void *pkt, size_t len) {
91         struct hdr hdr;
92
93         if(len < sizeof(hdr)) {
94                 debug(c, "%s: short packet (%lu bytes)\n", dir, (unsigned long)len);
95                 return;
96         }
97
98         memcpy(&hdr, pkt, sizeof(hdr));
99
100         uint32_t datalen;
101
102         if(len > sizeof(hdr)) {
103                 datalen = min(len - sizeof(hdr), UTCP_DEBUG_DATALEN);
104         } else {
105                 datalen = 0;
106         }
107
108
109         const uint8_t *data = (uint8_t *)pkt + sizeof(hdr);
110         char str[datalen * 2 + 1];
111         char *p = str;
112
113         for(uint32_t i = 0; i < datalen; i++) {
114                 *p++ = "0123456789ABCDEF"[data[i] >> 4];
115                 *p++ = "0123456789ABCDEF"[data[i] & 15];
116         }
117
118         *p = 0;
119
120         debug(c, "%s: len %lu src %u dst %u seq %u ack %u wnd %u aux %x ctl %s%s%s%s data %s\n",
121               dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux,
122               hdr.ctl & SYN ? "SYN" : "",
123               hdr.ctl & RST ? "RST" : "",
124               hdr.ctl & FIN ? "FIN" : "",
125               hdr.ctl & ACK ? "ACK" : "",
126               str
127              );
128 }
129
130 static void debug_cwnd(struct utcp_connection *c) {
131         debug(c, "snd.cwnd %u snd.ssthresh %u\n", c->snd.cwnd, ~c->snd.ssthresh ? c->snd.ssthresh : 0);
132 }
133 #else
134 #define debug(...) do {} while(0)
135 #define print_packet(...) do {} while(0)
136 #define debug_cwnd(...) do {} while(0)
137 #endif
138
139 static void set_state(struct utcp_connection *c, enum state state) {
140         c->state = state;
141
142         if(state == ESTABLISHED) {
143                 timerclear(&c->conn_timeout);
144         }
145
146         debug(c, "state %s\n", strstate[state]);
147 }
148
149 static bool fin_wanted(struct utcp_connection *c, uint32_t seq) {
150         if(seq != c->snd.last) {
151                 return false;
152         }
153
154         switch(c->state) {
155         case FIN_WAIT_1:
156         case CLOSING:
157         case LAST_ACK:
158                 return true;
159
160         default:
161                 return false;
162         }
163 }
164
165 static bool is_reliable(struct utcp_connection *c) {
166         return c->flags & UTCP_RELIABLE;
167 }
168
169 static int32_t seqdiff(uint32_t a, uint32_t b) {
170         return a - b;
171 }
172
173 // Buffer functions
174 static bool buffer_wraps(struct buffer *buf) {
175         return buf->size - buf->offset < buf->used;
176 }
177
178 static bool buffer_resize(struct buffer *buf, uint32_t newsize) {
179         char *newdata = realloc(buf->data, newsize);
180
181         if(!newdata) {
182                 return false;
183         }
184
185         buf->data = newdata;
186
187         if(buffer_wraps(buf)) {
188                 // Shift the right part of the buffer until it hits the end of the new buffer.
189                 // Old situation:
190                 // [345......012]
191                 // New situation:
192                 // [345.........|........012]
193                 uint32_t tailsize = buf->size - buf->offset;
194                 uint32_t newoffset = newsize - tailsize;
195                 memmove(buf + newoffset, buf + buf->offset, tailsize);
196                 buf->offset = newoffset;
197         }
198
199         buf->size = newsize;
200         return true;
201 }
202
203 // Store data into the buffer
204 static ssize_t buffer_put_at(struct buffer *buf, size_t offset, const void *data, size_t len) {
205         debug(NULL, "buffer_put_at %lu %lu %lu\n", (unsigned long)buf->used, (unsigned long)offset, (unsigned long)len);
206
207         // Ensure we don't store more than maxsize bytes in total
208         size_t required = offset + len;
209
210         if(required > buf->maxsize) {
211                 if(offset >= buf->maxsize) {
212                         return 0;
213                 }
214
215                 len = buf->maxsize - offset;
216                 required = buf->maxsize;
217         }
218
219         // Check if we need to resize the buffer
220         if(required > buf->size) {
221                 size_t newsize = buf->size;
222
223                 if(!newsize) {
224                         newsize = 4096;
225                 }
226
227                 do {
228                         newsize *= 2;
229                 } while(newsize < required);
230
231                 if(newsize > buf->maxsize) {
232                         newsize = buf->maxsize;
233                 }
234
235                 if(!buffer_resize(buf, newsize)) {
236                         return -1;
237                 }
238         }
239
240         uint32_t realoffset = buf->offset + offset;
241
242         if(buf->size - buf->offset < offset) {
243                 // The offset wrapped
244                 realoffset -= buf->size;
245         }
246
247         if(buf->size - realoffset < len) {
248                 // The new chunk of data must be wrapped
249                 memcpy(buf->data + realoffset, data, buf->size - realoffset);
250                 memcpy(buf->data, (char *)data + buf->size - realoffset, len - (buf->size - realoffset));
251         } else {
252                 memcpy(buf->data + realoffset, data, len);
253         }
254
255         if(required > buf->used) {
256                 buf->used = required;
257         }
258
259         return len;
260 }
261
262 static ssize_t buffer_put(struct buffer *buf, const void *data, size_t len) {
263         return buffer_put_at(buf, buf->used, data, len);
264 }
265
266 // Copy data from the buffer without removing it.
267 static ssize_t buffer_copy(struct buffer *buf, void *data, size_t offset, size_t len) {
268         // Ensure we don't copy more than is actually stored in the buffer
269         if(offset >= buf->used) {
270                 return 0;
271         }
272
273         if(buf->used - offset < len) {
274                 len = buf->used - offset;
275         }
276
277         uint32_t realoffset = buf->offset + offset;
278
279         if(buf->size - buf->offset < offset) {
280                 // The offset wrapped
281                 realoffset -= buf->size;
282         }
283
284         if(buf->size - realoffset < len) {
285                 // The data is wrapped
286                 memcpy(data, buf->data + realoffset, buf->size - realoffset);
287                 memcpy((char *)data + buf->size - realoffset, buf->data, len - (buf->size - realoffset));
288         } else {
289                 memcpy(data, buf->data + realoffset, len);
290         }
291
292         return len;
293 }
294
295 // Discard data from the buffer.
296 static ssize_t buffer_discard(struct buffer *buf, size_t len) {
297         if(buf->used < len) {
298                 len = buf->used;
299         }
300
301         if(buf->size - buf->offset < len) {
302                 buf->offset -= buf->size;
303         }
304
305         buf->offset += len;
306         buf->used -= len;
307
308         return len;
309 }
310
311 static bool buffer_set_size(struct buffer *buf, uint32_t minsize, uint32_t maxsize) {
312         if(maxsize < minsize) {
313                 maxsize = minsize;
314         }
315
316         buf->maxsize = maxsize;
317
318         return buf->size >= minsize || buffer_resize(buf, minsize);
319 }
320
321 static void buffer_exit(struct buffer *buf) {
322         free(buf->data);
323         memset(buf, 0, sizeof(*buf));
324 }
325
326 static uint32_t buffer_free(const struct buffer *buf) {
327         return buf->maxsize - buf->used;
328 }
329
330 // Connections are stored in a sorted list.
331 // This gives O(log(N)) lookup time, O(N log(N)) insertion time and O(N) deletion time.
332
333 static int compare(const void *va, const void *vb) {
334         assert(va && vb);
335
336         const struct utcp_connection *a = *(struct utcp_connection **)va;
337         const struct utcp_connection *b = *(struct utcp_connection **)vb;
338
339         assert(a && b);
340         assert(a->src && b->src);
341
342         int c = (int)a->src - (int)b->src;
343
344         if(c) {
345                 return c;
346         }
347
348         c = (int)a->dst - (int)b->dst;
349         return c;
350 }
351
352 static struct utcp_connection *find_connection(const struct utcp *utcp, uint16_t src, uint16_t dst) {
353         if(!utcp->nconnections) {
354                 return NULL;
355         }
356
357         struct utcp_connection key = {
358                 .src = src,
359                 .dst = dst,
360         }, *keyp = &key;
361         struct utcp_connection **match = bsearch(&keyp, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
362         return match ? *match : NULL;
363 }
364
365 static void free_connection(struct utcp_connection *c) {
366         struct utcp *utcp = c->utcp;
367         struct utcp_connection **cp = bsearch(&c, utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
368
369         assert(cp);
370
371         int i = cp - utcp->connections;
372         memmove(cp, cp + 1, (utcp->nconnections - i - 1) * sizeof(*cp));
373         utcp->nconnections--;
374
375         buffer_exit(&c->rcvbuf);
376         buffer_exit(&c->sndbuf);
377         free(c);
378 }
379
380 static struct utcp_connection *allocate_connection(struct utcp *utcp, uint16_t src, uint16_t dst) {
381         // Check whether this combination of src and dst is free
382
383         if(src) {
384                 if(find_connection(utcp, src, dst)) {
385                         errno = EADDRINUSE;
386                         return NULL;
387                 }
388         } else { // If src == 0, generate a random port number with the high bit set
389                 if(utcp->nconnections >= 32767) {
390                         errno = ENOMEM;
391                         return NULL;
392                 }
393
394                 src = rand() | 0x8000;
395
396                 while(find_connection(utcp, src, dst)) {
397                         src++;
398                 }
399         }
400
401         // Allocate memory for the new connection
402
403         if(utcp->nconnections >= utcp->nallocated) {
404                 if(!utcp->nallocated) {
405                         utcp->nallocated = 4;
406                 } else {
407                         utcp->nallocated *= 2;
408                 }
409
410                 struct utcp_connection **new_array = realloc(utcp->connections, utcp->nallocated * sizeof(*utcp->connections));
411
412                 if(!new_array) {
413                         return NULL;
414                 }
415
416                 utcp->connections = new_array;
417         }
418
419         struct utcp_connection *c = calloc(1, sizeof(*c));
420
421         if(!c) {
422                 return NULL;
423         }
424
425         if(!buffer_set_size(&c->sndbuf, DEFAULT_SNDBUFSIZE, DEFAULT_MAXSNDBUFSIZE)) {
426                 free(c);
427                 return NULL;
428         }
429
430         if(!buffer_set_size(&c->rcvbuf, DEFAULT_RCVBUFSIZE, DEFAULT_MAXRCVBUFSIZE)) {
431                 buffer_exit(&c->sndbuf);
432                 free(c);
433                 return NULL;
434         }
435
436         // Fill in the details
437
438         c->src = src;
439         c->dst = dst;
440 #ifdef UTCP_DEBUG
441         c->snd.iss = 0;
442 #else
443         c->snd.iss = rand();
444 #endif
445         c->snd.una = c->snd.iss;
446         c->snd.nxt = c->snd.iss + 1;
447         c->snd.last = c->snd.nxt;
448         c->snd.cwnd = (utcp->mss > 2190 ? 2 : utcp->mss > 1095 ? 3 : 4) * utcp->mss;
449         c->snd.ssthresh = ~0;
450         debug_cwnd(c);
451         c->utcp = utcp;
452
453         // Add it to the sorted list of connections
454
455         utcp->connections[utcp->nconnections++] = c;
456         qsort(utcp->connections, utcp->nconnections, sizeof(*utcp->connections), compare);
457
458         return c;
459 }
460
461 static inline uint32_t absdiff(uint32_t a, uint32_t b) {
462         if(a > b) {
463                 return a - b;
464         } else {
465                 return b - a;
466         }
467 }
468
469 // Update RTT variables. See RFC 6298.
470 static void update_rtt(struct utcp_connection *c, uint32_t rtt) {
471         if(!rtt) {
472                 debug(c, "invalid rtt\n");
473                 return;
474         }
475
476         struct utcp *utcp = c->utcp;
477
478         if(!utcp->srtt) {
479                 utcp->srtt = rtt;
480                 utcp->rttvar = rtt / 2;
481         } else {
482                 utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
483                 utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
484         }
485
486         utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
487
488         if(utcp->rto > MAX_RTO) {
489                 utcp->rto = MAX_RTO;
490         }
491
492         debug(c, "rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
493 }
494
495 static void start_retransmit_timer(struct utcp_connection *c) {
496         gettimeofday(&c->rtrx_timeout, NULL);
497         c->rtrx_timeout.tv_usec += c->utcp->rto;
498
499         while(c->rtrx_timeout.tv_usec >= 1000000) {
500                 c->rtrx_timeout.tv_usec -= 1000000;
501                 c->rtrx_timeout.tv_sec++;
502         }
503
504         debug(c, "rtrx_timeout %ld.%06lu\n", c->rtrx_timeout.tv_sec, c->rtrx_timeout.tv_usec);
505 }
506
507 static void stop_retransmit_timer(struct utcp_connection *c) {
508         timerclear(&c->rtrx_timeout);
509         debug(c, "rtrx_timeout cleared\n");
510 }
511
512 struct utcp_connection *utcp_connect_ex(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv, uint32_t flags) {
513         struct utcp_connection *c = allocate_connection(utcp, 0, dst);
514
515         if(!c) {
516                 return NULL;
517         }
518
519         assert((flags & ~0x1f) == 0);
520
521         c->flags = flags;
522         c->recv = recv;
523         c->priv = priv;
524
525         struct {
526                 struct hdr hdr;
527                 uint8_t init[4];
528         } pkt;
529
530         pkt.hdr.src = c->src;
531         pkt.hdr.dst = c->dst;
532         pkt.hdr.seq = c->snd.iss;
533         pkt.hdr.ack = 0;
534         pkt.hdr.wnd = c->rcvbuf.maxsize;
535         pkt.hdr.ctl = SYN;
536         pkt.hdr.aux = 0x0101;
537         pkt.init[0] = 1;
538         pkt.init[1] = 0;
539         pkt.init[2] = 0;
540         pkt.init[3] = flags & 0x7;
541
542         set_state(c, SYN_SENT);
543
544         print_packet(c, "send", &pkt, sizeof(pkt));
545         utcp->send(utcp, &pkt, sizeof(pkt));
546
547         gettimeofday(&c->conn_timeout, NULL);
548         c->conn_timeout.tv_sec += utcp->timeout;
549
550         start_retransmit_timer(c);
551
552         return c;
553 }
554
555 struct utcp_connection *utcp_connect(struct utcp *utcp, uint16_t dst, utcp_recv_t recv, void *priv) {
556         return utcp_connect_ex(utcp, dst, recv, priv, UTCP_TCP);
557 }
558
559 void utcp_accept(struct utcp_connection *c, utcp_recv_t recv, void *priv) {
560         if(c->reapable || c->state != SYN_RECEIVED) {
561                 debug(c, "accept() called on invalid connection in state %s\n", c, strstate[c->state]);
562                 return;
563         }
564
565         debug(c, "accepted %p %p\n", c, recv, priv);
566         c->recv = recv;
567         c->priv = priv;
568         set_state(c, ESTABLISHED);
569 }
570
571 static void ack(struct utcp_connection *c, bool sendatleastone) {
572         int32_t left = seqdiff(c->snd.last, c->snd.nxt);
573         int32_t cwndleft = min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una);
574
575         assert(left >= 0);
576
577         if(cwndleft <= 0) {
578                 left = 0;
579         } else if(cwndleft < left) {
580                 left = cwndleft;
581
582                 if(!sendatleastone || cwndleft > c->utcp->mss) {
583                         left -= left % c->utcp->mss;
584                 }
585         }
586
587         debug(c, "cwndleft %d left %d\n", cwndleft, left);
588
589         if(!left && !sendatleastone) {
590                 return;
591         }
592
593         struct {
594                 struct hdr hdr;
595                 uint8_t data[];
596         } *pkt = c->utcp->pkt;
597
598         pkt->hdr.src = c->src;
599         pkt->hdr.dst = c->dst;
600         pkt->hdr.ack = c->rcv.nxt;
601         pkt->hdr.wnd = c->rcvbuf.maxsize;
602         pkt->hdr.ctl = ACK;
603         pkt->hdr.aux = 0;
604
605         do {
606                 uint32_t seglen = left > c->utcp->mss ? c->utcp->mss : left;
607                 pkt->hdr.seq = c->snd.nxt;
608
609                 buffer_copy(&c->sndbuf, pkt->data, seqdiff(c->snd.nxt, c->snd.una), seglen);
610
611                 c->snd.nxt += seglen;
612                 left -= seglen;
613
614                 if(seglen && fin_wanted(c, c->snd.nxt)) {
615                         seglen--;
616                         pkt->hdr.ctl |= FIN;
617                 }
618
619                 if(!c->rtt_start.tv_sec) {
620                         // Start RTT measurement
621                         gettimeofday(&c->rtt_start, NULL);
622                         c->rtt_seq = pkt->hdr.seq + seglen;
623                         debug(c, "starting RTT measurement, expecting ack %u\n", c->rtt_seq);
624                 }
625
626                 print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
627                 c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
628         } while(left);
629 }
630
631 ssize_t utcp_send(struct utcp_connection *c, const void *data, size_t len) {
632         if(c->reapable) {
633                 debug(c, "send() called on closed connection\n");
634                 errno = EBADF;
635                 return -1;
636         }
637
638         switch(c->state) {
639         case CLOSED:
640         case LISTEN:
641                 debug(c, "send() called on unconnected connection\n");
642                 errno = ENOTCONN;
643                 return -1;
644
645         case SYN_SENT:
646         case SYN_RECEIVED:
647         case ESTABLISHED:
648         case CLOSE_WAIT:
649                 break;
650
651         case FIN_WAIT_1:
652         case FIN_WAIT_2:
653         case CLOSING:
654         case LAST_ACK:
655         case TIME_WAIT:
656                 debug(c, "send() called on closed connection\n");
657                 errno = EPIPE;
658                 return -1;
659         }
660
661         // Exit early if we have nothing to send.
662
663         if(!len) {
664                 return 0;
665         }
666
667         if(!data) {
668                 errno = EFAULT;
669                 return -1;
670         }
671
672         // Check if we need to be able to buffer all data
673
674         if(c->flags & UTCP_NO_PARTIAL) {
675                 if(len > buffer_free(&c->sndbuf)) {
676                         if(len > c->sndbuf.maxsize) {
677                                 errno = EMSGSIZE;
678                                 return -1;
679                         } else {
680                                 errno = EWOULDBLOCK;
681                                 return 0;
682                         }
683                 }
684         }
685
686         // Add data to send buffer.
687
688         if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
689                 len = buffer_put(&c->sndbuf, data, len);
690         } else {
691                 return 0;
692         }
693
694         if(len <= 0) {
695                 if(is_reliable(c)) {
696                         errno = EWOULDBLOCK;
697                         return 0;
698                 } else {
699                         return len;
700                 }
701         }
702
703         c->snd.last += len;
704
705         // Don't send anything yet if the connection has not fully established yet
706
707         if(c->state == SYN_SENT || c->state == SYN_RECEIVED) {
708                 return len;
709         }
710
711         ack(c, false);
712
713         if(!is_reliable(c)) {
714                 c->snd.una = c->snd.nxt = c->snd.last;
715                 buffer_discard(&c->sndbuf, c->sndbuf.used);
716         }
717
718         if(is_reliable(c) && !timerisset(&c->rtrx_timeout)) {
719                 start_retransmit_timer(c);
720         }
721
722         if(is_reliable(c) && !timerisset(&c->conn_timeout)) {
723                 gettimeofday(&c->conn_timeout, NULL);
724                 c->conn_timeout.tv_sec += c->utcp->timeout;
725         }
726
727         return len;
728 }
729
730 static void swap_ports(struct hdr *hdr) {
731         uint16_t tmp = hdr->src;
732         hdr->src = hdr->dst;
733         hdr->dst = tmp;
734 }
735
736 static void fast_retransmit(struct utcp_connection *c) {
737         if(c->state == CLOSED || c->snd.last == c->snd.una) {
738                 debug(c, "fast_retransmit() called but nothing to retransmit!\n");
739                 return;
740         }
741
742         struct utcp *utcp = c->utcp;
743
744         struct {
745                 struct hdr hdr;
746                 uint8_t data[];
747         } *pkt;
748
749         pkt = malloc(c->utcp->mtu);
750
751         if(!pkt) {
752                 return;
753         }
754
755         pkt->hdr.src = c->src;
756         pkt->hdr.dst = c->dst;
757         pkt->hdr.wnd = c->rcvbuf.maxsize;
758         pkt->hdr.aux = 0;
759
760         switch(c->state) {
761         case ESTABLISHED:
762         case FIN_WAIT_1:
763         case CLOSE_WAIT:
764         case CLOSING:
765         case LAST_ACK:
766                 // Send unacked data again.
767                 pkt->hdr.seq = c->snd.una;
768                 pkt->hdr.ack = c->rcv.nxt;
769                 pkt->hdr.ctl = ACK;
770                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
771
772                 if(fin_wanted(c, c->snd.una + len)) {
773                         len--;
774                         pkt->hdr.ctl |= FIN;
775                 }
776
777                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
778                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
779                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
780                 break;
781
782         default:
783                 break;
784         }
785
786         free(pkt);
787 }
788
789 static void retransmit(struct utcp_connection *c) {
790         if(c->state == CLOSED || c->snd.last == c->snd.una) {
791                 debug(c, "retransmit() called but nothing to retransmit!\n");
792                 stop_retransmit_timer(c);
793                 return;
794         }
795
796         struct utcp *utcp = c->utcp;
797
798         struct {
799                 struct hdr hdr;
800                 uint8_t data[];
801         } *pkt = c->utcp->pkt;
802
803         pkt->hdr.src = c->src;
804         pkt->hdr.dst = c->dst;
805         pkt->hdr.wnd = c->rcvbuf.maxsize;
806         pkt->hdr.aux = 0;
807
808         switch(c->state) {
809         case SYN_SENT:
810                 // Send our SYN again
811                 pkt->hdr.seq = c->snd.iss;
812                 pkt->hdr.ack = 0;
813                 pkt->hdr.ctl = SYN;
814                 pkt->hdr.aux = 0x0101;
815                 pkt->data[0] = 1;
816                 pkt->data[1] = 0;
817                 pkt->data[2] = 0;
818                 pkt->data[3] = c->flags & 0x7;
819                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + 4);
820                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + 4);
821                 break;
822
823         case SYN_RECEIVED:
824                 // Send SYNACK again
825                 pkt->hdr.seq = c->snd.nxt;
826                 pkt->hdr.ack = c->rcv.nxt;
827                 pkt->hdr.ctl = SYN | ACK;
828                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr));
829                 utcp->send(utcp, pkt, sizeof(pkt->hdr));
830                 break;
831
832         case ESTABLISHED:
833         case FIN_WAIT_1:
834         case CLOSE_WAIT:
835         case CLOSING:
836         case LAST_ACK:
837                 // Send unacked data again.
838                 pkt->hdr.seq = c->snd.una;
839                 pkt->hdr.ack = c->rcv.nxt;
840                 pkt->hdr.ctl = ACK;
841                 uint32_t len = min(seqdiff(c->snd.last, c->snd.una), utcp->mss);
842
843                 if(fin_wanted(c, c->snd.una + len)) {
844                         len--;
845                         pkt->hdr.ctl |= FIN;
846                 }
847
848                 // RFC 5681 slow start after timeout
849                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
850                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
851                 c->snd.cwnd = utcp->mss;
852                 debug_cwnd(c);
853
854                 buffer_copy(&c->sndbuf, pkt->data, 0, len);
855                 print_packet(c, "rtrx", pkt, sizeof(pkt->hdr) + len);
856                 utcp->send(utcp, pkt, sizeof(pkt->hdr) + len);
857
858                 c->snd.nxt = c->snd.una + len;
859                 break;
860
861         case CLOSED:
862         case LISTEN:
863         case TIME_WAIT:
864         case FIN_WAIT_2:
865                 // We shouldn't need to retransmit anything in this state.
866 #ifdef UTCP_DEBUG
867                 abort();
868 #endif
869                 stop_retransmit_timer(c);
870                 goto cleanup;
871         }
872
873         start_retransmit_timer(c);
874         utcp->rto *= 2;
875
876         if(utcp->rto > MAX_RTO) {
877                 utcp->rto = MAX_RTO;
878         }
879
880         c->rtt_start.tv_sec = 0; // invalidate RTT timer
881         c->dupack = 0; // cancel any ongoing fast recovery
882
883 cleanup:
884         return;
885 }
886
887 /* Update receive buffer and SACK entries after consuming data.
888  *
889  * Situation:
890  *
891  * |.....0000..1111111111.....22222......3333|
892  * |---------------^
893  *
894  * 0..3 represent the SACK entries. The ^ indicates up to which point we want
895  * to remove data from the receive buffer. The idea is to substract "len"
896  * from the offset of all the SACK entries, and then remove/cut down entries
897  * that are shifted to before the start of the receive buffer.
898  *
899  * There are three cases:
900  * - the SACK entry is after ^, in that case just change the offset.
901  * - the SACK entry starts before and ends after ^, so we have to
902  *   change both its offset and size.
903  * - the SACK entry is completely before ^, in that case delete it.
904  */
905 static void sack_consume(struct utcp_connection *c, size_t len) {
906         debug(c, "sack_consume %lu\n", (unsigned long)len);
907
908         if(len > c->rcvbuf.used) {
909                 debug(c, "all SACK entries consumed\n");
910                 c->sacks[0].len = 0;
911                 return;
912         }
913
914         buffer_discard(&c->rcvbuf, len);
915
916         for(int i = 0; i < NSACKS && c->sacks[i].len;) {
917                 if(len < c->sacks[i].offset) {
918                         c->sacks[i].offset -= len;
919                         i++;
920                 } else if(len < c->sacks[i].offset + c->sacks[i].len) {
921                         c->sacks[i].len -= len - c->sacks[i].offset;
922                         c->sacks[i].offset = 0;
923                         i++;
924                 } else {
925                         if(i < NSACKS - 1) {
926                                 memmove(&c->sacks[i], &c->sacks[i + 1], (NSACKS - 1 - i) * sizeof(c->sacks)[i]);
927                                 c->sacks[NSACKS - 1].len = 0;
928                         } else {
929                                 c->sacks[i].len = 0;
930                                 break;
931                         }
932                 }
933         }
934
935         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
936                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
937         }
938 }
939
940 static void handle_out_of_order(struct utcp_connection *c, uint32_t offset, const void *data, size_t len) {
941         debug(c, "out of order packet, offset %u\n", offset);
942         // Packet loss or reordering occured. Store the data in the buffer.
943         ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
944
945         if(rxd < 0 || (size_t)rxd < len) {
946                 abort();
947         }
948
949         // Make note of where we put it.
950         for(int i = 0; i < NSACKS; i++) {
951                 if(!c->sacks[i].len) { // nothing to merge, add new entry
952                         debug(c, "new SACK entry %d\n", i);
953                         c->sacks[i].offset = offset;
954                         c->sacks[i].len = rxd;
955                         break;
956                 } else if(offset < c->sacks[i].offset) {
957                         if(offset + rxd < c->sacks[i].offset) { // insert before
958                                 if(!c->sacks[NSACKS - 1].len) { // only if room left
959                                         debug(c, "insert SACK entry at %d\n", i);
960                                         memmove(&c->sacks[i + 1], &c->sacks[i], (NSACKS - i - 1) * sizeof(c->sacks)[i]);
961                                         c->sacks[i].offset = offset;
962                                         c->sacks[i].len = rxd;
963                                 } else {
964                                         debug(c, "SACK entries full, dropping packet\n");
965                                 }
966
967                                 break;
968                         } else { // merge
969                                 debug(c, "merge with start of SACK entry at %d\n", i);
970                                 c->sacks[i].offset = offset;
971                                 break;
972                         }
973                 } else if(offset <= c->sacks[i].offset + c->sacks[i].len) {
974                         if(offset + rxd > c->sacks[i].offset + c->sacks[i].len) { // merge
975                                 debug(c, "merge with end of SACK entry at %d\n", i);
976                                 c->sacks[i].len = offset + rxd - c->sacks[i].offset;
977                                 // TODO: handle potential merge with next entry
978                         }
979
980                         break;
981                 }
982         }
983
984         for(int i = 0; i < NSACKS && c->sacks[i].len; i++) {
985                 debug(c, "SACK[%d] offset %u len %u\n", i, c->sacks[i].offset, c->sacks[i].len);
986         }
987 }
988
989 static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
990         // Check if we can process out-of-order data now.
991         if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
992                 debug(c, "incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
993                 buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
994                 len = max(len, c->sacks[0].offset + c->sacks[0].len);
995                 data = c->rcvbuf.data;
996         }
997
998         if(c->recv) {
999                 ssize_t rxd = c->recv(c, data, len);
1000
1001                 if(rxd < 0 || (size_t)rxd != len) {
1002                         // TODO: handle the application not accepting all data.
1003                         abort();
1004                 }
1005         }
1006
1007         if(c->rcvbuf.used) {
1008                 sack_consume(c, len);
1009         }
1010
1011         c->rcv.nxt += len;
1012 }
1013
1014
1015 static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
1016         if(!is_reliable(c)) {
1017                 c->recv(c, data, len);
1018                 c->rcv.nxt = seq + len;
1019                 return;
1020         }
1021
1022         uint32_t offset = seqdiff(seq, c->rcv.nxt);
1023
1024         if(offset + len > c->rcvbuf.maxsize) {
1025                 abort();
1026         }
1027
1028         if(offset) {
1029                 handle_out_of_order(c, offset, data, len);
1030         } else {
1031                 handle_in_order(c, data, len);
1032         }
1033 }
1034
1035
1036 ssize_t utcp_recv(struct utcp *utcp, const void *data, size_t len) {
1037         const uint8_t *ptr = data;
1038
1039         if(!utcp) {
1040                 errno = EFAULT;
1041                 return -1;
1042         }
1043
1044         if(!len) {
1045                 return 0;
1046         }
1047
1048         if(!data) {
1049                 errno = EFAULT;
1050                 return -1;
1051         }
1052
1053         // Drop packets smaller than the header
1054
1055         struct hdr hdr;
1056
1057         if(len < sizeof(hdr)) {
1058                 print_packet(NULL, "recv", data, len);
1059                 errno = EBADMSG;
1060                 return -1;
1061         }
1062
1063         // Make a copy from the potentially unaligned data to a struct hdr
1064
1065         memcpy(&hdr, ptr, sizeof(hdr));
1066
1067         // Try to match the packet to an existing connection
1068
1069         struct utcp_connection *c = find_connection(utcp, hdr.dst, hdr.src);
1070         print_packet(c, "recv", data, len);
1071
1072         // Process the header
1073
1074         ptr += sizeof(hdr);
1075         len -= sizeof(hdr);
1076
1077         // Drop packets with an unknown CTL flag
1078
1079         if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
1080                 print_packet(NULL, "recv", data, len);
1081                 errno = EBADMSG;
1082                 return -1;
1083         }
1084
1085         // Check for auxiliary headers
1086
1087         const uint8_t *init = NULL;
1088
1089         uint16_t aux = hdr.aux;
1090
1091         while(aux) {
1092                 size_t auxlen = 4 * (aux >> 8) & 0xf;
1093                 uint8_t auxtype = aux & 0xff;
1094
1095                 if(len < auxlen) {
1096                         errno = EBADMSG;
1097                         return -1;
1098                 }
1099
1100                 switch(auxtype) {
1101                 case AUX_INIT:
1102                         if(!(hdr.ctl & SYN) || auxlen != 4) {
1103                                 errno = EBADMSG;
1104                                 return -1;
1105                         }
1106
1107                         init = ptr;
1108                         break;
1109
1110                 default:
1111                         errno = EBADMSG;
1112                         return -1;
1113                 }
1114
1115                 len -= auxlen;
1116                 ptr += auxlen;
1117
1118                 if(!(aux & 0x800)) {
1119                         break;
1120                 }
1121
1122                 if(len < 2) {
1123                         errno = EBADMSG;
1124                         return -1;
1125                 }
1126
1127                 memcpy(&aux, ptr, 2);
1128                 len -= 2;
1129                 ptr += 2;
1130         }
1131
1132         bool has_data = len || (hdr.ctl & (SYN | FIN));
1133
1134         // Is it for a new connection?
1135
1136         if(!c) {
1137                 // Ignore RST packets
1138
1139                 if(hdr.ctl & RST) {
1140                         return 0;
1141                 }
1142
1143                 // Is it a SYN packet and are we LISTENing?
1144
1145                 if(hdr.ctl & SYN && !(hdr.ctl & ACK) && utcp->accept) {
1146                         // If we don't want to accept it, send a RST back
1147                         if((utcp->pre_accept && !utcp->pre_accept(utcp, hdr.dst))) {
1148                                 len = 1;
1149                                 goto reset;
1150                         }
1151
1152                         // Try to allocate memory, otherwise send a RST back
1153                         c = allocate_connection(utcp, hdr.dst, hdr.src);
1154
1155                         if(!c) {
1156                                 len = 1;
1157                                 goto reset;
1158                         }
1159
1160                         // Parse auxilliary information
1161                         if(init) {
1162                                 if(init[0] < 1) {
1163                                         len = 1;
1164                                         goto reset;
1165                                 }
1166
1167                                 c->flags = init[3] & 0x7;
1168                         } else {
1169                                 c->flags = UTCP_TCP;
1170                         }
1171
1172 synack:
1173                         // Return SYN+ACK, go to SYN_RECEIVED state
1174                         c->snd.wnd = hdr.wnd;
1175                         c->rcv.irs = hdr.seq;
1176                         c->rcv.nxt = c->rcv.irs + 1;
1177                         set_state(c, SYN_RECEIVED);
1178
1179                         struct {
1180                                 struct hdr hdr;
1181                                 uint8_t data[4];
1182                         } pkt;
1183
1184                         pkt.hdr.src = c->src;
1185                         pkt.hdr.dst = c->dst;
1186                         pkt.hdr.ack = c->rcv.irs + 1;
1187                         pkt.hdr.seq = c->snd.iss;
1188                         pkt.hdr.wnd = c->rcvbuf.maxsize;
1189                         pkt.hdr.ctl = SYN | ACK;
1190
1191                         if(init) {
1192                                 pkt.hdr.aux = 0x0101;
1193                                 pkt.data[0] = 1;
1194                                 pkt.data[1] = 0;
1195                                 pkt.data[2] = 0;
1196                                 pkt.data[3] = c->flags & 0x7;
1197                                 print_packet(c, "send", &pkt, sizeof(hdr) + 4);
1198                                 utcp->send(utcp, &pkt, sizeof(hdr) + 4);
1199                         } else {
1200                                 pkt.hdr.aux = 0;
1201                                 print_packet(c, "send", &pkt, sizeof(hdr));
1202                                 utcp->send(utcp, &pkt, sizeof(hdr));
1203                         }
1204                 } else {
1205                         // No, we don't want your packets, send a RST back
1206                         len = 1;
1207                         goto reset;
1208                 }
1209
1210                 return 0;
1211         }
1212
1213         debug(c, "state %s\n", strstate[c->state]);
1214
1215         // In case this is for a CLOSED connection, ignore the packet.
1216         // TODO: make it so incoming packets can never match a CLOSED connection.
1217
1218         if(c->state == CLOSED) {
1219                 debug(c, "got packet for closed connection\n");
1220                 return 0;
1221         }
1222
1223         // It is for an existing connection.
1224
1225         // 1. Drop invalid packets.
1226
1227         // 1a. Drop packets that should not happen in our current state.
1228
1229         switch(c->state) {
1230         case SYN_SENT:
1231         case SYN_RECEIVED:
1232         case ESTABLISHED:
1233         case FIN_WAIT_1:
1234         case FIN_WAIT_2:
1235         case CLOSE_WAIT:
1236         case CLOSING:
1237         case LAST_ACK:
1238         case TIME_WAIT:
1239                 break;
1240
1241         default:
1242 #ifdef UTCP_DEBUG
1243                 abort();
1244 #endif
1245                 break;
1246         }
1247
1248         // 1b. Discard data that is not in our receive window.
1249
1250         if(is_reliable(c)) {
1251                 bool acceptable;
1252
1253                 if(c->state == SYN_SENT) {
1254                         acceptable = true;
1255                 } else if(len == 0) {
1256                         acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0;
1257                 } else {
1258                         int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1259
1260                         // cut already accepted front overlapping
1261                         if(rcv_offset < 0) {
1262                                 acceptable = len > (size_t) - rcv_offset;
1263
1264                                 if(acceptable) {
1265                                         ptr -= rcv_offset;
1266                                         len += rcv_offset;
1267                                         hdr.seq -= rcv_offset;
1268                                 }
1269                         } else {
1270                                 acceptable = seqdiff(hdr.seq, c->rcv.nxt) >= 0 && seqdiff(hdr.seq, c->rcv.nxt) + len <= c->rcvbuf.maxsize;
1271                         }
1272                 }
1273
1274                 if(!acceptable) {
1275                         debug(c, "packet not acceptable, %u <= %u + %lu < %u\n", c->rcv.nxt, hdr.seq, (unsigned long)len, c->rcv.nxt + c->rcvbuf.maxsize);
1276
1277                         // Ignore unacceptable RST packets.
1278                         if(hdr.ctl & RST) {
1279                                 return 0;
1280                         }
1281
1282                         // Otherwise, continue processing.
1283                         len = 0;
1284                 }
1285         } else {
1286 #if UTCP_DEBUG
1287                 int32_t rcv_offset = seqdiff(hdr.seq, c->rcv.nxt);
1288
1289                 if(rcv_offset) {
1290                         debug(c, "packet out of order, offset %u bytes", rcv_offset);
1291                 }
1292
1293                 if(rcv_offset >= 0) {
1294                         c->rcv.nxt = hdr.seq + len;
1295                 }
1296
1297 #endif
1298         }
1299
1300         c->snd.wnd = hdr.wnd; // TODO: move below
1301
1302         // 1c. Drop packets with an invalid ACK.
1303         // ackno should not roll back, and it should also not be bigger than what we ever could have sent
1304         // (= snd.una + c->sndbuf.used).
1305
1306         if(!is_reliable(c)) {
1307                 if(hdr.ack != c->snd.last && c->state >= ESTABLISHED) {
1308                         hdr.ack = c->snd.una;
1309                 }
1310         }
1311
1312         if(hdr.ctl & ACK && (seqdiff(hdr.ack, c->snd.last) > 0 || seqdiff(hdr.ack, c->snd.una) < 0)) {
1313                 debug(c, "packet ack seqno out of range, %u <= %u < %u\n", c->snd.una, hdr.ack, c->snd.una + c->sndbuf.used);
1314
1315                 // Ignore unacceptable RST packets.
1316                 if(hdr.ctl & RST) {
1317                         return 0;
1318                 }
1319
1320                 goto reset;
1321         }
1322
1323         // 2. Handle RST packets
1324
1325         if(hdr.ctl & RST) {
1326                 switch(c->state) {
1327                 case SYN_SENT:
1328                         if(!(hdr.ctl & ACK)) {
1329                                 return 0;
1330                         }
1331
1332                         // The peer has refused our connection.
1333                         set_state(c, CLOSED);
1334                         errno = ECONNREFUSED;
1335
1336                         if(c->recv) {
1337                                 c->recv(c, NULL, 0);
1338                         }
1339
1340                         if(c->poll && !c->reapable) {
1341                                 c->poll(c, 0);
1342                         }
1343
1344                         return 0;
1345
1346                 case SYN_RECEIVED:
1347                         if(hdr.ctl & ACK) {
1348                                 return 0;
1349                         }
1350
1351                         // We haven't told the application about this connection yet. Silently delete.
1352                         free_connection(c);
1353                         return 0;
1354
1355                 case ESTABLISHED:
1356                 case FIN_WAIT_1:
1357                 case FIN_WAIT_2:
1358                 case CLOSE_WAIT:
1359                         if(hdr.ctl & ACK) {
1360                                 return 0;
1361                         }
1362
1363                         // The peer has aborted our connection.
1364                         set_state(c, CLOSED);
1365                         errno = ECONNRESET;
1366
1367                         if(c->recv) {
1368                                 c->recv(c, NULL, 0);
1369                         }
1370
1371                         if(c->poll && !c->reapable) {
1372                                 c->poll(c, 0);
1373                         }
1374
1375                         return 0;
1376
1377                 case CLOSING:
1378                 case LAST_ACK:
1379                 case TIME_WAIT:
1380                         if(hdr.ctl & ACK) {
1381                                 return 0;
1382                         }
1383
1384                         // As far as the application is concerned, the connection has already been closed.
1385                         // If it has called utcp_close() already, we can immediately free this connection.
1386                         if(c->reapable) {
1387                                 free_connection(c);
1388                                 return 0;
1389                         }
1390
1391                         // Otherwise, immediately move to the CLOSED state.
1392                         set_state(c, CLOSED);
1393                         return 0;
1394
1395                 default:
1396 #ifdef UTCP_DEBUG
1397                         abort();
1398 #endif
1399                         break;
1400                 }
1401         }
1402
1403         uint32_t advanced;
1404
1405         if(!(hdr.ctl & ACK)) {
1406                 advanced = 0;
1407                 goto skip_ack;
1408         }
1409
1410         // 3. Advance snd.una
1411
1412         advanced = seqdiff(hdr.ack, c->snd.una);
1413
1414         if(advanced) {
1415                 // RTT measurement
1416                 if(c->rtt_start.tv_sec) {
1417                         if(c->rtt_seq == hdr.ack) {
1418                                 struct timeval now, diff;
1419                                 gettimeofday(&now, NULL);
1420                                 timersub(&now, &c->rtt_start, &diff);
1421                                 update_rtt(c, diff.tv_sec * 1000000 + diff.tv_usec);
1422                                 c->rtt_start.tv_sec = 0;
1423                         } else if(c->rtt_seq < hdr.ack) {
1424                                 debug(c, "cancelling RTT measurement: %u < %u\n", c->rtt_seq, hdr.ack);
1425                                 c->rtt_start.tv_sec = 0;
1426                         }
1427                 }
1428
1429                 int32_t data_acked = advanced;
1430
1431                 switch(c->state) {
1432                 case SYN_SENT:
1433                 case SYN_RECEIVED:
1434                         data_acked--;
1435                         break;
1436
1437                 // TODO: handle FIN as well.
1438                 default:
1439                         break;
1440                 }
1441
1442                 assert(data_acked >= 0);
1443
1444 #ifndef NDEBUG
1445                 int32_t bufused = seqdiff(c->snd.last, c->snd.una);
1446                 assert(data_acked <= bufused);
1447 #endif
1448
1449                 if(data_acked) {
1450                         buffer_discard(&c->sndbuf, data_acked);
1451                 }
1452
1453                 // Also advance snd.nxt if possible
1454                 if(seqdiff(c->snd.nxt, hdr.ack) < 0) {
1455                         c->snd.nxt = hdr.ack;
1456                 }
1457
1458                 c->snd.una = hdr.ack;
1459
1460                 if(c->dupack) {
1461                         if(c->dupack >= 3) {
1462                                 debug(c, "fast recovery ended\n");
1463                                 c->snd.cwnd = c->snd.ssthresh;
1464                         }
1465
1466                         c->dupack = 0;
1467                 }
1468
1469                 // Increase the congestion window according to RFC 5681
1470                 if(c->snd.cwnd < c->snd.ssthresh) {
1471                         c->snd.cwnd += min(advanced, utcp->mss); // eq. 2
1472                 } else {
1473                         c->snd.cwnd += max(1, (utcp->mss * utcp->mss) / c->snd.cwnd); // eq. 3
1474                 }
1475
1476                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1477                         c->snd.cwnd = c->sndbuf.maxsize;
1478                 }
1479
1480                 debug_cwnd(c);
1481
1482                 // Check if we have sent a FIN that is now ACKed.
1483                 switch(c->state) {
1484                 case FIN_WAIT_1:
1485                         if(c->snd.una == c->snd.last) {
1486                                 set_state(c, FIN_WAIT_2);
1487                         }
1488
1489                         break;
1490
1491                 case CLOSING:
1492                         if(c->snd.una == c->snd.last) {
1493                                 gettimeofday(&c->conn_timeout, NULL);
1494                                 c->conn_timeout.tv_sec += utcp->timeout;
1495                                 set_state(c, TIME_WAIT);
1496                         }
1497
1498                         break;
1499
1500                 default:
1501                         break;
1502                 }
1503         } else {
1504                 if(!len && is_reliable(c) && c->snd.una != c->snd.last) {
1505                         c->dupack++;
1506                         debug(c, "duplicate ACK %d\n", c->dupack);
1507
1508                         if(c->dupack == 3) {
1509                                 // RFC 5681 fast recovery
1510                                 debug(c, "fast recovery started\n", c->dupack);
1511                                 uint32_t flightsize = seqdiff(c->snd.nxt, c->snd.una);
1512                                 c->snd.ssthresh = max(flightsize / 2, utcp->mss * 2); // eq. 4
1513                                 c->snd.cwnd = min(c->snd.ssthresh + 3 * utcp->mss, c->sndbuf.maxsize);
1514
1515                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1516                                         c->snd.cwnd = c->sndbuf.maxsize;
1517                                 }
1518
1519                                 debug_cwnd(c);
1520
1521                                 fast_retransmit(c);
1522                         } else if(c->dupack > 3) {
1523                                 c->snd.cwnd += utcp->mss;
1524
1525                                 if(c->snd.cwnd > c->sndbuf.maxsize) {
1526                                         c->snd.cwnd = c->sndbuf.maxsize;
1527                                 }
1528
1529                                 debug_cwnd(c);
1530                         }
1531
1532                         // We got an ACK which indicates the other side did get one of our packets.
1533                         // Reset the retransmission timer to avoid going to slow start,
1534                         // but don't touch the connection timeout.
1535                         start_retransmit_timer(c);
1536                 }
1537         }
1538
1539         // 4. Update timers
1540
1541         if(advanced) {
1542                 if(c->snd.una == c->snd.last) {
1543                         stop_retransmit_timer(c);
1544                         timerclear(&c->conn_timeout);
1545                 } else if(is_reliable(c)) {
1546                         start_retransmit_timer(c);
1547                         gettimeofday(&c->conn_timeout, NULL);
1548                         c->conn_timeout.tv_sec += utcp->timeout;
1549                 }
1550         }
1551
1552 skip_ack:
1553         // 5. Process SYN stuff
1554
1555         if(hdr.ctl & SYN) {
1556                 switch(c->state) {
1557                 case SYN_SENT:
1558
1559                         // This is a SYNACK. It should always have ACKed the SYN.
1560                         if(!advanced) {
1561                                 goto reset;
1562                         }
1563
1564                         c->rcv.irs = hdr.seq;
1565                         c->rcv.nxt = hdr.seq;
1566
1567                         if(c->shut_wr) {
1568                                 c->snd.last++;
1569                                 set_state(c, FIN_WAIT_1);
1570                         } else {
1571                                 set_state(c, ESTABLISHED);
1572                         }
1573
1574                         // TODO: notify application of this somehow.
1575                         break;
1576
1577                 case SYN_RECEIVED:
1578                         // This is a retransmit of a SYN, send back the SYNACK.
1579                         goto synack;
1580
1581                 case ESTABLISHED:
1582                 case FIN_WAIT_1:
1583                 case FIN_WAIT_2:
1584                 case CLOSE_WAIT:
1585                 case CLOSING:
1586                 case LAST_ACK:
1587                 case TIME_WAIT:
1588                         // Ehm, no. We should never receive a second SYN.
1589                         return 0;
1590
1591                 default:
1592 #ifdef UTCP_DEBUG
1593                         abort();
1594 #endif
1595                         return 0;
1596                 }
1597
1598                 // SYN counts as one sequence number
1599                 c->rcv.nxt++;
1600         }
1601
1602         // 6. Process new data
1603
1604         if(c->state == SYN_RECEIVED) {
1605                 // This is the ACK after the SYNACK. It should always have ACKed the SYNACK.
1606                 if(!advanced) {
1607                         goto reset;
1608                 }
1609
1610                 // Are we still LISTENing?
1611                 if(utcp->accept) {
1612                         utcp->accept(c, c->src);
1613                 }
1614
1615                 if(c->state != ESTABLISHED) {
1616                         set_state(c, CLOSED);
1617                         c->reapable = true;
1618                         goto reset;
1619                 }
1620         }
1621
1622         if(len) {
1623                 switch(c->state) {
1624                 case SYN_SENT:
1625                 case SYN_RECEIVED:
1626                         // This should never happen.
1627 #ifdef UTCP_DEBUG
1628                         abort();
1629 #endif
1630                         return 0;
1631
1632                 case ESTABLISHED:
1633                 case FIN_WAIT_1:
1634                 case FIN_WAIT_2:
1635                         break;
1636
1637                 case CLOSE_WAIT:
1638                 case CLOSING:
1639                 case LAST_ACK:
1640                 case TIME_WAIT:
1641                         // Ehm no, We should never receive more data after a FIN.
1642                         goto reset;
1643
1644                 default:
1645 #ifdef UTCP_DEBUG
1646                         abort();
1647 #endif
1648                         return 0;
1649                 }
1650
1651                 handle_incoming_data(c, hdr.seq, ptr, len);
1652         }
1653
1654         // 7. Process FIN stuff
1655
1656         if((hdr.ctl & FIN) && (!is_reliable(c) || hdr.seq + len == c->rcv.nxt)) {
1657                 switch(c->state) {
1658                 case SYN_SENT:
1659                 case SYN_RECEIVED:
1660                         // This should never happen.
1661 #ifdef UTCP_DEBUG
1662                         abort();
1663 #endif
1664                         break;
1665
1666                 case ESTABLISHED:
1667                         set_state(c, CLOSE_WAIT);
1668                         break;
1669
1670                 case FIN_WAIT_1:
1671                         set_state(c, CLOSING);
1672                         break;
1673
1674                 case FIN_WAIT_2:
1675                         gettimeofday(&c->conn_timeout, NULL);
1676                         c->conn_timeout.tv_sec += utcp->timeout;
1677                         set_state(c, TIME_WAIT);
1678                         break;
1679
1680                 case CLOSE_WAIT:
1681                 case CLOSING:
1682                 case LAST_ACK:
1683                 case TIME_WAIT:
1684                         // Ehm, no. We should never receive a second FIN.
1685                         goto reset;
1686
1687                 default:
1688 #ifdef UTCP_DEBUG
1689                         abort();
1690 #endif
1691                         break;
1692                 }
1693
1694                 // FIN counts as one sequence number
1695                 c->rcv.nxt++;
1696                 len++;
1697
1698                 // Inform the application that the peer closed its end of the connection.
1699                 if(c->recv) {
1700                         errno = 0;
1701                         c->recv(c, NULL, 0);
1702                 }
1703         }
1704
1705         // Now we send something back if:
1706         // - we received data, so we have to send back an ACK
1707         //   -> sendatleastone = true
1708         // - or we got an ack, so we should maybe send a bit more data
1709         //   -> sendatleastone = false
1710
1711         if(is_reliable(c) || hdr.ctl & SYN || hdr.ctl & FIN) {
1712                 ack(c, has_data);
1713         }
1714
1715         return 0;
1716
1717 reset:
1718         swap_ports(&hdr);
1719         hdr.wnd = 0;
1720         hdr.aux = 0;
1721
1722         if(hdr.ctl & ACK) {
1723                 hdr.seq = hdr.ack;
1724                 hdr.ctl = RST;
1725         } else {
1726                 hdr.ack = hdr.seq + len;
1727                 hdr.seq = 0;
1728                 hdr.ctl = RST | ACK;
1729         }
1730
1731         print_packet(c, "send", &hdr, sizeof(hdr));
1732         utcp->send(utcp, &hdr, sizeof(hdr));
1733         return 0;
1734
1735 }
1736
1737 int utcp_shutdown(struct utcp_connection *c, int dir) {
1738         debug(c, "shutdown %d at %u\n", dir, c ? c->snd.last : 0);
1739
1740         if(!c) {
1741                 errno = EFAULT;
1742                 return -1;
1743         }
1744
1745         if(c->reapable) {
1746                 debug(c, "shutdown() called on closed connection\n");
1747                 errno = EBADF;
1748                 return -1;
1749         }
1750
1751         if(!(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_WR || dir == UTCP_SHUT_RDWR)) {
1752                 errno = EINVAL;
1753                 return -1;
1754         }
1755
1756         // TCP does not have a provision for stopping incoming packets.
1757         // The best we can do is to just ignore them.
1758         if(dir == UTCP_SHUT_RD || dir == UTCP_SHUT_RDWR) {
1759                 c->recv = NULL;
1760         }
1761
1762         // The rest of the code deals with shutting down writes.
1763         if(dir == UTCP_SHUT_RD) {
1764                 return 0;
1765         }
1766
1767         // Only process shutting down writes once.
1768         if(c->shut_wr) {
1769                 return 0;
1770         }
1771
1772         c->shut_wr = true;
1773
1774         switch(c->state) {
1775         case CLOSED:
1776         case LISTEN:
1777                 errno = ENOTCONN;
1778                 return -1;
1779
1780         case SYN_SENT:
1781                 return 0;
1782
1783         case SYN_RECEIVED:
1784         case ESTABLISHED:
1785                 set_state(c, FIN_WAIT_1);
1786                 break;
1787
1788         case FIN_WAIT_1:
1789         case FIN_WAIT_2:
1790                 return 0;
1791
1792         case CLOSE_WAIT:
1793                 set_state(c, CLOSING);
1794                 break;
1795
1796         case CLOSING:
1797         case LAST_ACK:
1798         case TIME_WAIT:
1799                 return 0;
1800         }
1801
1802         c->snd.last++;
1803
1804         ack(c, false);
1805
1806         if(!timerisset(&c->rtrx_timeout)) {
1807                 start_retransmit_timer(c);
1808         }
1809
1810         return 0;
1811 }
1812
1813 static bool reset_connection(struct utcp_connection *c) {
1814         if(!c) {
1815                 errno = EFAULT;
1816                 return false;
1817         }
1818
1819         if(c->reapable) {
1820                 debug(c, "abort() called on closed connection\n");
1821                 errno = EBADF;
1822                 return false;
1823         }
1824
1825         c->recv = NULL;
1826         c->poll = NULL;
1827
1828         switch(c->state) {
1829         case CLOSED:
1830                 return true;
1831
1832         case LISTEN:
1833         case SYN_SENT:
1834         case CLOSING:
1835         case LAST_ACK:
1836         case TIME_WAIT:
1837                 set_state(c, CLOSED);
1838                 return true;
1839
1840         case SYN_RECEIVED:
1841         case ESTABLISHED:
1842         case FIN_WAIT_1:
1843         case FIN_WAIT_2:
1844         case CLOSE_WAIT:
1845                 set_state(c, CLOSED);
1846                 break;
1847         }
1848
1849         // Send RST
1850
1851         struct hdr hdr;
1852
1853         hdr.src = c->src;
1854         hdr.dst = c->dst;
1855         hdr.seq = c->snd.nxt;
1856         hdr.ack = 0;
1857         hdr.wnd = 0;
1858         hdr.ctl = RST;
1859
1860         print_packet(c, "send", &hdr, sizeof(hdr));
1861         c->utcp->send(c->utcp, &hdr, sizeof(hdr));
1862         return true;
1863 }
1864
1865 // Closes all the opened connections
1866 void utcp_abort_all_connections(struct utcp *utcp) {
1867         if(!utcp) {
1868                 errno = EINVAL;
1869                 return;
1870         }
1871
1872         for(int i = 0; i < utcp->nconnections; i++) {
1873                 struct utcp_connection *c = utcp->connections[i];
1874
1875                 if(c->reapable || c->state == CLOSED) {
1876                         continue;
1877                 }
1878
1879                 utcp_recv_t old_recv = c->recv;
1880                 utcp_poll_t old_poll = c->poll;
1881
1882                 reset_connection(c);
1883
1884                 if(old_recv) {
1885                         errno = 0;
1886                         old_recv(c, NULL, 0);
1887                 }
1888
1889                 if(old_poll && !c->reapable) {
1890                         errno = 0;
1891                         old_poll(c, 0);
1892                 }
1893         }
1894
1895         return;
1896 }
1897
1898 int utcp_close(struct utcp_connection *c) {
1899         if(utcp_shutdown(c, SHUT_RDWR) && errno != ENOTCONN) {
1900                 return -1;
1901         }
1902
1903         c->recv = NULL;
1904         c->poll = NULL;
1905         c->reapable = true;
1906         return 0;
1907 }
1908
1909 int utcp_abort(struct utcp_connection *c) {
1910         if(!reset_connection(c)) {
1911                 return -1;
1912         }
1913
1914         c->reapable = true;
1915         return 0;
1916 }
1917
1918 /* Handle timeouts.
1919  * One call to this function will loop through all connections,
1920  * checking if something needs to be resent or not.
1921  * The return value is the time to the next timeout in milliseconds,
1922  * or maybe a negative value if the timeout is infinite.
1923  */
1924 struct timeval utcp_timeout(struct utcp *utcp) {
1925         struct timeval now;
1926         gettimeofday(&now, NULL);
1927         struct timeval next = {now.tv_sec + 3600, now.tv_usec};
1928
1929         for(int i = 0; i < utcp->nconnections; i++) {
1930                 struct utcp_connection *c = utcp->connections[i];
1931
1932                 if(!c) {
1933                         continue;
1934                 }
1935
1936                 // delete connections that have been utcp_close()d.
1937                 if(c->state == CLOSED) {
1938                         if(c->reapable) {
1939                                 debug(c, "reaping\n");
1940                                 free_connection(c);
1941                                 i--;
1942                         }
1943
1944                         continue;
1945                 }
1946
1947                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &now, <)) {
1948                         errno = ETIMEDOUT;
1949                         c->state = CLOSED;
1950
1951                         if(c->recv) {
1952                                 c->recv(c, NULL, 0);
1953                         }
1954
1955                         if(c->poll && !c->reapable) {
1956                                 c->poll(c, 0);
1957                         }
1958
1959                         continue;
1960                 }
1961
1962                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &now, <)) {
1963                         debug(c, "retransmitting after timeout\n");
1964                         retransmit(c);
1965                 }
1966
1967                 if(c->poll) {
1968                         if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
1969                                 uint32_t len =  buffer_free(&c->sndbuf);
1970
1971                                 if(len) {
1972                                         c->poll(c, len);
1973                                 }
1974                         } else if(c->state == CLOSED) {
1975                                 c->poll(c, 0);
1976                         }
1977                 }
1978
1979                 if(timerisset(&c->conn_timeout) && timercmp(&c->conn_timeout, &next, <)) {
1980                         next = c->conn_timeout;
1981                 }
1982
1983                 if(timerisset(&c->rtrx_timeout) && timercmp(&c->rtrx_timeout, &next, <)) {
1984                         next = c->rtrx_timeout;
1985                 }
1986         }
1987
1988         struct timeval diff;
1989
1990         timersub(&next, &now, &diff);
1991
1992         return diff;
1993 }
1994
1995 bool utcp_is_active(struct utcp *utcp) {
1996         if(!utcp) {
1997                 return false;
1998         }
1999
2000         for(int i = 0; i < utcp->nconnections; i++)
2001                 if(utcp->connections[i]->state != CLOSED && utcp->connections[i]->state != TIME_WAIT) {
2002                         return true;
2003                 }
2004
2005         return false;
2006 }
2007
2008 struct utcp *utcp_init(utcp_accept_t accept, utcp_pre_accept_t pre_accept, utcp_send_t send, void *priv) {
2009         if(!send) {
2010                 errno = EFAULT;
2011                 return NULL;
2012         }
2013
2014         struct utcp *utcp = calloc(1, sizeof(*utcp));
2015
2016         if(!utcp) {
2017                 return NULL;
2018         }
2019
2020         utcp->accept = accept;
2021         utcp->pre_accept = pre_accept;
2022         utcp->send = send;
2023         utcp->priv = priv;
2024         utcp_set_mtu(utcp, DEFAULT_MTU);
2025         utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
2026         utcp->rto = START_RTO; // usec
2027
2028         return utcp;
2029 }
2030
2031 void utcp_exit(struct utcp *utcp) {
2032         if(!utcp) {
2033                 return;
2034         }
2035
2036         for(int i = 0; i < utcp->nconnections; i++) {
2037                 struct utcp_connection *c = utcp->connections[i];
2038
2039                 if(!c->reapable) {
2040                         if(c->recv) {
2041                                 c->recv(c, NULL, 0);
2042                         }
2043
2044                         if(c->poll && !c->reapable) {
2045                                 c->poll(c, 0);
2046                         }
2047                 }
2048
2049                 buffer_exit(&c->rcvbuf);
2050                 buffer_exit(&c->sndbuf);
2051                 free(c);
2052         }
2053
2054         free(utcp->connections);
2055         free(utcp);
2056 }
2057
2058 uint16_t utcp_get_mtu(struct utcp *utcp) {
2059         return utcp ? utcp->mtu : 0;
2060 }
2061
2062 uint16_t utcp_get_mss(struct utcp *utcp) {
2063         return utcp ? utcp->mss : 0;
2064 }
2065
2066 void utcp_set_mtu(struct utcp *utcp, uint16_t mtu) {
2067         if(!utcp) {
2068                 return;
2069         }
2070
2071         if(mtu <= sizeof(struct hdr)) {
2072                 return;
2073         }
2074
2075         if(mtu > utcp->mtu) {
2076                 char *new = realloc(utcp->pkt, mtu + sizeof(struct hdr));
2077
2078                 if(!new) {
2079                         return;
2080                 }
2081
2082                 utcp->pkt = new;
2083         }
2084
2085         utcp->mtu = mtu;
2086         utcp->mss = mtu - sizeof(struct hdr);
2087 }
2088
2089 void utcp_reset_timers(struct utcp *utcp) {
2090         if(!utcp) {
2091                 return;
2092         }
2093
2094         struct timeval now, then;
2095
2096         gettimeofday(&now, NULL);
2097
2098         then = now;
2099
2100         then.tv_sec += utcp->timeout;
2101
2102         for(int i = 0; i < utcp->nconnections; i++) {
2103                 struct utcp_connection *c = utcp->connections[i];
2104
2105                 if(c->reapable) {
2106                         continue;
2107                 }
2108
2109                 if(timerisset(&c->rtrx_timeout)) {
2110                         c->rtrx_timeout = now;
2111                 }
2112
2113                 if(timerisset(&c->conn_timeout)) {
2114                         c->conn_timeout = then;
2115                 }
2116
2117                 c->rtt_start.tv_sec = 0;
2118         }
2119
2120         if(utcp->rto > START_RTO) {
2121                 utcp->rto = START_RTO;
2122         }
2123 }
2124
2125 int utcp_get_user_timeout(struct utcp *u) {
2126         return u ? u->timeout : 0;
2127 }
2128
2129 void utcp_set_user_timeout(struct utcp *u, int timeout) {
2130         if(u) {
2131                 u->timeout = timeout;
2132         }
2133 }
2134
2135 size_t utcp_get_sndbuf(struct utcp_connection *c) {
2136         return c ? c->sndbuf.maxsize : 0;
2137 }
2138
2139 size_t utcp_get_sndbuf_free(struct utcp_connection *c) {
2140         if(!c) {
2141                 return 0;
2142         }
2143
2144         switch(c->state) {
2145         case SYN_SENT:
2146         case SYN_RECEIVED:
2147         case ESTABLISHED:
2148         case CLOSE_WAIT:
2149                 return buffer_free(&c->sndbuf);
2150
2151         default:
2152                 return 0;
2153         }
2154 }
2155
2156 void utcp_set_sndbuf(struct utcp_connection *c, size_t size) {
2157         if(!c) {
2158                 return;
2159         }
2160
2161         c->sndbuf.maxsize = size;
2162
2163         if(c->sndbuf.maxsize != size) {
2164                 c->sndbuf.maxsize = -1;
2165         }
2166 }
2167
2168 size_t utcp_get_rcvbuf(struct utcp_connection *c) {
2169         return c ? c->rcvbuf.maxsize : 0;
2170 }
2171
2172 size_t utcp_get_rcvbuf_free(struct utcp_connection *c) {
2173         if(c && (c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
2174                 return buffer_free(&c->rcvbuf);
2175         } else {
2176                 return 0;
2177         }
2178 }
2179
2180 void utcp_set_rcvbuf(struct utcp_connection *c, size_t size) {
2181         if(!c) {
2182                 return;
2183         }
2184
2185         c->rcvbuf.maxsize = size;
2186
2187         if(c->rcvbuf.maxsize != size) {
2188                 c->rcvbuf.maxsize = -1;
2189         }
2190 }
2191
2192 size_t utcp_get_sendq(struct utcp_connection *c) {
2193         return c->sndbuf.used;
2194 }
2195
2196 size_t utcp_get_recvq(struct utcp_connection *c) {
2197         return c->rcvbuf.used;
2198 }
2199
2200 bool utcp_get_nodelay(struct utcp_connection *c) {
2201         return c ? c->nodelay : false;
2202 }
2203
2204 void utcp_set_nodelay(struct utcp_connection *c, bool nodelay) {
2205         if(c) {
2206                 c->nodelay = nodelay;
2207         }
2208 }
2209
2210 bool utcp_get_keepalive(struct utcp_connection *c) {
2211         return c ? c->keepalive : false;
2212 }
2213
2214 void utcp_set_keepalive(struct utcp_connection *c, bool keepalive) {
2215         if(c) {
2216                 c->keepalive = keepalive;
2217         }
2218 }
2219
2220 size_t utcp_get_outq(struct utcp_connection *c) {
2221         return c ? seqdiff(c->snd.nxt, c->snd.una) : 0;
2222 }
2223
2224 void utcp_set_recv_cb(struct utcp_connection *c, utcp_recv_t recv) {
2225         if(c) {
2226                 c->recv = recv;
2227         }
2228 }
2229
2230 void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
2231         if(c) {
2232                 c->poll = poll;
2233         }
2234 }
2235
2236 void utcp_set_accept_cb(struct utcp *utcp, utcp_accept_t accept, utcp_pre_accept_t pre_accept) {
2237         if(utcp) {
2238                 utcp->accept = accept;
2239                 utcp->pre_accept = pre_accept;
2240         }
2241 }
2242
2243 void utcp_expect_data(struct utcp_connection *c, bool expect) {
2244         if(!c || c->reapable) {
2245                 return;
2246         }
2247
2248         if(!(c->state == ESTABLISHED || c->state == FIN_WAIT_1 || c->state == FIN_WAIT_2)) {
2249                 return;
2250         }
2251
2252         if(expect) {
2253                 // If we expect data, start the connection timer.
2254                 if(!timerisset(&c->conn_timeout)) {
2255                         gettimeofday(&c->conn_timeout, NULL);
2256                         c->conn_timeout.tv_sec += c->utcp->timeout;
2257                 }
2258         } else {
2259                 // If we want to cancel expecting data, only clear the timer when there is no unACKed data.
2260                 if(c->snd.una == c->snd.last) {
2261                         timerclear(&c->conn_timeout);
2262                 }
2263         }
2264 }
2265
2266 void utcp_offline(struct utcp *utcp, bool offline) {
2267         struct timeval now;
2268         gettimeofday(&now, NULL);
2269
2270         for(int i = 0; i < utcp->nconnections; i++) {
2271                 struct utcp_connection *c = utcp->connections[i];
2272
2273                 if(c->reapable) {
2274                         continue;
2275                 }
2276
2277                 utcp_expect_data(c, offline);
2278
2279                 if(!offline) {
2280                         if(timerisset(&c->rtrx_timeout)) {
2281                                 c->rtrx_timeout = now;
2282                         }
2283
2284                         utcp->connections[i]->rtt_start.tv_sec = 0;
2285                 }
2286         }
2287
2288         if(!offline && utcp->rto > START_RTO) {
2289                 utcp->rto = START_RTO;
2290         }
2291 }