#undef poll
#endif
+#ifndef UTCP_CLOCK
+#if defined(CLOCK_MONOTONIC_RAW) && defined(__x86_64__)
+#define UTCP_CLOCK CLOCK_MONOTONIC_RAW
+#else
+#define UTCP_CLOCK CLOCK_MONOTONIC
+#endif
+#endif
+
static void timespec_sub(const struct timespec *a, const struct timespec *b, struct timespec *r) {
r->tv_sec = a->tv_sec - b->tv_sec;
r->tv_nsec = a->tv_nsec - b->tv_nsec;
}
static int32_t timespec_diff_usec(const struct timespec *a, const struct timespec *b) {
- int64_t diff = (a->tv_sec - b->tv_sec) * 1000000000 + a->tv_sec - b->tv_sec;
- return diff / 1000;
+ return (a->tv_sec - b->tv_sec) * 1000000 + (a->tv_nsec - b->tv_nsec) / 1000;
}
static bool timespec_lt(const struct timespec *a, const struct timespec *b) {
static void timespec_clear(struct timespec *a) {
a->tv_sec = 0;
+ a->tv_nsec = 0;
}
static bool timespec_isset(const struct timespec *a) {
return a->tv_sec;
}
-static long CLOCK_GRANULARITY;
+static long CLOCK_GRANULARITY; // usec
static inline size_t min(size_t a, size_t b) {
return a < b ? a : b;
#define UTCP_DEBUG_DATALEN 20
#endif
-#ifndef UTCP_CLOCK
-#if defined(CLOCK_MONOTONIC_RAW) && defined(__x86_64__)
-#define UTCP_CLOCK CLOCK_MONOTONIC_RAW
-#else
-#define UTCP_CLOCK CLOCK_MONOTONIC
-#endif
-#endif
-
static void debug(struct utcp_connection *c, const char *format, ...) {
struct timespec tv;
char buf[1024];
*p = 0;
- debug(c, "%s: len %lu src %u dst %u seq %u ack %u wnd %u aux %x ctl %s%s%s%s data %s\n",
+ debug(c, "%s: len %lu src %u dst %u seq %u ack %u wnd %u aux %x ctl %s%s%s%s%s data %s\n",
dir, (unsigned long)len, hdr.src, hdr.dst, hdr.seq, hdr.ack, hdr.wnd, hdr.aux,
hdr.ctl & SYN ? "SYN" : "",
hdr.ctl & RST ? "RST" : "",
hdr.ctl & FIN ? "FIN" : "",
hdr.ctl & ACK ? "ACK" : "",
+ hdr.ctl & MF ? "MF" : "",
str
);
}
// [345.........|........012]
uint32_t tailsize = buf->size - buf->offset;
uint32_t newoffset = newsize - tailsize;
- memmove(buf + newoffset, buf + buf->offset, tailsize);
+ memmove(buf->data + newoffset, buf->data + buf->offset, tailsize);
buf->offset = newoffset;
}
uint32_t realoffset = buf->offset + offset;
- if(buf->size - buf->offset < offset) {
+ if(buf->size - buf->offset <= offset) {
// The offset wrapped
realoffset -= buf->size;
}
uint32_t realoffset = buf->offset + offset;
- if(buf->size - buf->offset < offset) {
+ if(buf->size - buf->offset <= offset) {
// The offset wrapped
realoffset -= buf->size;
}
return len;
}
+// Copy data from the buffer without removing it.
+static ssize_t buffer_call(struct utcp_connection *c, struct buffer *buf, size_t offset, size_t len) {
+ if(!c->recv) {
+ return len;
+ }
+
+ // Ensure we don't copy more than is actually stored in the buffer
+ if(offset >= buf->used) {
+ return 0;
+ }
+
+ if(buf->used - offset < len) {
+ len = buf->used - offset;
+ }
+
+ uint32_t realoffset = buf->offset + offset;
+
+ if(buf->size - buf->offset <= offset) {
+ // The offset wrapped
+ realoffset -= buf->size;
+ }
+
+ if(buf->size - realoffset < len) {
+ // The data is wrapped
+ ssize_t rx1 = c->recv(c, buf->data + realoffset, buf->size - realoffset);
+
+ if(rx1 < buf->size - realoffset) {
+ return rx1;
+ }
+
+ // The channel might have been closed by the previous callback
+ if(!c->recv) {
+ return len;
+ }
+
+ ssize_t rx2 = c->recv(c, buf->data, len - (buf->size - realoffset));
+
+ if(rx2 < 0) {
+ return rx2;
+ } else {
+ return rx1 + rx2;
+ }
+ } else {
+ return c->recv(c, buf->data + realoffset, len);
+ }
+}
+
// Discard data from the buffer.
static ssize_t buffer_discard(struct buffer *buf, size_t len) {
if(buf->used < len) {
len = buf->used;
}
- if(buf->size - buf->offset < len) {
+ if(buf->size - buf->offset <= len) {
buf->offset -= buf->size;
}
- buf->offset += len;
+ if(buf->used == len) {
+ buf->offset = 0;
+ } else {
+ buf->offset += len;
+ }
+
buf->used -= len;
return len;
}
+static void buffer_clear(struct buffer *buf) {
+ buf->used = 0;
+ buf->offset = 0;
+}
+
static bool buffer_set_size(struct buffer *buf, uint32_t minsize, uint32_t maxsize) {
if(maxsize < minsize) {
maxsize = minsize;
}
static uint32_t buffer_free(const struct buffer *buf) {
- return buf->maxsize - buf->used;
+ return buf->maxsize > buf->used ? buf->maxsize - buf->used : 0;
}
// Connections are stored in a sorted list.
c->snd.cwnd = (utcp->mss > 2190 ? 2 : utcp->mss > 1095 ? 3 : 4) * utcp->mss;
c->snd.ssthresh = ~0;
debug_cwnd(c);
+ c->srtt = 0;
+ c->rttvar = 0;
+ c->rto = START_RTO;
c->utcp = utcp;
// Add it to the sorted list of connections
return;
}
- struct utcp *utcp = c->utcp;
-
- if(!utcp->srtt) {
- utcp->srtt = rtt;
- utcp->rttvar = rtt / 2;
+ if(!c->srtt) {
+ c->srtt = rtt;
+ c->rttvar = rtt / 2;
} else {
- utcp->rttvar = (utcp->rttvar * 3 + absdiff(utcp->srtt, rtt)) / 4;
- utcp->srtt = (utcp->srtt * 7 + rtt) / 8;
+ c->rttvar = (c->rttvar * 3 + absdiff(c->srtt, rtt)) / 4;
+ c->srtt = (c->srtt * 7 + rtt) / 8;
}
- utcp->rto = utcp->srtt + max(4 * utcp->rttvar, CLOCK_GRANULARITY);
+ c->rto = c->srtt + max(4 * c->rttvar, CLOCK_GRANULARITY);
- if(utcp->rto > MAX_RTO) {
- utcp->rto = MAX_RTO;
+ if(c->rto > MAX_RTO) {
+ c->rto = MAX_RTO;
}
- debug(c, "rtt %u srtt %u rttvar %u rto %u\n", rtt, utcp->srtt, utcp->rttvar, utcp->rto);
+ debug(c, "rtt %u srtt %u rttvar %u rto %u\n", rtt, c->srtt, c->rttvar, c->rto);
}
static void start_retransmit_timer(struct utcp_connection *c) {
clock_gettime(UTCP_CLOCK, &c->rtrx_timeout);
- uint32_t rto = c->utcp->rto;
+ uint32_t rto = c->rto;
while(rto > USEC_PER_SEC) {
c->rtrx_timeout.tv_sec++;
rto -= USEC_PER_SEC;
}
- c->rtrx_timeout.tv_nsec += c->utcp->rto * 1000;
+ c->rtrx_timeout.tv_nsec += rto * 1000;
if(c->rtrx_timeout.tv_nsec >= NSEC_PER_SEC) {
c->rtrx_timeout.tv_nsec -= NSEC_PER_SEC;
static void ack(struct utcp_connection *c, bool sendatleastone) {
int32_t left = seqdiff(c->snd.last, c->snd.nxt);
- int32_t cwndleft = min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una);
+ int32_t cwndleft = is_reliable(c) ? min(c->snd.cwnd, c->snd.wnd) - seqdiff(c->snd.nxt, c->snd.una) : MAX_UNRELIABLE_SIZE;
assert(left >= 0);
pkt->hdr.src = c->src;
pkt->hdr.dst = c->dst;
pkt->hdr.ack = c->rcv.nxt;
- pkt->hdr.wnd = c->rcvbuf.maxsize;
+ pkt->hdr.wnd = is_reliable(c) ? c->rcvbuf.maxsize : 0;
pkt->hdr.ctl = ACK;
pkt->hdr.aux = 0;
c->snd.nxt += seglen;
left -= seglen;
+ if(!is_reliable(c)) {
+ if(left) {
+ pkt->hdr.ctl |= MF;
+ } else {
+ pkt->hdr.ctl &= ~MF;
+ }
+ }
+
if(seglen && fin_wanted(c, c->snd.nxt)) {
seglen--;
pkt->hdr.ctl |= FIN;
print_packet(c, "send", pkt, sizeof(pkt->hdr) + seglen);
c->utcp->send(c->utcp, pkt, sizeof(pkt->hdr) + seglen);
+
+ if(left && !is_reliable(c)) {
+ pkt->hdr.wnd += seglen;
+ }
} while(left);
}
// Add data to send buffer.
- if(is_reliable(c) || (c->state != SYN_SENT && c->state != SYN_RECEIVED)) {
+ if(is_reliable(c)) {
len = buffer_put(&c->sndbuf, data, len);
+ } else if(c->state != SYN_SENT && c->state != SYN_RECEIVED) {
+ if(len > MAX_UNRELIABLE_SIZE || buffer_put(&c->sndbuf, data, len) != (ssize_t)len) {
+ errno = EMSGSIZE;
+ return -1;
+ }
} else {
return 0;
}
struct {
struct hdr hdr;
uint8_t data[];
- } *pkt;
-
- pkt = malloc(c->utcp->mtu);
-
- if(!pkt) {
- return;
- }
+ } *pkt = c->utcp->pkt;
pkt->hdr.src = c->src;
pkt->hdr.dst = c->dst;
default:
break;
}
-
- free(pkt);
}
static void retransmit(struct utcp_connection *c) {
struct utcp *utcp = c->utcp;
+ if(utcp->retransmit) {
+ utcp->retransmit(c);
+ }
+
struct {
struct hdr hdr;
uint8_t data[];
}
start_retransmit_timer(c);
- utcp->rto *= 2;
+ c->rto *= 2;
- if(utcp->rto > MAX_RTO) {
- utcp->rto = MAX_RTO;
+ if(c->rto > MAX_RTO) {
+ c->rto = MAX_RTO;
}
c->rtt_start.tv_sec = 0; // invalidate RTT timer
// Packet loss or reordering occured. Store the data in the buffer.
ssize_t rxd = buffer_put_at(&c->rcvbuf, offset, data, len);
- if(rxd < 0 || (size_t)rxd < len) {
- abort();
+ if(rxd <= 0) {
+ debug(c, "packet outside receive buffer, dropping\n");
+ return;
+ }
+
+ if((size_t)rxd < len) {
+ debug(c, "packet partially outside receive buffer\n");
+ len = rxd;
}
// Make note of where we put it.
}
static void handle_in_order(struct utcp_connection *c, const void *data, size_t len) {
- // Check if we can process out-of-order data now.
- if(c->sacks[0].len && len >= c->sacks[0].offset) { // TODO: handle overlap with second SACK
- debug(c, "incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
- buffer_put_at(&c->rcvbuf, 0, data, len); // TODO: handle return value
- len = max(len, c->sacks[0].offset + c->sacks[0].len);
- data = c->rcvbuf.data;
- }
-
if(c->recv) {
ssize_t rxd = c->recv(c, data, len);
- if(rxd < 0 || (size_t)rxd != len) {
+ if(rxd != (ssize_t)len) {
// TODO: handle the application not accepting all data.
abort();
}
}
+ // Check if we can process out-of-order data now.
+ if(c->sacks[0].len && len >= c->sacks[0].offset) {
+ debug(c, "incoming packet len %lu connected with SACK at %u\n", (unsigned long)len, c->sacks[0].offset);
+
+ if(len < c->sacks[0].offset + c->sacks[0].len) {
+ size_t offset = len;
+ len = c->sacks[0].offset + c->sacks[0].len;
+ size_t remainder = len - offset;
+
+ ssize_t rxd = buffer_call(c, &c->rcvbuf, offset, remainder);
+
+ if(rxd != (ssize_t)remainder) {
+ // TODO: handle the application not accepting all data.
+ abort();
+ }
+ }
+ }
+
if(c->rcvbuf.used) {
sack_consume(c, len);
}
c->rcv.nxt += len;
}
+static void handle_unreliable(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
+ // Fast path for unfragmented packets
+ if(!hdr->wnd && !(hdr->ctl & MF)) {
+ if(c->recv) {
+ c->recv(c, data, len);
+ }
-static void handle_incoming_data(struct utcp_connection *c, uint32_t seq, const void *data, size_t len) {
- if(!is_reliable(c)) {
- c->recv(c, data, len);
- c->rcv.nxt = seq + len;
+ c->rcv.nxt = hdr->seq + len;
return;
}
- uint32_t offset = seqdiff(seq, c->rcv.nxt);
+ // Ensure reassembled packet are not larger than 64 kiB
+ if(hdr->wnd >= MAX_UNRELIABLE_SIZE || hdr->wnd + len > MAX_UNRELIABLE_SIZE) {
+ return;
+ }
- if(offset + len > c->rcvbuf.maxsize) {
- abort();
+ // Don't accept out of order fragments
+ if(hdr->wnd && hdr->seq != c->rcv.nxt) {
+ return;
+ }
+
+ // Reset the receive buffer for the first fragment
+ if(!hdr->wnd) {
+ buffer_clear(&c->rcvbuf);
+ }
+
+ ssize_t rxd = buffer_put_at(&c->rcvbuf, hdr->wnd, data, len);
+
+ if(rxd != (ssize_t)len) {
+ return;
}
+ // Send the packet if it's the final fragment
+ if(!(hdr->ctl & MF)) {
+ buffer_call(c, &c->rcvbuf, 0, hdr->wnd + len);
+ }
+
+ c->rcv.nxt = hdr->seq + len;
+}
+
+static void handle_incoming_data(struct utcp_connection *c, const struct hdr *hdr, const void *data, size_t len) {
+ if(!is_reliable(c)) {
+ handle_unreliable(c, hdr, data, len);
+ return;
+ }
+
+ uint32_t offset = seqdiff(hdr->seq, c->rcv.nxt);
+
if(offset) {
handle_out_of_order(c, offset, data, len);
} else {
// Drop packets with an unknown CTL flag
- if(hdr.ctl & ~(SYN | ACK | RST | FIN)) {
+ if(hdr.ctl & ~(SYN | ACK | RST | FIN | MF)) {
print_packet(NULL, "recv", data, len);
errno = EBADMSG;
return -1;
print_packet(c, "send", &pkt, sizeof(hdr));
utcp->send(utcp, &pkt, sizeof(hdr));
}
+
+ start_retransmit_timer(c);
} else {
// No, we don't want your packets, send a RST back
len = 1;
debug(c, "packet out of order, offset %u bytes", rcv_offset);
}
- if(rcv_offset >= 0) {
- c->rcv.nxt = hdr.seq + len;
- }
-
#endif
}
if(data_acked) {
buffer_discard(&c->sndbuf, data_acked);
+
+ if(is_reliable(c)) {
+ c->do_poll = true;
+ }
}
// Also advance snd.nxt if possible
}
c->rcv.irs = hdr.seq;
- c->rcv.nxt = hdr.seq;
+ c->rcv.nxt = hdr.seq + 1;
if(c->shut_wr) {
c->snd.last++;
set_state(c, ESTABLISHED);
}
- // TODO: notify application of this somehow.
break;
case SYN_RECEIVED:
case CLOSING:
case LAST_ACK:
case TIME_WAIT:
- // Ehm, no. We should never receive a second SYN.
- return 0;
+ // This could be a retransmission. Ignore the SYN flag, but send an ACK back.
+ break;
default:
#ifdef UTCP_DEBUG
#endif
return 0;
}
-
- // SYN counts as one sequence number
- c->rcv.nxt++;
}
// 6. Process new data
return 0;
}
- handle_incoming_data(c, hdr.seq, ptr, len);
+ handle_incoming_data(c, &hdr, ptr, len);
}
// 7. Process FIN stuff
}
if(c->poll) {
- if((c->state == ESTABLISHED || c->state == CLOSE_WAIT)) {
- uint32_t len = buffer_free(&c->sndbuf);
+ if((c->state == ESTABLISHED || c->state == CLOSE_WAIT) && c->do_poll) {
+ c->do_poll = false;
+ uint32_t len = buffer_free(&c->sndbuf);
if(len) {
c->poll(c, len);
return NULL;
}
+ utcp_set_mtu(utcp, DEFAULT_MTU);
+
+ if(!utcp->pkt) {
+ free(utcp);
+ return NULL;
+ }
+
if(!CLOCK_GRANULARITY) {
struct timespec res;
clock_getres(UTCP_CLOCK, &res);
- CLOCK_GRANULARITY = res.tv_sec * NSEC_PER_SEC + res.tv_nsec;
+ CLOCK_GRANULARITY = res.tv_sec * USEC_PER_SEC + res.tv_nsec / 1000;
}
utcp->accept = accept;
utcp->pre_accept = pre_accept;
utcp->send = send;
utcp->priv = priv;
- utcp_set_mtu(utcp, DEFAULT_MTU);
utcp->timeout = DEFAULT_USER_TIMEOUT; // sec
- utcp->rto = START_RTO; // usec
return utcp;
}
}
free(utcp->connections);
+ free(utcp->pkt);
free(utcp);
}
}
c->rtt_start.tv_sec = 0;
- }
- if(utcp->rto > START_RTO) {
- utcp->rto = START_RTO;
+ if(c->rto > START_RTO) {
+ c->rto = START_RTO;
+ }
}
}
if(c->sndbuf.maxsize != size) {
c->sndbuf.maxsize = -1;
}
+
+ c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
}
size_t utcp_get_rcvbuf(struct utcp_connection *c) {
void utcp_set_poll_cb(struct utcp_connection *c, utcp_poll_t poll) {
if(c) {
c->poll = poll;
+ c->do_poll = is_reliable(c) && buffer_free(&c->sndbuf);
}
}
}
utcp->connections[i]->rtt_start.tv_sec = 0;
+
+ if(c->rto > START_RTO) {
+ c->rto = START_RTO;
+ }
}
}
+}
- if(!offline && utcp->rto > START_RTO) {
- utcp->rto = START_RTO;
- }
+void utcp_set_retransmit_cb(struct utcp *utcp, utcp_retransmit_t retransmit) {
+ utcp->retransmit = retransmit;
+}
+
+void utcp_set_clock_granularity(long granularity) {
+ CLOCK_GRANULARITY = granularity;
}