From ed2ba5b07afb3f480abd3103ffc284ee085a0432 Mon Sep 17 00:00:00 2001 From: Simon Kelley Date: Mon, 26 Jan 2026 21:44:27 +0000 Subject: [PATCH] Optimise TCP send. In the DNS TCP code, there are a couple of places where we have a buffer containing a message which we need to send via TCP. The DNS protocol is that this is sent as <16-bit length in network order> Making two write calls, one for the length and one for the message causes the TCP stack to send two packets, one for each. A single packet containing both is preferable from a performance POV. Implement a scatter-gather version of our read_write() wrapper and use it where necessary to send TCP DNS messages. --- src/dnsmasq.h | 1 + src/forward.c | 32 ++++++++++++++------------ src/util.c | 63 ++++++++++++++++++++++++++++++++------------------- 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/src/dnsmasq.h b/src/dnsmasq.h index 0a5064c..d93ad16 100644 --- a/src/dnsmasq.h +++ b/src/dnsmasq.h @@ -1527,6 +1527,7 @@ int memcmp_masked(unsigned char *a, unsigned char *b, int len, int expand_buf(struct iovec *iov, size_t size); char *print_mac(char *buff, unsigned char *mac, int len); int read_write(int fd, unsigned char *packet, int size, int rw); +int read_writev(int fd, struct iovec *iov, int iovcnt, int rw); void close_fds(long max_fd, int spare1, int spare2, int spare3); int wildcard_match(const char* wildcard, const char* match); int wildcard_matchn(const char* wildcard, const char* match, int num); diff --git a/src/forward.c b/src/forward.c index 65bd080..272bee9 100644 --- a/src/forward.c +++ b/src/forward.c @@ -2029,9 +2029,9 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade int class, rclass, type, rtype; unsigned char *p; struct timeval tv; + struct iovec sendio[2]; #ifdef MSG_FASTOPEN struct msghdr msg; - struct iovec sendio[2]; #endif (void)mark; @@ -2044,6 +2044,12 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade GETSHORT(type, p); GETSHORT(class, p); + length = htons(qsize); + sendio[0].iov_base = &length; + sendio[0].iov_len = sizeof(length); + sendio[1].iov_base = header; + sendio[1].iov_len = qsize; + while (1) { int data_sent = 0, fatal = 0; @@ -2065,8 +2071,6 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade *servp = serv = daemon->serverarray[start]; retry: - length = htons(qsize); - if (serv->tcpfd == -1) { if ((serv->tcpfd = socket(serv->addr.sa.sa_family, SOCK_STREAM, 0)) == -1) @@ -2097,12 +2101,7 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade tv.tv_sec += TCP_TIMEOUT; setsockopt(serv->tcpfd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); #endif - #ifdef MSG_FASTOPEN - sendio[0].iov_base = (unsigned char *)&length; - sendio[0].iov_len = sizeof(length); - sendio[1].iov_base = (unsigned char *)header; - sendio[1].iov_len = qsize; msg.msg_name = &serv->addr.sa; msg.msg_namelen = sa_len(&serv->addr); msg.msg_iov = sendio; @@ -2138,11 +2137,9 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade serv->flags &= ~SERV_GOT_TCP; } - /* We us the _ONCE variant of read_write() here because we've set a timeout on the tcp socket + /* We use the _ONCE variant of read_write() here because we've set a timeout on the tcp socket and wish to abort if the whole data is not read/written within the timeout. */ - if ((!data_sent && - (!read_write(serv->tcpfd, (unsigned char *)&length, sizeof(length), RW_WRITE_ONCE) || - !read_write(serv->tcpfd, (unsigned char *)header, qsize, RW_WRITE_ONCE))) || + if ((!data_sent && !read_writev(serv->tcpfd, sendio, 2, RW_WRITE_ONCE)) || !read_write(serv->tcpfd, (unsigned char *)&length, sizeof(length), RW_READ_ONCE) || !expand_buf(recvbuff, (rsize = ntohs(length))) || !read_write(serv->tcpfd, recvbuff->iov_base, rsize, RW_READ_ONCE)) @@ -2376,7 +2373,8 @@ void tcp_request(int confd, time_t now, struct iovec *bigbuff, unsigned int mark = 0; int have_mark = 0; int first, last, filtered, do_stale = 0; - + struct iovec out_iov[2]; + bigbuff->iov_base = NULL; bigbuff->iov_len = 0; @@ -2746,9 +2744,13 @@ void tcp_request(int confd, time_t now, struct iovec *bigbuff, report_addresses(header, m, mark); #endif + /* use scatter-gather IO so that length doesn't end up in separate packet. */ out_len = htons(m); - if (!read_write(confd, (unsigned char *)&out_len, sizeof(out_len), RW_WRITE) | - !read_write(confd, bigbuff->iov_base, m, RW_WRITE)) + out_iov[0].iov_len = sizeof(out_len); + out_iov[0].iov_base = &out_len; + out_iov[1].iov_len = m; + out_iov[1].iov_base = bigbuff->iov_base; + if (!read_writev(confd, out_iov, 2, RW_WRITE)) break; /* If we answered with stale data, this process will now try and get fresh data into diff --git a/src/util.c b/src/util.c index b5c961b..75971d3 100644 --- a/src/util.c +++ b/src/util.c @@ -757,43 +757,60 @@ int retry_send(ssize_t rc) "once" fails on EAGAIN, as this a timeout. This indicates a timeout of a TCP socket. */ -int read_write(int fd, unsigned char *packet, int size, int rw) +int read_writev(int fd, struct iovec *iov, int iovcnt, int rw) { - ssize_t n, done; - - for (done = 0; done < size; done += n) - { - if (rw & 1) - n = read(fd, &packet[done], (size_t)(size - done)); - else - n = write(fd, &packet[done], (size_t)(size - done)); - - if (n == 0) - return 0; + int cur = 0; + ssize_t n, done = 0; + while (cur < iovcnt) + { + iov[cur].iov_len -= done; + iov[cur].iov_base = ((char *)iov[cur].iov_base) + done; + + if (rw & 1) + n = readv(fd, &iov[cur], iovcnt - cur); + else + n = writev(fd, &iov[cur], iovcnt - cur); + + iov[cur].iov_len += done; + iov[cur].iov_base = ((char *)iov[cur].iov_base) - done; + if (n == -1) { - n = 0; /* don't mess with counter when we loop. */ - if (errno == EINTR || errno == ENOMEM || errno == ENOBUFS) continue; - - if (errno == EAGAIN || errno == EWOULDBLOCK) - { - /* "once" variant */ - if (rw & 2) - return 0; - - continue; - } + + if (!(rw & 2) && (errno == EAGAIN || errno == EWOULDBLOCK)) + continue; return 0; } + + if (n == 0 && (rw & 1)) + return 0; + + done += n; + while ((size_t)done >= iov[cur].iov_len) + done -= iov[cur++].iov_len; } return 1; } +int read_write(int fd, unsigned char *packet, int size, int rw) +{ + struct iovec iov; + + /* size == 0 is not an error, just a NOOP. */ + if (size == 0) + return 1; + + iov.iov_len = (size_t)size; + iov.iov_base = packet; + + return read_writev(fd, &iov, 1, rw); +} + /* close all fds except STDIN, STDOUT and STDERR, spare1, spare2 and spare3 */ void close_fds(long max_fd, int spare1, int spare2, int spare3) {