Optimise TCP send.

In the DNS TCP code, there are a couple of places where
we have a buffer containing a message which we need to
send via TCP. The DNS protocol is that this is sent as
<16-bit length in network order><message>

Making two write calls, one for the length and one
for the message causes the TCP stack to send two packets,
one for each. A single packet containing both is
preferable from a performance POV.

Implement a scatter-gather version of our read_write()
wrapper and use it where necessary to send TCP DNS messages.
This commit is contained in:
Simon Kelley
2026-01-26 21:44:27 +00:00
parent 729c16a8ac
commit ed2ba5b07a
3 changed files with 58 additions and 38 deletions

View File

@@ -1527,6 +1527,7 @@ int memcmp_masked(unsigned char *a, unsigned char *b, int len,
int expand_buf(struct iovec *iov, size_t size);
char *print_mac(char *buff, unsigned char *mac, int len);
int read_write(int fd, unsigned char *packet, int size, int rw);
int read_writev(int fd, struct iovec *iov, int iovcnt, int rw);
void close_fds(long max_fd, int spare1, int spare2, int spare3);
int wildcard_match(const char* wildcard, const char* match);
int wildcard_matchn(const char* wildcard, const char* match, int num);

View File

@@ -2029,9 +2029,9 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade
int class, rclass, type, rtype;
unsigned char *p;
struct timeval tv;
struct iovec sendio[2];
#ifdef MSG_FASTOPEN
struct msghdr msg;
struct iovec sendio[2];
#endif
(void)mark;
@@ -2044,6 +2044,12 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade
GETSHORT(type, p);
GETSHORT(class, p);
length = htons(qsize);
sendio[0].iov_base = &length;
sendio[0].iov_len = sizeof(length);
sendio[1].iov_base = header;
sendio[1].iov_len = qsize;
while (1)
{
int data_sent = 0, fatal = 0;
@@ -2065,8 +2071,6 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade
*servp = serv = daemon->serverarray[start];
retry:
length = htons(qsize);
if (serv->tcpfd == -1)
{
if ((serv->tcpfd = socket(serv->addr.sa.sa_family, SOCK_STREAM, 0)) == -1)
@@ -2097,12 +2101,7 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade
tv.tv_sec += TCP_TIMEOUT;
setsockopt(serv->tcpfd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
#endif
#ifdef MSG_FASTOPEN
sendio[0].iov_base = (unsigned char *)&length;
sendio[0].iov_len = sizeof(length);
sendio[1].iov_base = (unsigned char *)header;
sendio[1].iov_len = qsize;
msg.msg_name = &serv->addr.sa;
msg.msg_namelen = sa_len(&serv->addr);
msg.msg_iov = sendio;
@@ -2138,11 +2137,9 @@ static ssize_t tcp_talk(int first, int last, int start, struct dns_header *heade
serv->flags &= ~SERV_GOT_TCP;
}
/* We us the _ONCE variant of read_write() here because we've set a timeout on the tcp socket
/* We use the _ONCE variant of read_write() here because we've set a timeout on the tcp socket
and wish to abort if the whole data is not read/written within the timeout. */
if ((!data_sent &&
(!read_write(serv->tcpfd, (unsigned char *)&length, sizeof(length), RW_WRITE_ONCE) ||
!read_write(serv->tcpfd, (unsigned char *)header, qsize, RW_WRITE_ONCE))) ||
if ((!data_sent && !read_writev(serv->tcpfd, sendio, 2, RW_WRITE_ONCE)) ||
!read_write(serv->tcpfd, (unsigned char *)&length, sizeof(length), RW_READ_ONCE) ||
!expand_buf(recvbuff, (rsize = ntohs(length))) ||
!read_write(serv->tcpfd, recvbuff->iov_base, rsize, RW_READ_ONCE))
@@ -2376,7 +2373,8 @@ void tcp_request(int confd, time_t now, struct iovec *bigbuff,
unsigned int mark = 0;
int have_mark = 0;
int first, last, filtered, do_stale = 0;
struct iovec out_iov[2];
bigbuff->iov_base = NULL;
bigbuff->iov_len = 0;
@@ -2746,9 +2744,13 @@ void tcp_request(int confd, time_t now, struct iovec *bigbuff,
report_addresses(header, m, mark);
#endif
/* use scatter-gather IO so that length doesn't end up in separate packet. */
out_len = htons(m);
if (!read_write(confd, (unsigned char *)&out_len, sizeof(out_len), RW_WRITE) |
!read_write(confd, bigbuff->iov_base, m, RW_WRITE))
out_iov[0].iov_len = sizeof(out_len);
out_iov[0].iov_base = &out_len;
out_iov[1].iov_len = m;
out_iov[1].iov_base = bigbuff->iov_base;
if (!read_writev(confd, out_iov, 2, RW_WRITE))
break;
/* If we answered with stale data, this process will now try and get fresh data into

View File

@@ -757,43 +757,60 @@ int retry_send(ssize_t rc)
"once" fails on EAGAIN, as this a timeout.
This indicates a timeout of a TCP socket.
*/
int read_write(int fd, unsigned char *packet, int size, int rw)
int read_writev(int fd, struct iovec *iov, int iovcnt, int rw)
{
ssize_t n, done;
for (done = 0; done < size; done += n)
{
if (rw & 1)
n = read(fd, &packet[done], (size_t)(size - done));
else
n = write(fd, &packet[done], (size_t)(size - done));
if (n == 0)
return 0;
int cur = 0;
ssize_t n, done = 0;
while (cur < iovcnt)
{
iov[cur].iov_len -= done;
iov[cur].iov_base = ((char *)iov[cur].iov_base) + done;
if (rw & 1)
n = readv(fd, &iov[cur], iovcnt - cur);
else
n = writev(fd, &iov[cur], iovcnt - cur);
iov[cur].iov_len += done;
iov[cur].iov_base = ((char *)iov[cur].iov_base) - done;
if (n == -1)
{
n = 0; /* don't mess with counter when we loop. */
if (errno == EINTR || errno == ENOMEM || errno == ENOBUFS)
continue;
if (errno == EAGAIN || errno == EWOULDBLOCK)
{
/* "once" variant */
if (rw & 2)
return 0;
continue;
}
if (!(rw & 2) && (errno == EAGAIN || errno == EWOULDBLOCK))
continue;
return 0;
}
if (n == 0 && (rw & 1))
return 0;
done += n;
while ((size_t)done >= iov[cur].iov_len)
done -= iov[cur++].iov_len;
}
return 1;
}
int read_write(int fd, unsigned char *packet, int size, int rw)
{
struct iovec iov;
/* size == 0 is not an error, just a NOOP. */
if (size == 0)
return 1;
iov.iov_len = (size_t)size;
iov.iov_base = packet;
return read_writev(fd, &iov, 1, rw);
}
/* close all fds except STDIN, STDOUT and STDERR, spare1, spare2 and spare3 */
void close_fds(long max_fd, int spare1, int spare2, int spare3)
{