diff --git a/CHANGELOG b/CHANGELOG index f526acb..e05b8db 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,17 @@ +version 2.81 + Impove cache behaviour for TCP connections. For ease of + implementaion, dnsmasq has always forked a new process to handle + each incoming TCP connection. A side-effect of this is that + any DNS queries answered from TCP connections are not cached: + when TCP connections were rare, this was not a problem. + With the coming of DNSSEC, it's now the case that some + DNSSEC queries have answers which spill to TCP, and if, + for instance, this applies to the keys for the root then + those never get cached, and performance is very bad. + This fix passes cache entries back from the TCP child process to + the main server process, and fixes the problem. + + version 2.80 Add support for RFC 4039 DHCP rapid commit. Thanks to Ashram Method for the initial patch and motivation. diff --git a/src/blockdata.c b/src/blockdata.c index 72f0575..4aacfa1 100644 --- a/src/blockdata.c +++ b/src/blockdata.c @@ -61,7 +61,7 @@ void blockdata_report(void) blockdata_alloced * sizeof(struct blockdata)); } -struct blockdata *blockdata_alloc(char *data, size_t len) +static struct blockdata *blockdata_alloc_real(int fd, char *data, size_t len) { struct blockdata *block, *ret = NULL; struct blockdata **prev = &ret; @@ -89,8 +89,17 @@ struct blockdata *blockdata_alloc(char *data, size_t len) blockdata_hwm = blockdata_count; blen = len > KEYBLOCK_LEN ? KEYBLOCK_LEN : len; - memcpy(block->key, data, blen); - data += blen; + if (data) + { + memcpy(block->key, data, blen); + data += blen; + } + else if (!read_write(fd, block->key, blen, 1)) + { + /* failed read free partial chain */ + blockdata_free(ret); + return NULL; + } len -= blen; *prev = block; prev = &block->next; @@ -100,6 +109,10 @@ struct blockdata *blockdata_alloc(char *data, size_t len) return ret; } +struct blockdata *blockdata_alloc(char *data, size_t len) +{ + return blockdata_alloc_real(0, data, len); +} void blockdata_free(struct blockdata *blocks) { @@ -148,5 +161,21 @@ void *blockdata_retrieve(struct blockdata *block, size_t len, void *data) return data; } - + + +void blockdata_write(struct blockdata *block, size_t len, int fd) +{ + for (; len > 0 && block; block = block->next) + { + size_t blen = len > KEYBLOCK_LEN ? KEYBLOCK_LEN : len; + read_write(fd, block->key, blen, 0); + len -= blen; + } +} + +struct blockdata *blockdata_read(int fd, size_t len) +{ + return blockdata_alloc_real(fd, NULL, len); +} + #endif diff --git a/src/cache.c b/src/cache.c index 6daaf3e..39f6dbe 100644 --- a/src/cache.c +++ b/src/cache.c @@ -26,6 +26,8 @@ static union bigname *big_free = NULL; static int bignames_left, hash_size; static void make_non_terminals(struct crec *source); +static struct crec *really_insert(char *name, struct all_addr *addr, + time_t now, unsigned long ttl, unsigned short flags); /* type->string mapping: this is also used by the name-hash function as a mixing table. */ static const struct { @@ -464,16 +466,10 @@ void cache_start_insert(void) new_chain = NULL; insert_error = 0; } - + struct crec *cache_insert(char *name, struct all_addr *addr, time_t now, unsigned long ttl, unsigned short flags) { - struct crec *new, *target_crec = NULL; - union bigname *big_name = NULL; - int freed_all = flags & F_REVERSE; - int free_avail = 0; - unsigned int target_uid; - /* Don't log DNSSEC records here, done elsewhere */ if (flags & (F_IPV4 | F_IPV6 | F_CNAME)) { @@ -484,7 +480,20 @@ struct crec *cache_insert(char *name, struct all_addr *addr, if (daemon->min_cache_ttl != 0 && daemon->min_cache_ttl > ttl) ttl = daemon->min_cache_ttl; } + + return really_insert(name, addr, now, ttl, flags); +} + +static struct crec *really_insert(char *name, struct all_addr *addr, + time_t now, unsigned long ttl, unsigned short flags) +{ + struct crec *new, *target_crec = NULL; + union bigname *big_name = NULL; + int freed_all = flags & F_REVERSE; + int free_avail = 0; + unsigned int target_uid; + /* if previous insertion failed give up now. */ if (insert_error) return NULL; @@ -645,12 +654,185 @@ void cache_end_insert(void) cache_hash(new_chain); cache_link(new_chain); daemon->metrics[METRIC_DNS_CACHE_INSERTED]++; + + /* If we're a child process, send this cache entry up the pipe to the master. + The marshalling process is rather nasty. */ + if (daemon->pipe_to_parent != -1) + { + char *name = cache_get_name(new_chain); + ssize_t m = strlen(name); + unsigned short flags = new_chain->flags; +#ifdef HAVE_DNSSEC + u16 class = new_chain->uid; +#endif + + read_write(daemon->pipe_to_parent, (unsigned char *)&m, sizeof(m), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)name, m, 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->ttd, sizeof(new_chain->ttd), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&flags, sizeof(flags), 0); + + if (flags & (F_IPV4 | F_IPV6)) + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr, sizeof(new_chain->addr), 0); +#ifdef HAVE_DNSSEC + else if (flags & F_DNSKEY) + { + read_write(daemon->pipe_to_parent, (unsigned char *)&class, sizeof(class), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.key.algo, sizeof(new_chain->addr.key.algo), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.key.keytag, sizeof(new_chain->addr.key.keytag), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.key.flags, sizeof(new_chain->addr.key.flags), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.key.keylen, sizeof(new_chain->addr.key.keylen), 0); + blockdata_write(new_chain->addr.key.keydata, new_chain->addr.key.keylen, daemon->pipe_to_parent); + } + else if (flags & F_DS) + { + read_write(daemon->pipe_to_parent, (unsigned char *)&class, sizeof(class), 0); + /* A negative DS entry is possible and has no data, obviously. */ + if (!(flags & F_NEG)) + { + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.ds.algo, sizeof(new_chain->addr.ds.algo), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.ds.keytag, sizeof(new_chain->addr.ds.keytag), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.ds.digest, sizeof(new_chain->addr.ds.digest), 0); + read_write(daemon->pipe_to_parent, (unsigned char *)&new_chain->addr.ds.keylen, sizeof(new_chain->addr.ds.keylen), 0); + blockdata_write(new_chain->addr.ds.keydata, new_chain->addr.ds.keylen, daemon->pipe_to_parent); + } + } +#endif + + } } + new_chain = tmp; } + + /* signal end of cache insert in master process */ + if (daemon->pipe_to_parent != -1) + { + ssize_t m = -1; + read_write(daemon->pipe_to_parent, (unsigned char *)&m, sizeof(m), 0); + } + new_chain = NULL; } + +/* A marshalled cache entry arrives on fd, read, unmarshall and insert into cache of master process. */ +int cache_recv_insert(time_t now, int fd) +{ + ssize_t m; + struct all_addr addr; + unsigned long ttl; + time_t ttd; + unsigned short flags; + struct crec *crecp = NULL; + + cache_start_insert(); + + while(1) + { + + if (!read_write(fd, (unsigned char *)&m, sizeof(m), 1)) + return 0; + + if (m == -1) + { + cache_end_insert(); + return 1; + } + + if (!read_write(fd, (unsigned char *)daemon->namebuff, m, 1) || + !read_write(fd, (unsigned char *)&ttd, sizeof(ttd), 1) || + !read_write(fd, (unsigned char *)&flags, sizeof(flags), 1)) + return 0; + + daemon->namebuff[m] = 0; + + ttl = difftime(ttd, now); + + if (flags & (F_IPV4 | F_IPV6)) + { + if (!read_write(fd, (unsigned char *)&addr, sizeof(addr), 1)) + return 0; + crecp = really_insert(daemon->namebuff, &addr, now, ttl, flags); + } + else if (flags & F_CNAME) + { + struct crec *newc = really_insert(daemon->namebuff, NULL, now, ttl, flags); + /* This relies on the fact the the target of a CNAME immediately preceeds + it because of the order of extraction in extract_addresses, and + the order reversal on the new_chain. */ + if (newc) + { + if (!crecp) + { + newc->addr.cname.target.cache = NULL; + /* anything other than zero, to avoid being mistaken for CNAME to interface-name */ + newc->addr.cname.uid = 1; + } + else + { + next_uid(crecp); + newc->addr.cname.target.cache = crecp; + newc->addr.cname.uid = crecp->uid; + } + } + } +#ifdef HAVE_DNSSEC + else if (flags & (F_DNSKEY | F_DS)) + { + unsigned short class, keylen, keyflags, keytag; + unsigned char algo, digest; + struct blockdata *keydata; + + if (!read_write(fd, (unsigned char *)&class, sizeof(class), 1)) + return 0; + /* Cache needs to known class for DNSSEC stuff */ + addr.addr.dnssec.class = class; + + crecp = really_insert(daemon->namebuff, &addr, now, ttl, flags); + + if (flags & F_DNSKEY) + { + if (!read_write(fd, (unsigned char *)&algo, sizeof(algo), 1) || + !read_write(fd, (unsigned char *)&keytag, sizeof(keytag), 1) || + !read_write(fd, (unsigned char *)&keyflags, sizeof(keyflags), 1) || + !read_write(fd, (unsigned char *)&keylen, sizeof(keylen), 1) || + !(keydata = blockdata_read(fd, keylen))) + return 0; + } + else if (!(flags & F_NEG)) + { + if (!read_write(fd, (unsigned char *)&algo, sizeof(algo), 1) || + !read_write(fd, (unsigned char *)&keytag, sizeof(keytag), 1) || + !read_write(fd, (unsigned char *)&digest, sizeof(digest), 1) || + !read_write(fd, (unsigned char *)&keylen, sizeof(keylen), 1) || + !(keydata = blockdata_read(fd, keylen))) + return 0; + } + + if (crecp) + { + if (flags & F_DNSKEY) + { + crecp->addr.key.algo = algo; + crecp->addr.key.keytag = keytag; + crecp->addr.key.flags = flags; + crecp->addr.key.keylen = keylen; + crecp->addr.key.keydata = keydata; + } + else if (!(flags & F_NEG)) + { + crecp->addr.ds.algo = algo; + crecp->addr.ds.keytag = keytag; + crecp->addr.ds.digest = digest; + crecp->addr.ds.keylen = keylen; + crecp->addr.ds.keydata = keydata; + } + } + } +#endif + } +} + int cache_find_non_terminal(char *name, time_t now) { struct crec *crecp; diff --git a/src/dnsmasq.c b/src/dnsmasq.c index 7fd33af..cd0194d 100644 --- a/src/dnsmasq.c +++ b/src/dnsmasq.c @@ -930,6 +930,10 @@ int main (int argc, char **argv) check_servers(); pid = getpid(); + + daemon->pipe_to_parent = -1; + for (i = 0; i < MAX_PROCS; i++) + daemon->tcp_pipes[i] = -1; #ifdef HAVE_INOTIFY /* Using inotify, have to select a resolv file at startup */ @@ -1611,7 +1615,7 @@ static int set_dns_listeners(time_t now) we don't need to explicitly arrange to wake up here */ if (listener->tcpfd != -1) for (i = 0; i < MAX_PROCS; i++) - if (daemon->tcp_pids[i] == 0) + if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1) { poll_listen(listener->tcpfd, POLLIN); break; @@ -1624,6 +1628,13 @@ static int set_dns_listeners(time_t now) } +#ifndef NO_FORK + if (!option_bool(OPT_DEBUG)) + for (i = 0; i < MAX_PROCS; i++) + if (daemon->tcp_pipes[i] != -1) + poll_listen(daemon->tcp_pipes[i], POLLIN); +#endif + return wait; } @@ -1632,7 +1643,10 @@ static void check_dns_listeners(time_t now) struct serverfd *serverfdp; struct listener *listener; int i; - +#ifndef NO_FORK + int pipefd[2]; +#endif + for (serverfdp = daemon->sfds; serverfdp; serverfdp = serverfdp->next) if (poll_check(serverfdp->fd, POLLIN)) reply_query(serverfdp->fd, serverfdp->source_addr.sa.sa_family, now); @@ -1642,7 +1656,26 @@ static void check_dns_listeners(time_t now) if (daemon->randomsocks[i].refcount != 0 && poll_check(daemon->randomsocks[i].fd, POLLIN)) reply_query(daemon->randomsocks[i].fd, daemon->randomsocks[i].family, now); - + +#ifndef NO_FORK + /* Races. The child process can die before we read all of the data from the + pipe, or vice versa. Therefore send tcp_pids to zero when we wait() the + process, and tcp_pipes to -1 and close the FD when we read the last + of the data - indicated by cache_recv_insert returning zero. + The order of these events is indeterminate, and both are needed + to free the process slot. Once the child process has gone, poll() + returns POLLHUP, not POLLIN, so have to check for both here. */ + if (!option_bool(OPT_DEBUG)) + for (i = 0; i < MAX_PROCS; i++) + if (daemon->tcp_pipes[i] != -1 && + poll_check(daemon->tcp_pipes[i], POLLIN | POLLHUP) && + !cache_recv_insert(now, daemon->tcp_pipes[i])) + { + close(daemon->tcp_pipes[i]); + daemon->tcp_pipes[i] = -1; + } +#endif + for (listener = daemon->listeners; listener; listener = listener->next) { if (listener->fd != -1 && poll_check(listener->fd, POLLIN)) @@ -1736,15 +1769,20 @@ static void check_dns_listeners(time_t now) while (retry_send(close(confd))); } #ifndef NO_FORK - else if (!option_bool(OPT_DEBUG) && (p = fork()) != 0) + else if (!option_bool(OPT_DEBUG) && pipe(pipefd) == 0 && (p = fork()) != 0) { - if (p != -1) + close(pipefd[1]); /* parent needs read pipe end. */ + if (p == -1) + close(pipefd[0]); + else { int i; + for (i = 0; i < MAX_PROCS; i++) - if (daemon->tcp_pids[i] == 0) + if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1) { daemon->tcp_pids[i] = p; + daemon->tcp_pipes[i] = pipefd[0]; break; } } @@ -1761,7 +1799,7 @@ static void check_dns_listeners(time_t now) int flags; struct in_addr netmask; int auth_dns; - + if (iface) { netmask = iface->netmask; @@ -1777,7 +1815,11 @@ static void check_dns_listeners(time_t now) /* Arrange for SIGALRM after CHILD_LIFETIME seconds to terminate the process. */ if (!option_bool(OPT_DEBUG)) - alarm(CHILD_LIFETIME); + { + alarm(CHILD_LIFETIME); + close(pipefd[0]); /* close read end in child. */ + daemon->pipe_to_parent = pipefd[1]; + } #endif /* start with no upstream connections. */ diff --git a/src/dnsmasq.h b/src/dnsmasq.h index f53e9a5..bc7921e 100644 --- a/src/dnsmasq.h +++ b/src/dnsmasq.h @@ -1091,6 +1091,8 @@ extern struct daemon { size_t packet_len; /* " " */ struct randfd *rfd_save; /* " " */ pid_t tcp_pids[MAX_PROCS]; + int tcp_pipes[MAX_PROCS]; + int pipe_to_parent; struct randfd randomsocks[RANDOM_SOCKS]; int v6pktinfo; struct addrlist *interface_addrs; /* list of all addresses/prefix lengths associated with all local interfaces */ @@ -1152,6 +1154,7 @@ struct crec *cache_find_by_name(struct crec *crecp, char *name, time_t now, unsigned int prot); void cache_end_insert(void); void cache_start_insert(void); +int cache_recv_insert(time_t now, int fd); struct crec *cache_insert(char *name, struct all_addr *addr, time_t now, unsigned long ttl, unsigned short flags); void cache_reload(void); @@ -1174,6 +1177,8 @@ void blockdata_init(void); void blockdata_report(void); struct blockdata *blockdata_alloc(char *data, size_t len); void *blockdata_retrieve(struct blockdata *block, size_t len, void *data); +struct blockdata *blockdata_read(int fd, size_t len); +void blockdata_write(struct blockdata *block, size_t len, int fd); void blockdata_free(struct blockdata *blocks); #endif