Improve performance when reading large hostfiles.

This commit is contained in:
Simon Kelley
2012-01-11 21:31:51 +00:00
parent 8ecfaa4adf
commit 205fafa577
2 changed files with 48 additions and 39 deletions

View File

@@ -635,11 +635,12 @@ struct crec *cache_find_by_addr(struct crec *crecp, struct all_addr *addr,
} }
static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrlen, static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrlen,
unsigned short flags, int index, int addr_dup) unsigned short flags, int index, struct crec **rhash)
{ {
struct crec *lookup = cache_find_by_name(NULL, cache->name.sname, 0, flags & (F_IPV4 | F_IPV6)); struct crec *lookup = cache_find_by_name(NULL, cache->name.sname, 0, flags & (F_IPV4 | F_IPV6));
int i, nameexists = 0; int i, nameexists = 0;
struct cname *a; struct cname *a;
unsigned int j;
/* Remove duplicates in hosts files. */ /* Remove duplicates in hosts files. */
if (lookup && (lookup->flags & F_HOSTS)) if (lookup && (lookup->flags & F_HOSTS))
@@ -653,33 +654,41 @@ static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrl
} }
/* Ensure there is only one address -> name mapping (first one trumps) /* Ensure there is only one address -> name mapping (first one trumps)
We do this by steam here, first we see if the address is the same as We do this by steam here, The entries are kept in hash chains, linked
the last one we saw, which eliminates most in the case of an ad-block by ->next (which is unused at this point) held in hash buckets in
file with thousands of entries for the same address. the array rhash. Note that rhash and the values in ->next are only valid
Then we search and bail at the first matching address that came from whilst reading hosts files: the buckets are then freed, and the
a HOSTS file. Since the first host entry gets reverse, we know ->next pointer used for other things.
then that it must exist without searching exhaustively for it. */
if (addr_dup) We search and bail at the first matching address that came from
flags &= ~F_REVERSE; a HOSTS file. Since the first host entry gets reverse, we know
else then that it must exist without searching exhaustively for it.
for (i=0; i<hash_size; i++)
This complexity avoids O(n^2) divergent CPU use whilst reading
large (10000 entry) hosts files. */
/* hash address */
for (j = 0, i = 0; i < addrlen; i++)
j += ((unsigned char *)addr)[i] + (j << 6) + (j << 16) - j;
for (lookup = rhash[j % RHASHSIZE]; lookup; lookup = lookup->next)
if ((lookup->flags & F_HOSTS) &&
(lookup->flags & flags & (F_IPV4 | F_IPV6)) &&
memcmp(&lookup->addr.addr, addr, addrlen) == 0)
{ {
for (lookup = hash_table[i]; lookup; lookup = lookup->hash_next) flags &= ~F_REVERSE;
if ((lookup->flags & F_HOSTS) && break;
(lookup->flags & flags & (F_IPV4 | F_IPV6)) &&
memcmp(&lookup->addr.addr, addr, addrlen) == 0)
{
flags &= ~F_REVERSE;
break;
}
if (lookup)
break;
} }
cache->flags = flags; cache->flags = flags;
cache->uid = index; cache->uid = index;
/* maintain address has chain */
cache->next = rhash[j % RHASHSIZE];
rhash[j % RHASHSIZE] = cache;
memcpy(&cache->addr.addr, addr, addrlen); memcpy(&cache->addr.addr, addr, addrlen);
cache_hash(cache); cache_hash(cache);
/* don't need to do alias stuff for second and subsequent addresses. */ /* don't need to do alias stuff for second and subsequent addresses. */
@@ -743,14 +752,14 @@ static int gettok(FILE *f, char *token)
} }
} }
static int read_hostsfile(char *filename, int index, int cache_size) static int read_hostsfile(char *filename, int index, int cache_size, struct crec **rhash)
{ {
FILE *f = fopen(filename, "r"); FILE *f = fopen(filename, "r");
char *token = daemon->namebuff, *domain_suffix = NULL; char *token = daemon->namebuff, *domain_suffix = NULL;
int addr_count = 0, name_count = cache_size, lineno = 0; int addr_count = 0, name_count = cache_size, lineno = 0;
unsigned short flags = 0, saved_flags = 0; unsigned short flags = 0;
struct all_addr addr, saved_addr; struct all_addr addr;
int atnl, addrlen = 0, addr_dup; int atnl, addrlen = 0;
if (!f) if (!f)
{ {
@@ -762,7 +771,6 @@ static int read_hostsfile(char *filename, int index, int cache_size)
while ((atnl = gettok(f, token)) != EOF) while ((atnl = gettok(f, token)) != EOF)
{ {
addr_dup = 0;
lineno++; lineno++;
#ifdef HAVE_IPV6 #ifdef HAVE_IPV6
@@ -794,14 +802,6 @@ static int read_hostsfile(char *filename, int index, int cache_size)
continue; continue;
} }
if (saved_flags == flags && memcmp(&addr, &saved_addr, addrlen) == 0)
addr_dup = 1;
else
{
saved_flags = flags;
saved_addr = addr;
}
addr_count++; addr_count++;
/* rehash every 1000 names. */ /* rehash every 1000 names. */
@@ -832,14 +832,13 @@ static int read_hostsfile(char *filename, int index, int cache_size)
strcpy(cache->name.sname, canon); strcpy(cache->name.sname, canon);
strcat(cache->name.sname, "."); strcat(cache->name.sname, ".");
strcat(cache->name.sname, domain_suffix); strcat(cache->name.sname, domain_suffix);
add_hosts_entry(cache, &addr, addrlen, flags, index, addr_dup); add_hosts_entry(cache, &addr, addrlen, flags, index, rhash);
addr_dup = 1;
name_count++; name_count++;
} }
if ((cache = whine_malloc(sizeof(struct crec) + strlen(canon)+1-SMALLDNAME))) if ((cache = whine_malloc(sizeof(struct crec) + strlen(canon)+1-SMALLDNAME)))
{ {
strcpy(cache->name.sname, canon); strcpy(cache->name.sname, canon);
add_hosts_entry(cache, &addr, addrlen, flags, index, addr_dup); add_hosts_entry(cache, &addr, addrlen, flags, index, rhash);
name_count++; name_count++;
} }
free(canon); free(canon);
@@ -863,6 +862,7 @@ void cache_reload(void)
struct crec *cache, **up, *tmp; struct crec *cache, **up, *tmp;
int i, total_size = daemon->cachesize; int i, total_size = daemon->cachesize;
struct hostsfile *ah; struct hostsfile *ah;
struct crec **reverse_hash;
cache_inserted = cache_live_freed = 0; cache_inserted = cache_live_freed = 0;
@@ -896,13 +896,21 @@ void cache_reload(void)
return; return;
} }
if (!(reverse_hash = whine_malloc(sizeof(struct crec *) * RHASHSIZE)))
return;
for (i = 0; i < RHASHSIZE; i++)
reverse_hash[i] = NULL;
if (!option_bool(OPT_NO_HOSTS)) if (!option_bool(OPT_NO_HOSTS))
total_size = read_hostsfile(HOSTSFILE, 0, total_size); total_size = read_hostsfile(HOSTSFILE, 0, total_size, reverse_hash);
daemon->addn_hosts = expand_filelist(daemon->addn_hosts); daemon->addn_hosts = expand_filelist(daemon->addn_hosts);
for (ah = daemon->addn_hosts; ah; ah = ah->next) for (ah = daemon->addn_hosts; ah; ah = ah->next)
if (!(ah->flags & AH_INACTIVE)) if (!(ah->flags & AH_INACTIVE))
total_size = read_hostsfile(ah->fname, ah->index, total_size); total_size = read_hostsfile(ah->fname, ah->index, total_size, reverse_hash);
free(reverse_hash);
} }
char *get_domain(struct in_addr addr) char *get_domain(struct in_addr addr)

View File

@@ -15,6 +15,7 @@
*/ */
#define FTABSIZ 150 /* max number of outstanding requests (default) */ #define FTABSIZ 150 /* max number of outstanding requests (default) */
#define RHASHSIZE 1024 /* hash buckets for address lookup during hostfile read */
#define MAX_PROCS 20 /* max no children for TCP requests */ #define MAX_PROCS 20 /* max no children for TCP requests */
#define CHILD_LIFETIME 150 /* secs 'till terminated (RFC1035 suggests > 120s) */ #define CHILD_LIFETIME 150 /* secs 'till terminated (RFC1035 suggests > 120s) */
#define EDNS_PKTSZ 4096 /* default max EDNS.0 UDP packet from RFC5625 */ #define EDNS_PKTSZ 4096 /* default max EDNS.0 UDP packet from RFC5625 */