Rework utf-8 string concat function a little and make it more widely available.

This commit is contained in:
Josh Elsasser
2007-08-15 23:02:56 +00:00
parent 4b97e604b1
commit 7301b39c8b
3 changed files with 132 additions and 86 deletions

View File

@@ -729,3 +729,115 @@ tr_wait( uint64_t delay_milliseconds )
usleep( 1000 * delay_milliseconds );
#endif
}
#define WANTBYTES( want, got ) \
if( (want) > (got) ) { return; } else { (got) -= (want); }
void
strlcat_utf8( void * dest, const void * src, size_t len, char skip )
{
char * s = dest;
const char * append = src;
const char * p;
/* don't overwrite the nul at the end */
len--;
/* Go to the end of the destination string */
while( s[0] )
{
s++;
len--;
}
/* Now start appending, converting on the fly if necessary */
for( p = append; p[0]; )
{
/* skip over the requested character */
if( skip == p[0] )
{
p++;
continue;
}
if( !( p[0] & 0x80 ) )
{
/* ASCII character */
WANTBYTES( 1, len );
*(s++) = *(p++);
continue;
}
if( ( p[0] & 0xE0 ) == 0xC0 && ( p[1] & 0xC0 ) == 0x80 )
{
/* 2-bytes UTF-8 character */
WANTBYTES( 2, len );
*(s++) = *(p++); *(s++) = *(p++);
continue;
}
if( ( p[0] & 0xF0 ) == 0xE0 && ( p[1] & 0xC0 ) == 0x80 &&
( p[2] & 0xC0 ) == 0x80 )
{
/* 3-bytes UTF-8 character */
WANTBYTES( 3, len );
*(s++) = *(p++); *(s++) = *(p++);
*(s++) = *(p++);
continue;
}
if( ( p[0] & 0xF8 ) == 0xF0 && ( p[1] & 0xC0 ) == 0x80 &&
( p[2] & 0xC0 ) == 0x80 && ( p[3] & 0xC0 ) == 0x80 )
{
/* 4-bytes UTF-8 character */
WANTBYTES( 4, len );
*(s++) = *(p++); *(s++) = *(p++);
*(s++) = *(p++); *(s++) = *(p++);
continue;
}
/* ISO 8859-1 -> UTF-8 conversion */
WANTBYTES( 2, len );
*(s++) = 0xC0 | ( ( *p & 0xFF ) >> 6 );
*(s++) = 0x80 | ( *(p++) & 0x3F );
}
}
size_t
bufsize_utf8( const void * vstr, int * changed )
{
const char * str = vstr;
size_t ii, grow;
if( NULL != changed )
*changed = 0;
ii = 0;
grow = 1;
while( '\0' != str[ii] )
{
if( !( str[ii] & 0x80 ) )
/* ASCII character */
ii++;
else if( ( str[ii] & 0xE0 ) == 0xC0 && ( str[ii+1] & 0xC0 ) == 0x80 )
/* 2-bytes UTF-8 character */
ii += 2;
else if( ( str[ii] & 0xF0 ) == 0xE0 && ( str[ii+1] & 0xC0 ) == 0x80 &&
( str[ii+2] & 0xC0 ) == 0x80 )
/* 3-bytes UTF-8 character */
ii += 3;
else if( ( str[ii] & 0xF8 ) == 0xF0 && ( str[ii+1] & 0xC0 ) == 0x80 &&
( str[ii+2] & 0xC0 ) == 0x80 && ( str[ii+3] & 0xC0 ) == 0x80 )
/* 4-bytes UTF-8 character */
ii += 4;
else
{
/* ISO 8859-1 -> UTF-8 conversion */
ii++;
grow++;
if( NULL != changed )
*changed = 1;
}
}
return ii + grow;
}