UTF-8 Decoding & Encoding | v6ktw2 | sa.yona.la help | tags | register | sign in

UTF-8 Decoding & Encoding

reply

// U-00000000 - U-0000007F: 0xxxxxxx

// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx

// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx

// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx


byte GetUTF8ByteCount(byte first)
{
	if (first < 0x80) return 1;
	else if (first < 0xE0) return 2;
	else if (first < 0xF0) return 3;
	else if (first < 0xF8) return 4;
	else if (first < 0xFC) return 5;
	else return 6;
}


wchar_t DecodeUTF8(byte first, byte second, byte third)
{
	switch( GetUTF8ByteCount(first) )
	{
	case 1: return first;
	case 2: if (second) return ((first & 0x1F) << 6) | (second & 0x3F);
	case 3: if (second && third)	return ((first & 0x0F) << 12) | ((second & 0x3F) << 6) | third & 0x3F;
	}

	return NULL;
}

std::vector<byte> EncodeUTF8(wchar_t wideChar)
{
	std::vector<byte> result;
	result.reserve(3);

	if (InRange(wideChar, 0x00, 0x7F)) result.push_back( wideChar );
	else if (InRange(wideChar, 0x80, 0x07FF)) {
		result.push_back( (wideChar >> 5) & 0x1F | 0xC0 );
		result.push_back( wideChar & 0x1F | 0x80 );
	}
	else if (InRange(wideChar, 0x0800, 0xFFFF)) {
		result.push_back( (wideChar >> 10) & 0x0F | 0xE0 );
		result.push_back( (wideChar >>  5) & 0x1F | 0x80 );
		result.push_back( wideChar & 0x1F | 0x80 );
	}

	return result;
}



-- by Sizuha

posted by v6ktw2 | reply (0) | trackback (0)

Trackback URL:
api | terms of service | privacy policy | support Copyright (C) 2017 HeartRails Inc. All Rights Reserved.