| /****************************************************************************** | 
 |  | 
 |  @File         PVRTUnicode.cpp | 
 |  | 
 |  @Title        PVRTUnicode | 
 |  | 
 |  @Version       @Version       | 
 |  | 
 |  @Copyright    Copyright (c) Imagination Technologies Limited. | 
 |  | 
 |  @Platform     All | 
 |  | 
 |  @Description  A small collection of functions used to decode Unicode formats to | 
 |                individual code points. | 
 |  | 
 | ******************************************************************************/ | 
 | #include "PVRTUnicode.h" | 
 | #include <string.h> | 
 |  | 
 | /**************************************************************************** | 
 | ** Constants | 
 | ****************************************************************************/ | 
 | const PVRTuint32 c_u32ReplChar = 0xFFFD; | 
 |  | 
 | #define VALID_ASCII 0x80 | 
 | #define TAIL_MASK 0x3F | 
 | #define BYTES_PER_TAIL 6 | 
 |  | 
 | #define UTF16_SURG_H_MARK 0xD800 | 
 | #define UTF16_SURG_H_END  0xDBFF | 
 | #define UTF16_SURG_L_MARK 0xDC00 | 
 | #define UTF16_SURG_L_END  0xDFFF | 
 |  | 
 | #define UNICODE_NONCHAR_MARK 0xFDD0 | 
 | #define UNICODE_NONCHAR_END  0xFDEF | 
 | #define UNICODE_RESERVED	 0xFFFE | 
 | #define UNICODE_MAX			 0x10FFFF | 
 |  | 
 | #define MAX_LEN 0x8FFF | 
 |  | 
 | /**************************************************************************** | 
 | ** A table which allows quick lookup to determine the number of bytes of a  | 
 | ** UTF8 code point. | 
 | ****************************************************************************/ | 
 | const PVRTuint8 c_u8UTF8Lengths[256] =  | 
 | { | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
 | 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
 | 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
 | 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | 
 | 	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0, | 
 | }; | 
 |  | 
 | /**************************************************************************** | 
 | ** A table which allows quick lookup to determine whether a UTF8 sequence | 
 | ** is 'overlong'. | 
 | ****************************************************************************/ | 
 | const PVRTuint32 c_u32MinVals[4] = | 
 | { | 
 | 	0x00000000,		// 0 tail bytes | 
 | 	0x00000080,		// 1 tail bytes | 
 | 	0x00000800,		// 2 tail bytes | 
 | 	0x00010000,		// 3 tail bytes | 
 | }; | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			CheckGenericUnicode | 
 |  @Input				c32			A UTF32 character/Unicode code point | 
 |  @Returns			Success or failure.  | 
 |  @Description		Checks that the decoded code point is valid. | 
 | *****************************************************************************/ | 
 | static bool CheckGenericUnicode(PVRTuint32 c32) | 
 | { | 
 | 	// Check that this value isn't a UTF16 surrogate mask. | 
 | 	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END) | 
 | 		return false; | 
 | 	// Check non-char values | 
 | 	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END) | 
 | 		return false; | 
 | 	// Check reserved values | 
 | 	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED) | 
 | 		return false; | 
 | 	// Check max value. | 
 | 	if(c32 > UNICODE_MAX) | 
 | 		return false; | 
 |  | 
 | 	return true; | 
 | } | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			PVRTUnicodeUTF8ToUTF32 | 
 |  @Input				pUTF8			A UTF8 string, which is null terminated. | 
 |  @Output			aUTF32			An array of Unicode code points. | 
 |  @Returns			Success or failure.  | 
 |  @Description		Decodes a UTF8-encoded string in to Unicode code points | 
 | 					(UTF32). If pUTF8 is not null terminated, the results are  | 
 | 					undefined. | 
 | *****************************************************************************/ | 
 | EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)						 | 
 | { | 
 | 	unsigned int uiTailLen, uiIndex; | 
 | 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); | 
 | 	PVRTuint32 c32; | 
 |  | 
 | 	const PVRTuint8* pC = pUTF8; | 
 | 	while(*pC) | 
 | 	{ | 
 | 		// Quick optimisation for ASCII characters | 
 | 		while(*pC && *pC < VALID_ASCII) | 
 | 		{ | 
 | 			aUTF32.Append(*pC++); | 
 | 		} | 
 | 		// Done | 
 | 		if(!*pC)			 | 
 | 			break; | 
 |  | 
 | 		c32 = *pC++; | 
 | 		uiTailLen = c_u8UTF8Lengths[c32]; | 
 |  | 
 | 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character. | 
 | 		// Also check to make sure the tail length is inside the provided buffer. | 
 | 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) | 
 | 			return PVR_OVERFLOW; | 
 |  | 
 | 		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail. | 
 |  | 
 | 		// Get the data out of each tail byte | 
 | 		uiIndex = 0; | 
 | 		while(uiIndex < uiTailLen) | 
 | 		{ | 
 | 			if((pC[uiIndex] & 0xC0) != 0x80) | 
 | 				return PVR_FAIL;		// Invalid tail byte! | 
 |  | 
 | 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); | 
 | 			uiIndex++; | 
 | 		} | 
 |  | 
 | 		pC += uiIndex; | 
 |  | 
 | 		// Check overlong values. | 
 | 		if(c32 < c_u32MinVals[uiTailLen]) | 
 | 			return PVR_FAIL;		 | 
 | 		 | 
 | 		if(!CheckGenericUnicode(c32)) | 
 | 			return PVR_FAIL; | 
 |  | 
 | 		// OK | 
 | 		aUTF32.Append(c32); | 
 | 	} | 
 |  | 
 | 	return PVR_SUCCESS; | 
 | } | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			PVRTUnicodeUTF16ToUTF32 | 
 |  @Input				pUTF16			A UTF16 string, which is null terminated. | 
 |  @Output			aUTF32			An array of Unicode code points. | 
 |  @Returns			Success or failure.  | 
 |  @Description		Decodes a UTF16-encoded string in to Unicode code points | 
 | 					(UTF32). If pUTF16 is not null terminated, the results are  | 
 | 					undefined. | 
 | *****************************************************************************/ | 
 | EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32) | 
 | { | 
 | 	const PVRTuint16* pC = pUTF16; | 
 |  | 
 | 	// Determine the number of shorts | 
 | 	while(*++pC && (pC - pUTF16) < MAX_LEN); | 
 | 	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16); | 
 |  | 
 | 	if(uiBufferLen == MAX_LEN) | 
 | 		return PVR_OVERFLOW;		// Probably not NULL terminated.	 | 
 |  | 
 | 	// Reset to start. | 
 | 	pC = pUTF16; | 
 |  | 
 | 	PVRTuint32 c32; | 
 | 	while(*pC) | 
 | 	{ | 
 | 		// Straight copy. We'll check for surrogate pairs next... | 
 | 		c32 = *pC++; | 
 |  | 
 | 		// Check surrogate pair | 
 | 		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END) | 
 | 		{ | 
 | 			// Make sure the next 2 bytes are in range... | 
 | 			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0) | 
 | 				return PVR_OVERFLOW; | 
 |  | 
 | 			// Check that the next value is in the low surrogate range | 
 | 			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END) | 
 | 				return PVR_FAIL; | 
 |  | 
 | 			// Decode | 
 | 			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000; | 
 | 			pC++; | 
 | 		} | 
 |  | 
 | 		if(!CheckGenericUnicode(c32)) | 
 | 			return PVR_FAIL; | 
 |  | 
 | 		// OK | 
 | 		aUTF32.Append(c32); | 
 | 	} | 
 |  | 
 | 	return PVR_SUCCESS; | 
 | } | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			PVRTUnicodeUTF8Length | 
 |  @Input				pUTF8			A UTF8 string, which is null terminated. | 
 |  @Returns			The length of the string, in Unicode code points. | 
 |  @Description		Calculates the length of a UTF8 string. If pUTF8 is  | 
 | 					not null terminated, the results are undefined. | 
 | *****************************************************************************/ | 
 | unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8) | 
 | { | 
 | 	const PVRTuint8* pC = pUTF8; | 
 |  | 
 | 	unsigned int charCount = 0; | 
 | 	unsigned int mask; | 
 | 	while(*pC) | 
 | 	{ | 
 | 		// Quick optimisation for ASCII characters | 
 | 		const PVRTuint8* pStart = pC; | 
 | 		while(*pC && *pC < VALID_ASCII) | 
 | 			pC++; | 
 |  | 
 | 		charCount += (unsigned int) (pC - pStart); | 
 |  | 
 | 		// Done | 
 | 		if(!*pC)	 | 
 | 			break; | 
 | 		 | 
 | 		mask = *pC & 0xF0; | 
 | 		switch(mask) | 
 | 		{ | 
 | 		case 0xF0: pC++; | 
 | 		case 0xE0: pC++; | 
 | 		case 0xC0: pC++; | 
 | 			break; | 
 | 		default: | 
 | 			_ASSERT(!"Invalid tail byte!"); | 
 | 			return 0; | 
 | 		} | 
 |  | 
 | 		pC++; | 
 | 		charCount++; | 
 | 	} | 
 |  | 
 | 	return charCount; | 
 | } | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			PVRTUnicodeUTF16Length | 
 |  @Input				pUTF16			A UTF16 string, which is null terminated. | 
 |  @Returns			The length of the string, in Unicode code points. | 
 |  @Description		Calculates the length of a UTF16 string. | 
 | 					If pUTF16 is not null terminated, the results are  | 
 | 					undefined. | 
 | *****************************************************************************/ | 
 | unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16) | 
 | { | 
 | 	const PVRTuint16* pC = pUTF16;	 | 
 | 	unsigned int charCount = 0; | 
 | 	while(*pC && (pC - pUTF16) < MAX_LEN) | 
 | 	{ | 
 | 		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END | 
 | 		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END) | 
 | 		{ | 
 | 			pC += 2; | 
 | 		} | 
 | 		else | 
 | 		{ | 
 | 			pC += 1; | 
 | 		} | 
 |  | 
 | 		charCount++; | 
 | 	} | 
 |  | 
 | 	return charCount; | 
 | } | 
 |  | 
 | /*!*************************************************************************** | 
 |  @Function			PVRTUnicodeValidUTF8 | 
 |  @Input				pUTF8			A UTF8 string, which is null terminated. | 
 |  @Returns			true or false | 
 |  @Description		Checks whether the encoding of a UTF8 string is valid. | 
 | 					If pUTF8 is not null terminated, the results are undefined. | 
 | *****************************************************************************/ | 
 | bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8) | 
 | { | 
 | 	unsigned int uiTailLen, uiIndex; | 
 | 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); | 
 | 	const PVRTuint8* pC = pUTF8; | 
 | 	while(*pC) | 
 | 	{ | 
 | 		// Quick optimisation for ASCII characters | 
 | 		while(*pC && *pC < VALID_ASCII)	pC++; | 
 | 		// Done? | 
 | 		if(!*pC)			 | 
 | 			break; | 
 |  | 
 | 		PVRTuint32 c32 = *pC++; | 
 | 		uiTailLen = c_u8UTF8Lengths[c32]; | 
 |  | 
 | 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character. | 
 | 		// Also check to make sure the tail length is inside the provided buffer. | 
 | 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) | 
 | 			return false; | 
 |  | 
 | 		// Get the data out of each tail byte | 
 | 		uiIndex = 0; | 
 | 		while(uiIndex < uiTailLen) | 
 | 		{ | 
 | 			if((pC[uiIndex] & 0xC0) != 0x80) | 
 | 				return false;		// Invalid tail byte! | 
 | 			 | 
 | 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); | 
 | 			uiIndex++; | 
 | 		} | 
 | 		 | 
 | 		pC += uiIndex; | 
 |  | 
 | 		// Check overlong values. | 
 | 		if(c32 < c_u32MinVals[uiTailLen]) | 
 | 			return false;		 | 
 | 		if(!CheckGenericUnicode(c32)) | 
 | 			return false; | 
 | 	} | 
 |  | 
 | 	return true; | 
 | } | 
 |  | 
 | /***************************************************************************** | 
 |  End of file (PVRTUnicode.cpp) | 
 | *****************************************************************************/ | 
 |  |