third_party/PowerVR_SDK/Tools/PVRTUnicode.cpp - SwiftShader - Git at Google

 /******************************************************************************

  @File         PVRTUnicode.cpp

  @Title        PVRTUnicode

  @Version       @Version

  @Copyright    Copyright (c) Imagination Technologies Limited.

  @Platform     All

  @Description  A small collection of functions used to decode Unicode formats to
                individual code points.

 ******************************************************************************/
 #include "PVRTUnicode.h"
 #include <string.h>

 /****************************************************************************
 ** Constants
 ****************************************************************************/
 const PVRTuint32 c_u32ReplChar = 0xFFFD;

 #define VALID_ASCII 0x80
 #define TAIL_MASK 0x3F
 #define BYTES_PER_TAIL 6

 #define UTF16_SURG_H_MARK 0xD800
 #define UTF16_SURG_H_END  0xDBFF
 #define UTF16_SURG_L_MARK 0xDC00
 #define UTF16_SURG_L_END  0xDFFF

 #define UNICODE_NONCHAR_MARK 0xFDD0
 #define UNICODE_NONCHAR_END  0xFDEF
 #define UNICODE_RESERVED	 0xFFFE
 #define UNICODE_MAX			 0x10FFFF

 #define MAX_LEN 0x8FFF

 /****************************************************************************
 ** A table which allows quick lookup to determine the number of bytes of a
 ** UTF8 code point.
 ****************************************************************************/
 const PVRTuint8 c_u8UTF8Lengths[256] =
 {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
 };

 /****************************************************************************
 ** A table which allows quick lookup to determine whether a UTF8 sequence
 ** is 'overlong'.
 ****************************************************************************/
 const PVRTuint32 c_u32MinVals[4] =
 {
 	0x00000000,		// 0 tail bytes
 	0x00000080,		// 1 tail bytes
 	0x00000800,		// 2 tail bytes
 	0x00010000,		// 3 tail bytes
 };

 /*!***************************************************************************
  @Function			CheckGenericUnicode
  @Input				c32			A UTF32 character/Unicode code point
  @Returns			Success or failure.
  @Description		Checks that the decoded code point is valid.
 *****************************************************************************/
 static bool CheckGenericUnicode(PVRTuint32 c32)
 {
 	// Check that this value isn't a UTF16 surrogate mask.
 	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
 		return false;
 	// Check non-char values
 	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
 		return false;
 	// Check reserved values
 	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
 		return false;
 	// Check max value.
 	if(c32 > UNICODE_MAX)
 		return false;

 	return true;
 }

 /*!***************************************************************************
  @Function			PVRTUnicodeUTF8ToUTF32
  @Input				pUTF8			A UTF8 string, which is null terminated.
  @Output			aUTF32			An array of Unicode code points.
  @Returns			Success or failure.
  @Description		Decodes a UTF8-encoded string in to Unicode code points
 					(UTF32). If pUTF8 is not null terminated, the results are
 					undefined.
 *****************************************************************************/
 EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
 {
 	unsigned int uiTailLen, uiIndex;
 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
 	PVRTuint32 c32;

 	const PVRTuint8* pC = pUTF8;
 	while(*pC)
 	{
 		// Quick optimisation for ASCII characters
 		while(*pC && *pC < VALID_ASCII)
 		{
 			aUTF32.Append(*pC++);
 		}
 		// Done
 		if(!*pC)
 			break;

 		c32 = *pC++;
 		uiTailLen = c_u8UTF8Lengths[c32];

 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
 		// Also check to make sure the tail length is inside the provided buffer.
 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
 			return PVR_OVERFLOW;

 		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.

 		// Get the data out of each tail byte
 		uiIndex = 0;
 		while(uiIndex < uiTailLen)
 		{
 			if((pC[uiIndex] & 0xC0) != 0x80)
 				return PVR_FAIL;		// Invalid tail byte!

 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
 			uiIndex++;
 		}

 		pC += uiIndex;

 		// Check overlong values.
 		if(c32 < c_u32MinVals[uiTailLen])
 			return PVR_FAIL;

 		if(!CheckGenericUnicode(c32))
 			return PVR_FAIL;

 		// OK
 		aUTF32.Append(c32);
 	}

 	return PVR_SUCCESS;
 }

 /*!***************************************************************************
  @Function			PVRTUnicodeUTF16ToUTF32
  @Input				pUTF16			A UTF16 string, which is null terminated.
  @Output			aUTF32			An array of Unicode code points.
  @Returns			Success or failure.
  @Description		Decodes a UTF16-encoded string in to Unicode code points
 					(UTF32). If pUTF16 is not null terminated, the results are
 					undefined.
 *****************************************************************************/
 EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
 {
 	const PVRTuint16* pC = pUTF16;

 	// Determine the number of shorts
 	while(*++pC && (pC - pUTF16) < MAX_LEN);
 	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);

 	if(uiBufferLen == MAX_LEN)
 		return PVR_OVERFLOW;		// Probably not NULL terminated.

 	// Reset to start.
 	pC = pUTF16;

 	PVRTuint32 c32;
 	while(*pC)
 	{
 		// Straight copy. We'll check for surrogate pairs next...
 		c32 = *pC++;

 		// Check surrogate pair
 		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
 		{
 			// Make sure the next 2 bytes are in range...
 			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
 				return PVR_OVERFLOW;

 			// Check that the next value is in the low surrogate range
 			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
 				return PVR_FAIL;

 			// Decode
 			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
 			pC++;
 		}

 		if(!CheckGenericUnicode(c32))
 			return PVR_FAIL;

 		// OK
 		aUTF32.Append(c32);
 	}

 	return PVR_SUCCESS;
 }

 /*!***************************************************************************
  @Function			PVRTUnicodeUTF8Length
  @Input				pUTF8			A UTF8 string, which is null terminated.
  @Returns			The length of the string, in Unicode code points.
  @Description		Calculates the length of a UTF8 string. If pUTF8 is
 					not null terminated, the results are undefined.
 *****************************************************************************/
 unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
 {
 	const PVRTuint8* pC = pUTF8;

 	unsigned int charCount = 0;
 	unsigned int mask;
 	while(*pC)
 	{
 		// Quick optimisation for ASCII characters
 		const PVRTuint8* pStart = pC;
 		while(*pC && *pC < VALID_ASCII)
 			pC++;

 		charCount += (unsigned int) (pC - pStart);

 		// Done
 		if(!*pC)
 			break;

 		mask = *pC & 0xF0;
 		switch(mask)
 		{
 		case 0xF0: pC++;
 		case 0xE0: pC++;
 		case 0xC0: pC++;
 			break;
 		default:
 			_ASSERT(!"Invalid tail byte!");
 			return 0;
 		}

 		pC++;
 		charCount++;
 	}

 	return charCount;
 }

 /*!***************************************************************************
  @Function			PVRTUnicodeUTF16Length
  @Input				pUTF16			A UTF16 string, which is null terminated.
  @Returns			The length of the string, in Unicode code points.
  @Description		Calculates the length of a UTF16 string.
 					If pUTF16 is not null terminated, the results are
 					undefined.
 *****************************************************************************/
 unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
 {
 	const PVRTuint16* pC = pUTF16;
 	unsigned int charCount = 0;
 	while(*pC && (pC - pUTF16) < MAX_LEN)
 	{
 		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
 		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
 		{
 			pC += 2;
 		}
 		else
 		{
 			pC += 1;
 		}

 		charCount++;
 	}

 	return charCount;
 }

 /*!***************************************************************************
  @Function			PVRTUnicodeValidUTF8
  @Input				pUTF8			A UTF8 string, which is null terminated.
  @Returns			true or false
  @Description		Checks whether the encoding of a UTF8 string is valid.
 					If pUTF8 is not null terminated, the results are undefined.
 *****************************************************************************/
 bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
 {
 	unsigned int uiTailLen, uiIndex;
 	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
 	const PVRTuint8* pC = pUTF8;
 	while(*pC)
 	{
 		// Quick optimisation for ASCII characters
 		while(*pC && *pC < VALID_ASCII)	pC++;
 		// Done?
 		if(!*pC)
 			break;

 		PVRTuint32 c32 = *pC++;
 		uiTailLen = c_u8UTF8Lengths[c32];

 		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
 		// Also check to make sure the tail length is inside the provided buffer.
 		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
 			return false;

 		// Get the data out of each tail byte
 		uiIndex = 0;
 		while(uiIndex < uiTailLen)
 		{
 			if((pC[uiIndex] & 0xC0) != 0x80)
 				return false;		// Invalid tail byte!

 			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
 			uiIndex++;
 		}

 		pC += uiIndex;

 		// Check overlong values.
 		if(c32 < c_u32MinVals[uiTailLen])
 			return false;
 		if(!CheckGenericUnicode(c32))
 			return false;
 	}

 	return true;
 }

 /*****************************************************************************
  End of file (PVRTUnicode.cpp)
 *****************************************************************************/
	/******************************************************************************

	@File PVRTUnicode.cpp

	@Title PVRTUnicode

	@Version @Version

	@Copyright Copyright (c) Imagination Technologies Limited.

	@Platform All

	@Description A small collection of functions used to decode Unicode formats to
	individual code points.

	******************************************************************************/
	#include "PVRTUnicode.h"
	#include <string.h>

	/****************************************************************************
	** Constants
	****************************************************************************/
	const PVRTuint32 c_u32ReplChar = 0xFFFD;

	#define VALID_ASCII 0x80
	#define TAIL_MASK 0x3F
	#define BYTES_PER_TAIL 6

	#define UTF16_SURG_H_MARK 0xD800
	#define UTF16_SURG_H_END 0xDBFF
	#define UTF16_SURG_L_MARK 0xDC00
	#define UTF16_SURG_L_END 0xDFFF

	#define UNICODE_NONCHAR_MARK 0xFDD0
	#define UNICODE_NONCHAR_END 0xFDEF
	#define UNICODE_RESERVED 0xFFFE
	#define UNICODE_MAX 0x10FFFF

	#define MAX_LEN 0x8FFF

	/****************************************************************************
	** A table which allows quick lookup to determine the number of bytes of a
	** UTF8 code point.
	****************************************************************************/
	const PVRTuint8 c_u8UTF8Lengths[256] =
	{
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
	};

	/****************************************************************************
	** A table which allows quick lookup to determine whether a UTF8 sequence
	** is 'overlong'.
	****************************************************************************/
	const PVRTuint32 c_u32MinVals[4] =
	{
	0x00000000, // 0 tail bytes
	0x00000080, // 1 tail bytes
	0x00000800, // 2 tail bytes
	0x00010000, // 3 tail bytes
	};

	/!**************************************************************************
	@Function CheckGenericUnicode
	@Input c32 A UTF32 character/Unicode code point
	@Returns Success or failure.
	@Description Checks that the decoded code point is valid.
	*****************************************************************************/
	static bool CheckGenericUnicode(PVRTuint32 c32)
	{
	// Check that this value isn't a UTF16 surrogate mask.
	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
	return false;
	// Check non-char values
	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
	return false;
	// Check reserved values
	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
	return false;
	// Check max value.
	if(c32 > UNICODE_MAX)
	return false;

	return true;
	}

	/!**************************************************************************
	@Function PVRTUnicodeUTF8ToUTF32
	@Input pUTF8 A UTF8 string, which is null terminated.
	@Output aUTF32 An array of Unicode code points.
	@Returns Success or failure.
	@Description Decodes a UTF8-encoded string in to Unicode code points
	(UTF32). If pUTF8 is not null terminated, the results are
	undefined.
	*****************************************************************************/
	EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
	{
	unsigned int uiTailLen, uiIndex;
	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
	PVRTuint32 c32;

	const PVRTuint8* pC = pUTF8;
	while(*pC)
	{
	// Quick optimisation for ASCII characters
	while(pC && pC < VALID_ASCII)
	{
	aUTF32.Append(*pC++);
	}
	// Done
	if(!*pC)
	break;

	c32 = *pC++;
	uiTailLen = c_u8UTF8Lengths[c32];

	// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
	// Also check to make sure the tail length is inside the provided buffer.
	if(uiTailLen == 0 \|\| (pC + uiTailLen > pUTF8 + uiBytes))
	return PVR_OVERFLOW;

	c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail.

	// Get the data out of each tail byte
	uiIndex = 0;
	while(uiIndex < uiTailLen)
	{
	if((pC[uiIndex] & 0xC0) != 0x80)
	return PVR_FAIL; // Invalid tail byte!

	c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
	uiIndex++;
	}

	pC += uiIndex;

	// Check overlong values.
	if(c32 < c_u32MinVals[uiTailLen])
	return PVR_FAIL;

	if(!CheckGenericUnicode(c32))
	return PVR_FAIL;

	// OK
	aUTF32.Append(c32);
	}

	return PVR_SUCCESS;
	}

	/!**************************************************************************
	@Function PVRTUnicodeUTF16ToUTF32
	@Input pUTF16 A UTF16 string, which is null terminated.
	@Output aUTF32 An array of Unicode code points.
	@Returns Success or failure.
	@Description Decodes a UTF16-encoded string in to Unicode code points
	(UTF32). If pUTF16 is not null terminated, the results are
	undefined.
	*****************************************************************************/
	EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
	{
	const PVRTuint16* pC = pUTF16;

	// Determine the number of shorts
	while(*++pC && (pC - pUTF16) < MAX_LEN);
	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);

	if(uiBufferLen == MAX_LEN)
	return PVR_OVERFLOW; // Probably not NULL terminated.

	// Reset to start.
	pC = pUTF16;

	PVRTuint32 c32;
	while(*pC)
	{
	// Straight copy. We'll check for surrogate pairs next...
	c32 = *pC++;

	// Check surrogate pair
	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
	{
	// Make sure the next 2 bytes are in range...
	if(pC + 1 > pUTF16 + uiBufferLen \|\| *pC == 0)
	return PVR_OVERFLOW;

	// Check that the next value is in the low surrogate range
	if(pC < UTF16_SURG_L_MARK \|\| pC > UTF16_SURG_L_END)
	return PVR_FAIL;

	// Decode
	c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
	pC++;
	}

	if(!CheckGenericUnicode(c32))
	return PVR_FAIL;

	// OK
	aUTF32.Append(c32);
	}

	return PVR_SUCCESS;
	}

	/!**************************************************************************
	@Function PVRTUnicodeUTF8Length
	@Input pUTF8 A UTF8 string, which is null terminated.
	@Returns The length of the string, in Unicode code points.
	@Description Calculates the length of a UTF8 string. If pUTF8 is
	not null terminated, the results are undefined.
	*****************************************************************************/
	unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
	{
	const PVRTuint8* pC = pUTF8;

	unsigned int charCount = 0;
	unsigned int mask;
	while(*pC)
	{
	// Quick optimisation for ASCII characters
	const PVRTuint8* pStart = pC;
	while(pC && pC < VALID_ASCII)
	pC++;

	charCount += (unsigned int) (pC - pStart);

	// Done
	if(!*pC)
	break;

	mask = *pC & 0xF0;
	switch(mask)
	{
	case 0xF0: pC++;
	case 0xE0: pC++;
	case 0xC0: pC++;
	break;
	default:
	_ASSERT(!"Invalid tail byte!");
	return 0;
	}

	pC++;
	charCount++;
	}

	return charCount;
	}

	/!**************************************************************************
	@Function PVRTUnicodeUTF16Length
	@Input pUTF16 A UTF16 string, which is null terminated.
	@Returns The length of the string, in Unicode code points.
	@Description Calculates the length of a UTF16 string.
	If pUTF16 is not null terminated, the results are
	undefined.
	*****************************************************************************/
	unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
	{
	const PVRTuint16* pC = pUTF16;
	unsigned int charCount = 0;
	while(*pC && (pC - pUTF16) < MAX_LEN)
	{
	if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
	&& pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
	{
	pC += 2;
	}
	else
	{
	pC += 1;
	}

	charCount++;
	}

	return charCount;
	}

	/!**************************************************************************
	@Function PVRTUnicodeValidUTF8
	@Input pUTF8 A UTF8 string, which is null terminated.
	@Returns true or false
	@Description Checks whether the encoding of a UTF8 string is valid.
	If pUTF8 is not null terminated, the results are undefined.
	*****************************************************************************/
	bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
	{
	unsigned int uiTailLen, uiIndex;
	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
	const PVRTuint8* pC = pUTF8;
	while(*pC)
	{
	// Quick optimisation for ASCII characters
	while(pC && pC < VALID_ASCII) pC++;
	// Done?
	if(!*pC)
	break;

	PVRTuint32 c32 = *pC++;
	uiTailLen = c_u8UTF8Lengths[c32];

	// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
	// Also check to make sure the tail length is inside the provided buffer.
	if(uiTailLen == 0 \|\| (pC + uiTailLen > pUTF8 + uiBytes))
	return false;

	// Get the data out of each tail byte
	uiIndex = 0;
	while(uiIndex < uiTailLen)
	{
	if((pC[uiIndex] & 0xC0) != 0x80)
	return false; // Invalid tail byte!

	c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
	uiIndex++;
	}

	pC += uiIndex;

	// Check overlong values.
	if(c32 < c_u32MinVals[uiTailLen])
	return false;
	if(!CheckGenericUnicode(c32))
	return false;
	}

	return true;
	}

	/*****************************************************************************
	End of file (PVRTUnicode.cpp)
	*****************************************************************************/