| /****************************************************************************** |
| |
| @File PVRTUnicode.cpp |
| |
| @Title PVRTUnicode |
| |
| @Version @Version |
| |
| @Copyright Copyright (c) Imagination Technologies Limited. |
| |
| @Platform All |
| |
| @Description A small collection of functions used to decode Unicode formats to |
| individual code points. |
| |
| ******************************************************************************/ |
| #include "PVRTUnicode.h" |
| #include <string.h> |
| |
| /**************************************************************************** |
| ** Constants |
| ****************************************************************************/ |
| const PVRTuint32 c_u32ReplChar = 0xFFFD; |
| |
| #define VALID_ASCII 0x80 |
| #define TAIL_MASK 0x3F |
| #define BYTES_PER_TAIL 6 |
| |
| #define UTF16_SURG_H_MARK 0xD800 |
| #define UTF16_SURG_H_END 0xDBFF |
| #define UTF16_SURG_L_MARK 0xDC00 |
| #define UTF16_SURG_L_END 0xDFFF |
| |
| #define UNICODE_NONCHAR_MARK 0xFDD0 |
| #define UNICODE_NONCHAR_END 0xFDEF |
| #define UNICODE_RESERVED 0xFFFE |
| #define UNICODE_MAX 0x10FFFF |
| |
| #define MAX_LEN 0x8FFF |
| |
| /**************************************************************************** |
| ** A table which allows quick lookup to determine the number of bytes of a |
| ** UTF8 code point. |
| ****************************************************************************/ |
| const PVRTuint8 c_u8UTF8Lengths[256] = |
| { |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0, |
| }; |
| |
| /**************************************************************************** |
| ** A table which allows quick lookup to determine whether a UTF8 sequence |
| ** is 'overlong'. |
| ****************************************************************************/ |
| const PVRTuint32 c_u32MinVals[4] = |
| { |
| 0x00000000, // 0 tail bytes |
| 0x00000080, // 1 tail bytes |
| 0x00000800, // 2 tail bytes |
| 0x00010000, // 3 tail bytes |
| }; |
| |
| /*!*************************************************************************** |
| @Function CheckGenericUnicode |
| @Input c32 A UTF32 character/Unicode code point |
| @Returns Success or failure. |
| @Description Checks that the decoded code point is valid. |
| *****************************************************************************/ |
| static bool CheckGenericUnicode(PVRTuint32 c32) |
| { |
| // Check that this value isn't a UTF16 surrogate mask. |
| if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END) |
| return false; |
| // Check non-char values |
| if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END) |
| return false; |
| // Check reserved values |
| if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED) |
| return false; |
| // Check max value. |
| if(c32 > UNICODE_MAX) |
| return false; |
| |
| return true; |
| } |
| |
| /*!*************************************************************************** |
| @Function PVRTUnicodeUTF8ToUTF32 |
| @Input pUTF8 A UTF8 string, which is null terminated. |
| @Output aUTF32 An array of Unicode code points. |
| @Returns Success or failure. |
| @Description Decodes a UTF8-encoded string in to Unicode code points |
| (UTF32). If pUTF8 is not null terminated, the results are |
| undefined. |
| *****************************************************************************/ |
| EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32) |
| { |
| unsigned int uiTailLen, uiIndex; |
| unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); |
| PVRTuint32 c32; |
| |
| const PVRTuint8* pC = pUTF8; |
| while(*pC) |
| { |
| // Quick optimisation for ASCII characters |
| while(*pC && *pC < VALID_ASCII) |
| { |
| aUTF32.Append(*pC++); |
| } |
| // Done |
| if(!*pC) |
| break; |
| |
| c32 = *pC++; |
| uiTailLen = c_u8UTF8Lengths[c32]; |
| |
| // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. |
| // Also check to make sure the tail length is inside the provided buffer. |
| if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) |
| return PVR_OVERFLOW; |
| |
| c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail. |
| |
| // Get the data out of each tail byte |
| uiIndex = 0; |
| while(uiIndex < uiTailLen) |
| { |
| if((pC[uiIndex] & 0xC0) != 0x80) |
| return PVR_FAIL; // Invalid tail byte! |
| |
| c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); |
| uiIndex++; |
| } |
| |
| pC += uiIndex; |
| |
| // Check overlong values. |
| if(c32 < c_u32MinVals[uiTailLen]) |
| return PVR_FAIL; |
| |
| if(!CheckGenericUnicode(c32)) |
| return PVR_FAIL; |
| |
| // OK |
| aUTF32.Append(c32); |
| } |
| |
| return PVR_SUCCESS; |
| } |
| |
| /*!*************************************************************************** |
| @Function PVRTUnicodeUTF16ToUTF32 |
| @Input pUTF16 A UTF16 string, which is null terminated. |
| @Output aUTF32 An array of Unicode code points. |
| @Returns Success or failure. |
| @Description Decodes a UTF16-encoded string in to Unicode code points |
| (UTF32). If pUTF16 is not null terminated, the results are |
| undefined. |
| *****************************************************************************/ |
| EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32) |
| { |
| const PVRTuint16* pC = pUTF16; |
| |
| // Determine the number of shorts |
| while(*++pC && (pC - pUTF16) < MAX_LEN); |
| unsigned int uiBufferLen = (unsigned int) (pC - pUTF16); |
| |
| if(uiBufferLen == MAX_LEN) |
| return PVR_OVERFLOW; // Probably not NULL terminated. |
| |
| // Reset to start. |
| pC = pUTF16; |
| |
| PVRTuint32 c32; |
| while(*pC) |
| { |
| // Straight copy. We'll check for surrogate pairs next... |
| c32 = *pC++; |
| |
| // Check surrogate pair |
| if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END) |
| { |
| // Make sure the next 2 bytes are in range... |
| if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0) |
| return PVR_OVERFLOW; |
| |
| // Check that the next value is in the low surrogate range |
| if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END) |
| return PVR_FAIL; |
| |
| // Decode |
| c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000; |
| pC++; |
| } |
| |
| if(!CheckGenericUnicode(c32)) |
| return PVR_FAIL; |
| |
| // OK |
| aUTF32.Append(c32); |
| } |
| |
| return PVR_SUCCESS; |
| } |
| |
| /*!*************************************************************************** |
| @Function PVRTUnicodeUTF8Length |
| @Input pUTF8 A UTF8 string, which is null terminated. |
| @Returns The length of the string, in Unicode code points. |
| @Description Calculates the length of a UTF8 string. If pUTF8 is |
| not null terminated, the results are undefined. |
| *****************************************************************************/ |
| unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8) |
| { |
| const PVRTuint8* pC = pUTF8; |
| |
| unsigned int charCount = 0; |
| unsigned int mask; |
| while(*pC) |
| { |
| // Quick optimisation for ASCII characters |
| const PVRTuint8* pStart = pC; |
| while(*pC && *pC < VALID_ASCII) |
| pC++; |
| |
| charCount += (unsigned int) (pC - pStart); |
| |
| // Done |
| if(!*pC) |
| break; |
| |
| mask = *pC & 0xF0; |
| switch(mask) |
| { |
| case 0xF0: pC++; |
| case 0xE0: pC++; |
| case 0xC0: pC++; |
| break; |
| default: |
| _ASSERT(!"Invalid tail byte!"); |
| return 0; |
| } |
| |
| pC++; |
| charCount++; |
| } |
| |
| return charCount; |
| } |
| |
| /*!*************************************************************************** |
| @Function PVRTUnicodeUTF16Length |
| @Input pUTF16 A UTF16 string, which is null terminated. |
| @Returns The length of the string, in Unicode code points. |
| @Description Calculates the length of a UTF16 string. |
| If pUTF16 is not null terminated, the results are |
| undefined. |
| *****************************************************************************/ |
| unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16) |
| { |
| const PVRTuint16* pC = pUTF16; |
| unsigned int charCount = 0; |
| while(*pC && (pC - pUTF16) < MAX_LEN) |
| { |
| if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END |
| && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END) |
| { |
| pC += 2; |
| } |
| else |
| { |
| pC += 1; |
| } |
| |
| charCount++; |
| } |
| |
| return charCount; |
| } |
| |
| /*!*************************************************************************** |
| @Function PVRTUnicodeValidUTF8 |
| @Input pUTF8 A UTF8 string, which is null terminated. |
| @Returns true or false |
| @Description Checks whether the encoding of a UTF8 string is valid. |
| If pUTF8 is not null terminated, the results are undefined. |
| *****************************************************************************/ |
| bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8) |
| { |
| unsigned int uiTailLen, uiIndex; |
| unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); |
| const PVRTuint8* pC = pUTF8; |
| while(*pC) |
| { |
| // Quick optimisation for ASCII characters |
| while(*pC && *pC < VALID_ASCII) pC++; |
| // Done? |
| if(!*pC) |
| break; |
| |
| PVRTuint32 c32 = *pC++; |
| uiTailLen = c_u8UTF8Lengths[c32]; |
| |
| // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. |
| // Also check to make sure the tail length is inside the provided buffer. |
| if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) |
| return false; |
| |
| // Get the data out of each tail byte |
| uiIndex = 0; |
| while(uiIndex < uiTailLen) |
| { |
| if((pC[uiIndex] & 0xC0) != 0x80) |
| return false; // Invalid tail byte! |
| |
| c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); |
| uiIndex++; |
| } |
| |
| pC += uiIndex; |
| |
| // Check overlong values. |
| if(c32 < c_u32MinVals[uiTailLen]) |
| return false; |
| if(!CheckGenericUnicode(c32)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /***************************************************************************** |
| End of file (PVRTUnicode.cpp) |
| *****************************************************************************/ |
| |