|         |      1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). | 
|         |      2 // All rights reserved. | 
|         |      3 // This component and the accompanying materials are made available | 
|         |      4 // under the terms of the License "Eclipse Public License v1.0" | 
|         |      5 // which accompanies this distribution, and is available | 
|         |      6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". | 
|         |      7 // | 
|         |      8 // Initial Contributors: | 
|         |      9 // Nokia Corporation - initial contribution. | 
|         |     10 // | 
|         |     11 // Contributors: | 
|         |     12 // | 
|         |     13 // Description: | 
|         |     14 // e32\euser\unicode\unicode.cpp | 
|         |     15 // The implementation of the base-level Unicode character classification functions. These are members of | 
|         |     16 // a class called TUnicode that contains a Unicode value. | 
|         |     17 //  | 
|         |     18 // | 
|         |     19  | 
|         |     20 #include <unicode.h> | 
|         |     21 #include "CompareImp.h" | 
|         |     22  | 
|         |     23 static const TUnicodeData TheDefaultUnicodeData = | 
|         |     24 	{ TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric }; | 
|         |     25  | 
|         |     26  | 
|         |     27 // Declarations for tables held in unitable.cpp and used by unicode.cpp. | 
|         |     28 #ifndef __KERNEL_MODE__ | 
|         |     29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[]; | 
|         |     30 extern const TUnicodePlane ThePlanes[17]; | 
|         |     31 #endif | 
|         |     32  | 
|         |     33  | 
|         |     34 // Fill in a TChar::TCharInfo structure with category information about the character. | 
|         |     35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const | 
|         |     36 	{ | 
|         |     37 	const TUnicodeData& data = GetData(aOverridingDataSet); | 
|         |     38 	aInfo.iCategory = (TChar::TCategory)data.iCategory; | 
|         |     39 	aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory; | 
|         |     40 	aInfo.iCombiningClass = data.iCombiningClass; | 
|         |     41 	aInfo.iLowerCase = iCode; | 
|         |     42 	aInfo.iUpperCase = iCode; | 
|         |     43 	aInfo.iTitleCase = iCode; | 
|         |     44 	if (data.iFlags & TUnicodeData::EHasLowerCase) | 
|         |     45 		aInfo.iLowerCase = GetLowerCase(data); | 
|         |     46 	if (data.iFlags & TUnicodeData::EHasUpperCase) | 
|         |     47 		aInfo.iUpperCase = GetUpperCase(data); | 
|         |     48 	if (data.iFlags & TUnicodeData::EHasTitleCase) | 
|         |     49 		aInfo.iTitleCase = GetTitleCase(data); | 
|         |     50 	aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored; | 
|         |     51 	if (data.iFlags & TUnicodeData::ENumericFlags) | 
|         |     52 		aInfo.iNumericValue = GetNumericValue(data); | 
|         |     53 	else | 
|         |     54 		aInfo.iNumericValue = -1; | 
|         |     55 	} | 
|         |     56  | 
|         |     57 /* | 
|         |     58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that | 
|         |     59 data set before searching the standard data set. | 
|         |     60 */ | 
|         |     61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |     62 	{ | 
|         |     63 	const TUnicodeData *result = NULL; | 
|         |     64 	if (aOverridingDataSet) | 
|         |     65 		result = GetDataFromDataSet(*aOverridingDataSet); | 
|         |     66 	if (result == NULL) | 
|         |     67 		{ | 
|         |     68 		if (0xFFFF >= iCode) | 
|         |     69 			{ | 
|         |     70 			// optimize for BMP characters (plane 0) | 
|         |     71 			TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4]; | 
|         |     72 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index | 
|         |     73 				index &= ~0x8000; | 
|         |     74 			else | 
|         |     75 				index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)]; | 
|         |     76 			return TheStandardUnicodeDataSet[0].iData[index]; | 
|         |     77 			} | 
|         |     78 		else | 
|         |     79 			{ | 
|         |     80 			// for non-BMP characters (plane 1-16) | 
|         |     81 			TInt plane = (iCode >> 16); | 
|         |     82 			if (plane > 16) | 
|         |     83 				{ | 
|         |     84 				// for now we have no data for values above U+10FFFF | 
|         |     85 				return TheDefaultUnicodeData; | 
|         |     86 				} | 
|         |     87 			TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock; | 
|         |     88 			TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint; | 
|         |     89 			 | 
|         |     90 			TInt low16bit = (iCode & 0xFFFF); | 
|         |     91 			TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock]; | 
|         |     92 			if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index | 
|         |     93 				index &= ~0x8000; | 
|         |     94 			else | 
|         |     95 				index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)]; | 
|         |     96 			return TheStandardUnicodeDataSet[plane].iData[index]; | 
|         |     97 			} | 
|         |     98 		} | 
|         |     99  | 
|         |    100 	return *result; | 
|         |    101 	} | 
|         |    102  | 
|         |    103 /* | 
|         |    104 Given a character data set, get the data referring to this character. | 
|         |    105 Return NULL if no data is available in this data set. | 
|         |    106 */ | 
|         |    107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const | 
|         |    108 	{ | 
|         |    109 	// Perform a binary chop to find the range containing this character. | 
|         |    110 	TInt n = aDataSet.iRanges; | 
|         |    111 	const TUnicodeDataRange *base = aDataSet.iRange; | 
|         |    112 	const TUnicodeDataRange *last = base + n - 1; | 
|         |    113 	const TUnicodeDataRange *r = base; | 
|         |    114  | 
|         |    115 	while (n > 1) | 
|         |    116 		{ | 
|         |    117 		TInt pivot = n / 2; | 
|         |    118 		r += pivot; | 
|         |    119 		if (iCode < r->iRangeStart)									// it's before this range | 
|         |    120 			n = pivot; | 
|         |    121 		else if (r < last && iCode >= r[1].iRangeStart)				// it's after this range | 
|         |    122 			{ | 
|         |    123 			base = r + 1; | 
|         |    124 			n -= pivot + 1; | 
|         |    125 			} | 
|         |    126 		else														// it's in this range | 
|         |    127 			break; | 
|         |    128 		r = base; | 
|         |    129 		} | 
|         |    130  | 
|         |    131 	if (r->iIndex >= 0) | 
|         |    132 		return &aDataSet.iData[r->iIndex];		// index >= 0: data available | 
|         |    133 	else | 
|         |    134 		return NULL;							// index < 0: no data available | 
|         |    135 	} | 
|         |    136  | 
|         |    137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    138 	{ | 
|         |    139 	return (TChar::TCategory)GetData(aOverridingDataSet).iCategory; | 
|         |    140 	} | 
|         |    141  | 
|         |    142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    143 	{ | 
|         |    144 	return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory; | 
|         |    145 	} | 
|         |    146  | 
|         |    147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    148 	{ | 
|         |    149 	return GetData(aOverridingDataSet).iCombiningClass; | 
|         |    150 	} | 
|         |    151  | 
|         |    152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    153 	{ | 
|         |    154 	return GetLowerCase(GetData(aOverridingDataSet)); | 
|         |    155 	} | 
|         |    156  | 
|         |    157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    158 	{ | 
|         |    159 	return GetUpperCase(GetData(aOverridingDataSet)); | 
|         |    160 	} | 
|         |    161  | 
|         |    162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const | 
|         |    163 	{ | 
|         |    164 	if (aData.iFlags & TUnicodeData::EHasLowerCase) | 
|         |    165 		return iCode + aData.iCaseOffset; | 
|         |    166 	else | 
|         |    167 		return iCode; | 
|         |    168 	} | 
|         |    169  | 
|         |    170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const | 
|         |    171 	{ | 
|         |    172 	if (aData.iFlags & TUnicodeData::EHasUpperCase) | 
|         |    173 		return iCode - aData.iCaseOffset; | 
|         |    174 	else | 
|         |    175 		return iCode; | 
|         |    176 	} | 
|         |    177  | 
|         |    178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    179 	{ | 
|         |    180 	return GetTitleCase(GetData(aOverridingDataSet)); | 
|         |    181 	} | 
|         |    182  | 
|         |    183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const | 
|         |    184 	{ | 
|         |    185 	// Handle the very few characters with distinct title case variants. | 
|         |    186 	if (aData.iFlags & TUnicodeData::EHasTitleCase) | 
|         |    187 		{ | 
|         |    188 		// If the character has no upper case variant add one to get the title case form. | 
|         |    189 		if (!(aData.iFlags & TUnicodeData::EHasUpperCase)) | 
|         |    190 			return iCode + 1; | 
|         |    191 		// If the character has no lower case variant subtract one to get the title case form. | 
|         |    192 		if (!(aData.iFlags & TUnicodeData::EHasLowerCase)) | 
|         |    193 			return iCode - 1; | 
|         |    194 		// Both upper and lower case forms exist so the character itself must be title case. | 
|         |    195 		return iCode; | 
|         |    196 		} | 
|         |    197  | 
|         |    198 	// All other characters have title case forms that are the same as their upper case forms. | 
|         |    199 	return GetUpperCase(aData); | 
|         |    200 	} | 
|         |    201  | 
|         |    202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    203 	{ | 
|         |    204 	return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored; | 
|         |    205 	} | 
|         |    206  | 
|         |    207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    208 	{ | 
|         |    209 	return GetNumericValue(GetData(aOverridingDataSet)); | 
|         |    210 	} | 
|         |    211  | 
|         |    212 /* | 
|         |    213 Return the integer numeric value of this character. | 
|         |    214 Return -1 if the character is not numeric, or -2 if it has a fractional value. | 
|         |    215 */ | 
|         |    216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const | 
|         |    217 	{ | 
|         |    218 	switch (aData.iFlags & TUnicodeData::ENumericFlags) | 
|         |    219 		{ | 
|         |    220 		case TUnicodeData::ENonNumeric: return -1; | 
|         |    221 		case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF; | 
|         |    222 		case TUnicodeData::EFiveHundred: return 500; | 
|         |    223 		case TUnicodeData::EOneThousand: return 1000; | 
|         |    224 		case TUnicodeData::EFiveThousand: return 5000; | 
|         |    225 		case TUnicodeData::ETenThousand: return 10000; | 
|         |    226 		case TUnicodeData::EHundredThousand: return 100000; | 
|         |    227 		case TUnicodeData::EFraction: return -2; | 
|         |    228 		default: return -1; // we should never come here | 
|         |    229 		} | 
|         |    230 	} | 
|         |    231  | 
|         |    232 struct TWidthInfo | 
|         |    233 	{ | 
|         |    234 	TUint iStart; | 
|         |    235 	TUint iEnd; | 
|         |    236 	TChar::TCjkWidth iWidth; | 
|         |    237 	}; | 
|         |    238  | 
|         |    239 static const TWidthInfo TheWidthInfoTable[] = | 
|         |    240 	{ | 
|         |    241 	{ 0x0020, 0x007F, TChar::ENarrow }, | 
|         |    242 	{ 0x00A2, 0x00A4, TChar::ENarrow }, | 
|         |    243 	{ 0x00A5, 0x00A7, TChar::ENarrow }, | 
|         |    244 	{ 0x00AF, 0x00B0, TChar::ENarrow }, | 
|         |    245 	{ 0x00B1, 0x1100, TChar::ENeutralWidth }, | 
|         |    246 	{ 0x1100, 0x1160, TChar::EWide }, | 
|         |    247 	{ 0x1160, 0x2E80, TChar::ENeutralWidth }, | 
|         |    248 	{ 0x2E80, 0xD7A4, TChar::EWide }, | 
|         |    249 	{ 0xF900, 0xFA2E, TChar::EWide }, | 
|         |    250 	{ 0xFE30, 0xFE6C, TChar::EWide }, | 
|         |    251 	{ 0xFF01, 0xFF5F, TChar::EFullWidth }, | 
|         |    252 	{ 0xFF61, 0xFFDD, TChar::EHalfWidth }, | 
|         |    253 	{ 0xFFE0, 0xFFE7, TChar::EFullWidth }, | 
|         |    254 	{ 0xFFE8, 0xFFEF, TChar::EHalfWidth }, | 
|         |    255 	{ 0x20000, 0x2A6DF, TChar::EWide },		// CJK Unified Ideographs Extension B | 
|         |    256 	{ 0x2F800, 0x2FA1F, TChar::EWide },		// CJK Unified Ideographs Supplement | 
|         |    257 	}; | 
|         |    258  | 
|         |    259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]); | 
|         |    260  | 
|         |    261 /* | 
|         |    262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned. | 
|         |    263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those | 
|         |    264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table | 
|         |    265 is too great to justify at the moment. | 
|         |    266 */ | 
|         |    267 TChar::TCjkWidth TUnicode::GetCjkWidth() const | 
|         |    268 	{ | 
|         |    269 	const TWidthInfo* w = TheWidthInfoTable; | 
|         |    270 	for (TInt i = 0; i < TheWidthInfos; i++, w++) | 
|         |    271 		if (iCode >= w->iStart && iCode < w->iEnd) | 
|         |    272 			return w->iWidth; | 
|         |    273 	return TChar::ENeutralWidth; | 
|         |    274 	} | 
|         |    275  | 
|         |    276 /* | 
|         |    277 Convert a Unicode character into a form most likely to be equal to another character, while | 
|         |    278 still preserving the essential meaning of the character. Possible folding operations include | 
|         |    279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others. | 
|         |    280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done | 
|         |    281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll, | 
|         |    282 which performs all possible folding operations. | 
|         |    283  | 
|         |    284 Note that the difference between folding and collation is that folding is | 
|         |    285 	*	character-based | 
|         |    286 	*	biased towards yielding equality where possible | 
|         |    287 while collation is | 
|         |    288 	*	string-based | 
|         |    289 	*	designed to yield a non-equal ordering | 
|         |    290  | 
|         |    291 Typically, folding will be used when searching for a match, while collation will be used when | 
|         |    292 sorting a list. | 
|         |    293 */ | 
|         |    294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const | 
|         |    295 	{ | 
|         |    296 	TUint result = iCode; | 
|         |    297  | 
|         |    298 	/* | 
|         |    299 	Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use | 
|         |    300 	a built-in table. | 
|         |    301 	*/ | 
|         |    302 	if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth)) | 
|         |    303 		result = CjkWidthFoldTable[result & 0xFF]; | 
|         |    304  | 
|         |    305 	/* | 
|         |    306 	If the character is <= 0x00FF and the flags include folding case and stripping accents, | 
|         |    307 	and there is no overriding character data, we can use the built-in fold table. | 
|         |    308 	*/ | 
|         |    309 	const TUnicodeData* data = NULL; | 
|         |    310 	if (aOverridingDataSet) | 
|         |    311 		data = GetDataFromDataSet(*aOverridingDataSet); | 
|         |    312 	if (data == NULL && result < 256 && | 
|         |    313 		(aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents)) | 
|         |    314 		return FoldTable[result]; | 
|         |    315  | 
|         |    316 	/* | 
|         |    317 	Other characters have to be dealt with laboriously. | 
|         |    318 	The first operations are those that, if successful, tell us that nothing more | 
|         |    319 	need be done. If a value is folded to a space or a digit or converted to Katakana | 
|         |    320 	it cannot have anything else done to it. | 
|         |    321 	*/ | 
|         |    322 	if (aFlags & TChar::EFoldKana) | 
|         |    323 		{ | 
|         |    324 		if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E) | 
|         |    325 			return result += 0x0060; | 
|         |    326 		} | 
|         |    327 	if (data == NULL) | 
|         |    328 		data = &GetData(NULL); | 
|         |    329 	if (aFlags & TChar::EFoldSpaces) | 
|         |    330 		{ | 
|         |    331 		if (data->iCategory == TChar::EZsCategory) | 
|         |    332 			return 0x0020; | 
|         |    333 		} | 
|         |    334 	if (aFlags & TChar::EFoldDigits) | 
|         |    335 		{ | 
|         |    336 		TInt n = GetNumericValue(*data); | 
|         |    337 		if (n >= 0 && n <= 9) | 
|         |    338 			return 0x0030 + n; | 
|         |    339 		} | 
|         |    340  | 
|         |    341 	/* | 
|         |    342 	The final operations are the relatively rare and expensive ones (after the special | 
|         |    343 	case dealt with above) of accent removal and case conversion. | 
|         |    344 	*/ | 
|         |    345 	if ((aFlags & TChar::EFoldAccents) && (result < 0x2000)) | 
|         |    346 		{ | 
|         |    347 		/* | 
|         |    348 		Throw away characters other than the first if all are accents. For the moment these | 
|         |    349 		are defined as characters in the range 0x0300..0x0361. This definition may need | 
|         |    350 		to be modified; or I may decide to store a flag in the decomposition table indicating | 
|         |    351 		whether or not the decomposition consists of base + accent(s). | 
|         |    352 		*/ | 
|         |    353 		TPtrC16 decomposition; | 
|         |    354 		if (::DecomposeChar(iCode, decomposition)) | 
|         |    355 			{ | 
|         |    356 			TBool all_accents = TRUE;			 | 
|         |    357 			for (TInt i = 1; all_accents && i < decomposition.Length(); ++i) | 
|         |    358 				{ | 
|         |    359 				if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361) | 
|         |    360 					all_accents = FALSE; | 
|         |    361 				} | 
|         |    362 			if (all_accents) | 
|         |    363 				result = decomposition[0]; | 
|         |    364 			} | 
|         |    365 		} | 
|         |    366  | 
|         |    367 	if (aFlags & TChar::EFoldCase) | 
|         |    368 		{ | 
|         |    369 		if (aOverridingDataSet == NULL && result < 256) | 
|         |    370 			result = FoldTable[result]; | 
|         |    371 		else | 
|         |    372 			result = TUnicode(result).GetLowerCase(aOverridingDataSet); | 
|         |    373 		} | 
|         |    374 	 | 
|         |    375 	return result; | 
|         |    376 	} | 
|         |    377  | 
|         |    378 /* | 
|         |    379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison | 
|         |    380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they | 
|         |    381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes | 
|         |    382 after the string { 0x0001 }. | 
|         |    383  | 
|         |    384 This function exists to make it easier to search tables of Unicode strings (like the composition | 
|         |    385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table. | 
|         |    386  | 
|         |    387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2. | 
|         |    388 */ | 
|         |    389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2) | 
|         |    390 	{ | 
|         |    391 	for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++) | 
|         |    392 		{ | 
|         |    393 		TInt x = i < aLength1 ? *aString1 : -1; | 
|         |    394 		TInt y = i < aLength2 ? *aString2 : -1; | 
|         |    395 		if (x != y) | 
|         |    396 			return x - y; | 
|         |    397 		} | 
|         |    398 	return 0; | 
|         |    399 	} | 
|         |    400  |