symport/e32/include/collate.h
changeset 1 0a7b44b10206
child 2 806186ab5e14
equal deleted inserted replaced
0:c55016431358 1:0a7b44b10206
       
     1 // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Symbian Foundation License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 // e32\include\collate.h
       
    15 // Definitions needed for Unicode collation.
       
    16 // Collation is the comparison of two Unicode strings to produce an ordering
       
    17 // that may be used in a dictionary or other list.
       
    18 // Collation is implemented using the Standard Unicode Collation algorithm. There
       
    19 // are four levels of comparison:
       
    20 // primary: basic character identity
       
    21 // secondary: accents and diacritics
       
    22 // tertiary: upper and lower case, and other minor attributes
       
    23 // quaternary: Unicode character value
       
    24 // Punctuation is normally ignored but can optionally be taken into account.
       
    25 // Strings are fully expanded using the standard Unicode canonical expansions before
       
    26 // they are compared. Thai and Lao vowels are swapped with the following character
       
    27 // if any.
       
    28 // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values
       
    29 // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus
       
    30 // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after
       
    31 // all the characters for which keys are defined, and ordered by their Unicode values.
       
    32 // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard
       
    33 // method. This is done by using the standard table as the main key table (signalled by placing NULL in
       
    34 // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable).
       
    35 // Locale-specific collation data resides in ELOCL.
       
    36 // 
       
    37 //
       
    38 
       
    39 #ifndef __COLLATE_H__
       
    40 #define __COLLATE_H__
       
    41 
       
    42 #ifdef __KERNEL_MODE__
       
    43 #include <e32cmn.h>
       
    44 #else
       
    45 #include <e32std.h>
       
    46 #endif
       
    47 
       
    48 //This material is used in the Unicode build only.
       
    49 #ifdef _UNICODE
       
    50 
       
    51 /**
       
    52 Collation key table structure.
       
    53 @publishedPartner
       
    54 @released
       
    55 */
       
    56 struct TCollationKeyTable
       
    57 	{
       
    58 public:
       
    59 	/**
       
    60 	Masks for the various parts of the elements of the iKey array.
       
    61 	*/
       
    62 	enum
       
    63 		{
       
    64 		ELevel0Mask = 0xFFFF0000,	// primary key - basic character identity
       
    65 		ELevel1Mask = 0x0000FF00,	// secondary key - accents and diacritics
       
    66 		ELevel2Mask = 0x000000FC,	// tertiary key - case, etc.
       
    67 		EIgnoreFlag = 0x2,			// if set, this key is normally ignored
       
    68 		EStopFlag = 0x1				// if set, this key is the last in a sequence representing a Unicode value or values
       
    69 		};
       
    70 
       
    71 	/**
       
    72 	An array containing all of the keys and strings of keys concatenated
       
    73 	together. Each key has EStopFlag set only if it is the last key in its
       
    74 	string. Eack key contains the keys for levels 0, 1 and 2, and a flag
       
    75 	EIgnoreFlag if the key is usually ignored (for punctuation & spaces
       
    76 	etc.).
       
    77 	*/
       
    78 	const TUint32* iKey;
       
    79 	/**
       
    80 	An array of indices into the iKey array. Each element has its high 16
       
    81 	bits indicating a Unicode value and its low 16 bits indicating an index
       
    82 	into the iKey array at which its key starts. The elements are sorted by
       
    83 	Unicode value.
       
    84 	*/
       
    85 	const TUint32* iIndex;
       
    86 	/**
       
    87 	The size of the iIndex array.
       
    88 	*/
       
    89 	TInt iIndices;
       
    90 	/**
       
    91 	Concatenated Unicode strings. Each is a strings that is to be converted
       
    92 	to keys differently from how it would be if each letter were converted
       
    93 	independently. An example is "ch" in Spanish, which sorts as though it
       
    94 	were a single letter. Each Unicode string is preceeded by a 16-bit value
       
    95 	indicating the string's length. The end of the string is not delimited.
       
    96 	*/
       
    97 	const TUint16* iString;
       
    98 	/**
       
    99 	An array of elements mapping elements of iString to elements of iIndex.
       
   100 	Each element has its high 16 bits indicating the index of the start of
       
   101 	an element of iString, and its low 16 bits indicating the corresponding
       
   102 	element in iIndex. This array is sorted on the string index.
       
   103 	*/
       
   104 	const TUint32* iStringIndex;
       
   105 	/**
       
   106 	The size of the iStringIndex array.
       
   107 	*/
       
   108 	TInt iStringIndices;
       
   109 	};
       
   110 
       
   111 /**
       
   112 Defines a collation method. 
       
   113 
       
   114 Collation means sorting pieces of text. It needs to take into account characters, 
       
   115 accents and case; spaces and punctuation are usually ignored. It differs from 
       
   116 ordinary methods of sorting in that it is locale-dependent - different 
       
   117 languages use different ordering methods. Additionally, multiple collation 
       
   118 methods may exist within the same locale.
       
   119 
       
   120 A collation method provides the collation keys and other data needed to customise 
       
   121 collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) 
       
   122 perform the collation. Note that these functions use the standard collation 
       
   123 method for the current locale - you only need to specify an object of class 
       
   124 TCollationMethod to customise this collation scheme. Collation methods can 
       
   125 be retrieved using member functions of the Mem class. Each one has a unique 
       
   126 identifier.
       
   127 
       
   128 A collation method specifies a main table of collation keys, and optionally 
       
   129 an overriding table that contains keys for which the values in the main table 
       
   130 are overridden. A collation key table (TCollationKeyTable) is the set of collation 
       
   131 keys: primary (basic character identity), secondary (accents and diacritics) 
       
   132 and tertiary (case). The quaternary key is the Unicode character values themselves.
       
   133 
       
   134 The simplest way to customise a collation method is to create a local copy 
       
   135 of the standard collation method and change it. For example, you could use 
       
   136 the standard method, but not ignore punctuation and spaces:
       
   137 
       
   138 @code
       
   139 TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method
       
   140 m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces
       
   141 @endcode
       
   142 
       
   143 @publishedPartner
       
   144 @released
       
   145 */
       
   146 struct TCollationMethod
       
   147 	{
       
   148 	public:
       
   149 	/**
       
   150 	The UID of this collation method.
       
   151 	*/
       
   152 	TUint iId;
       
   153 	
       
   154 	/**
       
   155 	The main collation key table; if NULL, use the standard table.
       
   156 	*/
       
   157 	const TCollationKeyTable* iMainTable;
       
   158 	
       
   159 	/**
       
   160 	If non-NULL, tailoring for collation keys.
       
   161 	*/
       
   162 	const TCollationKeyTable* iOverrideTable;
       
   163 	enum
       
   164 		{
       
   165 		/**
       
   166 		Don't ignore any keys (punctuation, etc. is normally ignored).
       
   167 		*/
       
   168 		EIgnoreNone = 1,
       
   169 		
       
   170 		/**
       
   171 		Reverse the normal order for characters differing only in case
       
   172 		*/
       
   173 		ESwapCase = 2,
       
   174 		
       
   175 		/**
       
   176 		Compare secondary keys which represent accents in reverse
       
   177 		order (from right to left); this is needed for French when comparing
       
   178 		words that differ only in accents.
       
   179 		*/
       
   180 		EAccentsBackwards = 4,	
       
   181 		
       
   182 		/**
       
   183 		Reverse the normal order for characters differing only in whether they
       
   184 		are katakana or hiragana.
       
   185 		*/
       
   186 		ESwapKana = 8,
       
   187 		
       
   188 		/**
       
   189 		Fold all characters to lower case before extracting keys; needed for
       
   190 		comparison of filenames, for which case is ignored but other
       
   191 		tertiary (level-2) distinctions are not.
       
   192 		*/
       
   193 		EFoldCase = 16,
       
   194 		
       
   195 		/** Flag to indicate a collation method for matching purpose 
       
   196 		This flag is only needed if we wish to specify a particular collation method
       
   197 		to be used for matching purpose.
       
   198 		*/
       
   199 		EMatchingTable = 32,
       
   200 		
       
   201 		/** Ignore the check for adjacent combining characters.  A combining
       
   202 		character effectively changes the character it combines with to something
       
   203 		else and so a match doesn't occur.  Setting this flag will allow character
       
   204 		matching regardless of any combining characters.
       
   205 		*/
       
   206 		EIgnoreCombining = 64
       
   207 		};
       
   208 		
       
   209 	/**
       
   210 	Flags.
       
   211 	
       
   212 	@see TCollationMethod::EIgnoreNone
       
   213 	@see TCollationMethod::ESwapCase
       
   214 	@see TCollationMethod::EAccentsBackwards
       
   215 	@see TCollationMethod::ESwapKana
       
   216 	@see TCollationMethod::EFoldCase
       
   217 	*/
       
   218 	TUint iFlags;
       
   219 	};
       
   220 
       
   221 /**
       
   222 A collation data set provides any collation methods needed by a locale.
       
   223 @publishedPartner
       
   224 @released
       
   225 */
       
   226 struct TCollationDataSet
       
   227 	{
       
   228 	public:
       
   229 	const TCollationMethod* iMethod;
       
   230 	TInt iMethods;
       
   231 	};
       
   232 
       
   233 // Collation method IDs
       
   234 
       
   235 /**
       
   236 A collation data set provides any collation methods needed by a locale.
       
   237 @internalTechnology
       
   238 @released
       
   239 */
       
   240 const TUint KUidBasicCollationMethod = 0x10004F4E;
       
   241 
       
   242 /**
       
   243 A collation data set provides any collation methods needed by a locale.
       
   244 @internalTechnology
       
   245 @released
       
   246 */
       
   247 const TUint KUidStandardUnicodeCollationMethod = 0x10004E96;
       
   248 
       
   249 #ifndef __KERNEL_MODE__
       
   250 
       
   251 //Forward declarations
       
   252 class TUTF32Iterator;
       
   253 struct LCharSet;
       
   254 
       
   255 /**
       
   256 Provides low-level collation functions.
       
   257 @internalComponent
       
   258 @released
       
   259 */
       
   260 class TCollate
       
   261 	{
       
   262 public:
       
   263 	/**
       
   264 	Construct a TCollate object based on the collation method specified
       
   265 	within aCharSet, if any. If there is none, or aCharSet is null, the
       
   266 	standard collation method will be used. aMask and aFlags provide a
       
   267 	method for overriding the flags in the collation method: Each flag set
       
   268 	to 1 in aMask is a flag that will be overridden and set to the
       
   269 	corresponding flag value in aFlags. Ownership of aCharSet is not passed.
       
   270 	*/
       
   271 	TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF);
       
   272 	/**
       
   273 	Construct a TCollate object based on an already constructed
       
   274 	TCollationMethod specified in aMethod. Ownership is not passed.
       
   275 	*/
       
   276 	TCollate(const TCollationMethod& aMethod);
       
   277 
       
   278 	enum TComparisonResult
       
   279 		{
       
   280 		ELeftComparesLessAndIsNotPrefix = -2,
       
   281 		ELeftIsPrefixOfRight = -1,
       
   282 		EStringsIdentical = 0,
       
   283 		ERightIsPrefixOfLeft = 1,
       
   284 		ERightComparesLessAndIsNotPrefix = 2
       
   285 		};
       
   286 
       
   287 	/**
       
   288 	Compare the string beginning at aString1 of length aLength1 against the
       
   289 	string beginning at aString2 of length aLength2.
       
   290 	aMaxLevel determines the tightness of the collation. At level 0, only
       
   291 	character identities are distinguished. At level 1 accents are
       
   292 	distinguished as well. At level 2 case is distinguishes as well. At
       
   293 	level 3 all valid different Unicode characters are considered different.
       
   294 	*/
       
   295 	TComparisonResult Compare(const TUint16* aString1,TInt aLength1,
       
   296 							  const TUint16* aString2,TInt aLength2,
       
   297 							  TInt aMaxLevel = 3) const;
       
   298 	/**
       
   299 	Find the string beginning at aString2 of length aLength2 in the string
       
   300 	beginning at aString1 of length aLength1. aMaxLevel determines
       
   301 	the tightness of the collation, see Compare for details.
       
   302 	*/
       
   303 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
       
   304 			  TInt aMaxLevel,TUint aString2WildChar = 0) const;
       
   305 			  
       
   306 	TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2,
       
   307 		      TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const;
       
   308 		      
       
   309 	/**
       
   310 	Test if the string beginning at aSearchTerm of length aSearchTermLength
       
   311 	matches the string beginning at aCandidate of length aCandidateLength.
       
   312 	aMaxLevel determines the tightness of the collation, see
       
   313 	Compare for details. The search term may have wild card characters as
       
   314 	specified by aWildChar (for matching a single grapheme- i.e. character
       
   315 	and any characters that combine with it, such as accents) and
       
   316 	aWildSequenceChar (for matching any sequence of whole graphemes). The
       
   317 	return value is KErrNotFound iff the search term does not match the
       
   318 	candidate string exactly. To find a match within the candidate string,
       
   319 	the search term must begin and end with a wild sequence character. If
       
   320 	the search term does match the candidate string, 0 will be returned,
       
   321 	unless the first character of the search term is a wild sequence
       
   322 	character in which case the value returned will be the index into
       
   323 	aCandidate at which the first non-wild sequence character matched.
       
   324 	aWildSequenceChar must be a valid (non-surrogate) Unicode character
       
   325 	below FFFE.
       
   326 	*/
       
   327 	TInt Match(const TUint16 *aCandidate, TInt aCandidateLength,
       
   328 			   const TUint16 *aSearchTerm,TInt aSearchTermLength,
       
   329 			   TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const;
       
   330 
       
   331 private:
       
   332 	/**
       
   333 	Compare values output from the iterators. After the comparison, if
       
   334 	ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and
       
   335 	aRight will be pointing at the next key (at MaxLevel) after the match.
       
   336 	If right is shown to be a prefix of left, this means that it has been
       
   337 	checked at all requested levels. If it is reported that the right is a
       
   338 	prefix of the left, then this will mean also that there are no unmatched
       
   339 	combining characters on the left.
       
   340 	*/
       
   341 	TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight,
       
   342 										  TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const;
       
   343 	/**
       
   344 	Finds search term inside candidate string. Returns KErrNotFound if there
       
   345 	is no match, returns the offset into the candidate string at which the
       
   346 	search term was found (note that this is the offset from the start of
       
   347 	the iteration, not from where the iteration was when the function was
       
   348 	called). If a string was found, the search term iterator is left
       
   349 	pointing at the end of the search term, and the candidate iterator is
       
   350 	left pointing just after the matched keys. aMatchPos returns where in
       
   351 	the candidate string the match was found.
       
   352 	*/
       
   353 	TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm,
       
   354 						 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const;
       
   355 
       
   356 private:
       
   357 	TCollationMethod iMethod;
       
   358 	};
       
   359 
       
   360 #endif	// __KERNEL_MODE__
       
   361 
       
   362 #endif // _UNICODE
       
   363 
       
   364 #endif // __COLLATE_H__