WebCore/icu/unicode/ucsdet.h
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  **********************************************************************
       
     3  *   Copyright (C) 2005-2006, International Business Machines
       
     4  *   Corporation and others.  All Rights Reserved.
       
     5  **********************************************************************
       
     6  *   file name:  ucsdet.h
       
     7  *   encoding:   US-ASCII
       
     8  *   indentation:4
       
     9  *
       
    10  *   created on: 2005Aug04
       
    11  *   created by: Andy Heninger
       
    12  *
       
    13  *   ICU Character Set Detection, API for C
       
    14  *
       
    15  *   Draft version 18 Oct 2005
       
    16  *
       
    17  */
       
    18 
       
    19 #ifndef __UCSDET_H
       
    20 #define __UCSDET_H
       
    21 
       
    22 #include "unicode/utypes.h"
       
    23 
       
    24 #if !UCONFIG_NO_CONVERSION
       
    25 #include "unicode/uenum.h"
       
    26 
       
    27 /**
       
    28  * \file 
       
    29  * \brief C API: Charset Detection API
       
    30  *
       
    31  * This API provides a facility for detecting the
       
    32  * charset or encoding of character data in an unknown text format.
       
    33  * The input data can be from an array of bytes.
       
    34  * <p>
       
    35  * Character set detection is at best an imprecise operation.  The detection
       
    36  * process will attempt to identify the charset that best matches the characteristics
       
    37  * of the byte data, but the process is partly statistical in nature, and
       
    38  * the results can not be guaranteed to always be correct.
       
    39  * <p>
       
    40  * For best accuracy in charset detection, the input data should be primarily
       
    41  * in a single language, and a minimum of a few hundred bytes worth of plain text
       
    42  * in the language are needed.  The detection process will attempt to
       
    43  * ignore html or xml style markup that could otherwise obscure the content.
       
    44  */
       
    45  
       
    46 
       
    47 struct UCharsetDetector;
       
    48 /**
       
    49   * Structure representing a charset detector
       
    50   * @draft ICU 3.6
       
    51   */
       
    52 typedef struct UCharsetDetector UCharsetDetector;
       
    53 
       
    54 struct UCharsetMatch;
       
    55 /**
       
    56   *  Opaque structure representing a match that was identified
       
    57   *  from a charset detection operation.
       
    58   *  @draft ICU 3.6
       
    59   */
       
    60 typedef struct UCharsetMatch UCharsetMatch;
       
    61 
       
    62 /**
       
    63   *  Open a charset detector.
       
    64   *
       
    65   *  @param status Any error conditions occurring during the open
       
    66   *                operation are reported back in this variable.
       
    67   *  @return the newly opened charset detector.
       
    68   *  @draft ICU 3.6
       
    69   */
       
    70 U_DRAFT UCharsetDetector * U_EXPORT2
       
    71 ucsdet_open(UErrorCode   *status);
       
    72 
       
    73 /**
       
    74   * Close a charset detector.  All storage and any other resources
       
    75   *   owned by this charset detector will be released.  Failure to
       
    76   *   close a charset detector when finished with it can result in
       
    77   *   memory leaks in the application.
       
    78   *
       
    79   *  @param ucsd  The charset detector to be closed.
       
    80   *  @draft ICU 3.6
       
    81   */
       
    82 U_DRAFT void U_EXPORT2
       
    83 ucsdet_close(UCharsetDetector *ucsd);
       
    84 
       
    85 /**
       
    86   * Set the input byte data whose charset is to detected.
       
    87   *
       
    88   * Ownership of the input  text byte array remains with the caller.
       
    89   * The input string must not be altered or deleted until the charset
       
    90   * detector is either closed or reset to refer to different input text.
       
    91   *
       
    92   * @param ucsd   the charset detector to be used.
       
    93   * @param textIn the input text of unknown encoding.   .
       
    94   * @param len    the length of the input text, or -1 if the text
       
    95   *               is NUL terminated.
       
    96   * @param status any error conditions are reported back in this variable.
       
    97   *
       
    98   * @draft ICU 3.6
       
    99   */
       
   100 U_DRAFT void U_EXPORT2
       
   101 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
       
   102 
       
   103 
       
   104 /** Set the declared encoding for charset detection.
       
   105  *  The declared encoding of an input text is an encoding obtained
       
   106  *  by the user from an http header or xml declaration or similar source that
       
   107  *  can be provided as an additional hint to the charset detector.
       
   108  *
       
   109  *  How and whether the declared encoding will be used during the
       
   110  *  detection process is TBD.
       
   111  *
       
   112  * @param ucsd      the charset detector to be used.
       
   113  * @param encoding  an encoding for the current data obtained from
       
   114  *                  a header or declaration or other source outside
       
   115  *                  of the byte data itself.
       
   116  * @param length    the length of the encoding name, or -1 if the name string
       
   117  *                  is NUL terminated.
       
   118  * @param status    any error conditions are reported back in this variable.
       
   119  *
       
   120  * @draft ICU 3.6
       
   121  */
       
   122 U_DRAFT void U_EXPORT2
       
   123 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
       
   124 
       
   125 
       
   126 /**
       
   127  * Return the charset that best matches the supplied input data.
       
   128  * 
       
   129  * Note though, that because the detection 
       
   130  * only looks at the start of the input data,
       
   131  * there is a possibility that the returned charset will fail to handle
       
   132  * the full set of input data.
       
   133  * <p>
       
   134  * The returned UCharsetMatch object is owned by the UCharsetDetector.
       
   135  * It will remain valid until the detector input is reset, or until
       
   136  * the detector is closed.
       
   137  * <p>
       
   138  * The function will fail if
       
   139  *  <ul>
       
   140  *    <li>no charset appears to match the data.</li>
       
   141  *    <li>no input text has been provided</li>
       
   142  *  </ul>
       
   143  *
       
   144  * @param ucsd      the charset detector to be used.
       
   145  * @param status    any error conditions are reported back in this variable.
       
   146  * @return          a UCharsetMatch  representing the best matching charset,
       
   147  *                  or NULL if no charset matches the byte data.
       
   148  *
       
   149  * @draft ICU 3.6
       
   150  */
       
   151 U_DRAFT const UCharsetMatch * U_EXPORT2
       
   152 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
       
   153     
       
   154 
       
   155 /**
       
   156  *  Find all charset matches that appear to be consistent with the input,
       
   157  *  returning an array of results.  The results are ordered with the
       
   158  *  best quality match first.
       
   159  *
       
   160  *  Because the detection only looks at a limited amount of the
       
   161  *  input byte data, some of the returned charsets may fail to handle
       
   162  *  the all of input data.
       
   163  *  <p>
       
   164  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
       
   165  *  They will remain valid until the detector is closed or modified
       
   166  *  
       
   167  * <p>
       
   168  * Return an error if 
       
   169  *  <ul>
       
   170  *    <li>no charsets appear to match the input data.</li>
       
   171  *    <li>no input text has been provided</li>
       
   172  *  </ul>
       
   173  * 
       
   174  * @param ucsd          the charset detector to be used.
       
   175  * @param matchesFound  pointer to a variable that will be set to the
       
   176  *                      number of charsets identified that are consistent with
       
   177  *                      the input data.  Output only.
       
   178  * @param status        any error conditions are reported back in this variable.
       
   179  * @return              A pointer to an array of pointers to UCharSetMatch objects.
       
   180  *                      This array, and the UCharSetMatch instances to which it refers,
       
   181  *                      are owned by the UCharsetDetector, and will remain valid until
       
   182  *                      the detector is closed or modified.
       
   183  * @draft ICU 3.4
       
   184  */
       
   185 U_DRAFT const UCharsetMatch ** U_EXPORT2
       
   186 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
       
   187 
       
   188 
       
   189 
       
   190 /**
       
   191  *  Get the name of the charset represented by a UCharsetMatch.
       
   192  *
       
   193  *  The storage for the returned name string is owned by the
       
   194  *  UCharsetMatch, and will remain valid while the UCharsetMatch
       
   195  *  is valid.
       
   196  *
       
   197  *  The name returned is suitable for use with the ICU conversion APIs.
       
   198  *
       
   199  *  @param ucsm    The charset match object.
       
   200  *  @param status  Any error conditions are reported back in this variable.
       
   201  *  @return        The name of the matching charset.
       
   202  *
       
   203  *  @draft ICU 3.6
       
   204  */
       
   205 U_DRAFT const char * U_EXPORT2
       
   206 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
       
   207 
       
   208 /**
       
   209  *  Get a confidence number for the quality of the match of the byte
       
   210  *  data with the charset.  Confidence numbers range from zero to 100,
       
   211  *  with 100 representing complete confidence and zero representing
       
   212  *  no confidence.
       
   213  *
       
   214  *  The confidence values are somewhat arbitrary.  They define an
       
   215  *  an ordering within the results for any single detection operation
       
   216  *  but are not generally comparable between the results for different input.
       
   217  *
       
   218  *  A confidence value of ten does have a general meaning - it is used
       
   219  *  for charsets that can represent the input data, but for which there
       
   220  *  is no other indication that suggests that the charset is the correct one.
       
   221  *  Pure 7 bit ASCII data, for example, is compatible with a
       
   222  *  great many charsets, most of which will appear as possible matches
       
   223  *  with a confidence of 10.
       
   224  *
       
   225  *  @param ucsm    The charset match object.
       
   226  *  @param status  Any error conditions are reported back in this variable.
       
   227  *  @return        A confidence number for the charset match.
       
   228  *
       
   229  *  @draft ICU 3.6
       
   230  */
       
   231 U_DRAFT int32_t U_EXPORT2
       
   232 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
       
   233 
       
   234 /**
       
   235  *  Get the RFC 3066 code for the language of the input data.
       
   236  *
       
   237  *  The Charset Detection service is intended primarily for detecting
       
   238  *  charsets, not language.  For some, but not all, charsets, a language is
       
   239  *  identified as a byproduct of the detection process, and that is what
       
   240  *  is returned by this function.
       
   241  *
       
   242  *  CAUTION:
       
   243  *    1.  Language information is not available for input data encoded in
       
   244  *        all charsets. In particular, no language is identified
       
   245  *        for UTF-8 input data.
       
   246  *
       
   247  *    2.  Closely related languages may sometimes be confused.
       
   248  *
       
   249  *  If more accurate language detection is required, a linguistic
       
   250  *  analysis package should be used.
       
   251  *
       
   252  *  The storage for the returned name string is owned by the
       
   253  *  UCharsetMatch, and will remain valid while the UCharsetMatch
       
   254  *  is valid.
       
   255  *
       
   256  *  @param ucsm    The charset match object.
       
   257  *  @param status  Any error conditions are reported back in this variable.
       
   258  *  @return        The RFC 3066 code for the language of the input data, or
       
   259  *                 an empty string if the language could not be determined.
       
   260  *
       
   261  *  @draft ICU 3.6
       
   262  */
       
   263 U_DRAFT const char * U_EXPORT2
       
   264 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
       
   265 
       
   266 
       
   267 /**
       
   268   *  Get the entire input text as a UChar string, placing it into
       
   269   *  a caller-supplied buffer.  A terminating
       
   270   *  NUL character will be appended to the buffer if space is available.
       
   271   *
       
   272   *  The number of UChars in the output string, not including the terminating
       
   273   *  NUL, is returned. 
       
   274   *
       
   275   *  If the supplied buffer is smaller than required to hold the output,
       
   276   *  the contents of the buffer are undefined.  The full output string length
       
   277   *  (in UChars) is returned as always, and can be used to allocate a buffer
       
   278   *  of the correct size.
       
   279   *
       
   280   *
       
   281   * @param ucsm    The charset match object.
       
   282   * @param buf     A UChar buffer to be filled with the converted text data.
       
   283   * @param cap     The capacity of the buffer in UChars.
       
   284   * @param status  Any error conditions are reported back in this variable.
       
   285   * @return        The number of UChars in the output string.
       
   286   *
       
   287   * @draft ICU 3.6
       
   288   */
       
   289 U_DRAFT  int32_t U_EXPORT2
       
   290 ucsdet_getUChars(const UCharsetMatch *ucsm,
       
   291                  UChar *buf, int32_t cap, UErrorCode *status);
       
   292 
       
   293 
       
   294 
       
   295 /**
       
   296   *  Get an iterator over the set of all detectable charsets - 
       
   297   *  over the charsets that are known to the charset detection
       
   298   *  service.
       
   299   *
       
   300   *  The returned UEnumeration provides access to the names of
       
   301   *  the charsets.
       
   302   *
       
   303   *  The state of the Charset detector that is passed in does not
       
   304   *  affect the result of this function, but requiring a valid, open
       
   305   *  charset detector as a parameter insures that the charset detection
       
   306   *  service has been safely initialized and that the required detection
       
   307   *  data is available.
       
   308   *
       
   309   *  @param ucsd a Charset detector.
       
   310   *  @param status  Any error conditions are reported back in this variable.
       
   311   *  @return an iterator providing access to the detectable charset names.
       
   312   *  @draft ICU 3.6
       
   313   */
       
   314 
       
   315 U_DRAFT  UEnumeration * U_EXPORT2
       
   316 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
       
   317 
       
   318 
       
   319 /**
       
   320   *  Test whether input filtering is enabled for this charset detector.
       
   321   *  Input filtering removes text that appears to be HTML or xml
       
   322   *  markup from the input before applying the code page detection
       
   323   *  heuristics.
       
   324   *
       
   325   *  @param ucsd  The charset detector to check.
       
   326   *  @return TRUE if filtering is enabled.
       
   327   *  @draft ICU 3.4
       
   328   */
       
   329 U_DRAFT  UBool U_EXPORT2
       
   330 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
       
   331 
       
   332 
       
   333 /**
       
   334  * Enable filtering of input text. If filtering is enabled,
       
   335  * text within angle brackets ("<" and ">") will be removed
       
   336  * before detection, which will remove most HTML or xml markup.
       
   337  *
       
   338  * @param ucsd   the charset detector to be modified.
       
   339  * @param filter <code>true</code> to enable input text filtering.
       
   340  * @return The previous setting.
       
   341  *
       
   342  * @draft ICU 3.6
       
   343  */
       
   344 U_DRAFT  UBool U_EXPORT2
       
   345 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
       
   346 
       
   347 #endif
       
   348 #endif   /* __UCSDET_H */
       
   349 
       
   350