webengine/wmlengine/src/htmlp/src/htmlp_lexer.c
changeset 0 dd21522fd290
child 26 cb62a4f66ebe
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/webengine/wmlengine/src/htmlp/src/htmlp_lexer.c	Mon Mar 30 12:54:55 2009 +0300
@@ -0,0 +1,476 @@
+/*
+* Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of the License "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+
+/*
+
+Lexer for HTML parser.
+
+*/
+#ifndef FEA_RME_NOHTMLPARSER
+
+#include "nwx_defs.h"
+#include "nw_htmlp_lexer.h"
+#include "nw_string_char.h"
+#include "BrsrStatusCodes.h"
+
+/* Does not copy the buffer */
+TBrowserStatusCode NW_HTMLP_Lexer_InitFromBuffer(NW_HTMLP_Lexer_t* pL,
+                                          NW_Uint32 byteCount,
+                                          NW_Uint8* pBuf,
+                                          NW_HTMLP_ElementTableIndex_t elementCount,
+                                          NW_HTMLP_ElementDescriptionConst_t* pElementDictionary)
+{
+  if ((byteCount == 0) || (pBuf == NULL)) {
+    return KBrsrFailure;
+  }
+  pL->encoding = 0;
+  pL->endianness = NW_NATIVE_ENDIAN;
+  pL->readPosition = 0;
+  pL->charPosition = 0;
+  pL->lineColumn.crCount = 0;
+  pL->lineColumn.lfCount = 0;
+  pL->lineColumn.charsSinceLastCR = 0;
+  pL->lineColumn.charsSinceLastLF = 0;
+  pL->end = NW_FALSE;
+  pL->byteCount = byteCount;
+  pL->pBuf = pBuf;
+  pL->elementCount = elementCount;
+  pL->pElementDictionary = pElementDictionary;
+  return KBrsrSuccess;
+}
+
+/* Returns a pointer in *ppData into pBuf at some byte position
+and byte count, byte count is truncated to fit in pBuf if required. */
+TBrowserStatusCode NW_HTMLP_Lexer_DataAddressFromBuffer(NW_HTMLP_Lexer_t* pL,
+                                                 NW_Uint32 byteIndex,
+                                                 NW_Uint32* pByteCount,
+                                                 NW_Uint8** ppData)
+{
+  NW_ASSERT(byteIndex < pL->byteCount);
+  NW_ASSERT(*pByteCount <= pL->byteCount);
+  NW_ASSERT((byteIndex + *pByteCount) <= pL->byteCount);
+  *ppData = NULL;
+  if (byteIndex < pL->byteCount) {
+    *ppData = pL->pBuf + byteIndex;
+    if ((byteIndex + *pByteCount) > pL->byteCount) {
+      *pByteCount = pL->byteCount - byteIndex;
+    }
+    return KBrsrSuccess;
+  }
+  return KBrsrFailure;
+}
+
+/* peekOrAdvance: first arg "advance": peek = NW_FALSE, advance = NW_TRUE */
+static
+TBrowserStatusCode NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_Bool advance,
+                                               NW_HTMLP_Lexer_t* pL,
+                                               NW_Uint32 offsetCharCount,
+                                               NW_Uint32* pC,
+                                               NW_Bool* pEOF)
+{
+  NW_Uint32 i;
+  NW_Uint32 charCount = 0;
+  NW_Int32 byteCount = 0;
+  NW_Uint32 crCount = 0;
+  NW_Uint32 lfCount = 0;
+  NW_Uint32 charsPastCR = 0;
+  NW_Uint32 charsPastLF = 0;
+  NW_Ucs2 c_ucs2;
+  NW_Bool resetPastCR = 0;
+  NW_Bool resetPastLF = 0;
+
+  /* it makes no sense to advance by 0 */
+  NW_ASSERT((advance == NW_FALSE) || (offsetCharCount > 0));
+
+  if (NW_HTMLP_Lexer_AtEnd(pL)) {
+    *pEOF = NW_TRUE;
+    return KBrsrSuccess;
+  }
+  *pEOF = NW_FALSE;
+  i = pL->readPosition;
+  do {
+    /* It is assumed that this func returns UNICODE code points. */
+    byteCount = NW_String_readChar(&(pL->pBuf[i]),
+                                   &c_ucs2, pL->encoding);
+    /* This catches NW_String_readChar() reading past end of buffer
+    and can be removed when the readChar function does proper
+    error checking (requires passing in buf length). */
+    if ((i + byteCount) > pL->byteCount) {
+      return KBrsrFailure;
+    }
+    *pC = c_ucs2;
+    if (byteCount == -1) {
+      return KBrsrFailure;
+    }
+    if (charCount == offsetCharCount) {
+      break;
+    }
+    charCount++;
+    charsPastCR++;
+    charsPastLF++;
+    if (c_ucs2 == 0xd /* CR */) {
+      crCount++;
+      resetPastCR = 1;
+      charsPastCR = 0;
+    } else if (c_ucs2 == 0xa /* LF */) {
+      lfCount++;
+      resetPastLF = 1;
+      charsPastLF = 0;
+    }
+    i += byteCount;
+    if (pL->encoding == HTTP_iso_10646_ucs_2 &&
+      i == (pL->byteCount - 1))
+    {
+      *pEOF = NW_TRUE;
+      break;
+    }
+    if (pL->encoding == HTTP_utf_8 &&
+        (i + 3) > pL->byteCount)
+      {
+        *pEOF = NW_TRUE;
+        break;
+      }
+  } while (i < pL->byteCount);
+  if (i >= pL->byteCount) {
+    *pEOF = NW_TRUE;
+  }
+  if (advance == NW_TRUE) {
+    if (*pEOF == NW_TRUE) {
+      pL->readPosition = pL->byteCount;
+      pL->end = NW_TRUE;
+    } else {
+      pL->readPosition = i;
+    }
+    pL->charPosition += charCount;
+    pL->lineColumn.crCount += crCount;
+    pL->lineColumn.lfCount += lfCount;
+    if (resetPastCR) {
+      pL->lineColumn.charsSinceLastCR = charsPastCR;
+    } else {
+      pL->lineColumn.charsSinceLastCR += charsPastCR;
+    }
+    if (resetPastLF) {
+      pL->lineColumn.charsSinceLastLF = charsPastLF;
+    } else {
+      pL->lineColumn.charsSinceLastLF += charsPastLF;
+    }
+  }
+  return KBrsrSuccess;
+}
+
+TBrowserStatusCode NW_HTMLP_Lexer_PeekOffset(NW_HTMLP_Lexer_t* pL,
+                                      NW_Uint32 offsetCharCount,
+                                      NW_Uint32* pC,
+                                      NW_Bool* pEOF)
+{
+  return NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_FALSE, pL,
+                                            offsetCharCount, pC, pEOF);
+}
+
+TBrowserStatusCode NW_HTMLP_Lexer_AdvanceOffset(NW_HTMLP_Lexer_t* pL,
+                                         NW_Uint32 offsetCharCount)
+{
+  NW_Uint32 c;
+  NW_Bool eof;
+  return NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_TRUE, pL,
+                                            offsetCharCount, &c, &eof);
+}
+
+void NW_HTMLP_Lexer_GetPosition(NW_HTMLP_Lexer_t* pL, NW_HTMLP_Lexer_Position_t* pPosition)
+{
+  pPosition->readPosition = pL->readPosition;
+  pPosition->charPosition = pL->charPosition;
+  pPosition->crCount = pL->lineColumn.crCount;
+  pPosition->lfCount = pL->lineColumn.lfCount;
+  pPosition->charsSinceLastCR = pL->lineColumn.charsSinceLastCR;
+  pPosition->charsSinceLastLF = pL->lineColumn.charsSinceLastLF;
+  pPosition->end = pL->end;
+}
+
+/* Note: Setting the position (similar to seeking in a file) is in general
+not possible without reading the characters (usually reading forward) because
+character encoding may use a variable numbers of bytes per character. This is
+here so that if you have defined a valid interval, then you can reposition to
+the beginning of the interval. Setting to the position to a bad value will
+not always be caught immediately. Don't forget to also save and set line
+and column with position. */
+TBrowserStatusCode NW_HTMLP_Lexer_SetPosition(NW_HTMLP_Lexer_t* pL, NW_HTMLP_Lexer_Position_t* pPosition)
+{
+  if ((pPosition->readPosition > pL->byteCount)
+      || (pPosition->end && (pPosition->readPosition != pL->byteCount))) {
+    return KBrsrFailure;
+  }
+  pL->readPosition = pPosition->readPosition;
+  pL->charPosition = pPosition->charPosition;
+  pL->lineColumn.crCount = pPosition->crCount;
+  pL->lineColumn.lfCount = pPosition->lfCount;
+  pL->lineColumn.charsSinceLastCR = pPosition->charsSinceLastCR;
+  pL->lineColumn.charsSinceLastLF = pPosition->charsSinceLastLF;
+  pL->end = pPosition->end;
+
+  return KBrsrSuccess;
+}
+
+TBrowserStatusCode NW_HTMLP_Lexer_IsSpace(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch)
+{
+  NW_Uint32 c;
+  TBrowserStatusCode e;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  e = NW_HTMLP_Lexer_Peek(pL, &c, &eof);
+  if (BRSR_STAT_IS_FAILURE(e)) {
+    return e;
+  }
+  if (eof == NW_TRUE) {
+    return KBrsrSuccess;
+  }
+  /* "space" is defined in HTML to be the following codepoints:
+  0x20 (space), 0x9 (tab), 0xc (form feed), 0x200b (zero-width space),
+  0xd (cr), 0xa (lf) */
+  if ((c == 0x20U) || (c == 0x9U) || (c == 0xcU)
+      || (c== 0x200bU) || (c == 0xdU) || (c == 0xaU)) {
+    *pMatch = NW_TRUE;
+  }
+  return KBrsrSuccess;
+}
+
+TBrowserStatusCode NW_HTMLP_Lexer_IsCRLF(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch)
+{
+  NW_Uint32 c;
+  TBrowserStatusCode e;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  e = NW_HTMLP_Lexer_Peek(pL, &c, &eof);
+  if (BRSR_STAT_IS_FAILURE(e)) {
+    return e;
+  }
+  if (eof == NW_TRUE) {
+    return KBrsrSuccess;
+  }
+  /* 0xd (CR), 0xa (LF) */
+  if ((c == 0xdU) || (c == 0xaU)) {
+    *pMatch = NW_TRUE;
+  }
+  return KBrsrSuccess;
+}
+
+/* on return: *pMatch == NW_TRUE if character is in [a-zA-Z] */
+TBrowserStatusCode NW_HTMLP_Lexer_IsAsciiLetter(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch)
+{
+  NW_Uint32 c;
+  TBrowserStatusCode e;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  e = NW_HTMLP_Lexer_Peek(pL, &c, &eof);
+  if (BRSR_STAT_IS_FAILURE(e)) {
+    return e;
+  }
+  if (eof == NW_TRUE) {
+    return KBrsrSuccess;
+  }
+  if ( ( (c >= (NW_Uint32)'a') && (c <= (NW_Uint32)'z') )
+      || ( (c >= (NW_Uint32)'A') && (c <= (NW_Uint32)'Z') ) ) {
+    *pMatch = NW_TRUE;
+  }
+  return KBrsrSuccess;
+}
+
+/* on return: *pMatch == NW_TRUE if character is in [0-9] */
+TBrowserStatusCode NW_HTMLP_Lexer_IsAsciiDigit(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch)
+{
+  NW_Uint32 c;
+  TBrowserStatusCode e;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  e = NW_HTMLP_Lexer_Peek(pL, &c, &eof);
+  if (BRSR_STAT_IS_FAILURE(e)) {
+    return e;
+  }
+  if (eof == NW_TRUE) {
+    return KBrsrSuccess;
+  }
+  if ((c >= (NW_Uint32)'0') && (c <= (NW_Uint32)'9')) {
+    *pMatch = NW_TRUE;
+  }
+  return KBrsrSuccess;
+}
+
+/* *pMatch is NW_TRUE iff ASCII string matches the text in its encoding */
+TBrowserStatusCode NW_HTMLP_Lexer_AsciiCharCompare(NW_HTMLP_Lexer_t* pL,
+                                            NW_Uint8 asciiChar,
+                                            NW_Bool* pMatch)
+{
+  NW_Uint32 c_text;
+  TBrowserStatusCode e;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  e = NW_HTMLP_Lexer_Peek(pL, &c_text, &eof);
+  if (BRSR_STAT_IS_FAILURE(e)) {
+    return e;
+  }
+  if (eof == NW_TRUE) {
+    return KBrsrSuccess;
+  }
+  if (c_text == asciiChar) {
+    *pMatch = NW_TRUE;
+  }
+  return KBrsrSuccess;
+}
+
+/*
+on entry: no assumptions
+on return: If matched string, then *pMatch == NW_TRUE.
+.........: If did not match string, then *pMatch == NW_FALSE.
+.........: In either case, lexer read position is unchanged
+eof handling: if encounters EOF while attempting operation then returns
+............: *pMatch == NW_FALSE and KBrsrSuccess, and
+............: lexer read position is unchanged
+on error return: return value is not KBrsrSuccess, *pMatch == NW_FALSE
+...............: and lexer read position is unchanged
+*/
+TBrowserStatusCode NW_HTMLP_Lexer_AsciiStringCompare(NW_HTMLP_Lexer_t* pL,
+                                              NW_Uint32 asciiCharCount,
+                                              const NW_Uint8* pString,
+                                              NW_Bool* pMatch)
+{
+  return NW_HTMLP_Lexer_AsciiStringCompareCase(pL,
+                                              asciiCharCount,
+                                              pString,
+                                              NW_TRUE,
+                                              pMatch);
+}
+
+/* perform the same functionality as NW_HTMLP_Lexer_AsciiStringCompareCase
+   except for the ability to perform both case insensitive check and 
+   case sensitive check
+*/
+TBrowserStatusCode NW_HTMLP_Lexer_AsciiStringCompareCase(NW_HTMLP_Lexer_t* pL,
+                                              NW_Uint32 asciiCharCount,
+                                              const NW_Uint8* pString,
+                                              NW_Bool CaseSensitive,
+                                              NW_Bool* pMatch)
+{
+  NW_Uint32 c_text;
+  NW_Uint32 i;
+  TBrowserStatusCode e = KBrsrSuccess;
+  NW_Bool eof;
+
+  *pMatch = NW_FALSE;
+  NW_ASSERT(asciiCharCount);
+  for (i = 0; i < asciiCharCount; i++) {
+
+    e = NW_HTMLP_Lexer_PeekOffset(pL, i, &c_text, &eof);
+    if (BRSR_STAT_IS_FAILURE(e)) {
+      break;
+    }
+    if (eof == NW_TRUE) {
+      break;
+    }
+    if (c_text != pString[i]) {
+      if (!CaseSensitive)
+      {
+        if (c_text + 'A' - 'a' == pString[i]) //Small case to Upper
+          continue;
+		if(c_text + 'a' - 'A' == pString[i]) //Upper case to small 
+          continue;
+      }
+      break;
+    }
+  }
+  if (i == asciiCharCount) {
+    *pMatch = NW_TRUE;
+  }
+  return e;
+}
+
+/* Sets start, stop, charStart, charStop to current read position. */
+void NW_HTMLP_Interval_Start(NW_HTMLP_Interval_t* pI, NW_HTMLP_Lexer_t* pL)
+{
+  /* set both start and stop for safety in later use */
+  pI->start = pI->stop = pL->readPosition;
+  pI->charStart = pI->charStop = pL->charPosition;
+}
+
+/* Sets stop to current reader position. */
+void NW_HTMLP_Interval_Stop(NW_HTMLP_Interval_t* pI, NW_HTMLP_Lexer_t* pL)
+{
+  pI->stop = pL->readPosition;
+  pI->charStop = pL->charPosition;
+}
+
+/* Returns an estimate of the current line and column position in the text.
+It is an estimate because it has to guess at what the intended line ending
+sequence is using a count of CR and LF characters.  Line and Column indices
+are 1-based not 0-based. */
+void
+NW_HTMLP_Lexer_GetLineColumn(NW_HTMLP_Lexer_t* pT, NW_Uint32* pLine,
+                             NW_Uint32* pColumn)
+{
+    NW_Uint32 crCount, lfCount, charsSinceCR, charsSinceLF;
+    crCount = pT->lineColumn.crCount;
+    lfCount = pT->lineColumn.lfCount;
+    charsSinceCR = pT->lineColumn.charsSinceLastCR;
+    charsSinceLF = pT->lineColumn.charsSinceLastLF;
+    if (crCount == lfCount) {
+        /* assume CR, LF, DOS style */
+        /* use a bias in favor of CR followed by LF
+        which will give the correct column for DOS */
+        *pLine = lfCount + 1;
+        *pColumn = charsSinceLF + 1;
+    } else if (lfCount == 0) {
+        /* assume CR only, Unix style */
+        *pLine = crCount + 1;
+        *pColumn = charsSinceCR + 1;
+    } else if (crCount == 0) {
+        /* assume LF only, Mac style */
+        *pLine = lfCount + 1;
+        *pColumn = charsSinceLF + 1;
+    } else {
+        /* an unclear situation so use
+        thresholds on the ratio to guess */
+        NW_Uint32 ratio;
+        ratio = ((crCount * 100) / lfCount);
+        if (ratio > 300) {/* more than 3 to 1 crCount to lfCount */
+            /* assume CR only, Unix style */
+            *pLine = crCount + 1;
+            *pColumn = charsSinceCR + 1;
+        } else if (ratio < 33) {/* less than 1 to 3 crCount to lfCount */
+            /* assume LF only, Mac style */
+            *pLine = lfCount + 1;
+            *pColumn = charsSinceLF + 1;
+        } else {
+            /* assume CR, LF, DOS style */
+            /* use a bias in favor of CR, LF sequence (DOS style)
+            which will give the correct column */
+            *pLine = lfCount + 1;
+            *pColumn = charsSinceLF + 1;
+        }
+    }
+}
+#else
+
+void FeaRmeNoHTMLParser_htmlp_lexer(){
+   int i = 0;
+   i+=1;
+}
+#endif /* FEA_RME_NOHTMLPARSER */