--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/cpix/tsrc/cpixunittest/src/analysiswhitebox.cpp Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,346 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+
+#include <wchar.h>
+#include <stddef.h>
+
+#include <iostream>
+
+#include "cpixidxdb.h"
+
+#include "itk.h"
+
+#include "config.h"
+#include "testutils.h"
+
+// For testing custom analyzer
+#include "CLucene.h"
+#include "CLucene\analysis\AnalysisHeader.h"
+#include "CLucene\util\stringreader.h"
+#include "analyzer.h"
+#include "analyzerexp.h"
+
+using namespace Cpt::Lex;
+using namespace Cpt::Parser;
+using namespace Cpix::AnalyzerExp;
+
+void PrintToken(Cpt::Lex::Token token) {
+ switch (token.type()) {
+ case TOKEN_WS: printf("space"); break;
+ case TOKEN_ID: printf("id"); break;
+ case TOKEN_LIT: printf("lit"); break;
+ case TOKEN_STRLIT: printf("str-lit"); break;
+ case TOKEN_REALLIT: printf("real-lit"); break;
+ case TOKEN_INTLIT: printf("int-lit"); break;
+ case TOKEN_LEFT_BRACKET: printf("lbr"); break;
+ case TOKEN_RIGHT_BRACKET: printf("rbr"); break;
+ case TOKEN_COMMA: printf("comma"); break;
+ case TOKEN_PIPE: printf("pipe"); break;
+ case TOKEN_SWITCH : printf("sw"); break;
+ case TOKEN_CASE : printf("case"); break;
+ case TOKEN_DEFAULT : printf("default"); break;
+ case TOKEN_LEFT_BRACE : printf("lbc"); break;
+ case TOKEN_RIGHT_BRACE : printf("rbc"); break;
+ case TOKEN_COLON : printf("cl"); break;
+ case TOKEN_TERMINATOR : printf("tr"); break;
+
+ default: printf("unknown"); break;
+ }
+ printf("('%S')", (token.text()).c_str());
+}
+
+
+void TestTokenization6(Itk::TestMgr * )
+{
+ Cpix::AnalyzerExp::Tokenizer tokenizer;
+ Tokens source(tokenizer,
+ L"switch { "
+ L"case '_docuid', '_mimetype': keywords;"
+ L"case '_baseappclass': whitespace>lowercase;"
+ L"default: natural(en); "
+ L"}");
+ WhiteSpaceFilter
+ tokens(source);
+
+ while (tokens) PrintToken(tokens++);
+}
+
+void TestParsing(Itk::TestMgr* )
+{
+ Cpix::AnalyzerExp::Tokenizer tokenizer;
+
+ Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)");
+ WhiteSpaceFilter tokens(source);
+ Lexer lexer(tokens);
+
+ Tokens source2(tokenizer, L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin) ");
+ WhiteSpaceFilter tokens2(source2);
+ Lexer lexer2(tokens2);
+
+ Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)");
+ WhiteSpaceFilter tokens3(source3);
+ Lexer lexer3(tokens3);
+
+ try {
+ auto_ptr<Invokation> invoke = ParseInvokation(lexer);
+ lexer.eatEof();
+ printf("Invoke identifier: %S\n", (invoke->id()).c_str());
+ printf("%d parameters\n", invoke->params().size());
+ auto_ptr<Piping> piping = ParsePiping(lexer2);
+ lexer2.eatEof();
+ printf("piping done.\n");
+ if (dynamic_cast<const Invokation*>(&piping->tokenizer())) {
+ printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id().c_str());
+ }
+ printf("%d filters\n", piping->filters().size());
+ invoke = ParseInvokation(lexer3);
+ lexer3.eatEof();
+ printf("Invoke identifier: %S\n", (invoke->id()).c_str());
+ printf("%d parameters\n", invoke->params().size());
+ } catch (ParseException& e) {
+ printf("ParseException: %S\n", e.wWhat());
+ } catch (LexException& e) {
+ printf("LexException: %S\n", e.wWhat());
+ }
+}
+
+void TestSwitch(Itk::TestMgr* )
+{
+ Cpix::AnalyzerExp::Tokenizer tokenizer;
+
+ const wchar_t* text;
+ Tokens source(tokenizer, text =
+ L"switch { "
+ L"case '_docuid', '_mimetype': keywords;"
+ L"case '_baseappclass': whitespace>lowercase;"
+ L"default: natural(en); "
+ L"}");
+ WhiteSpaceFilter tokens(source);
+ Lexer lexer(tokens);
+
+ try {
+ auto_ptr<Piping> sw = ParsePiping(lexer);
+ lexer.eatEof();
+ if (dynamic_cast<const Switch*>(&sw->tokenizer())) {
+ const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer());
+ for (int i = 0; i < s->cases().size(); i++) {
+ const Case* c = s->cases()[i];
+ printf("case ");
+ for (int j = 0; j < c->fields().size(); j++) {
+ printf("%S", (c->fields()[j]).c_str());
+ }
+ printf(": ...\n");
+// wcout<<L":"<<s->def().tokenizer().id();
+ }
+ printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;";
+ }
+ } catch (ParseException& e) {
+ // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("ParseException: %S\n", e.wWhat());
+ } catch (LexException& e) {
+ // OBS wcout<<L"LexException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("LexException: %S\n", e.wWhat());
+ }
+}
+
+void TestParsingErrors(Itk::TestMgr* )
+{
+ Cpix::AnalyzerExp::Tokenizer tokenizer;
+ // eof
+ const wchar_t* text;
+ StdLexer eof(tokenizer, text = L"foobar(zap, foo, 'bar', 'raf', do, ");
+ try {
+ ParsePiping(eof);
+ eof.eatEof();
+ } catch (ParseException& e) {
+ // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("ParseException: %S\n", e.wWhat());
+ }
+
+
+ // Unfinished literal
+ StdLexer lit(tokenizer, text = L"foobar(zap, foo, 'bar', 'a, raboof)");
+ try {
+ ParsePiping(lit);
+ lit.eatEof();
+ } catch (LexException& e) { // syntax error
+ // OBS wcout<<L"LexException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("LexException: %S\n", e.wWhat());
+ } catch (ParseException& e) { // syntax error
+ // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("ParseException: %S\n", e.wWhat());
+ }
+
+ // Unknown token
+ StdLexer unknown(tokenizer, text = L"foobar(!zap, foo, 'bar', 'a', raboof)");
+ try {
+ ParsePiping(unknown);
+ unknown.eatEof();
+ } catch (LexException& e) { // syntax error
+ // OBS wcout<<L"LexException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("LexException: %S\n", e.wWhat());
+ }
+
+ // Missing comma
+ StdLexer comma(tokenizer, text = L"foobar(zap, foo, 'bar', 'a' raboof)");
+ try {
+ ParsePiping(comma);
+ comma.eatEof();
+ } catch (ParseException& e) {
+ // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl;
+ e.setContext(text);
+ printf("ParseException: %S\n", e.wWhat());
+ }
+
+}
+
+
+const char * CustomAnalyzerTestDocs[] = {
+ FILE_TEST_CORPUS_PATH "\\en\\1.txt",
+ FILE_TEST_CORPUS_PATH "\\en\\2.txt",
+ FILE_TEST_CORPUS_PATH "\\en\\3.txt",
+ FILE_TEST_CORPUS_PATH "\\fi\\1.txt",
+ FILE_TEST_CORPUS_PATH "\\fi\\2.txt",
+ NULL
+};
+
+const char DEFAULT_ENCODING[] = "UTF-8";
+
+void PrintTokenStream(lucene::analysis::TokenStream* stream)
+{
+ using namespace lucene::analysis;
+ lucene::analysis::Token token;
+ while (stream->next(&token)) {
+ int pos = token.getPositionIncrement();
+ if (pos == 0) {
+ printf("|");
+ } else {
+ for (int i = 0; i < pos; i++) printf(" ");
+ }
+ printf("'%S'", token.termText());
+ }
+ printf("\n");
+}
+
+void TestCustomAnalyzer(Itk::TestMgr * , const wchar_t* definition)
+{
+ using namespace lucene::analysis;
+ using namespace lucene::util;
+ using namespace Cpix;
+ using namespace std;
+ CustomAnalyzer analyzer(definition);
+
+ printf("Analyzer \"%S\":\n", definition);
+ for (int i = 0; CustomAnalyzerTestDocs[i]; i++)
+ {
+ printf("File !%s tokenized:\n", (CustomAnalyzerTestDocs[i]+1));
+ FileReader file( CustomAnalyzerTestDocs[i], DEFAULT_ENCODING );
+
+ TokenStream* stream = analyzer.tokenStream( L"field", &file );
+ PrintTokenStream( stream );
+ stream->close();
+ _CLDELETE( stream );
+ }
+}
+
+void TestCustomAnalyzers(Itk::TestMgr * testMgr)
+{
+ TestCustomAnalyzer(testMgr, L"stdtokens");
+ TestCustomAnalyzer(testMgr, L"whitespace");
+ TestCustomAnalyzer(testMgr, L"whitespace>lowercase");
+ TestCustomAnalyzer(testMgr, L"whitespace>accent");
+ TestCustomAnalyzer(testMgr, L"letter");
+ TestCustomAnalyzer(testMgr, L"letter>lowercase");
+ TestCustomAnalyzer(testMgr, L"keyword");
+ TestCustomAnalyzer(testMgr, L"keyword>lowercase");
+ TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>accent>stem(en)");
+ TestCustomAnalyzer(testMgr, L"letter>lowercase>accent>stop(en)");
+ TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'näin')");
+ TestCustomAnalyzer(testMgr, L"letter>length(2, 4)");
+}
+
+void TestAnalyzerWithField(Itk::TestMgr * , const wchar_t* definition, const wchar_t* field)
+{
+ using namespace lucene::analysis;
+ using namespace lucene::util;
+ using namespace Cpix;
+ using namespace std;
+ CustomAnalyzer analyzer(definition);
+
+ printf("File !%s tokenized for field %S:\n", (CustomAnalyzerTestDocs[0]+1), field);
+ FileReader file( CustomAnalyzerTestDocs[0], DEFAULT_ENCODING );
+
+ TokenStream* stream = analyzer.tokenStream( field, &file );
+ PrintTokenStream( stream );
+ stream->close();
+ _CLDELETE( stream );
+}
+
+void TestSwitchAnalyzers(Itk::TestMgr * testMgr)
+{
+ const wchar_t* sw = L"\n"
+ L"switch {\n"
+ L" case '_docuid': keyword;\n"
+ L" case '_appclass': whitespace>lowercase;\n"
+ L" case 'title', 'message': stdtokens>accent>lowercase>stem(en)>stop(en);\n"
+ L" default: letter>lowercase>stop('i');\n"
+ L"}";
+ TestAnalyzerWithField(testMgr, sw, L"_docuid");
+ TestAnalyzerWithField(testMgr, sw, L"_appclass");
+ TestAnalyzerWithField(testMgr, sw, L"Title");
+ TestAnalyzerWithField(testMgr, sw, L"message");
+ TestAnalyzerWithField(testMgr, sw, L"field");
+}
+
+
+Itk::TesterBase * CreateAnalysisWhiteBoxTests()
+{
+ using namespace Itk;
+
+ SuiteTester
+ * analysisTests = new SuiteTester("whitebox");
+
+ analysisTests->add("analyzer",
+ &TestCustomAnalyzers,
+ "analyzer");
+ analysisTests->add("switchAnalyzer",
+ &TestSwitchAnalyzers,
+ "switchAnalyzer");
+ analysisTests->add("tokenization",
+ TestTokenization6,
+ "tokenization");
+ analysisTests->add("parsing",
+ TestParsing,
+ "parsing");
+ analysisTests->add("parsing2",
+ TestSwitch,
+ "parsing2");
+ analysisTests->add("parsingerrors",
+ TestParsingErrors,
+ "parsingerrors");
+
+ return analysisTests;
+}
+
+
+