searchengine/cpix/tsrc/cpixunittest/src/analysiswhitebox.cpp
changeset 0 671dee74050a
child 3 ae3f1779f6da
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/cpix/tsrc/cpixunittest/src/analysiswhitebox.cpp	Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,346 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+#include <wchar.h>
+#include <stddef.h>
+
+#include <iostream>
+
+#include "cpixidxdb.h"
+
+#include "itk.h"
+
+#include "config.h"
+#include "testutils.h"
+
+// For testing custom analyzer
+#include "CLucene.h"
+#include "CLucene\analysis\AnalysisHeader.h"
+#include "CLucene\util\stringreader.h"
+#include "analyzer.h"
+#include "analyzerexp.h"
+
+using namespace Cpt::Lex; 
+using namespace Cpt::Parser; 
+using namespace Cpix::AnalyzerExp; 
+
+void PrintToken(Cpt::Lex::Token token) {
+	switch (token.type()) {
+		case TOKEN_WS: printf("space"); break; 
+		case TOKEN_ID: printf("id"); break;
+		case TOKEN_LIT: printf("lit"); break;
+		case TOKEN_STRLIT: printf("str-lit"); break;
+		case TOKEN_REALLIT: printf("real-lit"); break;
+		case TOKEN_INTLIT: printf("int-lit"); break;
+		case TOKEN_LEFT_BRACKET: printf("lbr"); break;
+		case TOKEN_RIGHT_BRACKET: printf("rbr"); break;
+		case TOKEN_COMMA: printf("comma"); break;
+		case TOKEN_PIPE: printf("pipe"); break;
+		case TOKEN_SWITCH : printf("sw"); break;
+		case TOKEN_CASE : printf("case"); break;
+		case TOKEN_DEFAULT : printf("default"); break;
+		case TOKEN_LEFT_BRACE : printf("lbc"); break;
+		case TOKEN_RIGHT_BRACE : printf("rbc"); break;
+		case TOKEN_COLON : printf("cl"); break;
+		case TOKEN_TERMINATOR : printf("tr"); break;
+
+		default: printf("unknown"); break;
+	}
+	printf("('%S')", (token.text()).c_str());  
+}
+
+
+void TestTokenization6(Itk::TestMgr * )
+{
+	Cpix::AnalyzerExp::Tokenizer tokenizer; 
+	Tokens source(tokenizer, 
+		L"switch { "
+		  L"case '_docuid', '_mimetype': keywords;"
+		  L"case '_baseappclass':        whitespace>lowercase;"
+		  L"default: 					 natural(en); "
+		L"}");
+    WhiteSpaceFilter 
+        tokens(source); 
+
+    while (tokens) PrintToken(tokens++); 
+}
+
+void TestParsing(Itk::TestMgr* )
+{ 
+	Cpix::AnalyzerExp::Tokenizer tokenizer; 
+	
+	Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)");
+	WhiteSpaceFilter tokens(source);
+	Lexer lexer(tokens);
+
+	Tokens source2(tokenizer, L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin)  ");
+	WhiteSpaceFilter tokens2(source2);
+	Lexer lexer2(tokens2);
+	
+	Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)");
+	WhiteSpaceFilter tokens3(source3);
+	Lexer lexer3(tokens3);
+
+	try {
+		auto_ptr<Invokation> invoke = ParseInvokation(lexer); 
+		lexer.eatEof(); 
+		printf("Invoke identifier: %S\n", (invoke->id()).c_str()); 
+		printf("%d parameters\n", invoke->params().size()); 
+		auto_ptr<Piping> piping = ParsePiping(lexer2); 
+		lexer2.eatEof(); 
+		printf("piping done.\n"); 
+		if (dynamic_cast<const Invokation*>(&piping->tokenizer())) {
+			printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id().c_str()); 
+		}
+		printf("%d filters\n", piping->filters().size()); 
+		invoke = ParseInvokation(lexer3);
+		lexer3.eatEof(); 
+		printf("Invoke identifier: %S\n", (invoke->id()).c_str()); 
+		printf("%d parameters\n", invoke->params().size()); 
+	} catch (ParseException& e) {
+		printf("ParseException: %S\n", e.wWhat()); 
+	} catch (LexException& e) {
+		printf("LexException: %S\n", e.wWhat()); 
+	}
+}
+
+void TestSwitch(Itk::TestMgr* )
+{ 
+	Cpix::AnalyzerExp::Tokenizer tokenizer; 
+	
+	const wchar_t* text; 
+	Tokens source(tokenizer, text = 
+		L"switch { "
+		  L"case '_docuid', '_mimetype': keywords;"
+		  L"case '_baseappclass':        whitespace>lowercase;"
+		  L"default: 					 natural(en); "
+		L"}");
+	WhiteSpaceFilter tokens(source);
+	Lexer lexer(tokens);
+
+	try {
+		auto_ptr<Piping> sw = ParsePiping(lexer); 
+		lexer.eatEof(); 
+		if (dynamic_cast<const Switch*>(&sw->tokenizer())) {
+			const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer());
+			for (int i = 0; i < s->cases().size(); i++) {
+				const Case* c = s->cases()[i]; 
+				printf("case "); 
+				for (int j = 0; j < c->fields().size(); j++) {
+					printf("%S", (c->fields()[j]).c_str());
+				}
+				printf(": ...\n"); 
+//				wcout<<L":"<<s->def().tokenizer().id();
+			}
+			printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;";
+		}
+	} catch (ParseException& e) {
+		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("ParseException: %S\n", e.wWhat()); 
+	} catch (LexException& e) {
+		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("LexException: %S\n", e.wWhat()); 
+	}
+}
+
+void TestParsingErrors(Itk::TestMgr* )
+{
+	Cpix::AnalyzerExp::Tokenizer tokenizer; 
+	// eof
+	const wchar_t* text; 
+	StdLexer eof(tokenizer, text = L"foobar(zap, foo, 'bar', 'raf', do, ");
+	try {
+		ParsePiping(eof); 
+		eof.eatEof(); 
+	} catch (ParseException& e) {
+		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("ParseException: %S\n", e.wWhat()); 
+	}
+
+	
+	// Unfinished literal
+	StdLexer lit(tokenizer, text = L"foobar(zap, foo, 'bar', 'a, raboof)");
+	try {
+		ParsePiping(lit); 
+		lit.eatEof(); 
+	} catch (LexException& e) { // syntax error
+		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("LexException: %S\n", e.wWhat()); 
+	} catch (ParseException& e) { // syntax error
+		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("ParseException: %S\n", e.wWhat()); 
+	} 
+
+	// Unknown token
+	StdLexer unknown(tokenizer, text = L"foobar(!zap, foo, 'bar', 'a', raboof)");
+	try {
+		ParsePiping(unknown); 
+		unknown.eatEof(); 
+	} catch (LexException& e) { // syntax error
+		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("LexException: %S\n", e.wWhat()); 
+	} 
+	
+	// Missing comma
+	StdLexer comma(tokenizer, text = L"foobar(zap, foo, 'bar', 'a' raboof)");
+	try {
+		ParsePiping(comma); 
+		comma.eatEof(); 
+	} catch (ParseException& e) {
+		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
+		e.setContext(text);
+		printf("ParseException: %S\n", e.wWhat()); 
+	} 
+
+}
+
+
+const char * CustomAnalyzerTestDocs[] = {
+    FILE_TEST_CORPUS_PATH "\\en\\1.txt",
+    FILE_TEST_CORPUS_PATH "\\en\\2.txt",
+    FILE_TEST_CORPUS_PATH "\\en\\3.txt",
+    FILE_TEST_CORPUS_PATH "\\fi\\1.txt",
+    FILE_TEST_CORPUS_PATH "\\fi\\2.txt",
+    NULL
+};
+
+const char DEFAULT_ENCODING[] = "UTF-8";
+
+void PrintTokenStream(lucene::analysis::TokenStream* stream) 
+{
+	using namespace lucene::analysis; 
+	lucene::analysis::Token token; 
+	while (stream->next(&token)) {
+		int pos = token.getPositionIncrement(); 
+		if (pos == 0) {
+			printf("|"); 
+		} else {
+			for (int i = 0; i < pos; i++) printf(" "); 
+		}
+		printf("'%S'", token.termText());
+	}
+	printf("\n");
+}
+
+void TestCustomAnalyzer(Itk::TestMgr * , const wchar_t* definition)
+{
+	using namespace lucene::analysis; 
+	using namespace lucene::util; 
+	using namespace Cpix; 
+	using namespace std; 
+	CustomAnalyzer analyzer(definition);
+	
+	printf("Analyzer \"%S\":\n", definition); 
+	for (int i = 0; CustomAnalyzerTestDocs[i]; i++) 
+	{
+		printf("File !%s tokenized:\n", (CustomAnalyzerTestDocs[i]+1));
+		FileReader file( CustomAnalyzerTestDocs[i], DEFAULT_ENCODING ); 
+		
+		TokenStream* stream = analyzer.tokenStream( L"field", &file ); 
+		PrintTokenStream( stream ); 
+		stream->close(); 
+		_CLDELETE( stream ); 
+	}
+}
+
+void TestCustomAnalyzers(Itk::TestMgr * testMgr)
+{
+	TestCustomAnalyzer(testMgr, L"stdtokens");
+	TestCustomAnalyzer(testMgr, L"whitespace");
+	TestCustomAnalyzer(testMgr, L"whitespace>lowercase");
+	TestCustomAnalyzer(testMgr, L"whitespace>accent");
+	TestCustomAnalyzer(testMgr, L"letter");
+	TestCustomAnalyzer(testMgr, L"letter>lowercase");
+	TestCustomAnalyzer(testMgr, L"keyword");
+	TestCustomAnalyzer(testMgr, L"keyword>lowercase");
+	TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>accent>stem(en)"); 
+	TestCustomAnalyzer(testMgr, L"letter>lowercase>accent>stop(en)"); 
+	TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'näin')"); 
+	TestCustomAnalyzer(testMgr, L"letter>length(2, 4)");
+}
+
+void TestAnalyzerWithField(Itk::TestMgr * , const wchar_t* definition, const wchar_t* field)
+{
+	using namespace lucene::analysis; 
+	using namespace lucene::util; 
+	using namespace Cpix; 
+	using namespace std; 
+	CustomAnalyzer analyzer(definition);
+	
+	printf("File !%s tokenized for field %S:\n", (CustomAnalyzerTestDocs[0]+1), field);
+	FileReader file( CustomAnalyzerTestDocs[0], DEFAULT_ENCODING ); 
+	
+	TokenStream* stream = analyzer.tokenStream( field, &file ); 
+	PrintTokenStream( stream ); 
+	stream->close(); 
+	_CLDELETE( stream ); 
+}
+
+void TestSwitchAnalyzers(Itk::TestMgr * testMgr)
+{
+	const wchar_t* sw = L"\n"
+		L"switch {\n"
+		L"    case '_docuid':          keyword;\n"
+		L"    case '_appclass':        whitespace>lowercase;\n"
+		L"    case 'title', 'message': stdtokens>accent>lowercase>stem(en)>stop(en);\n"
+		L"    default:                 letter>lowercase>stop('i');\n"
+		L"}";
+	TestAnalyzerWithField(testMgr, sw, L"_docuid");
+	TestAnalyzerWithField(testMgr, sw, L"_appclass");
+	TestAnalyzerWithField(testMgr, sw, L"Title"); 
+	TestAnalyzerWithField(testMgr, sw, L"message"); 
+	TestAnalyzerWithField(testMgr, sw, L"field"); 
+}
+
+
+Itk::TesterBase * CreateAnalysisWhiteBoxTests()
+{
+    using namespace Itk;
+
+    SuiteTester
+        * analysisTests = new SuiteTester("whitebox");
+    
+    analysisTests->add("analyzer",
+					   &TestCustomAnalyzers,
+					   "analyzer");
+    analysisTests->add("switchAnalyzer",
+					   &TestSwitchAnalyzers,
+					   "switchAnalyzer");
+    analysisTests->add("tokenization",
+    				   TestTokenization6,
+    				   "tokenization");
+  	analysisTests->add("parsing",
+                      TestParsing,
+                      "parsing");
+    analysisTests->add("parsing2",
+                      TestSwitch,
+                      "parsing2");
+    analysisTests->add("parsingerrors",
+                      TestParsingErrors,
+                      "parsingerrors");
+    
+    return analysisTests;
+}
+
+
+