util/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
changeset 7 f7bc934e204c
equal deleted inserted replaced
3:41300fa6a67c 7:f7bc934e204c
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Assistant of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qhelpenginecore.h"
       
    43 #include "fulltextsearch/qsearchable_p.h"
       
    44 #include "fulltextsearch/qqueryparser_p.h"
       
    45 #include "fulltextsearch/qindexreader_p.h"
       
    46 #include "qhelpsearchindexreader_clucene_p.h"
       
    47 
       
    48 #include <QtCore/QDir>
       
    49 #include <QtCore/QSet>
       
    50 #include <QtCore/QString>
       
    51 #include <QtCore/QFileInfo>
       
    52 #include <QtCore/QStringList>
       
    53 #include <QtCore/QTextStream>
       
    54 #include <QtCore/QMutexLocker>
       
    55 
       
    56 QT_BEGIN_NAMESPACE
       
    57 
       
    58 namespace fulltextsearch {
       
    59 namespace clucene {
       
    60 
       
    61 QHelpSearchIndexReaderClucene::QHelpSearchIndexReaderClucene()
       
    62     : QHelpSearchIndexReader()
       
    63 {
       
    64     // nothing todo
       
    65 }
       
    66 
       
    67 QHelpSearchIndexReaderClucene::~QHelpSearchIndexReaderClucene()
       
    68 {
       
    69 }
       
    70 
       
    71 
       
    72 void QHelpSearchIndexReaderClucene::run()
       
    73 {
       
    74     mutex.lock();
       
    75 
       
    76     if (m_cancel) {
       
    77         mutex.unlock();
       
    78         return;
       
    79     }
       
    80 
       
    81     const QString collectionFile(this->m_collectionFile);
       
    82     const QList<QHelpSearchQuery> &queryList = this->m_query;
       
    83     const QString indexPath(m_indexFilesFolder);
       
    84 
       
    85     mutex.unlock();
       
    86 
       
    87     QHelpEngineCore engine(collectionFile, 0);
       
    88     if (!engine.setupData())
       
    89         return;
       
    90 
       
    91     QFileInfo fInfo(indexPath);
       
    92     if (fInfo.exists() && !fInfo.isWritable()) {
       
    93         qWarning("Full Text Search, could not read index (missing permissions).");
       
    94         return;
       
    95     }
       
    96 
       
    97     if(QCLuceneIndexReader::indexExists(indexPath)) {
       
    98         mutex.lock();
       
    99         if (m_cancel) {
       
   100             mutex.unlock();
       
   101             return;
       
   102         }
       
   103         mutex.unlock();
       
   104 
       
   105         emit searchingStarted();
       
   106 
       
   107 #if !defined(QT_NO_EXCEPTIONS)
       
   108         try {
       
   109 #endif
       
   110             QCLuceneBooleanQuery booleanQuery;
       
   111             QCLuceneStandardAnalyzer analyzer;
       
   112             if (!buildQuery(booleanQuery, queryList, analyzer)) {
       
   113                 emit searchingFinished(0);
       
   114                 return;
       
   115             }
       
   116 
       
   117             const QStringList attribList = engine.filterAttributes(engine.currentFilter());
       
   118             if (!attribList.isEmpty()) {
       
   119                 QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
       
   120                     + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), analyzer);
       
   121 
       
   122                 if (!query) {
       
   123                     emit searchingFinished(0);
       
   124                     return;
       
   125                 }
       
   126                 booleanQuery.add(query, true, true, false);
       
   127             }
       
   128 
       
   129             QCLuceneIndexSearcher indexSearcher(indexPath);
       
   130             QCLuceneHits hits = indexSearcher.search(booleanQuery);
       
   131 
       
   132             bool boost = true;
       
   133             QCLuceneBooleanQuery tryHarderQuery;
       
   134             if (hits.length() == 0) {
       
   135                 if (buildTryHarderQuery(tryHarderQuery, queryList, analyzer)) {
       
   136                     if (!attribList.isEmpty()) {
       
   137                         QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
       
   138                             + attribList.join(QLatin1String(" +")), QLatin1String("attribute"),
       
   139                             analyzer);
       
   140                         tryHarderQuery.add(query, true, true, false);
       
   141                     }
       
   142                     hits = indexSearcher.search(tryHarderQuery);
       
   143                     boost = (hits.length() == 0);
       
   144                 }
       
   145             }
       
   146 
       
   147             QSet<QString> pathSet;
       
   148             QCLuceneDocument document;
       
   149             const QStringList namespaceList = engine.registeredDocumentations();
       
   150 
       
   151             for (qint32 i = 0; i < hits.length(); i++) {
       
   152                 document = hits.document(i);
       
   153                 const QString path = document.get(QLatin1String("path"));
       
   154                 if (!pathSet.contains(path) && namespaceList.contains(
       
   155                     document.get(QLatin1String("namespace")), Qt::CaseInsensitive)) {
       
   156                     pathSet.insert(path);
       
   157                     hitList.append(qMakePair(path, document.get(QLatin1String("title"))));
       
   158                 }
       
   159                 document.clear();
       
   160 
       
   161                 mutex.lock();
       
   162                 if (m_cancel) {
       
   163                     mutex.unlock();
       
   164                     emit searchingFinished(0);
       
   165                     return;
       
   166                 }
       
   167                 mutex.unlock();
       
   168             }
       
   169 
       
   170             indexSearcher.close();
       
   171             const int count = hitList.count();
       
   172             if ((count > 0) && boost)
       
   173                 boostSearchHits(engine, hitList, queryList);
       
   174             emit searchingFinished(hitList.count());
       
   175 
       
   176 #if !defined(QT_NO_EXCEPTIONS)
       
   177         } catch(...) {
       
   178             mutex.lock();
       
   179             hitList.clear();
       
   180             mutex.unlock();
       
   181             emit searchingFinished(0);
       
   182         }
       
   183 #endif
       
   184     }
       
   185 }
       
   186 
       
   187 bool QHelpSearchIndexReaderClucene::defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery,
       
   188     QCLuceneStandardAnalyzer &analyzer)
       
   189 {
       
   190     const QLatin1String c("content");
       
   191     const QLatin1String t("titleTokenized");
       
   192 
       
   193     QCLuceneQuery *query = QCLuceneQueryParser::parse(term, c, analyzer);
       
   194     QCLuceneQuery *query2 = QCLuceneQueryParser::parse(term, t, analyzer);
       
   195     if (query && query2) {
       
   196         booleanQuery.add(query, true, false, false);
       
   197         booleanQuery.add(query2, true, false, false);
       
   198         return true;
       
   199     }
       
   200 
       
   201     return false;
       
   202 }
       
   203 
       
   204 bool QHelpSearchIndexReaderClucene::buildQuery(QCLuceneBooleanQuery &booleanQuery,
       
   205     const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
       
   206 {
       
   207     foreach (const QHelpSearchQuery query, queryList) {
       
   208         switch (query.fieldName) {
       
   209             case QHelpSearchQuery::FUZZY: {
       
   210                 const QLatin1String fuzzy("~");
       
   211                 foreach (const QString &term, query.wordList) {
       
   212                     if (term.isEmpty()
       
   213                         || !defaultQuery(term.toLower() + fuzzy, booleanQuery, analyzer)) {
       
   214                         return false;
       
   215                     }
       
   216                 }
       
   217             }   break;
       
   218 
       
   219             case QHelpSearchQuery::WITHOUT: {
       
   220                 QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   221                 foreach (const QString &term, query.wordList) {
       
   222                     if (stopWords.contains(term, Qt::CaseInsensitive))
       
   223                         continue;
       
   224 
       
   225                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   226                         QLatin1String("content"), term.toLower()));
       
   227                     QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
       
   228                         QLatin1String("titleTokenized"), term.toLower()));
       
   229 
       
   230                     if (query && query2) {
       
   231                         booleanQuery.add(query, true, false, true);
       
   232                         booleanQuery.add(query2, true, false, true);
       
   233                     } else {
       
   234                         return false;
       
   235                     }
       
   236                 }
       
   237             }   break;
       
   238 
       
   239             case QHelpSearchQuery::PHRASE: {
       
   240                 const QString &term = query.wordList.at(0).toLower();
       
   241                 if (term.contains(QLatin1Char(' '))) {
       
   242                     QStringList termList = term.split(QLatin1String(" "));
       
   243                     QCLucenePhraseQuery *q = new QCLucenePhraseQuery();
       
   244                     QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   245                     foreach (const QString &term, termList) {
       
   246                         if (!stopWords.contains(term, Qt::CaseInsensitive))
       
   247                             q->addTerm(QCLuceneTerm(QLatin1String("content"), term.toLower()));
       
   248                     }
       
   249                     booleanQuery.add(q, true, true, false);
       
   250                 } else {
       
   251                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   252                         QLatin1String("content"), term.toLower()));
       
   253                     QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
       
   254                         QLatin1String("titleTokenized"), term.toLower()));
       
   255 
       
   256                     if (query && query2) {
       
   257                         booleanQuery.add(query, true, true, false);
       
   258                         booleanQuery.add(query2, true, false, false);
       
   259                     } else {
       
   260                         return false;
       
   261                     }
       
   262                 }
       
   263             }   break;
       
   264 
       
   265             case QHelpSearchQuery::ALL: {
       
   266                 QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   267                 foreach (const QString &term, query.wordList) {
       
   268                     if (stopWords.contains(term, Qt::CaseInsensitive))
       
   269                         continue;
       
   270 
       
   271                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   272                         QLatin1String("content"), term.toLower()));
       
   273 
       
   274                     if (query) {
       
   275                         booleanQuery.add(query, true, true, false);
       
   276                     } else {
       
   277                         return false;
       
   278                     }
       
   279                 }
       
   280             }   break;
       
   281 
       
   282             case QHelpSearchQuery::DEFAULT: {
       
   283                 foreach (const QString &term, query.wordList) {
       
   284                     QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
       
   285                         QLatin1String("content"), analyzer);
       
   286 
       
   287                     if (query)
       
   288                         booleanQuery.add(query, true, true, false);
       
   289                 }
       
   290             }   break;
       
   291 
       
   292             case QHelpSearchQuery::ATLEAST: {
       
   293                 foreach (const QString &term, query.wordList) {
       
   294                     if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery, analyzer))
       
   295                         return false;
       
   296                 }
       
   297             }
       
   298         }
       
   299     }
       
   300 
       
   301     return true;
       
   302 }
       
   303 
       
   304 bool QHelpSearchIndexReaderClucene::buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery,
       
   305     const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
       
   306 {
       
   307     bool retVal = false;
       
   308     foreach (const QHelpSearchQuery query, queryList) {
       
   309         switch (query.fieldName) {
       
   310             default:    break;
       
   311             case QHelpSearchQuery::DEFAULT: {
       
   312                 foreach (const QString &term, query.wordList) {
       
   313                     QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
       
   314                         QLatin1String("content"), analyzer);
       
   315 
       
   316                     if (query) {
       
   317                         retVal = true;
       
   318                         booleanQuery.add(query, true, false, false);
       
   319                     }
       
   320                 }
       
   321             }   break;
       
   322         }
       
   323     }
       
   324     return retVal;
       
   325 }
       
   326 
       
   327 void QHelpSearchIndexReaderClucene::boostSearchHits(const QHelpEngineCore &engine,
       
   328     QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList)
       
   329 {
       
   330     foreach (const QHelpSearchQuery query, queryList) {
       
   331         if (query.fieldName != QHelpSearchQuery::DEFAULT)
       
   332             continue;
       
   333 
       
   334         QString joinedQuery = query.wordList.join(QLatin1String(" "));
       
   335 
       
   336         QCLuceneStandardAnalyzer analyzer;
       
   337         QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse(
       
   338             joinedQuery, QLatin1String("content"), analyzer);
       
   339 
       
   340         if (parsedQuery) {
       
   341             joinedQuery = parsedQuery->toString();
       
   342             delete parsedQuery;
       
   343         }
       
   344 
       
   345         int length = QString(QLatin1String("content:")).length();
       
   346         int index = joinedQuery.indexOf(QLatin1String("content:"));
       
   347 
       
   348         QString term;
       
   349         int nextIndex = 0;
       
   350         QStringList searchTerms;
       
   351         while (index != -1) {
       
   352             nextIndex = joinedQuery.indexOf(QLatin1String("content:"), index + 1);
       
   353             term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified();
       
   354             if (term.startsWith(QLatin1String("\""))
       
   355                 && term.endsWith(QLatin1String("\""))) {
       
   356                 searchTerms.append(term.remove(QLatin1String("\"")));
       
   357             } else {
       
   358                 searchTerms += term.split(QLatin1Char(' '));
       
   359             }
       
   360             index = nextIndex;
       
   361         }
       
   362         searchTerms.removeDuplicates();
       
   363 
       
   364         int count = qMin(75, hitList.count());
       
   365         QMap<int, QHelpSearchEngine::SearchHit> hitMap;
       
   366         for (int i = 0; i < count; ++i) {
       
   367             const QHelpSearchEngine::SearchHit &hit = hitList.at(i);
       
   368             QString data = QString::fromUtf8(engine.fileData(hit.first));
       
   369 
       
   370             int counter = 0;
       
   371             foreach (const QString &term, searchTerms)
       
   372                 counter += data.count(term, Qt::CaseInsensitive);
       
   373             hitMap.insertMulti(counter, hit);
       
   374         }
       
   375 
       
   376         QList<QHelpSearchEngine::SearchHit> boostedList;
       
   377         QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd();
       
   378         do {
       
   379             --it;
       
   380             boostedList.append(it.value());
       
   381         } while (it != hitMap.constBegin());
       
   382         boostedList += hitList.mid(count, hitList.count());
       
   383         mutex.lock();
       
   384         hitList = boostedList;
       
   385         mutex.unlock();
       
   386     }
       
   387 }
       
   388 
       
   389 }   // namespace clucene
       
   390 }   // namespace fulltextsearch
       
   391 
       
   392 QT_END_NAMESPACE