util/tools/assistant/lib/qhelpsearchindexreader_default.cpp
changeset 7 f7bc934e204c
equal deleted inserted replaced
3:41300fa6a67c 7:f7bc934e204c
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Assistant of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qhelpenginecore.h"
       
    43 #include "qhelpsearchindexreader_default_p.h"
       
    44 
       
    45 #include <QtCore/QDir>
       
    46 #include <QtCore/QUrl>
       
    47 #include <QtCore/QFile>
       
    48 #include <QtCore/QVariant>
       
    49 #include <QtCore/QFileInfo>
       
    50 #include <QtCore/QDataStream>
       
    51 #include <QtCore/QTextStream>
       
    52 
       
    53 QT_BEGIN_NAMESPACE
       
    54 
       
    55 namespace fulltextsearch {
       
    56 namespace std {
       
    57 
       
    58 namespace {
       
    59     QStringList split( const QString &str )
       
    60     {
       
    61         QStringList lst;
       
    62         int j = 0;
       
    63         int i = str.indexOf(QLatin1Char('*'), j );
       
    64 
       
    65         if (str.startsWith(QLatin1String("*")))
       
    66             lst << QLatin1String("*");
       
    67 
       
    68         while ( i != -1 ) {
       
    69             if ( i > j && i <= (int)str.length() ) {
       
    70                 lst << str.mid( j, i - j );
       
    71                 lst << QLatin1String("*");
       
    72             }
       
    73             j = i + 1;
       
    74             i = str.indexOf(QLatin1Char('*'), j );
       
    75         }
       
    76 
       
    77         int l = str.length() - 1;
       
    78         if ( str.mid( j, l - j + 1 ).length() > 0 )
       
    79             lst << str.mid( j, l - j + 1 );
       
    80 
       
    81         return lst;
       
    82     }
       
    83 }
       
    84 
       
    85 
       
    86 Reader::Reader()
       
    87     : indexPath(QString())
       
    88     , indexFile(QString())
       
    89     , documentFile(QString())
       
    90 {
       
    91     termList.clear();
       
    92     indexTable.clear();
       
    93     searchIndexTable.clear();
       
    94 }
       
    95 
       
    96 Reader::~Reader()
       
    97 {
       
    98     reset();
       
    99     searchIndexTable.clear();
       
   100 }
       
   101 
       
   102 bool Reader::readIndex()
       
   103 {
       
   104     if (indexTable.contains(indexFile))
       
   105         return true;
       
   106 
       
   107     QFile idxFile(indexFile);
       
   108     if (!idxFile.open(QFile::ReadOnly))
       
   109         return false;
       
   110 
       
   111     QString key;
       
   112     int numOfDocs;
       
   113     EntryTable entryTable;
       
   114     QVector<Document> docs;
       
   115     QDataStream dictStream(&idxFile);
       
   116     while (!dictStream.atEnd()) {
       
   117         dictStream >> key;
       
   118         dictStream >> numOfDocs;
       
   119         docs.resize(numOfDocs);
       
   120         dictStream >> docs;
       
   121         entryTable.insert(key, new Entry(docs));
       
   122     }
       
   123     idxFile.close();
       
   124 
       
   125     if (entryTable.isEmpty())
       
   126         return false;
       
   127 
       
   128     QFile docFile(documentFile);
       
   129     if (!docFile.open(QFile::ReadOnly))
       
   130         return false;
       
   131 
       
   132     QString title, url;
       
   133     DocumentList documentList;
       
   134     QDataStream docStream(&docFile);
       
   135     while (!docStream.atEnd()) {
       
   136         docStream >> title;
       
   137         docStream >> url;
       
   138         documentList.append(QStringList(title) << url);
       
   139     }
       
   140     docFile.close();
       
   141 
       
   142     if (documentList.isEmpty()) {
       
   143         cleanupIndex(entryTable);
       
   144         return false;
       
   145     }
       
   146 
       
   147     indexTable.insert(indexFile, Index(entryTable, documentList));
       
   148     return true;
       
   149 }
       
   150 
       
   151 bool Reader::initCheck() const
       
   152 {
       
   153     return !searchIndexTable.isEmpty();
       
   154 }
       
   155 
       
   156 void Reader::setIndexPath(const QString &path)
       
   157 {
       
   158     indexPath = path;
       
   159 }
       
   160 
       
   161 void Reader::filterFilesForAttributes(const QStringList &attributes)
       
   162 {
       
   163     searchIndexTable.clear();
       
   164     for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
       
   165         const QString fileName = it.key();
       
   166         bool containsAll = true;
       
   167         QStringList split = fileName.split(QLatin1String("@"));
       
   168         foreach (const QString &attribute, attributes) {
       
   169             if (!split.contains(attribute, Qt::CaseInsensitive)) {
       
   170                 containsAll = false;
       
   171                 break;
       
   172             }
       
   173         }
       
   174 
       
   175         if (containsAll)
       
   176             searchIndexTable.insert(fileName, it.value());
       
   177     }
       
   178 }
       
   179 
       
   180 void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
       
   181 {
       
   182     QString extension = namespaceName + QLatin1String("@") + attributes;
       
   183     indexFile = indexPath + QLatin1String("/indexdb40.") + extension;
       
   184     documentFile = indexPath + QLatin1String("/indexdoc40.") + extension;
       
   185 }
       
   186 
       
   187 bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
       
   188                                   QStringList *termSeq, QStringList *seqWords)
       
   189 {
       
   190     QString term = searchTerm;
       
   191 
       
   192     term = term.simplified();
       
   193     term = term.replace(QLatin1String("\'"), QLatin1String("\""));
       
   194     term = term.replace(QLatin1String("`"), QLatin1String("\""));
       
   195     term = term.replace(QLatin1String("-"), QLatin1String(" "));
       
   196     term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
       
   197 
       
   198     *terms = term.split(QLatin1Char(' '));
       
   199     QStringList::iterator it = terms->begin();
       
   200     for (; it != terms->end(); ++it) {
       
   201         (*it) = (*it).simplified();
       
   202         (*it) = (*it).toLower();
       
   203         (*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
       
   204     }
       
   205 
       
   206     if (term.contains(QLatin1Char('\"'))) {
       
   207         if ((term.count(QLatin1Char('\"')))%2 == 0) {
       
   208             int beg = 0;
       
   209             int end = 0;
       
   210             QString s;
       
   211             beg = term.indexOf(QLatin1Char('\"'), beg);
       
   212             while (beg != -1) {
       
   213                 beg++;
       
   214                 end = term.indexOf(QLatin1Char('\"'), beg);
       
   215                 s = term.mid(beg, end - beg);
       
   216                 s = s.toLower();
       
   217                 s = s.simplified();
       
   218                 if (s.contains(QLatin1Char('*'))) {
       
   219                     qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
       
   220                     return false;
       
   221                 }
       
   222                 *seqWords += s.split(QLatin1Char(' '));
       
   223                 *termSeq << s;
       
   224                 beg = term.indexOf(QLatin1Char('\"'), end + 1);
       
   225             }
       
   226         } else {
       
   227             qWarning("Full Text Search, the closing quotation mark is missing.");
       
   228             return false;
       
   229         }
       
   230     }
       
   231 
       
   232     return true;
       
   233 }
       
   234 
       
   235 void Reader::searchInIndex(const QStringList &terms)
       
   236 {
       
   237     foreach (const QString &term, terms) {
       
   238         QVector<Document> documents;
       
   239 
       
   240         for(IndexTable::ConstIterator it = searchIndexTable.begin();
       
   241             it != searchIndexTable.end(); ++it) {
       
   242             EntryTable entryTable = it.value().first;
       
   243             DocumentList documentList = it.value().second;
       
   244 
       
   245             if (term.contains(QLatin1Char('*')))
       
   246                 documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
       
   247             else if (entryTable.value(term))
       
   248                 documents = entryTable.value(term)->documents;
       
   249             else
       
   250                 continue;
       
   251 
       
   252             if (!documents.isEmpty()) {
       
   253                 DocumentInfo info;
       
   254                 QString title, url;
       
   255                 QVector<DocumentInfo> documentsInfo;
       
   256                 foreach(const Document &doc, documents) {
       
   257                     info.docNumber = doc.docNumber;
       
   258                     info.frequency = doc.frequency;
       
   259                     info.documentUrl = documentList.at(doc.docNumber).at(1);
       
   260                     info.documentTitle = documentList.at(doc.docNumber).at(0);
       
   261                     documentsInfo.append(info);
       
   262                 }
       
   263 
       
   264                 bool found = false;
       
   265                 for(QList<TermInfo>::Iterator tit = termList.begin();
       
   266                     tit != termList.end(); ++tit) {
       
   267                     TermInfo *t = &(*tit);
       
   268                     if(t->term == term) {
       
   269                         t->documents += documentsInfo;
       
   270                         t->frequency += documentsInfo.count();
       
   271                         found = true; break;
       
   272                     }
       
   273                 }
       
   274                 if (!found)
       
   275                     termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
       
   276             }
       
   277         }
       
   278     }
       
   279     qSort(termList);
       
   280 }
       
   281 
       
   282 QVector<DocumentInfo> Reader::hits()
       
   283 {
       
   284     QVector<DocumentInfo> documents;
       
   285     if (!termList.count())
       
   286         return documents;
       
   287 
       
   288     documents = termList.takeFirst().documents;
       
   289     for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   290         TermInfo *t = &(*it);
       
   291         QVector<DocumentInfo> docs = t->documents;
       
   292         for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
       
   293             minDoc_it != documents.end(); ) {
       
   294             bool found = false;
       
   295             for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
       
   296                 doc_it != docs.constEnd(); ++doc_it ) {
       
   297                 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
       
   298                     (*minDoc_it).frequency += (*doc_it).frequency;
       
   299                     found = true;
       
   300                     break;
       
   301                 }
       
   302             }
       
   303             if (!found)
       
   304                 minDoc_it = documents.erase(minDoc_it);
       
   305             else
       
   306                 ++minDoc_it;
       
   307         }
       
   308     }
       
   309 
       
   310     qSort(documents);
       
   311     return documents;
       
   312 }
       
   313 
       
   314 bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
       
   315                                    const QByteArray &data)
       
   316 {
       
   317     if (data.isEmpty())
       
   318         return false;
       
   319 
       
   320     for(QHash<QString, PosEntry*>::ConstIterator mit =
       
   321         miniIndex.begin(); mit != miniIndex.end(); ++mit) {
       
   322             delete mit.value();
       
   323     }
       
   324     miniIndex.clear();
       
   325 
       
   326     wordNum = 3;
       
   327     QStringList::ConstIterator cIt = words.begin();
       
   328     for ( ; cIt != words.end(); ++cIt )
       
   329         miniIndex.insert(*cIt, new PosEntry(0));
       
   330 
       
   331     QTextStream s(data);
       
   332     QString text = s.readAll();
       
   333     bool valid = true;
       
   334     const QChar *buf = text.unicode();
       
   335     QChar str[64];
       
   336     QChar c = buf[0];
       
   337     int j = 0;
       
   338     int i = 0;
       
   339     while ( j < text.length() ) {
       
   340         if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
       
   341             valid = false;
       
   342             if ( i > 1 )
       
   343                 buildMiniIndex( QString(str,i) );
       
   344             i = 0;
       
   345             c = buf[++j];
       
   346             continue;
       
   347         }
       
   348         if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
       
   349             valid = true;
       
   350             c = buf[++j];
       
   351             continue;
       
   352         }
       
   353         if ( !valid ) {
       
   354             c = buf[++j];
       
   355             continue;
       
   356         }
       
   357         if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
       
   358             str[i] = c.toLower();
       
   359             ++i;
       
   360         } else {
       
   361             if ( i > 1 )
       
   362                 buildMiniIndex( QString(str,i) );
       
   363             i = 0;
       
   364         }
       
   365         c = buf[++j];
       
   366     }
       
   367     if ( i > 1 )
       
   368         buildMiniIndex( QString(str,i) );
       
   369 
       
   370     QStringList::ConstIterator patIt = patterns.begin();
       
   371     QStringList wordLst;
       
   372     QList<uint> a, b;
       
   373     QList<uint>::iterator aIt;
       
   374     for ( ; patIt != patterns.end(); ++patIt ) {
       
   375         wordLst = (*patIt).split(QLatin1Char(' '));
       
   376         a = miniIndex[ wordLst[0] ]->positions;
       
   377         for ( int j = 1; j < (int)wordLst.count(); ++j ) {
       
   378             b = miniIndex[ wordLst[j] ]->positions;
       
   379             aIt = a.begin();
       
   380             while ( aIt != a.end() ) {
       
   381                 if ( b.contains( *aIt + 1 )) {
       
   382                     (*aIt)++;
       
   383                     ++aIt;
       
   384                 } else {
       
   385                     aIt = a.erase( aIt );
       
   386                 }
       
   387             }
       
   388         }
       
   389     }
       
   390     if ( a.count() )
       
   391         return true;
       
   392     return false;
       
   393 }
       
   394 
       
   395 QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
       
   396                                               const EntryTable &entryTable)
       
   397 {
       
   398     QList<Term> termList;
       
   399     for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
       
   400         if (entryTable.value(*it)) {
       
   401             Entry *e = entryTable.value(*it);
       
   402             termList.append(Term(*it, e->documents.count(), e->documents ) );
       
   403         }
       
   404     }
       
   405     QVector<Document> maxList(0);
       
   406     if ( !termList.count() )
       
   407         return maxList;
       
   408     qSort(termList);
       
   409 
       
   410     maxList = termList.takeLast().documents;
       
   411     for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   412         Term *t = &(*it);
       
   413         QVector<Document> docs = t->documents;
       
   414         for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
       
   415             if ( maxList.indexOf( *docIt ) == -1 )
       
   416                 maxList.append( *docIt );
       
   417         }
       
   418     }
       
   419     return maxList;
       
   420 }
       
   421 
       
   422 QStringList Reader::getWildcardTerms(const QString &term,
       
   423                                           const EntryTable &entryTable)
       
   424 {
       
   425     QStringList lst;
       
   426     QStringList terms = split(term);
       
   427     QStringList::Iterator iter;
       
   428 
       
   429     for(EntryTable::ConstIterator it = entryTable.begin();
       
   430         it != entryTable.end(); ++it) {
       
   431         int index = 0;
       
   432         bool found = false;
       
   433         QString text( it.key() );
       
   434         for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
       
   435             if ( *iter == QLatin1String("*") ) {
       
   436                 found = true;
       
   437                 continue;
       
   438             }
       
   439             if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
       
   440                 found = false;
       
   441                 break;
       
   442             }
       
   443             index = text.indexOf( *iter, index );
       
   444             if ( *iter == terms.last() && index != (int)text.length()-1 ) {
       
   445                 index = text.lastIndexOf( *iter );
       
   446                 if ( index != (int)text.length() - (int)(*iter).length() ) {
       
   447                     found = false;
       
   448                     break;
       
   449                 }
       
   450             }
       
   451             if ( index != -1 ) {
       
   452                 found = true;
       
   453                 index += (*iter).length();
       
   454                 continue;
       
   455             } else {
       
   456                 found = false;
       
   457                 break;
       
   458             }
       
   459         }
       
   460         if (found)
       
   461             lst << text;
       
   462     }
       
   463 
       
   464     return lst;
       
   465 }
       
   466 
       
   467 void Reader::buildMiniIndex(const QString &string)
       
   468 {
       
   469     if (miniIndex[string])
       
   470         miniIndex[string]->positions.append(wordNum);
       
   471     ++wordNum;
       
   472 }
       
   473 
       
   474 void Reader::reset()
       
   475 {
       
   476     for(IndexTable::Iterator it = indexTable.begin();
       
   477         it != indexTable.end(); ++it) {
       
   478         cleanupIndex(it.value().first);
       
   479         it.value().second.clear();
       
   480     }
       
   481 }
       
   482 
       
   483 void Reader::cleanupIndex(EntryTable &entryTable)
       
   484 {
       
   485     for(EntryTable::ConstIterator it =
       
   486         entryTable.begin(); it != entryTable.end(); ++it) {
       
   487             delete it.value();
       
   488     }
       
   489 
       
   490     entryTable.clear();
       
   491 }
       
   492 
       
   493 
       
   494 QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
       
   495     : QHelpSearchIndexReader()
       
   496 {
       
   497     // nothing todo
       
   498 }
       
   499 
       
   500 QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
       
   501 {
       
   502 }
       
   503 
       
   504 void QHelpSearchIndexReaderDefault::run()
       
   505 {
       
   506     mutex.lock();
       
   507 
       
   508     if (m_cancel) {
       
   509         mutex.unlock();
       
   510         return;
       
   511     }
       
   512 
       
   513     const QList<QHelpSearchQuery> &queryList = this->m_query;
       
   514     const QLatin1String key("DefaultSearchNamespaces");
       
   515     const QString collectionFile(this->m_collectionFile);
       
   516     const QString indexPath = m_indexFilesFolder;
       
   517 
       
   518     mutex.unlock();
       
   519 
       
   520     QString queryTerm;
       
   521     foreach (const QHelpSearchQuery &query, queryList) {
       
   522         if (query.fieldName == QHelpSearchQuery::DEFAULT) {
       
   523             queryTerm = query.wordList.at(0);
       
   524             break;
       
   525         }
       
   526     }
       
   527 
       
   528     if (queryTerm.isEmpty())
       
   529         return;
       
   530 
       
   531     QHelpEngineCore engine(collectionFile, 0);
       
   532     if (!engine.setupData())
       
   533         return;
       
   534 
       
   535     const QStringList registeredDocs = engine.registeredDocumentations();
       
   536     const QStringList indexedNamespaces = engine.customValue(key).toString().
       
   537         split(QLatin1String("|"), QString::SkipEmptyParts);
       
   538 
       
   539     emit searchingStarted();
       
   540 
       
   541     // setup the reader
       
   542     m_reader.setIndexPath(indexPath);
       
   543     foreach(const QString &namespaceName, registeredDocs) {
       
   544         mutex.lock();
       
   545         if (m_cancel) {
       
   546             mutex.unlock();
       
   547             searchingFinished(0);   // TODO: check this ???
       
   548             return;
       
   549         }
       
   550         mutex.unlock();
       
   551 
       
   552         const QList<QStringList> attributeSets =
       
   553             engine.filterAttributeSets(namespaceName);
       
   554 
       
   555         foreach (const QStringList &attributes, attributeSets) {
       
   556             // read all index files
       
   557             m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
       
   558             if (!m_reader.readIndex()) {
       
   559                 qWarning("Full Text Search, could not read file for namespace: %s.",
       
   560                     namespaceName.toUtf8().constData());
       
   561             }
       
   562         }
       
   563     }
       
   564 
       
   565     // get the current filter attributes and minimize the index files table
       
   566     m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
       
   567 
       
   568     hitList.clear();
       
   569     QStringList terms, termSeq, seqWords;
       
   570     if (m_reader.initCheck() && // check if we could read anything
       
   571         m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
       
   572 
       
   573         // search for term(s)
       
   574         m_reader.searchInIndex(terms);    // TODO: should this be interruptible as well ???
       
   575 
       
   576         QVector<DocumentInfo> hits = m_reader.hits();
       
   577         if (!hits.isEmpty()) {
       
   578             if (termSeq.isEmpty()) {
       
   579                 foreach (const DocumentInfo &docInfo, hits) {
       
   580                     mutex.lock();
       
   581                     if (m_cancel) {
       
   582                         mutex.unlock();
       
   583                         searchingFinished(0);   // TODO: check this, speed issue while locking???
       
   584                         return;
       
   585                     }
       
   586                     mutex.unlock();
       
   587                     hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
       
   588                 }
       
   589             } else {
       
   590                 foreach (const DocumentInfo &docInfo, hits) {
       
   591                     mutex.lock();
       
   592                     if (m_cancel) {
       
   593                         mutex.unlock();
       
   594                         searchingFinished(0);   // TODO: check this, speed issue while locking???
       
   595                         return;
       
   596                     }
       
   597                     mutex.unlock();
       
   598 
       
   599                     if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
       
   600                         hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
       
   601                 }
       
   602             }
       
   603         }
       
   604     }
       
   605 
       
   606     emit searchingFinished(hitList.count());
       
   607 }
       
   608 
       
   609 }   // namespace std
       
   610 }   // namespace fulltextsearch
       
   611 
       
   612 QT_END_NAMESPACE