|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the Qt Assistant of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 #include "qhelpenginecore.h" |
|
43 #include "qhelpsearchindexreader_default_p.h" |
|
44 |
|
45 #include <QtCore/QDir> |
|
46 #include <QtCore/QUrl> |
|
47 #include <QtCore/QFile> |
|
48 #include <QtCore/QVariant> |
|
49 #include <QtCore/QFileInfo> |
|
50 #include <QtCore/QDataStream> |
|
51 #include <QtCore/QTextStream> |
|
52 |
|
53 QT_BEGIN_NAMESPACE |
|
54 |
|
55 namespace fulltextsearch { |
|
56 namespace std { |
|
57 |
|
58 namespace { |
|
59 QStringList split( const QString &str ) |
|
60 { |
|
61 QStringList lst; |
|
62 int j = 0; |
|
63 int i = str.indexOf(QLatin1Char('*'), j ); |
|
64 |
|
65 if (str.startsWith(QLatin1String("*"))) |
|
66 lst << QLatin1String("*"); |
|
67 |
|
68 while ( i != -1 ) { |
|
69 if ( i > j && i <= (int)str.length() ) { |
|
70 lst << str.mid( j, i - j ); |
|
71 lst << QLatin1String("*"); |
|
72 } |
|
73 j = i + 1; |
|
74 i = str.indexOf(QLatin1Char('*'), j ); |
|
75 } |
|
76 |
|
77 int l = str.length() - 1; |
|
78 if ( str.mid( j, l - j + 1 ).length() > 0 ) |
|
79 lst << str.mid( j, l - j + 1 ); |
|
80 |
|
81 return lst; |
|
82 } |
|
83 } |
|
84 |
|
85 |
|
86 Reader::Reader() |
|
87 : indexPath(QString()) |
|
88 , indexFile(QString()) |
|
89 , documentFile(QString()) |
|
90 { |
|
91 termList.clear(); |
|
92 indexTable.clear(); |
|
93 searchIndexTable.clear(); |
|
94 } |
|
95 |
|
96 Reader::~Reader() |
|
97 { |
|
98 reset(); |
|
99 searchIndexTable.clear(); |
|
100 } |
|
101 |
|
102 bool Reader::readIndex() |
|
103 { |
|
104 if (indexTable.contains(indexFile)) |
|
105 return true; |
|
106 |
|
107 QFile idxFile(indexFile); |
|
108 if (!idxFile.open(QFile::ReadOnly)) |
|
109 return false; |
|
110 |
|
111 QString key; |
|
112 int numOfDocs; |
|
113 EntryTable entryTable; |
|
114 QVector<Document> docs; |
|
115 QDataStream dictStream(&idxFile); |
|
116 while (!dictStream.atEnd()) { |
|
117 dictStream >> key; |
|
118 dictStream >> numOfDocs; |
|
119 docs.resize(numOfDocs); |
|
120 dictStream >> docs; |
|
121 entryTable.insert(key, new Entry(docs)); |
|
122 } |
|
123 idxFile.close(); |
|
124 |
|
125 if (entryTable.isEmpty()) |
|
126 return false; |
|
127 |
|
128 QFile docFile(documentFile); |
|
129 if (!docFile.open(QFile::ReadOnly)) |
|
130 return false; |
|
131 |
|
132 QString title, url; |
|
133 DocumentList documentList; |
|
134 QDataStream docStream(&docFile); |
|
135 while (!docStream.atEnd()) { |
|
136 docStream >> title; |
|
137 docStream >> url; |
|
138 documentList.append(QStringList(title) << url); |
|
139 } |
|
140 docFile.close(); |
|
141 |
|
142 if (documentList.isEmpty()) { |
|
143 cleanupIndex(entryTable); |
|
144 return false; |
|
145 } |
|
146 |
|
147 indexTable.insert(indexFile, Index(entryTable, documentList)); |
|
148 return true; |
|
149 } |
|
150 |
|
151 bool Reader::initCheck() const |
|
152 { |
|
153 return !searchIndexTable.isEmpty(); |
|
154 } |
|
155 |
|
156 void Reader::setIndexPath(const QString &path) |
|
157 { |
|
158 indexPath = path; |
|
159 } |
|
160 |
|
161 void Reader::filterFilesForAttributes(const QStringList &attributes) |
|
162 { |
|
163 searchIndexTable.clear(); |
|
164 for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) { |
|
165 const QString fileName = it.key(); |
|
166 bool containsAll = true; |
|
167 QStringList split = fileName.split(QLatin1String("@")); |
|
168 foreach (const QString &attribute, attributes) { |
|
169 if (!split.contains(attribute, Qt::CaseInsensitive)) { |
|
170 containsAll = false; |
|
171 break; |
|
172 } |
|
173 } |
|
174 |
|
175 if (containsAll) |
|
176 searchIndexTable.insert(fileName, it.value()); |
|
177 } |
|
178 } |
|
179 |
|
180 void Reader::setIndexFile(const QString &namespaceName, const QString &attributes) |
|
181 { |
|
182 QString extension = namespaceName + QLatin1String("@") + attributes; |
|
183 indexFile = indexPath + QLatin1String("/indexdb40.") + extension; |
|
184 documentFile = indexPath + QLatin1String("/indexdoc40.") + extension; |
|
185 } |
|
186 |
|
187 bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms, |
|
188 QStringList *termSeq, QStringList *seqWords) |
|
189 { |
|
190 QString term = searchTerm; |
|
191 |
|
192 term = term.simplified(); |
|
193 term = term.replace(QLatin1String("\'"), QLatin1String("\"")); |
|
194 term = term.replace(QLatin1String("`"), QLatin1String("\"")); |
|
195 term = term.replace(QLatin1String("-"), QLatin1String(" ")); |
|
196 term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" ")); |
|
197 |
|
198 *terms = term.split(QLatin1Char(' ')); |
|
199 QStringList::iterator it = terms->begin(); |
|
200 for (; it != terms->end(); ++it) { |
|
201 (*it) = (*it).simplified(); |
|
202 (*it) = (*it).toLower(); |
|
203 (*it) = (*it).replace(QLatin1String("\""), QLatin1String("")); |
|
204 } |
|
205 |
|
206 if (term.contains(QLatin1Char('\"'))) { |
|
207 if ((term.count(QLatin1Char('\"')))%2 == 0) { |
|
208 int beg = 0; |
|
209 int end = 0; |
|
210 QString s; |
|
211 beg = term.indexOf(QLatin1Char('\"'), beg); |
|
212 while (beg != -1) { |
|
213 beg++; |
|
214 end = term.indexOf(QLatin1Char('\"'), beg); |
|
215 s = term.mid(beg, end - beg); |
|
216 s = s.toLower(); |
|
217 s = s.simplified(); |
|
218 if (s.contains(QLatin1Char('*'))) { |
|
219 qWarning("Full Text Search, using a wildcard within phrases is not allowed."); |
|
220 return false; |
|
221 } |
|
222 *seqWords += s.split(QLatin1Char(' ')); |
|
223 *termSeq << s; |
|
224 beg = term.indexOf(QLatin1Char('\"'), end + 1); |
|
225 } |
|
226 } else { |
|
227 qWarning("Full Text Search, the closing quotation mark is missing."); |
|
228 return false; |
|
229 } |
|
230 } |
|
231 |
|
232 return true; |
|
233 } |
|
234 |
|
235 void Reader::searchInIndex(const QStringList &terms) |
|
236 { |
|
237 foreach (const QString &term, terms) { |
|
238 QVector<Document> documents; |
|
239 |
|
240 for(IndexTable::ConstIterator it = searchIndexTable.begin(); |
|
241 it != searchIndexTable.end(); ++it) { |
|
242 EntryTable entryTable = it.value().first; |
|
243 DocumentList documentList = it.value().second; |
|
244 |
|
245 if (term.contains(QLatin1Char('*'))) |
|
246 documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable); |
|
247 else if (entryTable.value(term)) |
|
248 documents = entryTable.value(term)->documents; |
|
249 else |
|
250 continue; |
|
251 |
|
252 if (!documents.isEmpty()) { |
|
253 DocumentInfo info; |
|
254 QString title, url; |
|
255 QVector<DocumentInfo> documentsInfo; |
|
256 foreach(const Document &doc, documents) { |
|
257 info.docNumber = doc.docNumber; |
|
258 info.frequency = doc.frequency; |
|
259 info.documentUrl = documentList.at(doc.docNumber).at(1); |
|
260 info.documentTitle = documentList.at(doc.docNumber).at(0); |
|
261 documentsInfo.append(info); |
|
262 } |
|
263 |
|
264 bool found = false; |
|
265 for(QList<TermInfo>::Iterator tit = termList.begin(); |
|
266 tit != termList.end(); ++tit) { |
|
267 TermInfo *t = &(*tit); |
|
268 if(t->term == term) { |
|
269 t->documents += documentsInfo; |
|
270 t->frequency += documentsInfo.count(); |
|
271 found = true; break; |
|
272 } |
|
273 } |
|
274 if (!found) |
|
275 termList.append(TermInfo(term, documentsInfo.count(), documentsInfo)); |
|
276 } |
|
277 } |
|
278 } |
|
279 qSort(termList); |
|
280 } |
|
281 |
|
282 QVector<DocumentInfo> Reader::hits() |
|
283 { |
|
284 QVector<DocumentInfo> documents; |
|
285 if (!termList.count()) |
|
286 return documents; |
|
287 |
|
288 documents = termList.takeFirst().documents; |
|
289 for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) { |
|
290 TermInfo *t = &(*it); |
|
291 QVector<DocumentInfo> docs = t->documents; |
|
292 for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin(); |
|
293 minDoc_it != documents.end(); ) { |
|
294 bool found = false; |
|
295 for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin(); |
|
296 doc_it != docs.constEnd(); ++doc_it ) { |
|
297 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) { |
|
298 (*minDoc_it).frequency += (*doc_it).frequency; |
|
299 found = true; |
|
300 break; |
|
301 } |
|
302 } |
|
303 if (!found) |
|
304 minDoc_it = documents.erase(minDoc_it); |
|
305 else |
|
306 ++minDoc_it; |
|
307 } |
|
308 } |
|
309 |
|
310 qSort(documents); |
|
311 return documents; |
|
312 } |
|
313 |
|
314 bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words, |
|
315 const QByteArray &data) |
|
316 { |
|
317 if (data.isEmpty()) |
|
318 return false; |
|
319 |
|
320 for(QHash<QString, PosEntry*>::ConstIterator mit = |
|
321 miniIndex.begin(); mit != miniIndex.end(); ++mit) { |
|
322 delete mit.value(); |
|
323 } |
|
324 miniIndex.clear(); |
|
325 |
|
326 wordNum = 3; |
|
327 QStringList::ConstIterator cIt = words.begin(); |
|
328 for ( ; cIt != words.end(); ++cIt ) |
|
329 miniIndex.insert(*cIt, new PosEntry(0)); |
|
330 |
|
331 QTextStream s(data); |
|
332 QString text = s.readAll(); |
|
333 bool valid = true; |
|
334 const QChar *buf = text.unicode(); |
|
335 QChar str[64]; |
|
336 QChar c = buf[0]; |
|
337 int j = 0; |
|
338 int i = 0; |
|
339 while ( j < text.length() ) { |
|
340 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) { |
|
341 valid = false; |
|
342 if ( i > 1 ) |
|
343 buildMiniIndex( QString(str,i) ); |
|
344 i = 0; |
|
345 c = buf[++j]; |
|
346 continue; |
|
347 } |
|
348 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) { |
|
349 valid = true; |
|
350 c = buf[++j]; |
|
351 continue; |
|
352 } |
|
353 if ( !valid ) { |
|
354 c = buf[++j]; |
|
355 continue; |
|
356 } |
|
357 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) { |
|
358 str[i] = c.toLower(); |
|
359 ++i; |
|
360 } else { |
|
361 if ( i > 1 ) |
|
362 buildMiniIndex( QString(str,i) ); |
|
363 i = 0; |
|
364 } |
|
365 c = buf[++j]; |
|
366 } |
|
367 if ( i > 1 ) |
|
368 buildMiniIndex( QString(str,i) ); |
|
369 |
|
370 QStringList::ConstIterator patIt = patterns.begin(); |
|
371 QStringList wordLst; |
|
372 QList<uint> a, b; |
|
373 QList<uint>::iterator aIt; |
|
374 for ( ; patIt != patterns.end(); ++patIt ) { |
|
375 wordLst = (*patIt).split(QLatin1Char(' ')); |
|
376 a = miniIndex[ wordLst[0] ]->positions; |
|
377 for ( int j = 1; j < (int)wordLst.count(); ++j ) { |
|
378 b = miniIndex[ wordLst[j] ]->positions; |
|
379 aIt = a.begin(); |
|
380 while ( aIt != a.end() ) { |
|
381 if ( b.contains( *aIt + 1 )) { |
|
382 (*aIt)++; |
|
383 ++aIt; |
|
384 } else { |
|
385 aIt = a.erase( aIt ); |
|
386 } |
|
387 } |
|
388 } |
|
389 } |
|
390 if ( a.count() ) |
|
391 return true; |
|
392 return false; |
|
393 } |
|
394 |
|
395 QVector<Document> Reader::setupDummyTerm(const QStringList &terms, |
|
396 const EntryTable &entryTable) |
|
397 { |
|
398 QList<Term> termList; |
|
399 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) { |
|
400 if (entryTable.value(*it)) { |
|
401 Entry *e = entryTable.value(*it); |
|
402 termList.append(Term(*it, e->documents.count(), e->documents ) ); |
|
403 } |
|
404 } |
|
405 QVector<Document> maxList(0); |
|
406 if ( !termList.count() ) |
|
407 return maxList; |
|
408 qSort(termList); |
|
409 |
|
410 maxList = termList.takeLast().documents; |
|
411 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) { |
|
412 Term *t = &(*it); |
|
413 QVector<Document> docs = t->documents; |
|
414 for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) { |
|
415 if ( maxList.indexOf( *docIt ) == -1 ) |
|
416 maxList.append( *docIt ); |
|
417 } |
|
418 } |
|
419 return maxList; |
|
420 } |
|
421 |
|
422 QStringList Reader::getWildcardTerms(const QString &term, |
|
423 const EntryTable &entryTable) |
|
424 { |
|
425 QStringList lst; |
|
426 QStringList terms = split(term); |
|
427 QStringList::Iterator iter; |
|
428 |
|
429 for(EntryTable::ConstIterator it = entryTable.begin(); |
|
430 it != entryTable.end(); ++it) { |
|
431 int index = 0; |
|
432 bool found = false; |
|
433 QString text( it.key() ); |
|
434 for ( iter = terms.begin(); iter != terms.end(); ++iter ) { |
|
435 if ( *iter == QLatin1String("*") ) { |
|
436 found = true; |
|
437 continue; |
|
438 } |
|
439 if ( iter == terms.begin() && (*iter)[0] != text[0] ) { |
|
440 found = false; |
|
441 break; |
|
442 } |
|
443 index = text.indexOf( *iter, index ); |
|
444 if ( *iter == terms.last() && index != (int)text.length()-1 ) { |
|
445 index = text.lastIndexOf( *iter ); |
|
446 if ( index != (int)text.length() - (int)(*iter).length() ) { |
|
447 found = false; |
|
448 break; |
|
449 } |
|
450 } |
|
451 if ( index != -1 ) { |
|
452 found = true; |
|
453 index += (*iter).length(); |
|
454 continue; |
|
455 } else { |
|
456 found = false; |
|
457 break; |
|
458 } |
|
459 } |
|
460 if (found) |
|
461 lst << text; |
|
462 } |
|
463 |
|
464 return lst; |
|
465 } |
|
466 |
|
467 void Reader::buildMiniIndex(const QString &string) |
|
468 { |
|
469 if (miniIndex[string]) |
|
470 miniIndex[string]->positions.append(wordNum); |
|
471 ++wordNum; |
|
472 } |
|
473 |
|
474 void Reader::reset() |
|
475 { |
|
476 for(IndexTable::Iterator it = indexTable.begin(); |
|
477 it != indexTable.end(); ++it) { |
|
478 cleanupIndex(it.value().first); |
|
479 it.value().second.clear(); |
|
480 } |
|
481 } |
|
482 |
|
483 void Reader::cleanupIndex(EntryTable &entryTable) |
|
484 { |
|
485 for(EntryTable::ConstIterator it = |
|
486 entryTable.begin(); it != entryTable.end(); ++it) { |
|
487 delete it.value(); |
|
488 } |
|
489 |
|
490 entryTable.clear(); |
|
491 } |
|
492 |
|
493 |
|
494 QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault() |
|
495 : QHelpSearchIndexReader() |
|
496 { |
|
497 // nothing todo |
|
498 } |
|
499 |
|
500 QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault() |
|
501 { |
|
502 } |
|
503 |
|
504 void QHelpSearchIndexReaderDefault::run() |
|
505 { |
|
506 mutex.lock(); |
|
507 |
|
508 if (m_cancel) { |
|
509 mutex.unlock(); |
|
510 return; |
|
511 } |
|
512 |
|
513 const QList<QHelpSearchQuery> &queryList = this->m_query; |
|
514 const QLatin1String key("DefaultSearchNamespaces"); |
|
515 const QString collectionFile(this->m_collectionFile); |
|
516 const QString indexPath = m_indexFilesFolder; |
|
517 |
|
518 mutex.unlock(); |
|
519 |
|
520 QString queryTerm; |
|
521 foreach (const QHelpSearchQuery &query, queryList) { |
|
522 if (query.fieldName == QHelpSearchQuery::DEFAULT) { |
|
523 queryTerm = query.wordList.at(0); |
|
524 break; |
|
525 } |
|
526 } |
|
527 |
|
528 if (queryTerm.isEmpty()) |
|
529 return; |
|
530 |
|
531 QHelpEngineCore engine(collectionFile, 0); |
|
532 if (!engine.setupData()) |
|
533 return; |
|
534 |
|
535 const QStringList registeredDocs = engine.registeredDocumentations(); |
|
536 const QStringList indexedNamespaces = engine.customValue(key).toString(). |
|
537 split(QLatin1String("|"), QString::SkipEmptyParts); |
|
538 |
|
539 emit searchingStarted(); |
|
540 |
|
541 // setup the reader |
|
542 m_reader.setIndexPath(indexPath); |
|
543 foreach(const QString &namespaceName, registeredDocs) { |
|
544 mutex.lock(); |
|
545 if (m_cancel) { |
|
546 mutex.unlock(); |
|
547 searchingFinished(0); // TODO: check this ??? |
|
548 return; |
|
549 } |
|
550 mutex.unlock(); |
|
551 |
|
552 const QList<QStringList> attributeSets = |
|
553 engine.filterAttributeSets(namespaceName); |
|
554 |
|
555 foreach (const QStringList &attributes, attributeSets) { |
|
556 // read all index files |
|
557 m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@"))); |
|
558 if (!m_reader.readIndex()) { |
|
559 qWarning("Full Text Search, could not read file for namespace: %s.", |
|
560 namespaceName.toUtf8().constData()); |
|
561 } |
|
562 } |
|
563 } |
|
564 |
|
565 // get the current filter attributes and minimize the index files table |
|
566 m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter())); |
|
567 |
|
568 hitList.clear(); |
|
569 QStringList terms, termSeq, seqWords; |
|
570 if (m_reader.initCheck() && // check if we could read anything |
|
571 m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) { |
|
572 |
|
573 // search for term(s) |
|
574 m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ??? |
|
575 |
|
576 QVector<DocumentInfo> hits = m_reader.hits(); |
|
577 if (!hits.isEmpty()) { |
|
578 if (termSeq.isEmpty()) { |
|
579 foreach (const DocumentInfo &docInfo, hits) { |
|
580 mutex.lock(); |
|
581 if (m_cancel) { |
|
582 mutex.unlock(); |
|
583 searchingFinished(0); // TODO: check this, speed issue while locking??? |
|
584 return; |
|
585 } |
|
586 mutex.unlock(); |
|
587 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl)); |
|
588 } |
|
589 } else { |
|
590 foreach (const DocumentInfo &docInfo, hits) { |
|
591 mutex.lock(); |
|
592 if (m_cancel) { |
|
593 mutex.unlock(); |
|
594 searchingFinished(0); // TODO: check this, speed issue while locking??? |
|
595 return; |
|
596 } |
|
597 mutex.unlock(); |
|
598 |
|
599 if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ??? |
|
600 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl)); |
|
601 } |
|
602 } |
|
603 } |
|
604 } |
|
605 |
|
606 emit searchingFinished(hitList.count()); |
|
607 } |
|
608 |
|
609 } // namespace std |
|
610 } // namespace fulltextsearch |
|
611 |
|
612 QT_END_NAMESPACE |