This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit f90edbef1de138249656cf0f778863fe2ffeb654 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 11:16:32 2017 +0200 Upgrade lucene + revue de l'indexation des données de Question --- .../indexation/DocumentsIndexationService.java | 22 +++--- .../indexation/QuestionsIndexationService.java | 88 +++++++++++++--------- .../indexation/TransverseIndexationService.java | 6 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +++--- pom.xml | 2 +- 5 files changed, 81 insertions(+), 59 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 92402fb..87c4b68 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - import fr.ifremer.coselmar.beans.DocumentBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; @@ -50,6 +44,12 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.tika.exception.TikaException; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : @@ -134,7 +134,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(String text) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); String[] words = text.split(" "); @@ -183,7 +183,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(List<String> texts) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -240,7 +240,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public void updateDocument(DocumentBean document, String filepath) throws IOException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -287,7 +287,9 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + if (StringUtils.isNotBlank(parsedDocumentFile)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 2c97773..b5e92d3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -26,15 +26,19 @@ package fr.ifremer.coselmar.services.indexation; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; -import fr.ifremer.coselmar.beans.QuestionSearchExample; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.misc.HighFreqTerms; import org.apache.lucene.misc.HighFreqTermsMultiFields; import org.apache.lucene.misc.TermStats; @@ -46,6 +50,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.ArrayList; @@ -81,10 +86,23 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -102,11 +120,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -119,7 +137,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -139,14 +157,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -159,7 +177,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -177,7 +195,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchQuestion(QuestionSearchBean searchBean) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type @@ -272,7 +290,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> result = new LinkedHashMap<>(); try { @@ -297,46 +315,48 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); for (String questionId : questionIds) { if(StringUtils.isNotBlank(questionId)) { - queryBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + questionIdBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); } } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); TopDocs hits = isearcher.search(queryBuilder.build(), 100); ScoreDoc[] scoreDocs = hits.scoreDocs; - System.out.println("hits=" + scoreDocs.length); - System.out.println("Hits (rank,score,docId)"); - for (int n = 0; n < scoreDocs.length; ++n) { - ScoreDoc sd = scoreDocs[n]; - float score = sd.score; - int docId = sd.doc; - } - -// TopFieldCollector topFieldCollector = TopFieldCollector.create(new Sort(), 100, true, true, false); -// isearcher.search(queryBuilder.build(), topFieldCollector); Map<String, Long> result = new LinkedHashMap<>(); -// TopFieldDocs topField = topFieldCollector.topDocs(); -// for (SortField sortField : topField.fields) { -// String field = sortField.getField(); -// long sumDocFreq = ireader.getSumDocFreq(field); -// -// if (result.containsKey(field)) { -// result.put(field, result.get(field) + sumDocFreq); -// } else { -// result.put(field, sumDocFreq); -// } -// } + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + System.out.println("Vector: " + termVector); + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + bytesRef = termsEnum.next(); + } + } + } ireader.close(); return result; diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java index db6ae9b..2eceac3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import java.io.IOException; -import java.util.Comparator; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; @@ -73,7 +72,7 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> topWords = new LinkedHashMap<>(); try { @@ -81,7 +80,8 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { QuestionsIndexationService.QUESTION_TITLE_CLOUD_TAG_PROPERTY, QuestionsIndexationService.QUESTION_THEME_CLOUD_TAG_PROPERTY, DocumentsIndexationService.DOCUMENT_NAME_CLOUD_TAG_PROPERTY, - DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY + DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, + DocumentsIndexationService.DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, }; TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 40, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index df99838..2d2d154 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.Locale; -import java.util.Map; - import com.google.common.collect.Sets; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; @@ -43,6 +37,12 @@ import org.junit.Before; import org.junit.Test; import org.nuiton.util.DateUtil; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Map; + /** * @author ymartel <martel@codelutin.com> */ @@ -463,12 +463,12 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest questionsIndexationService.indexQuestion(questionTwo); // Ok, let's search now ! - Map<String, Long> topTerms = questionsIndexationService.getTopDocumentsTerms(Arrays.asList(questionOneId)); + Map<String, Long> topTerms = questionsIndexationService.getTopQuestionsTerms(Arrays.asList(questionOneId)); Assert.assertNotNull(topTerms); -// Assert.assertEquals(1, topTerms.get("question").longValue()); -// Assert.assertEquals(2, topTerms.get("tardis").longValue()); -// Assert.assertEquals(2, topTerms.get("time").longValue()); -// Assert.assertEquals(1, topTerms.get("space").longValue()); + Assert.assertEquals(1, topTerms.get("question").longValue()); + Assert.assertEquals(2, topTerms.get("tardis").longValue()); + Assert.assertEquals(2, topTerms.get("time").longValue()); + Assert.assertEquals(1, topTerms.get("space").longValue()); } } diff --git a/pom.xml b/pom.xml index f4bd15e..20fb3bd 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ <postgresqlVersion>9.4.1212.jre7</postgresqlVersion> <h2Version>1.4.190</h2Version> - <luceneVersion>5.4.0</luceneVersion> + <luceneVersion>6.5.1</luceneVersion> <tikaVersion>1.14</tikaVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.