This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit d50fb02e7473a52cedf06c7e37b16a5acc5209b0 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Dec 15 16:41:03 2015 +0100 refs-20 #7776 Recherche lucene des mots les plus fréquents dans l'ensemble des projets --- .../indexation/DocumentsIndexationService.java | 9 ++- .../indexation/QuestionsIndexationService.java | 51 +++++++++++---- .../lucene/misc/HighFreqTermsMultiFields.java | 75 ++++++++++++++++++++++ .../indexation/QuestionsIndexationServiceTest.java | 6 +- 4 files changed, 126 insertions(+), 15 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index af3b62a..144f4a2 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -66,7 +66,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_AUTHORS_INDEX_PROPERTY = "documentAuthors"; protected static final String DOCUMENT_SUMMARY_INDEX_PROPERTY = "documentSummary"; protected static final String DOCUMENT_KEYWORD_INDEX_PROPERTY = "documentKeyword"; - protected static final String DOCUMENT_TYPE = "document"; + protected static final String DOCUMENT_TYPE = "documentindextype"; public void indexDocument(DocumentBean document) throws IOException { @@ -237,8 +237,11 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } protected void cleanIndex() throws IOException { - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "document")), BooleanClause.Occur.SHOULD) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index cc8c0d0..d7fc8fc 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -33,16 +33,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.HighFreqTermsMultiFields; import org.apache.lucene.misc.TermStats; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiTermQueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; @@ -76,7 +74,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_INDEX_PROPERTY = "questionTheme"; protected static final String QUESTION_STATUS_INDEX_PROPERTY = "questionStatus"; protected static final String QUESTION_PRIVACY_INDEX_PROPERTY = "questionPrivacy"; - protected static final String DOCUMENT_TYPE = "questionIndexType"; + protected static final String DOCUMENT_TYPE = "questionindextype"; public void indexQuestion(QuestionBean question) throws IOException { @@ -130,7 +128,6 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { if (themes != null) { for (String theme : themes) { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); - doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -220,9 +217,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { public void deleteQuestion(String documentId) throws IOException { // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); @@ -230,21 +228,51 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { } protected void cleanIndex() throws IOException { - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "question")), BooleanClause.Occur.SHOULD) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); } public Map<String, Long> getTopTerms() throws IOException, ParseException { + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + String[] searchedFields = {QUESTION_TITLE_INDEX_PROPERTY, QUESTION_SUMMARY_INDEX_PROPERTY, QUESTION_THEME_INDEX_PROPERTY}; + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 20, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + indexReader.close(); + return result; + } + + public Map<String, Long> getTopTerms(String questionId) throws IOException, ParseException { + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); IndexSearcher isearcher = new IndexSearcher(ireader); Map<String, Long> result = new LinkedHashMap<>(); try { - TermStats[] highFreqTerms = HighFreqTerms.getHighFreqTerms(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); for (TermStats termStats : highFreqTerms) { long totalTermFreq = termStats.totalTermFreq; String value = termStats.termtext.utf8ToString(); @@ -264,4 +292,5 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } + } diff --git a/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java b/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java new file mode 100644 index 0000000..525cc6a --- /dev/null +++ b/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java @@ -0,0 +1,75 @@ +package org.apache.lucene.misc; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; + +import java.util.Comparator; + +/** + * + * <code>HighFreqTermsMultiField</code> class extends {@link HighFreqTerms} to allow extracts the top n most frequent terms + * (by document frequency ) from an existing Lucene index and reports their document frequency for several fields + * + * @see HighFreqTerms + */ +public class HighFreqTermsMultiFields extends HighFreqTerms { + + public static TermStats[] getHighFreqTermsMultiFields(IndexReader reader, int numTerms, String[] fieldNames, Comparator<TermStats> comparator) throws Exception { + + TermStatsQueue tiq = new TermStatsQueue(numTerms, comparator); + TermsEnum te; + + if (fieldNames != null) { + Fields fields = MultiFields.getFields(reader); + for (String field : fieldNames) { + Terms terms = fields.terms(field); + if (terms != null) { + te = terms.iterator(); + tiq.fill(field, te); + } + } + } else { + Fields fields = MultiFields.getFields(reader); + if (fields.size() == 0) { + throw new RuntimeException("no fields found for this index"); + } + for (String fieldName : fields) { + Terms terms = fields.terms(fieldName); + if (terms != null) { + tiq.fill(fieldName, terms.iterator()); + } + } + } + + TermStats[] result = new TermStats[tiq.size()]; + // we want highest first so we read the queue and populate the array + // starting at the end and work backwards + int count = tiq.size() - 1; + while (tiq.size() != 0) { + result[count] = tiq.pop(); + count--; + } + return result; + } + +} diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index a85dad8..0f377ea 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -420,7 +420,11 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest // Ok, let's search now ! Map<String, Long> topTerms = questionsIndexationService.getTopTerms(); Assert.assertNotNull(topTerms); - Assert.assertEquals(4, topTerms.get("question").longValue()); + Assert.assertEquals(3, topTerms.get("question").longValue()); + Assert.assertEquals(2, topTerms.get("universe").longValue()); + Assert.assertEquals(1, topTerms.get("river").longValue()); + Assert.assertEquals(1, topTerms.get("test").longValue()); + Assert.assertEquals(4, topTerms.get("something").longValue()); } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.