This is an automated email from the git hooks/post-receive script. New change to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git from 70270ab refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets new d50fb02 refs-20 #7776 Recherche lucene des mots les plus fréquents dans l'ensemble des projets The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit d50fb02e7473a52cedf06c7e37b16a5acc5209b0 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Dec 15 16:41:03 2015 +0100 refs-20 #7776 Recherche lucene des mots les plus fréquents dans l'ensemble des projets Summary of changes: .../indexation/DocumentsIndexationService.java | 9 ++- .../indexation/QuestionsIndexationService.java | 51 +++++++++++---- .../lucene/misc/HighFreqTermsMultiFields.java | 75 ++++++++++++++++++++++ .../indexation/QuestionsIndexationServiceTest.java | 6 +- 4 files changed, 126 insertions(+), 15 deletions(-) create mode 100644 coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit d50fb02e7473a52cedf06c7e37b16a5acc5209b0 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Dec 15 16:41:03 2015 +0100 refs-20 #7776 Recherche lucene des mots les plus fréquents dans l'ensemble des projets --- .../indexation/DocumentsIndexationService.java | 9 ++- .../indexation/QuestionsIndexationService.java | 51 +++++++++++---- .../lucene/misc/HighFreqTermsMultiFields.java | 75 ++++++++++++++++++++++ .../indexation/QuestionsIndexationServiceTest.java | 6 +- 4 files changed, 126 insertions(+), 15 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index af3b62a..144f4a2 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -66,7 +66,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_AUTHORS_INDEX_PROPERTY = "documentAuthors"; protected static final String DOCUMENT_SUMMARY_INDEX_PROPERTY = "documentSummary"; protected static final String DOCUMENT_KEYWORD_INDEX_PROPERTY = "documentKeyword"; - protected static final String DOCUMENT_TYPE = "document"; + protected static final String DOCUMENT_TYPE = "documentindextype"; public void indexDocument(DocumentBean document) throws IOException { @@ -237,8 +237,11 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } protected void cleanIndex() throws IOException { - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "document")), BooleanClause.Occur.SHOULD) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index cc8c0d0..d7fc8fc 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -33,16 +33,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.HighFreqTermsMultiFields; import org.apache.lucene.misc.TermStats; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiTermQueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; @@ -76,7 +74,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_INDEX_PROPERTY = "questionTheme"; protected static final String QUESTION_STATUS_INDEX_PROPERTY = "questionStatus"; protected static final String QUESTION_PRIVACY_INDEX_PROPERTY = "questionPrivacy"; - protected static final String DOCUMENT_TYPE = "questionIndexType"; + protected static final String DOCUMENT_TYPE = "questionindextype"; public void indexQuestion(QuestionBean question) throws IOException { @@ -130,7 +128,6 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { if (themes != null) { for (String theme : themes) { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); - doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -220,9 +217,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { public void deleteQuestion(String documentId) throws IOException { // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); @@ -230,21 +228,51 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { } protected void cleanIndex() throws IOException { - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "question")), BooleanClause.Occur.SHOULD) + .build(); getLuceneUtils().getIndexWriter().deleteDocuments(query); getLuceneUtils().getIndexWriter().commit(); } public Map<String, Long> getTopTerms() throws IOException, ParseException { + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + String[] searchedFields = {QUESTION_TITLE_INDEX_PROPERTY, QUESTION_SUMMARY_INDEX_PROPERTY, QUESTION_THEME_INDEX_PROPERTY}; + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 20, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + indexReader.close(); + return result; + } + + public Map<String, Long> getTopTerms(String questionId) throws IOException, ParseException { + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); IndexSearcher isearcher = new IndexSearcher(ireader); Map<String, Long> result = new LinkedHashMap<>(); try { - TermStats[] highFreqTerms = HighFreqTerms.getHighFreqTerms(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); for (TermStats termStats : highFreqTerms) { long totalTermFreq = termStats.totalTermFreq; String value = termStats.termtext.utf8ToString(); @@ -264,4 +292,5 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } + } diff --git a/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java b/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java new file mode 100644 index 0000000..525cc6a --- /dev/null +++ b/coselmar-rest/src/main/java/org/apache/lucene/misc/HighFreqTermsMultiFields.java @@ -0,0 +1,75 @@ +package org.apache.lucene.misc; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; + +import java.util.Comparator; + +/** + * + * <code>HighFreqTermsMultiField</code> class extends {@link HighFreqTerms} to allow extracts the top n most frequent terms + * (by document frequency ) from an existing Lucene index and reports their document frequency for several fields + * + * @see HighFreqTerms + */ +public class HighFreqTermsMultiFields extends HighFreqTerms { + + public static TermStats[] getHighFreqTermsMultiFields(IndexReader reader, int numTerms, String[] fieldNames, Comparator<TermStats> comparator) throws Exception { + + TermStatsQueue tiq = new TermStatsQueue(numTerms, comparator); + TermsEnum te; + + if (fieldNames != null) { + Fields fields = MultiFields.getFields(reader); + for (String field : fieldNames) { + Terms terms = fields.terms(field); + if (terms != null) { + te = terms.iterator(); + tiq.fill(field, te); + } + } + } else { + Fields fields = MultiFields.getFields(reader); + if (fields.size() == 0) { + throw new RuntimeException("no fields found for this index"); + } + for (String fieldName : fields) { + Terms terms = fields.terms(fieldName); + if (terms != null) { + tiq.fill(fieldName, terms.iterator()); + } + } + } + + TermStats[] result = new TermStats[tiq.size()]; + // we want highest first so we read the queue and populate the array + // starting at the end and work backwards + int count = tiq.size() - 1; + while (tiq.size() != 0) { + result[count] = tiq.pop(); + count--; + } + return result; + } + +} diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index a85dad8..0f377ea 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -420,7 +420,11 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest // Ok, let's search now ! Map<String, Long> topTerms = questionsIndexationService.getTopTerms(); Assert.assertNotNull(topTerms); - Assert.assertEquals(4, topTerms.get("question").longValue()); + Assert.assertEquals(3, topTerms.get("question").longValue()); + Assert.assertEquals(2, topTerms.get("universe").longValue()); + Assert.assertEquals(1, topTerms.get("river").longValue()); + Assert.assertEquals(1, topTerms.get("test").longValue()); + Assert.assertEquals(4, topTerms.get("something").longValue()); } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm