This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit 70270ab3832eaa95241c4c7b95f764f1af8a8f02 Author: Yannick Martel <martel@©odelutin.com> Date: Mon Dec 14 17:56:16 2015 +0100 refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets --- coselmar-rest/pom.xml | 4 + .../coselmar/services/indexation/LuceneUtils.java | 5 +- .../indexation/QuestionsIndexationService.java | 97 +++++++++++++++------- .../indexation/QuestionsIndexationServiceTest.java | 54 ++++++++++++ pom.xml | 8 +- 5 files changed, 136 insertions(+), 32 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index c9bcc0a..e31bfdb 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -156,6 +156,10 @@ <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-misc</artifactId> + </dependency> <!-- Others --> <dependency> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index e668303..c471fac 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -36,7 +36,6 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; -import org.apache.lucene.util.Version; /** * @author ymartel <martel@codelutin.com> @@ -46,7 +45,7 @@ public class LuceneUtils { private static final Log log = LogFactory.getLog(LuceneUtils.class); public Analyzer analyzer; - public final IndexWriterConfig indexationConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, getAnalyzer()); + public final IndexWriterConfig indexationConfig = new IndexWriterConfig(getAnalyzer()); public IndexWriter indexWriter; protected CoselmarServicesConfig servicesConfig; @@ -68,7 +67,7 @@ public class LuceneUtils { public IndexWriter getIndexWriter() throws IOException { if (indexWriter == null) { File indexDirectory = servicesConfig.getIndexDirectory(); - Directory index = NIOFSDirectory.open(indexDirectory); + Directory index = NIOFSDirectory.open(indexDirectory.toPath()); indexationConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriter = new IndexWriter(index, indexationConfig); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 70d2469..cc8c0d0 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -24,11 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; @@ -38,15 +33,27 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.TermStats; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : @@ -69,7 +76,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_INDEX_PROPERTY = "questionTheme"; protected static final String QUESTION_STATUS_INDEX_PROPERTY = "questionStatus"; protected static final String QUESTION_PRIVACY_INDEX_PROPERTY = "questionPrivacy"; - protected static final String DOCUMENT_TYPE = "question"; + protected static final String DOCUMENT_TYPE = "questionIndexType"; public void indexQuestion(QuestionBean question) throws IOException { @@ -78,11 +85,12 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, question.getId())), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, question.getId())), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + .build(); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(query, 1000).scoreDocs; if (hits.length > 0) { Document doc = new Document(); @@ -94,7 +102,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { Set<String> themes = question.getThemes(); if (themes != null) { for (String theme : themes) { - doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); + doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -122,6 +130,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { if (themes != null) { for (String theme : themes) { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); + doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -142,54 +151,59 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); String searchPrivacy = searchBean.getPrivacy(); if(StringUtils.isNotBlank(searchPrivacy)) { - fullQuery.add(new TermQuery(new Term(QUESTION_PRIVACY_INDEX_PROPERTY, searchPrivacy.toLowerCase())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term(QUESTION_PRIVACY_INDEX_PROPERTY, searchPrivacy.toLowerCase())), BooleanClause.Occur.MUST); } String searchStatus = searchBean.getStatus(); if(StringUtils.isNotBlank(searchStatus)) { - fullQuery.add(new TermQuery(new Term(QUESTION_STATUS_INDEX_PROPERTY, searchStatus.toLowerCase())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term(QUESTION_STATUS_INDEX_PROPERTY, searchStatus.toLowerCase())), BooleanClause.Occur.MUST); } // Keywords part List<String> keywords = searchBean.getFullTextSearch(); if (keywords != null && !keywords.isEmpty()) { - BooleanQuery keywordsQuery = new BooleanQuery(); + BooleanQuery.Builder keywordsQueryBuilder = new BooleanQuery.Builder(); for (String text : keywords) { String[] words = text.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = String.format("*%s*", word.toLowerCase()); - nameQuery.add(new WildcardQuery(new Term(QUESTION_TITLE_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(QUESTION_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(QUESTION_TITLE_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(QUESTION_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); + BooleanQuery nameQuery = nameQueryBuilder.build(); + BooleanQuery summaryQuery = summaryQueryBuilder.build(); + - query.add(new TermQuery(new Term(QUESTION_THEME_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + BooleanQuery query = new BooleanQuery.Builder() + .add(nameQuery, BooleanClause.Occur.SHOULD) + .add(summaryQuery, BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term(QUESTION_THEME_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD) + .build(); - keywordsQuery.add(query, BooleanClause.Occur.MUST); + keywordsQueryBuilder.add(query, BooleanClause.Occur.MUST); } + BooleanQuery keywordsQuery = keywordsQueryBuilder.build(); // add to complete query - fullQuery.add(keywordsQuery, BooleanClause.Occur.MUST); + queryBuilder.add(keywordsQuery, BooleanClause.Occur.MUST); } - - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + BooleanQuery fullQuery = queryBuilder.build(); + ScoreDoc[] hits = isearcher.search(fullQuery, 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -222,5 +236,32 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { getLuceneUtils().getIndexWriter().commit(); } + public Map<String, Long> getTopTerms() throws IOException, ParseException { + + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + IndexSearcher isearcher = new IndexSearcher(ireader); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + TermStats[] highFreqTerms = HighFreqTerms.getHighFreqTerms(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + ireader.close(); + return result; + } } diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index b71bb04..a85dad8 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -28,6 +28,7 @@ import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Locale; +import java.util.Map; import com.google.common.collect.Sets; import fr.ifremer.coselmar.beans.QuestionBean; @@ -369,4 +370,57 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest questionsIndexationService.cleanIndex(); } + + @Test + public void testGetTopTerms() throws Exception { + + QuestionBean questionOne = new QuestionBean(); + String questionOneId = "question_1_test_search" + System.currentTimeMillis(); + questionOne.setId(questionOneId); + questionOne.setTitle("Awesome question"); + questionOne.setSummary("Can we, just once, ask about it ?"); + questionOne.setDeadline(DateUtil.createDateAfterToday(1, 0, 1)); + questionOne.setExternalExperts(Sets.newHashSet("Amelia", "Rory", "River")); + questionOne.setSubmissionDate(new Date()); + questionOne.setStatus(Status.OPEN.name()); + questionOne.setPrivacy(Privacy.PUBLIC.name()); + questionOne.setThemes(Sets.newHashSet("TARDIS", "Universe", "Time", "Space")); + + QuestionBean questionTwo = new QuestionBean(); + String questionTwoId = "question_2_test_search" + System.currentTimeMillis(); + questionTwo.setId(questionTwoId); + questionTwo.setTitle("The ultimate"); + questionTwo.setSummary("We need some question"); + questionTwo.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionTwo.setSubmissionDate(new Date()); + questionTwo.setStatus(Status.OPEN.name()); + questionTwo.setPrivacy(Privacy.PUBLIC.name()); + questionTwo.setThemes(Sets.newHashSet("test", "question")); + + QuestionBean questionThree = new QuestionBean(); + String questionThreeId = "question_3_test_search" + System.currentTimeMillis(); + questionThree.setId(questionThreeId); + questionThree.setTitle("There's someone missing. The question's Who?"); + questionThree.setSummary("Something old, Something new, Something borrowed, Something blue."); + questionThree.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionThree.setSubmissionDate(new Date()); + questionThree.setStatus(Status.OPEN.name()); + questionThree.setPrivacy(Privacy.PRIVATE.name()); + questionThree.setThemes(Sets.newHashSet("big bang two", "Pandorica", "River", "Universe")); + + + CoselmarServicesContext serviceContext = getServiceContext(); + QuestionsIndexationService questionsIndexationService = + serviceContext.newService(QuestionsIndexationService.class); + + questionsIndexationService.indexQuestion(questionOne); + questionsIndexationService.indexQuestion(questionTwo); + questionsIndexationService.indexQuestion(questionThree); + + // Ok, let's search now ! + Map<String, Long> topTerms = questionsIndexationService.getTopTerms(); + Assert.assertNotNull(topTerms); + Assert.assertEquals(4, topTerms.get("question").longValue()); + + } } diff --git a/pom.xml b/pom.xml index 99aef73..f36a085 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ <postgresqlVersion>9.1-901-1.jdbc4</postgresqlVersion> <h2Version>1.4.190</h2Version> - <luceneVersion>4.10.3</luceneVersion> + <luceneVersion>5.4.0</luceneVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> @@ -306,6 +306,12 @@ <version>${luceneVersion}</version> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-misc</artifactId> + <version>${luceneVersion}</version> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.