This is an automated email from the git hooks/post-receive script. New change to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git at 70270ab refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets This branch includes the following new commits: new 70270ab refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit 70270ab3832eaa95241c4c7b95f764f1af8a8f02 Author: Yannick Martel <martel@©odelutin.com> Date: Mon Dec 14 17:56:16 2015 +0100 refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit 70270ab3832eaa95241c4c7b95f764f1af8a8f02 Author: Yannick Martel <martel@©odelutin.com> Date: Mon Dec 14 17:56:16 2015 +0100 refs-10 #7776 Prepare la recherche des mots les plus pertinents sur les projets --- coselmar-rest/pom.xml | 4 + .../coselmar/services/indexation/LuceneUtils.java | 5 +- .../indexation/QuestionsIndexationService.java | 97 +++++++++++++++------- .../indexation/QuestionsIndexationServiceTest.java | 54 ++++++++++++ pom.xml | 8 +- 5 files changed, 136 insertions(+), 32 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index c9bcc0a..e31bfdb 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -156,6 +156,10 @@ <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-misc</artifactId> + </dependency> <!-- Others --> <dependency> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index e668303..c471fac 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -36,7 +36,6 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; -import org.apache.lucene.util.Version; /** * @author ymartel <martel@codelutin.com> @@ -46,7 +45,7 @@ public class LuceneUtils { private static final Log log = LogFactory.getLog(LuceneUtils.class); public Analyzer analyzer; - public final IndexWriterConfig indexationConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, getAnalyzer()); + public final IndexWriterConfig indexationConfig = new IndexWriterConfig(getAnalyzer()); public IndexWriter indexWriter; protected CoselmarServicesConfig servicesConfig; @@ -68,7 +67,7 @@ public class LuceneUtils { public IndexWriter getIndexWriter() throws IOException { if (indexWriter == null) { File indexDirectory = servicesConfig.getIndexDirectory(); - Directory index = NIOFSDirectory.open(indexDirectory); + Directory index = NIOFSDirectory.open(indexDirectory.toPath()); indexationConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); indexWriter = new IndexWriter(index, indexationConfig); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 70d2469..cc8c0d0 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -24,11 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; @@ -38,15 +33,27 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.TermStats; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQueryWrapperFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : @@ -69,7 +76,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_INDEX_PROPERTY = "questionTheme"; protected static final String QUESTION_STATUS_INDEX_PROPERTY = "questionStatus"; protected static final String QUESTION_PRIVACY_INDEX_PROPERTY = "questionPrivacy"; - protected static final String DOCUMENT_TYPE = "question"; + protected static final String DOCUMENT_TYPE = "questionIndexType"; public void indexQuestion(QuestionBean question) throws IOException { @@ -78,11 +85,12 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, question.getId())), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, question.getId())), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST) + .build(); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(query, 1000).scoreDocs; if (hits.length > 0) { Document doc = new Document(); @@ -94,7 +102,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { Set<String> themes = question.getThemes(); if (themes != null) { for (String theme : themes) { - doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); + doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -122,6 +130,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { if (themes != null) { for (String theme : themes) { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); + doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); } } @@ -142,54 +151,59 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); String searchPrivacy = searchBean.getPrivacy(); if(StringUtils.isNotBlank(searchPrivacy)) { - fullQuery.add(new TermQuery(new Term(QUESTION_PRIVACY_INDEX_PROPERTY, searchPrivacy.toLowerCase())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term(QUESTION_PRIVACY_INDEX_PROPERTY, searchPrivacy.toLowerCase())), BooleanClause.Occur.MUST); } String searchStatus = searchBean.getStatus(); if(StringUtils.isNotBlank(searchStatus)) { - fullQuery.add(new TermQuery(new Term(QUESTION_STATUS_INDEX_PROPERTY, searchStatus.toLowerCase())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term(QUESTION_STATUS_INDEX_PROPERTY, searchStatus.toLowerCase())), BooleanClause.Occur.MUST); } // Keywords part List<String> keywords = searchBean.getFullTextSearch(); if (keywords != null && !keywords.isEmpty()) { - BooleanQuery keywordsQuery = new BooleanQuery(); + BooleanQuery.Builder keywordsQueryBuilder = new BooleanQuery.Builder(); for (String text : keywords) { String[] words = text.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = String.format("*%s*", word.toLowerCase()); - nameQuery.add(new WildcardQuery(new Term(QUESTION_TITLE_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(QUESTION_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(QUESTION_TITLE_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(QUESTION_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); + BooleanQuery nameQuery = nameQueryBuilder.build(); + BooleanQuery summaryQuery = summaryQueryBuilder.build(); + - query.add(new TermQuery(new Term(QUESTION_THEME_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + BooleanQuery query = new BooleanQuery.Builder() + .add(nameQuery, BooleanClause.Occur.SHOULD) + .add(summaryQuery, BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term(QUESTION_THEME_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD) + .build(); - keywordsQuery.add(query, BooleanClause.Occur.MUST); + keywordsQueryBuilder.add(query, BooleanClause.Occur.MUST); } + BooleanQuery keywordsQuery = keywordsQueryBuilder.build(); // add to complete query - fullQuery.add(keywordsQuery, BooleanClause.Occur.MUST); + queryBuilder.add(keywordsQuery, BooleanClause.Occur.MUST); } - - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + BooleanQuery fullQuery = queryBuilder.build(); + ScoreDoc[] hits = isearcher.search(fullQuery, 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -222,5 +236,32 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { getLuceneUtils().getIndexWriter().commit(); } + public Map<String, Long> getTopTerms() throws IOException, ParseException { + + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + IndexSearcher isearcher = new IndexSearcher(ireader); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + TermStats[] highFreqTerms = HighFreqTerms.getHighFreqTerms(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + ireader.close(); + return result; + } } diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index b71bb04..a85dad8 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -28,6 +28,7 @@ import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Locale; +import java.util.Map; import com.google.common.collect.Sets; import fr.ifremer.coselmar.beans.QuestionBean; @@ -369,4 +370,57 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest questionsIndexationService.cleanIndex(); } + + @Test + public void testGetTopTerms() throws Exception { + + QuestionBean questionOne = new QuestionBean(); + String questionOneId = "question_1_test_search" + System.currentTimeMillis(); + questionOne.setId(questionOneId); + questionOne.setTitle("Awesome question"); + questionOne.setSummary("Can we, just once, ask about it ?"); + questionOne.setDeadline(DateUtil.createDateAfterToday(1, 0, 1)); + questionOne.setExternalExperts(Sets.newHashSet("Amelia", "Rory", "River")); + questionOne.setSubmissionDate(new Date()); + questionOne.setStatus(Status.OPEN.name()); + questionOne.setPrivacy(Privacy.PUBLIC.name()); + questionOne.setThemes(Sets.newHashSet("TARDIS", "Universe", "Time", "Space")); + + QuestionBean questionTwo = new QuestionBean(); + String questionTwoId = "question_2_test_search" + System.currentTimeMillis(); + questionTwo.setId(questionTwoId); + questionTwo.setTitle("The ultimate"); + questionTwo.setSummary("We need some question"); + questionTwo.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionTwo.setSubmissionDate(new Date()); + questionTwo.setStatus(Status.OPEN.name()); + questionTwo.setPrivacy(Privacy.PUBLIC.name()); + questionTwo.setThemes(Sets.newHashSet("test", "question")); + + QuestionBean questionThree = new QuestionBean(); + String questionThreeId = "question_3_test_search" + System.currentTimeMillis(); + questionThree.setId(questionThreeId); + questionThree.setTitle("There's someone missing. The question's Who?"); + questionThree.setSummary("Something old, Something new, Something borrowed, Something blue."); + questionThree.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionThree.setSubmissionDate(new Date()); + questionThree.setStatus(Status.OPEN.name()); + questionThree.setPrivacy(Privacy.PRIVATE.name()); + questionThree.setThemes(Sets.newHashSet("big bang two", "Pandorica", "River", "Universe")); + + + CoselmarServicesContext serviceContext = getServiceContext(); + QuestionsIndexationService questionsIndexationService = + serviceContext.newService(QuestionsIndexationService.class); + + questionsIndexationService.indexQuestion(questionOne); + questionsIndexationService.indexQuestion(questionTwo); + questionsIndexationService.indexQuestion(questionThree); + + // Ok, let's search now ! + Map<String, Long> topTerms = questionsIndexationService.getTopTerms(); + Assert.assertNotNull(topTerms); + Assert.assertEquals(4, topTerms.get("question").longValue()); + + } } diff --git a/pom.xml b/pom.xml index 99aef73..f36a085 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ <postgresqlVersion>9.1-901-1.jdbc4</postgresqlVersion> <h2Version>1.4.190</h2Version> - <luceneVersion>4.10.3</luceneVersion> + <luceneVersion>5.4.0</luceneVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> @@ -306,6 +306,12 @@ <version>${luceneVersion}</version> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-misc</artifactId> + <version>${luceneVersion}</version> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm