This is an automated email from the git hooks/post-receive script. New change to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git from d50fb02 refs-20 #7776 Recherche lucene des mots les plus fréquents dans l'ensemble des projets new 19f5785 refs-40 #7776 mise en place d'un service rest renvoyant les mots les plus courrant de l'application The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit 19f578564c7717efe5bbfa816eef048d5c0451cc Author: Yannick Martel <martel@©odelutin.com> Date: Wed Dec 16 12:26:32 2015 +0100 refs-40 #7776 mise en place d'un service rest renvoyant les mots les plus courrant de l'application Summary of changes: coselmar-rest/pom.xml | 4 + .../java/fr/ifremer/coselmar/beans/CloudWord.java | 33 ++++++ .../indexation/QuestionsIndexationService.java | 48 ++++++--- .../indexation/TransverseIndexationService.java | 116 +++++++++++++++++++++ .../coselmar/services/v1/GeneralWebService.java | 74 +++++++++++++ coselmar-rest/src/main/resources/mapping | 4 + .../indexation/QuestionsIndexationServiceTest.java | 44 ++++++++ pom.xml | 7 ++ 8 files changed, 315 insertions(+), 15 deletions(-) create mode 100644 coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java create mode 100644 coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java create mode 100644 coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit 19f578564c7717efe5bbfa816eef048d5c0451cc Author: Yannick Martel <martel@©odelutin.com> Date: Wed Dec 16 12:26:32 2015 +0100 refs-40 #7776 mise en place d'un service rest renvoyant les mots les plus courrant de l'application --- coselmar-rest/pom.xml | 4 + .../java/fr/ifremer/coselmar/beans/CloudWord.java | 33 ++++++ .../indexation/QuestionsIndexationService.java | 48 ++++++--- .../indexation/TransverseIndexationService.java | 116 +++++++++++++++++++++ .../coselmar/services/v1/GeneralWebService.java | 74 +++++++++++++ coselmar-rest/src/main/resources/mapping | 4 + .../indexation/QuestionsIndexationServiceTest.java | 44 ++++++++ pom.xml | 7 ++ 8 files changed, 315 insertions(+), 15 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index e31bfdb..0c3c03c 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -160,6 +160,10 @@ <groupId>org.apache.lucene</groupId> <artifactId>lucene-misc</artifactId> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-backward-codecs</artifactId> + </dependency> <!-- Others --> <dependency> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java new file mode 100644 index 0000000..8e5523f --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java @@ -0,0 +1,33 @@ +package fr.ifremer.coselmar.beans; + +import java.io.Serializable; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class CloudWord implements Serializable { + + protected String text; + protected long weight; + + public CloudWord(String text, long weight) { + this.text = text; + this.weight = weight; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public long getWeight() { + return weight; + } + + public void setWeight(long weight) { + this.weight = weight; + } +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index d7fc8fc..65756af 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -43,6 +43,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import java.io.IOException; @@ -264,30 +265,47 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopTerms(String questionId) throws IOException, ParseException { + public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException, ParseException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); - IndexSearcher isearcher = new IndexSearcher(ireader); - Map<String, Long> result = new LinkedHashMap<>(); - try { - TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); - for (TermStats termStats : highFreqTerms) { - long totalTermFreq = termStats.totalTermFreq; - String value = termStats.termtext.utf8ToString(); + // Combine that with the type + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - if (result.containsKey(value)) { - result.put(value, result.get(value) + totalTermFreq); - } else { - result.put(value, totalTermFreq); - } + for (String questionId : questionIds) { + if(StringUtils.isNotBlank(questionId)) { + queryBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); } + } - } catch (Exception e) { - e.printStackTrace(); + TopDocs hits = isearcher.search(queryBuilder.build(), 100); + ScoreDoc[] scoreDocs = hits.scoreDocs; + System.out.println("hits=" + scoreDocs.length); + System.out.println("Hits (rank,score,docId)"); + for (int n = 0; n < scoreDocs.length; ++n) { + ScoreDoc sd = scoreDocs[n]; + float score = sd.score; + int docId = sd.doc; } +// TopFieldCollector topFieldCollector = TopFieldCollector.create(new Sort(), 100, true, true, false); +// isearcher.search(queryBuilder.build(), topFieldCollector); + + Map<String, Long> result = new LinkedHashMap<>(); +// TopFieldDocs topField = topFieldCollector.topDocs(); +// for (SortField sortField : topField.fields) { +// String field = sortField.getField(); +// long sumDocFreq = ireader.getSumDocFreq(field); +// +// if (result.containsKey(field)) { +// result.put(field, result.get(field) + sumDocFreq); +// } else { +// result.put(field, sumDocFreq); +// } +// } + ireader.close(); return result; } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java new file mode 100644 index 0000000..ab15112 --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -0,0 +1,116 @@ +package fr.ifremer.coselmar.services.indexation; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 - 2015 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; +import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.HighFreqTermsMultiFields; +import org.apache.lucene.misc.TermStats; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * This Services provides operations about indexed Objects. + * <ul> + * <li>cleanning of the indexation db</li> + * <li>top word from the indexation db about specific {@link QuestionBean} and {@link fr.ifremer.coselmar.beans.DocumentBean} attributes</li> + * </ul> + * + * The purpose is to use power of a indexation db (lucene) to increase search on + * document text field, and make easier fulltext search + * + * @author ymartel <martel@codelutin.com> + */ +public class TransverseIndexationService extends CoselmarSimpleServiceSupport { + + protected void cleanAllIndex() throws IOException { + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", QuestionsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("type", DocumentsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "question")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("type", "document")), BooleanClause.Occur.SHOULD) + .build(); + getLuceneUtils().getIndexWriter().deleteDocuments(query); + getLuceneUtils().getIndexWriter().commit(); + } + + public Map<String, Long> getTopTerms() throws IOException, ParseException { + + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + String[] searchedFields = { + QuestionsIndexationService.QUESTION_TITLE_INDEX_PROPERTY, + QuestionsIndexationService.QUESTION_SUMMARY_INDEX_PROPERTY, + QuestionsIndexationService.QUESTION_THEME_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_NAME_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_SUMMARY_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_KEYWORD_INDEX_PROPERTY + }; + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 20, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + indexReader.close(); + return result; + } + +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java new file mode 100644 index 0000000..cc1eaf4 --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java @@ -0,0 +1,74 @@ +package fr.ifremer.coselmar.services.v1; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import com.google.common.collect.Lists; +import fr.ifremer.coselmar.beans.CloudWord; +import fr.ifremer.coselmar.exceptions.CoselmarTechnicalException; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserRole; +import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; +import fr.ifremer.coselmar.services.indexation.TransverseIndexationService; +import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.queryparser.classic.ParseException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class GeneralWebService extends CoselmarWebServiceSupport { + + private static final Log log = LogFactory.getLog(GeneralWebService.class); + + protected static final List<String> RESTRICTED_ACCESS_USERS = Lists.newArrayList(CoselmarUserRole.CLIENT.name(), CoselmarUserRole.MEMBER.name()); + + public List<CloudWord> getTopWords() { + + TransverseIndexationService questionsIndexationService = getServicesContext().newService(TransverseIndexationService.class); + + Map<String, Long> topTerms = null; + try { + topTerms = questionsIndexationService.getTopTerms(); + } catch (IOException|ParseException e) { + if (log.isErrorEnabled()) { + log.error("Unable to search by lucene, make search directly in database", e); + throw new CoselmarTechnicalException("Unable to get most frequecy words"); + } + } + + List<CloudWord> cloudWords = new ArrayList<>(topTerms.size()); + for (Map.Entry<String, Long> wordFrequency : topTerms.entrySet()) { + CloudWord cloudWord = new CloudWord(wordFrequency.getKey(), wordFrequency.getValue()); + cloudWords.add(cloudWord); + } + + return cloudWords; + } +} diff --git a/coselmar-rest/src/main/resources/mapping b/coselmar-rest/src/main/resources/mapping index e31d444..2209eeb 100644 --- a/coselmar-rest/src/main/resources/mapping +++ b/coselmar-rest/src/main/resources/mapping @@ -64,6 +64,10 @@ POST /v1/questions/{questionId}/documents QuestionsWebService.addDocuments POST /v1/questions QuestionsWebService.addQuestion DELETE /v1/questions/{questionId} QuestionsWebService.deleteQuestion +# Transverse Api + +GET /v1/general/topwords GeneralWebService.getTopWords + # Admin API POST /v1/admin/lucene/index AdminWebService.refreshLuceneIndex diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index 0f377ea..b3c6334 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -427,4 +427,48 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest Assert.assertEquals(4, topTerms.get("something").longValue()); } + + @Test + public void testGetTopDocumentsTerms() throws Exception { + + QuestionBean questionOne = new QuestionBean(); + String questionOneId = "question_1_test_search" + System.currentTimeMillis(); + questionOne.setId(questionOneId); + questionOne.setTitle("Awesome question"); + questionOne.setSummary("Where is the tardis in time ?"); + questionOne.setDeadline(DateUtil.createDateAfterToday(1, 0, 1)); + questionOne.setExternalExperts(Sets.newHashSet("Amelia", "Rory", "River")); + questionOne.setSubmissionDate(new Date()); + questionOne.setStatus(Status.OPEN.name()); + questionOne.setPrivacy(Privacy.PUBLIC.name()); + questionOne.setThemes(Sets.newHashSet("TARDIS", "Universe", "Time", "Space")); + + QuestionBean questionTwo = new QuestionBean(); + String questionThreeId = "question_3_test_search" + System.currentTimeMillis(); + questionTwo.setId(questionThreeId); + questionTwo.setTitle("There's someone missing. The question's Who?"); + questionTwo.setSummary("Something old, Something new, Something borrowed, Something blue."); + questionTwo.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionTwo.setSubmissionDate(new Date()); + questionTwo.setStatus(Status.OPEN.name()); + questionTwo.setPrivacy(Privacy.PRIVATE.name()); + questionTwo.setThemes(Sets.newHashSet("big bang two", "Pandorica", "River", "Universe")); + + + CoselmarServicesContext serviceContext = getServiceContext(); + QuestionsIndexationService questionsIndexationService = + serviceContext.newService(QuestionsIndexationService.class); + + questionsIndexationService.indexQuestion(questionOne); + questionsIndexationService.indexQuestion(questionTwo); + + // Ok, let's search now ! + Map<String, Long> topTerms = questionsIndexationService.getTopDocumentsTerms(Arrays.asList(questionOneId)); + Assert.assertNotNull(topTerms); +// Assert.assertEquals(1, topTerms.get("question").longValue()); +// Assert.assertEquals(2, topTerms.get("tardis").longValue()); +// Assert.assertEquals(2, topTerms.get("time").longValue()); +// Assert.assertEquals(1, topTerms.get("space").longValue()); + + } } diff --git a/pom.xml b/pom.xml index f36a085..f8b4d12 100644 --- a/pom.xml +++ b/pom.xml @@ -312,6 +312,13 @@ <version>${luceneVersion}</version> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-backward-codecs</artifactId> + <version>${luceneVersion}</version> + <scope>runtime</scope> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm