This is an automated email from the git hooks/post-receive script. New commit to branch feature/R7776-cloudTags-in-homepage in repository coselmar. See http://git.codelutin.com/coselmar.git commit 19f578564c7717efe5bbfa816eef048d5c0451cc Author: Yannick Martel <martel@©odelutin.com> Date: Wed Dec 16 12:26:32 2015 +0100 refs-40 #7776 mise en place d'un service rest renvoyant les mots les plus courrant de l'application --- coselmar-rest/pom.xml | 4 + .../java/fr/ifremer/coselmar/beans/CloudWord.java | 33 ++++++ .../indexation/QuestionsIndexationService.java | 48 ++++++--- .../indexation/TransverseIndexationService.java | 116 +++++++++++++++++++++ .../coselmar/services/v1/GeneralWebService.java | 74 +++++++++++++ coselmar-rest/src/main/resources/mapping | 4 + .../indexation/QuestionsIndexationServiceTest.java | 44 ++++++++ pom.xml | 7 ++ 8 files changed, 315 insertions(+), 15 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index e31bfdb..0c3c03c 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -160,6 +160,10 @@ <groupId>org.apache.lucene</groupId> <artifactId>lucene-misc</artifactId> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-backward-codecs</artifactId> + </dependency> <!-- Others --> <dependency> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java new file mode 100644 index 0000000..8e5523f --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/beans/CloudWord.java @@ -0,0 +1,33 @@ +package fr.ifremer.coselmar.beans; + +import java.io.Serializable; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class CloudWord implements Serializable { + + protected String text; + protected long weight; + + public CloudWord(String text, long weight) { + this.text = text; + this.weight = weight; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public long getWeight() { + return weight; + } + + public void setWeight(long weight) { + this.weight = weight; + } +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index d7fc8fc..65756af 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -43,6 +43,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import java.io.IOException; @@ -264,30 +265,47 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopTerms(String questionId) throws IOException, ParseException { + public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException, ParseException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); - IndexSearcher isearcher = new IndexSearcher(ireader); - Map<String, Long> result = new LinkedHashMap<>(); - try { - TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(ireader, 20, null, new HighFreqTerms.TotalTermFreqComparator()); - for (TermStats termStats : highFreqTerms) { - long totalTermFreq = termStats.totalTermFreq; - String value = termStats.termtext.utf8ToString(); + // Combine that with the type + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - if (result.containsKey(value)) { - result.put(value, result.get(value) + totalTermFreq); - } else { - result.put(value, totalTermFreq); - } + for (String questionId : questionIds) { + if(StringUtils.isNotBlank(questionId)) { + queryBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); } + } - } catch (Exception e) { - e.printStackTrace(); + TopDocs hits = isearcher.search(queryBuilder.build(), 100); + ScoreDoc[] scoreDocs = hits.scoreDocs; + System.out.println("hits=" + scoreDocs.length); + System.out.println("Hits (rank,score,docId)"); + for (int n = 0; n < scoreDocs.length; ++n) { + ScoreDoc sd = scoreDocs[n]; + float score = sd.score; + int docId = sd.doc; } +// TopFieldCollector topFieldCollector = TopFieldCollector.create(new Sort(), 100, true, true, false); +// isearcher.search(queryBuilder.build(), topFieldCollector); + + Map<String, Long> result = new LinkedHashMap<>(); +// TopFieldDocs topField = topFieldCollector.topDocs(); +// for (SortField sortField : topField.fields) { +// String field = sortField.getField(); +// long sumDocFreq = ireader.getSumDocFreq(field); +// +// if (result.containsKey(field)) { +// result.put(field, result.get(field) + sumDocFreq); +// } else { +// result.put(field, sumDocFreq); +// } +// } + ireader.close(); return result; } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java new file mode 100644 index 0000000..ab15112 --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -0,0 +1,116 @@ +package fr.ifremer.coselmar.services.indexation; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 - 2015 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; +import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.misc.HighFreqTerms; +import org.apache.lucene.misc.HighFreqTermsMultiFields; +import org.apache.lucene.misc.TermStats; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * This Services provides operations about indexed Objects. + * <ul> + * <li>cleanning of the indexation db</li> + * <li>top word from the indexation db about specific {@link QuestionBean} and {@link fr.ifremer.coselmar.beans.DocumentBean} attributes</li> + * </ul> + * + * The purpose is to use power of a indexation db (lucene) to increase search on + * document text field, and make easier fulltext search + * + * @author ymartel <martel@codelutin.com> + */ +public class TransverseIndexationService extends CoselmarSimpleServiceSupport { + + protected void cleanAllIndex() throws IOException { + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("type", QuestionsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("type", DocumentsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) + //XXX ymartel 20151215 : Clean older DOCUMENT_TYPE value too (less or equals V1.0.1), should be removed after V2.0 + .add(new TermQuery(new Term("type", "question")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("type", "document")), BooleanClause.Occur.SHOULD) + .build(); + getLuceneUtils().getIndexWriter().deleteDocuments(query); + getLuceneUtils().getIndexWriter().commit(); + } + + public Map<String, Long> getTopTerms() throws IOException, ParseException { + + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + + Map<String, Long> result = new LinkedHashMap<>(); + try { + String[] searchedFields = { + QuestionsIndexationService.QUESTION_TITLE_INDEX_PROPERTY, + QuestionsIndexationService.QUESTION_SUMMARY_INDEX_PROPERTY, + QuestionsIndexationService.QUESTION_THEME_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_NAME_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_SUMMARY_INDEX_PROPERTY, + DocumentsIndexationService.DOCUMENT_KEYWORD_INDEX_PROPERTY + }; + TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 20, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); + for (TermStats termStats : highFreqTerms) { + long totalTermFreq = termStats.totalTermFreq; + String value = termStats.termtext.utf8ToString(); + + if (result.containsKey(value)) { + result.put(value, result.get(value) + totalTermFreq); + } else { + result.put(value, totalTermFreq); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + indexReader.close(); + return result; + } + +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java new file mode 100644 index 0000000..cc1eaf4 --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/GeneralWebService.java @@ -0,0 +1,74 @@ +package fr.ifremer.coselmar.services.v1; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import com.google.common.collect.Lists; +import fr.ifremer.coselmar.beans.CloudWord; +import fr.ifremer.coselmar.exceptions.CoselmarTechnicalException; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserRole; +import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; +import fr.ifremer.coselmar.services.indexation.TransverseIndexationService; +import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.queryparser.classic.ParseException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class GeneralWebService extends CoselmarWebServiceSupport { + + private static final Log log = LogFactory.getLog(GeneralWebService.class); + + protected static final List<String> RESTRICTED_ACCESS_USERS = Lists.newArrayList(CoselmarUserRole.CLIENT.name(), CoselmarUserRole.MEMBER.name()); + + public List<CloudWord> getTopWords() { + + TransverseIndexationService questionsIndexationService = getServicesContext().newService(TransverseIndexationService.class); + + Map<String, Long> topTerms = null; + try { + topTerms = questionsIndexationService.getTopTerms(); + } catch (IOException|ParseException e) { + if (log.isErrorEnabled()) { + log.error("Unable to search by lucene, make search directly in database", e); + throw new CoselmarTechnicalException("Unable to get most frequecy words"); + } + } + + List<CloudWord> cloudWords = new ArrayList<>(topTerms.size()); + for (Map.Entry<String, Long> wordFrequency : topTerms.entrySet()) { + CloudWord cloudWord = new CloudWord(wordFrequency.getKey(), wordFrequency.getValue()); + cloudWords.add(cloudWord); + } + + return cloudWords; + } +} diff --git a/coselmar-rest/src/main/resources/mapping b/coselmar-rest/src/main/resources/mapping index e31d444..2209eeb 100644 --- a/coselmar-rest/src/main/resources/mapping +++ b/coselmar-rest/src/main/resources/mapping @@ -64,6 +64,10 @@ POST /v1/questions/{questionId}/documents QuestionsWebService.addDocuments POST /v1/questions QuestionsWebService.addQuestion DELETE /v1/questions/{questionId} QuestionsWebService.deleteQuestion +# Transverse Api + +GET /v1/general/topwords GeneralWebService.getTopWords + # Admin API POST /v1/admin/lucene/index AdminWebService.refreshLuceneIndex diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index 0f377ea..b3c6334 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -427,4 +427,48 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest Assert.assertEquals(4, topTerms.get("something").longValue()); } + + @Test + public void testGetTopDocumentsTerms() throws Exception { + + QuestionBean questionOne = new QuestionBean(); + String questionOneId = "question_1_test_search" + System.currentTimeMillis(); + questionOne.setId(questionOneId); + questionOne.setTitle("Awesome question"); + questionOne.setSummary("Where is the tardis in time ?"); + questionOne.setDeadline(DateUtil.createDateAfterToday(1, 0, 1)); + questionOne.setExternalExperts(Sets.newHashSet("Amelia", "Rory", "River")); + questionOne.setSubmissionDate(new Date()); + questionOne.setStatus(Status.OPEN.name()); + questionOne.setPrivacy(Privacy.PUBLIC.name()); + questionOne.setThemes(Sets.newHashSet("TARDIS", "Universe", "Time", "Space")); + + QuestionBean questionTwo = new QuestionBean(); + String questionThreeId = "question_3_test_search" + System.currentTimeMillis(); + questionTwo.setId(questionThreeId); + questionTwo.setTitle("There's someone missing. The question's Who?"); + questionTwo.setSummary("Something old, Something new, Something borrowed, Something blue."); + questionTwo.setDeadline(DateUtil.createDateAfterToday(16, 0, 0)); + questionTwo.setSubmissionDate(new Date()); + questionTwo.setStatus(Status.OPEN.name()); + questionTwo.setPrivacy(Privacy.PRIVATE.name()); + questionTwo.setThemes(Sets.newHashSet("big bang two", "Pandorica", "River", "Universe")); + + + CoselmarServicesContext serviceContext = getServiceContext(); + QuestionsIndexationService questionsIndexationService = + serviceContext.newService(QuestionsIndexationService.class); + + questionsIndexationService.indexQuestion(questionOne); + questionsIndexationService.indexQuestion(questionTwo); + + // Ok, let's search now ! + Map<String, Long> topTerms = questionsIndexationService.getTopDocumentsTerms(Arrays.asList(questionOneId)); + Assert.assertNotNull(topTerms); +// Assert.assertEquals(1, topTerms.get("question").longValue()); +// Assert.assertEquals(2, topTerms.get("tardis").longValue()); +// Assert.assertEquals(2, topTerms.get("time").longValue()); +// Assert.assertEquals(1, topTerms.get("space").longValue()); + + } } diff --git a/pom.xml b/pom.xml index f36a085..f8b4d12 100644 --- a/pom.xml +++ b/pom.xml @@ -312,6 +312,13 @@ <version>${luceneVersion}</version> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-backward-codecs</artifactId> + <version>${luceneVersion}</version> + <scope>runtime</scope> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.