This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 1e43d560da4e23e5d45163f59ae9f9573e936ab5 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 18:05:10 2017 +0200 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content --- .../indexation/DocumentsIndexationService.java | 56 ++++++++++++++ .../indexation/QuestionsIndexationService.java | 15 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 ++++++++++++++++------ 3 files changed, 127 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index e218577..224311d 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -34,20 +34,27 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import org.apache.tika.exception.TikaException; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -329,4 +336,53 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } + public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException { + + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); + IndexSearcher isearcher = new IndexSearcher(ireader); + + // Combine that with the type + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); + for (String questionId : questionIds) { + if(StringUtils.isNotBlank(questionId)) { + questionIdBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + } + } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); + + TopDocs hits = isearcher.search(queryBuilder.build(), 100); + ScoreDoc[] scoreDocs = hits.scoreDocs; + + Map<String, Long> result = new LinkedHashMap<>(); + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString().toLowerCase(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + } + bytesRef = termsEnum.next(); + } + } + } + + ireader.close(); + return result; + } + + } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index f60ce70..32bb35b 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -302,7 +302,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -327,18 +327,19 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { for (ScoreDoc scoreDoc : scoreDocs) { Fields termVectors = ireader.getTermVectors(scoreDoc.doc); for (String termVector : termVectors) { - System.out.println("Vector: " + termVector); Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); TermsEnum termsEnum = vector.iterator(); BytesRef bytesRef = termsEnum.next(); while(bytesRef != null){ - String term = bytesRef.utf8ToString(); + String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (result.containsKey(term)) { - result.put(term, result.get(term) + totalTermFreq); - } else { - result.put(term, totalTermFreq); + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } } bytesRef = termsEnum.next(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java index 384aa7b..e7a9d48 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java @@ -24,15 +24,6 @@ package fr.ifremer.coselmar.services.v1; * #L% */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Collections2; @@ -65,6 +56,7 @@ import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.NoResultException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -79,6 +71,16 @@ import org.nuiton.util.DateUtil; import org.nuiton.util.pagination.PaginationParameter; import org.nuiton.util.pagination.PaginationResult; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * @author ymartel <martel@codelutin.com> */ @@ -1069,25 +1071,52 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { // Check authentication String authorization = getContext().getHeader("Authorization"); - UserWebToken userWebToken = checkAuthentication(authorization); + CoselmarUser user = checkUserAuthentication(authorization); - // Check current user - String fullCurrentUserId = getFullUserIdFromShort(userWebToken.getUserId()); - getCoselmarUserDao().forTopiaIdEquals(fullCurrentUserId).findAny(); + // Retrieve Question + String fullQuestionId = getFullIdFromShort(Question.class, questionId); + Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); - List<CloudWord> topWords; - if (getCoselmarServicesConfig().isPostgresqlDatabase()) { + List<CloudWord> topWords = new ArrayList<>(); +// if (getCoselmarServicesConfig().isPostgresqlDatabase()) { +// try { +// topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); +// } catch (TopiaNoResultException e) { +// if (log.isErrorEnabled()) { +// log.error("Try to find top words for non existing questionId" + questionId, e); +// } +// throw new NoResultException("Question does not exist"); +// } +// } else { +// topWords = Collections.EMPTY_LIST; + + QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); - } catch (TopiaNoResultException e) { + Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); + List<String> shortDocumentIds = getShortDocumentIds(question); + Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); + for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { + String term = documentTermFreq.getKey(); + Long frequence = documentTermFreq.getValue(); + if (topQuestionsTerms.containsKey(term)) { + } else { + topQuestionsTerms.put(term, frequence); + } + } + + for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { + String term = termFreq.getKey(); + CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); + topWords.add(cloudWord); + } + + } catch (IOException e) { if (log.isErrorEnabled()) { - log.error("Try to find top words for non existing questionId" + questionId, e); + log.error("Unable to index new question", e); } - throw new NoResultException("Question does not exist"); } - } else { - topWords = Collections.EMPTY_LIST; - } +// } return topWords; } @@ -1542,4 +1571,16 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { return result; } + protected List<String> getShortDocumentIds(Question question) { + List<String> shortDocumentIds = new ArrayList<>(); + for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(relatedDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(closingDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + return shortDocumentIds; + } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.