This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 8ade892c539efdfa408e81173a98c87ec9d24692 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Jun 13 13:24:20 2017 +0200 fixes #9197 add way to exclude http from cloud words --- .../fr/ifremer/coselmar/config/CloudWordUtils.java | 26 ++++++++++++++++++++++ .../persistence/entity/QuestionTopiaDao.java | 6 ++++- .../indexation/DocumentsIndexationService.java | 15 +++++++------ .../indexation/QuestionsIndexationService.java | 15 +++++++------ .../indexation/TransverseIndexationService.java | 6 ++--- 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java new file mode 100644 index 0000000..c09a5a6 --- /dev/null +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java @@ -0,0 +1,26 @@ +package fr.ifremer.coselmar.config; + +import com.google.common.base.Function; + +import java.util.Arrays; +import java.util.List; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class CloudWordUtils { + + public static final int CLOUD_TAG_WORD_MIN_SIZE = 3; + public static final List<String> MANUAL_EXCLUDED_TERMS_IN_CLOUD = Arrays.asList("http"); + + public static final boolean isCloudableTerm(String term) { + return term.length() > CLOUD_TAG_WORD_MIN_SIZE && !MANUAL_EXCLUDED_TERMS_IN_CLOUD.contains(term.toLowerCase()); + } + + public static final Function<String, String> SQLIFY_STRING = new Function<String, String>() { + @Override + public String apply(String input) { + return "'" + input + "'"; + } + }; +} diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java index fd95f15..54ef0ae 100644 --- a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java @@ -24,8 +24,11 @@ package fr.ifremer.coselmar.persistence.entity; * #L% */ +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; import fr.ifremer.coselmar.beans.CloudWord; import fr.ifremer.coselmar.beans.QuestionSearchExample; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.persistence.DaoUtils; import org.apache.commons.lang3.StringUtils; import org.nuiton.topia.persistence.TopiaQueryBuilderAddCriteriaOrRunQueryStep; @@ -385,7 +388,8 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { " || to_tsvector(''public.simple_english_conf'', (SELECT string_agg(keywords, '' '') FROM documents) ) " + " || to_tsvector(''public.simple_english_conf'', (SELECT string_agg(fileContent, '' '') FROM documents)) " + " FROM question q where q.topiaid = ''" + questionId + "'' ') " + - " WHERE char_length(word) > 3 " + + " WHERE char_length(word) > " + CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE + + " AND word NOT IN ( " + Joiner.on(',').join(Lists.transform(CloudWordUtils.MANUAL_EXCLUDED_TERMS_IN_CLOUD, CloudWordUtils.SQLIFY_STRING)) + " ) " + " ORDER BY nentry DESC "; } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index ce4770c..137c228 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -25,6 +25,7 @@ package fr.ifremer.coselmar.services.indexation; */ import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; @@ -95,10 +96,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); // Cloud Tag management - if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (documentName.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (documentSummary.length() >= CloudTagUtils.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); // } @@ -108,7 +109,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_KEYWORD_INDEX_PROPERTY, keyword, TextField.TYPE_STORED)); // Cloud Tag management - if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (keyword.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -252,10 +253,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management - if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (documentName.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (documentSummary.length() >= CloudTagUtils.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); // } @@ -265,7 +266,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_KEYWORD_INDEX_PROPERTY, keyword, TextField.TYPE_STORED)); // Cloud Tag management - if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (keyword.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -340,7 +341,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(term)) { if (result.containsKey(term)) { result.put(term, result.get(term) + totalTermFreq); } else { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 98d93b0..1528f37 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -26,6 +26,7 @@ package fr.ifremer.coselmar.services.indexation; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; @@ -108,10 +109,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management - if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (questionTitle.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (questionSummary.length() >= CloudTagUTil.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); // } @@ -121,7 +122,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); // Cloud Tag management - if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (theme.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -148,10 +149,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management - if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (questionTitle.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (questionSummary.length() >= CloudTagUTil.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); // } @@ -161,7 +162,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); // Cloud Tag management - if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (theme.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -333,7 +334,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(term)) { if (result.containsKey(term)) { result.put(term, result.get(term) + totalTermFreq); } else { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java index 875e957..b3dac78 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -25,6 +25,7 @@ package fr.ifremer.coselmar.services.indexation; */ import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import fr.ifremer.coselmar.services.StringLongMapValueComparator; import org.apache.lucene.index.DirectoryReader; @@ -56,8 +57,6 @@ import java.util.TreeMap; */ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { - public static final int CLOUD_TAG_WORD_MIN_SIZE = 3; - protected void cleanAllIndex() throws IOException { BooleanQuery query = new BooleanQuery.Builder() .add(new TermQuery(new Term("type", QuestionsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) @@ -89,7 +88,7 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { long totalTermFreq = termStats.totalTermFreq; String value = termStats.termtext.utf8ToString(); - if (value.length() > CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(value)) { if (topWords.containsKey(value)) { topWords.put(value, topWords.get(value) + totalTermFreq); @@ -119,5 +118,4 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { } return result; } - } \ No newline at end of file -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.