branch develop updated (092b48d -> 8ade892)
This is an automated email from the git hooks/post-receive script. New change to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git from 092b48d fix third party licenses new 8ade892 fixes #9197 add way to exclude http from cloud words The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit 8ade892c539efdfa408e81173a98c87ec9d24692 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Jun 13 13:24:20 2017 +0200 fixes #9197 add way to exclude http from cloud words Summary of changes: .../fr/ifremer/coselmar/config/CloudWordUtils.java | 26 ++++++++++++++++++++++ .../persistence/entity/QuestionTopiaDao.java | 6 ++++- .../indexation/DocumentsIndexationService.java | 15 +++++++------ .../indexation/QuestionsIndexationService.java | 15 +++++++------ .../indexation/TransverseIndexationService.java | 6 ++--- 5 files changed, 49 insertions(+), 19 deletions(-) create mode 100644 coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 8ade892c539efdfa408e81173a98c87ec9d24692 Author: Yannick Martel <martel@©odelutin.com> Date: Tue Jun 13 13:24:20 2017 +0200 fixes #9197 add way to exclude http from cloud words --- .../fr/ifremer/coselmar/config/CloudWordUtils.java | 26 ++++++++++++++++++++++ .../persistence/entity/QuestionTopiaDao.java | 6 ++++- .../indexation/DocumentsIndexationService.java | 15 +++++++------ .../indexation/QuestionsIndexationService.java | 15 +++++++------ .../indexation/TransverseIndexationService.java | 6 ++--- 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java new file mode 100644 index 0000000..c09a5a6 --- /dev/null +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/config/CloudWordUtils.java @@ -0,0 +1,26 @@ +package fr.ifremer.coselmar.config; + +import com.google.common.base.Function; + +import java.util.Arrays; +import java.util.List; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class CloudWordUtils { + + public static final int CLOUD_TAG_WORD_MIN_SIZE = 3; + public static final List<String> MANUAL_EXCLUDED_TERMS_IN_CLOUD = Arrays.asList("http"); + + public static final boolean isCloudableTerm(String term) { + return term.length() > CLOUD_TAG_WORD_MIN_SIZE && !MANUAL_EXCLUDED_TERMS_IN_CLOUD.contains(term.toLowerCase()); + } + + public static final Function<String, String> SQLIFY_STRING = new Function<String, String>() { + @Override + public String apply(String input) { + return "'" + input + "'"; + } + }; +} diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java index fd95f15..54ef0ae 100644 --- a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java @@ -24,8 +24,11 @@ package fr.ifremer.coselmar.persistence.entity; * #L% */ +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; import fr.ifremer.coselmar.beans.CloudWord; import fr.ifremer.coselmar.beans.QuestionSearchExample; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.persistence.DaoUtils; import org.apache.commons.lang3.StringUtils; import org.nuiton.topia.persistence.TopiaQueryBuilderAddCriteriaOrRunQueryStep; @@ -385,7 +388,8 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { " || to_tsvector(''public.simple_english_conf'', (SELECT string_agg(keywords, '' '') FROM documents) ) " + " || to_tsvector(''public.simple_english_conf'', (SELECT string_agg(fileContent, '' '') FROM documents)) " + " FROM question q where q.topiaid = ''" + questionId + "'' ') " + - " WHERE char_length(word) > 3 " + + " WHERE char_length(word) > " + CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE + + " AND word NOT IN ( " + Joiner.on(',').join(Lists.transform(CloudWordUtils.MANUAL_EXCLUDED_TERMS_IN_CLOUD, CloudWordUtils.SQLIFY_STRING)) + " ) " + " ORDER BY nentry DESC "; } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index ce4770c..137c228 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -25,6 +25,7 @@ package fr.ifremer.coselmar.services.indexation; */ import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; @@ -95,10 +96,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); // Cloud Tag management - if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (documentName.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (documentSummary.length() >= CloudTagUtils.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); // } @@ -108,7 +109,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_KEYWORD_INDEX_PROPERTY, keyword, TextField.TYPE_STORED)); // Cloud Tag management - if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (keyword.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -252,10 +253,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management - if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (documentName.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (documentSummary.length() >= CloudTagUtils.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); // } @@ -265,7 +266,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(DOCUMENT_KEYWORD_INDEX_PROPERTY, keyword, TextField.TYPE_STORED)); // Cloud Tag management - if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (keyword.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -340,7 +341,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(term)) { if (result.containsKey(term)) { result.put(term, result.get(term) + totalTermFreq); } else { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 98d93b0..1528f37 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -26,6 +26,7 @@ package fr.ifremer.coselmar.services.indexation; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; @@ -108,10 +109,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management - if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (questionTitle.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (questionSummary.length() >= CloudTagUTil.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); // } @@ -121,7 +122,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new TextField(QUESTION_THEME_INDEX_PROPERTY, theme, Field.Store.YES)); // Cloud Tag management - if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (theme.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -148,10 +149,10 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management - if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (questionTitle.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } -// if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { +// if (questionSummary.length() >= CloudTagUTil.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); // } @@ -161,7 +162,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field(QUESTION_THEME_INDEX_PROPERTY, theme, TextField.TYPE_STORED)); // Cloud Tag management - if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (theme.length() >= CloudWordUtils.CLOUD_TAG_WORD_MIN_SIZE) { doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } @@ -333,7 +334,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(term)) { if (result.containsKey(term)) { result.put(term, result.get(term) + totalTermFreq); } else { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java index 875e957..b3dac78 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -25,6 +25,7 @@ package fr.ifremer.coselmar.services.indexation; */ import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.config.CloudWordUtils; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import fr.ifremer.coselmar.services.StringLongMapValueComparator; import org.apache.lucene.index.DirectoryReader; @@ -56,8 +57,6 @@ import java.util.TreeMap; */ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { - public static final int CLOUD_TAG_WORD_MIN_SIZE = 3; - protected void cleanAllIndex() throws IOException { BooleanQuery query = new BooleanQuery.Builder() .add(new TermQuery(new Term("type", QuestionsIndexationService.DOCUMENT_TYPE)), BooleanClause.Occur.SHOULD) @@ -89,7 +88,7 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { long totalTermFreq = termStats.totalTermFreq; String value = termStats.termtext.utf8ToString(); - if (value.length() > CLOUD_TAG_WORD_MIN_SIZE) { + if (CloudWordUtils.isCloudableTerm(value)) { if (topWords.containsKey(value)) { topWords.put(value, topWords.get(value) + totalTermFreq); @@ -119,5 +118,4 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { } return result; } - } \ No newline at end of file -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm