This is an automated email from the git hooks/post-receive script. New change to branch feature/9197-Indexation_documents in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git from 8517e8f refs #9197 Utilisation de Apache Tika pour indexer les documents new f90edbe Upgrade lucene + revue de l'indexation des données de Question new c616ba1 refs #9197 use vectors on document fields indexation new 1e43d56 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit 1e43d560da4e23e5d45163f59ae9f9573e936ab5 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 18:05:10 2017 +0200 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content commit c616ba1fb803d57b19b2c1dd0c5f1e0e62108b2b Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 16:08:11 2017 +0200 refs #9197 use vectors on document fields indexation commit f90edbef1de138249656cf0f778863fe2ffeb654 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 11:16:32 2017 +0200 Upgrade lucene + revue de l'indexation des données de Question Summary of changes: .../indexation/DocumentsIndexationService.java | 96 +++++++++++++++++----- .../coselmar/services/indexation/LuceneUtils.java | 15 ++++ .../indexation/QuestionsIndexationService.java | 76 +++++++++-------- .../indexation/TransverseIndexationService.java | 6 +- .../coselmar/services/v1/QuestionsWebService.java | 85 ++++++++++++++----- .../indexation/QuestionsIndexationServiceTest.java | 22 ++--- pom.xml | 2 +- 7 files changed, 212 insertions(+), 90 deletions(-) -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/9197-Indexation_documents in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit f90edbef1de138249656cf0f778863fe2ffeb654 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 11:16:32 2017 +0200 Upgrade lucene + revue de l'indexation des données de Question --- .../indexation/DocumentsIndexationService.java | 22 +++--- .../indexation/QuestionsIndexationService.java | 88 +++++++++++++--------- .../indexation/TransverseIndexationService.java | 6 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +++--- pom.xml | 2 +- 5 files changed, 81 insertions(+), 59 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 92402fb..87c4b68 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - import fr.ifremer.coselmar.beans.DocumentBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; @@ -50,6 +44,12 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.tika.exception.TikaException; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : @@ -134,7 +134,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(String text) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); String[] words = text.split(" "); @@ -183,7 +183,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(List<String> texts) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -240,7 +240,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public void updateDocument(DocumentBean document, String filepath) throws IOException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -287,7 +287,9 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + if (StringUtils.isNotBlank(parsedDocumentFile)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 2c97773..b5e92d3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -26,15 +26,19 @@ package fr.ifremer.coselmar.services.indexation; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; -import fr.ifremer.coselmar.beans.QuestionSearchExample; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.misc.HighFreqTerms; import org.apache.lucene.misc.HighFreqTermsMultiFields; import org.apache.lucene.misc.TermStats; @@ -46,6 +50,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.ArrayList; @@ -81,10 +86,23 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -102,11 +120,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -119,7 +137,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -139,14 +157,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -159,7 +177,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -177,7 +195,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchQuestion(QuestionSearchBean searchBean) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type @@ -272,7 +290,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> result = new LinkedHashMap<>(); try { @@ -297,46 +315,48 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); for (String questionId : questionIds) { if(StringUtils.isNotBlank(questionId)) { - queryBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + questionIdBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); } } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); TopDocs hits = isearcher.search(queryBuilder.build(), 100); ScoreDoc[] scoreDocs = hits.scoreDocs; - System.out.println("hits=" + scoreDocs.length); - System.out.println("Hits (rank,score,docId)"); - for (int n = 0; n < scoreDocs.length; ++n) { - ScoreDoc sd = scoreDocs[n]; - float score = sd.score; - int docId = sd.doc; - } - -// TopFieldCollector topFieldCollector = TopFieldCollector.create(new Sort(), 100, true, true, false); -// isearcher.search(queryBuilder.build(), topFieldCollector); Map<String, Long> result = new LinkedHashMap<>(); -// TopFieldDocs topField = topFieldCollector.topDocs(); -// for (SortField sortField : topField.fields) { -// String field = sortField.getField(); -// long sumDocFreq = ireader.getSumDocFreq(field); -// -// if (result.containsKey(field)) { -// result.put(field, result.get(field) + sumDocFreq); -// } else { -// result.put(field, sumDocFreq); -// } -// } + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + System.out.println("Vector: " + termVector); + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + bytesRef = termsEnum.next(); + } + } + } ireader.close(); return result; diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java index db6ae9b..2eceac3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import java.io.IOException; -import java.util.Comparator; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; @@ -73,7 +72,7 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> topWords = new LinkedHashMap<>(); try { @@ -81,7 +80,8 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { QuestionsIndexationService.QUESTION_TITLE_CLOUD_TAG_PROPERTY, QuestionsIndexationService.QUESTION_THEME_CLOUD_TAG_PROPERTY, DocumentsIndexationService.DOCUMENT_NAME_CLOUD_TAG_PROPERTY, - DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY + DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, + DocumentsIndexationService.DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, }; TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 40, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index df99838..2d2d154 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.Locale; -import java.util.Map; - import com.google.common.collect.Sets; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; @@ -43,6 +37,12 @@ import org.junit.Before; import org.junit.Test; import org.nuiton.util.DateUtil; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Map; + /** * @author ymartel <martel@codelutin.com> */ @@ -463,12 +463,12 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest questionsIndexationService.indexQuestion(questionTwo); // Ok, let's search now ! - Map<String, Long> topTerms = questionsIndexationService.getTopDocumentsTerms(Arrays.asList(questionOneId)); + Map<String, Long> topTerms = questionsIndexationService.getTopQuestionsTerms(Arrays.asList(questionOneId)); Assert.assertNotNull(topTerms); -// Assert.assertEquals(1, topTerms.get("question").longValue()); -// Assert.assertEquals(2, topTerms.get("tardis").longValue()); -// Assert.assertEquals(2, topTerms.get("time").longValue()); -// Assert.assertEquals(1, topTerms.get("space").longValue()); + Assert.assertEquals(1, topTerms.get("question").longValue()); + Assert.assertEquals(2, topTerms.get("tardis").longValue()); + Assert.assertEquals(2, topTerms.get("time").longValue()); + Assert.assertEquals(1, topTerms.get("space").longValue()); } } diff --git a/pom.xml b/pom.xml index f4bd15e..20fb3bd 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ <postgresqlVersion>9.4.1212.jre7</postgresqlVersion> <h2Version>1.4.190</h2Version> - <luceneVersion>5.4.0</luceneVersion> + <luceneVersion>6.5.1</luceneVersion> <tikaVersion>1.14</tikaVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/9197-Indexation_documents in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit c616ba1fb803d57b19b2c1dd0c5f1e0e62108b2b Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 16:08:11 2017 +0200 refs #9197 use vectors on document fields indexation --- .../indexation/DocumentsIndexationService.java | 20 ++++++++--------- .../coselmar/services/indexation/LuceneUtils.java | 15 +++++++++++++ .../indexation/QuestionsIndexationService.java | 25 ++++++---------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 87c4b68..e218577 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -87,16 +87,16 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -109,7 +109,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -118,7 +118,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); @@ -255,15 +255,15 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -276,7 +276,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -288,7 +288,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); if (StringUtils.isNotBlank(parsedDocumentFile)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } } catch (TikaException te) { if (log.isErrorEnabled()) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index 43a3c43..26224cf 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -33,6 +33,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; @@ -51,6 +53,19 @@ public class LuceneUtils { public IndexWriter indexWriter; protected Tika tika; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + protected CoselmarServicesConfig servicesConfig; public LuceneUtils(CoselmarServicesConfig servicesConfig) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index b5e92d3..f60ce70 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -86,19 +86,6 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; - public static final FieldType TYPE_STORED = new FieldType(); - static { - TYPE_STORED.setOmitNorms(true); - TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - TYPE_STORED.setStored(true); - TYPE_STORED.setStoreTermVectors(true); - TYPE_STORED.setStoreTermVectorPositions(true); - TYPE_STORED.setStoreTermVectorOffsets(true); - TYPE_STORED.setStoreTermVectorPayloads(true); - TYPE_STORED.setTokenized(true); - TYPE_STORED.freeze(); - } - public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it @@ -120,11 +107,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -137,7 +124,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -157,14 +144,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -177,7 +164,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/9197-Indexation_documents in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 1e43d560da4e23e5d45163f59ae9f9573e936ab5 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 18:05:10 2017 +0200 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content --- .../indexation/DocumentsIndexationService.java | 56 ++++++++++++++ .../indexation/QuestionsIndexationService.java | 15 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 ++++++++++++++++------ 3 files changed, 127 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index e218577..224311d 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -34,20 +34,27 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import org.apache.tika.exception.TikaException; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -329,4 +336,53 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } + public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException { + + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); + IndexSearcher isearcher = new IndexSearcher(ireader); + + // Combine that with the type + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); + for (String questionId : questionIds) { + if(StringUtils.isNotBlank(questionId)) { + questionIdBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + } + } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); + + TopDocs hits = isearcher.search(queryBuilder.build(), 100); + ScoreDoc[] scoreDocs = hits.scoreDocs; + + Map<String, Long> result = new LinkedHashMap<>(); + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString().toLowerCase(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + } + bytesRef = termsEnum.next(); + } + } + } + + ireader.close(); + return result; + } + + } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index f60ce70..32bb35b 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -302,7 +302,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -327,18 +327,19 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { for (ScoreDoc scoreDoc : scoreDocs) { Fields termVectors = ireader.getTermVectors(scoreDoc.doc); for (String termVector : termVectors) { - System.out.println("Vector: " + termVector); Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); TermsEnum termsEnum = vector.iterator(); BytesRef bytesRef = termsEnum.next(); while(bytesRef != null){ - String term = bytesRef.utf8ToString(); + String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (result.containsKey(term)) { - result.put(term, result.get(term) + totalTermFreq); - } else { - result.put(term, totalTermFreq); + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } } bytesRef = termsEnum.next(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java index 384aa7b..e7a9d48 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java @@ -24,15 +24,6 @@ package fr.ifremer.coselmar.services.v1; * #L% */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Collections2; @@ -65,6 +56,7 @@ import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.NoResultException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -79,6 +71,16 @@ import org.nuiton.util.DateUtil; import org.nuiton.util.pagination.PaginationParameter; import org.nuiton.util.pagination.PaginationResult; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * @author ymartel <martel@codelutin.com> */ @@ -1069,25 +1071,52 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { // Check authentication String authorization = getContext().getHeader("Authorization"); - UserWebToken userWebToken = checkAuthentication(authorization); + CoselmarUser user = checkUserAuthentication(authorization); - // Check current user - String fullCurrentUserId = getFullUserIdFromShort(userWebToken.getUserId()); - getCoselmarUserDao().forTopiaIdEquals(fullCurrentUserId).findAny(); + // Retrieve Question + String fullQuestionId = getFullIdFromShort(Question.class, questionId); + Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); - List<CloudWord> topWords; - if (getCoselmarServicesConfig().isPostgresqlDatabase()) { + List<CloudWord> topWords = new ArrayList<>(); +// if (getCoselmarServicesConfig().isPostgresqlDatabase()) { +// try { +// topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); +// } catch (TopiaNoResultException e) { +// if (log.isErrorEnabled()) { +// log.error("Try to find top words for non existing questionId" + questionId, e); +// } +// throw new NoResultException("Question does not exist"); +// } +// } else { +// topWords = Collections.EMPTY_LIST; + + QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); - } catch (TopiaNoResultException e) { + Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); + List<String> shortDocumentIds = getShortDocumentIds(question); + Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); + for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { + String term = documentTermFreq.getKey(); + Long frequence = documentTermFreq.getValue(); + if (topQuestionsTerms.containsKey(term)) { + } else { + topQuestionsTerms.put(term, frequence); + } + } + + for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { + String term = termFreq.getKey(); + CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); + topWords.add(cloudWord); + } + + } catch (IOException e) { if (log.isErrorEnabled()) { - log.error("Try to find top words for non existing questionId" + questionId, e); + log.error("Unable to index new question", e); } - throw new NoResultException("Question does not exist"); } - } else { - topWords = Collections.EMPTY_LIST; - } +// } return topWords; } @@ -1542,4 +1571,16 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { return result; } + protected List<String> getShortDocumentIds(Question question) { + List<String> shortDocumentIds = new ArrayList<>(); + for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(relatedDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(closingDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + return shortDocumentIds; + } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm