This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See http://git.codelutin.com/coselmar.git commit 993a5ee7906d1c815d4f52de3025c0c3fb232639 Author: Yannick Martel <martel@©odelutin.com> Date: Mon Dec 29 11:53:28 2014 +0100 prepare document indexation with lucene --- coselmar-rest/pom.xml | 18 +++ .../services/config/CoselmarServicesConfig.java | 9 ++ .../config/CoselmarServicesConfigOption.java | 6 + .../indexation/DocumentsIndexationService.java | 150 +++++++++++++++++++++ .../services/AbstractCoselmarServiceTest.java | 42 ++++++ .../services/DocumentsIndexationServiceTest.java | 103 ++++++++++++++ pom.xml | 27 ++++ 7 files changed, 355 insertions(+) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index 9793232..ef22ee3 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -148,6 +148,24 @@ <scope>runtime</scope> </dependency> + <!-- Indexation - Lucence --> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queryparser</artifactId> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queries</artifactId> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + </dependency> + <!-- Others --> <dependency> <groupId>com.github.spullara.mustache.java</groupId> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfig.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfig.java index 2ced2d9..81538ed 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfig.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfig.java @@ -100,6 +100,15 @@ public class CoselmarServicesConfig { CoselmarServicesConfigOption.DATA_DIRECTORY.key); } + public File getIndexDirectory() { + File indexFile = applicationConfig.getOptionAsFile( + CoselmarServicesConfigOption.INDEX_DIRECTORY.key); + if (indexFile == null) { + indexFile = applicationConfig.getOptionAsFile(CoselmarServicesConfigOption.DATA_DIRECTORY.key); + } + return indexFile; + } + /** * @return Le nom d'hôte du serveur SMTP. */ diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfigOption.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfigOption.java index 720f302..c729e6b 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfigOption.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/config/CoselmarServicesConfigOption.java @@ -40,6 +40,12 @@ public enum CoselmarServicesConfigOption implements ConfigOptionDef { "${java.io.tmpdir}/coselmar", File.class), + INDEX_DIRECTORY( + "coselmar.index.directory", + I18n.n("coselmar.configuration.index.directory"), + null, + File.class), + SMTP_HOST( "coselmar.smtp.host", "Nom d'hôte du serveur SMTP", diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java new file mode 100644 index 0000000..3d5c573 --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -0,0 +1,150 @@ +package fr.ifremer.coselmar.services.indexation; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.NIOFSDirectory; +import org.apache.lucene.util.Version; + +/** + * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} + * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : + * <ul> + * <li>registration of a document in the indexation db</li> + * <li>modification of a document in the indexation db</li> + * <li>documents search from the indexation db</li> + * </ul> + * + * The purpose is to use power of a indexation db (lucene) to increase search on + * document text field, and make easier fulltext search + * + * @author ymartel <martel@codelutin.com> + */ +public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { + + public static Analyzer analyzer; + public static final IndexWriterConfig indexationConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, getAnalyzer()); + public static IndexWriter indexWriter; + + protected static final String DOCUMENT_ID_INDEX_PROPERTY = "documentId"; + protected static final String DOCUMENT_NAME_INDEX_PROPERTY = "documentName"; + protected static final String DOCUMENT_AUTHORS_INDEX_PROPERTY = "documentAuthors"; + protected static final String DOCUMENT_SUMMARY_INDEX_PROPERTY = "documentSummary"; + protected static final String DOCUMENT_KEYWORD_INDEX_PROPERTY = "documentKeyword"; + protected static final String DOCUMENT_TYPE = "document"; + + public void indexDocument(DocumentBean document) throws IOException { + + Document doc = new Document(); + doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); + doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, document.getName(), Field.Store.YES)); + doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); + doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, document.getSummary(), Field.Store.YES)); + doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); + + Set<String> keywords = document.getKeywords(); + for (String keyword : keywords) { + doc.add(new Field(DOCUMENT_KEYWORD_INDEX_PROPERTY, keyword, TextField.TYPE_STORED)); + } + + + getIndexWriter().addDocument(doc); + getIndexWriter().commit(); + + } + + public void updateDocument(DocumentBean document) { + //TODO + } + + public List<String> searchDocuments(String text) throws IOException, ParseException { + DirectoryReader ireader = DirectoryReader.open(getIndexWriter(), false); + IndexSearcher isearcher = new IndexSearcher(ireader); + + String[] words = text.split(" "); + + // Parse a simple query that searches for the "text": + BooleanQuery query = new BooleanQuery(); + + PhraseQuery nameQuery = new PhraseQuery(); + PhraseQuery summaryQuery = new PhraseQuery(); + PhraseQuery authorsQuery = new PhraseQuery(); + + for (String word : words) { + nameQuery.add(new Term(DOCUMENT_NAME_INDEX_PROPERTY, word.toLowerCase())); + summaryQuery.add(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, word.toLowerCase())); + authorsQuery.add(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, word.toLowerCase())); + } + + query.add(nameQuery, BooleanClause.Occur.SHOULD); + query.add(summaryQuery, BooleanClause.Occur.SHOULD); + query.add(authorsQuery, BooleanClause.Occur.SHOULD); + + query.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text)), BooleanClause.Occur.SHOULD); + + + // Combine that with the type + BooleanQuery fullQuery = new BooleanQuery(); + fullQuery.add(query, BooleanClause.Occur.MUST); + fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + + ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + + List<String> documentIds = new ArrayList(hits.length); + + for (ScoreDoc hit : hits) { + Document doc = isearcher.doc(hit.doc); + String documentId = doc.get(DOCUMENT_ID_INDEX_PROPERTY); + documentIds.add(documentId); + } + + ireader.close(); + return documentIds; + } + + + protected static Analyzer getAnalyzer() { + if (analyzer == null) { +// analyzer = new StandardAnalyzer(); + //Use simple analyzer to index all words and be able to search with "close word" classified in StandardAnalyzer + analyzer = new SimpleAnalyzer(); + } + return analyzer; + + } + + protected IndexWriter getIndexWriter() throws IOException { + if (indexWriter == null) { + File indexDirectory = getCoselmarServicesConfig().getIndexDirectory(); + Directory index = NIOFSDirectory.open(indexDirectory); + + indexationConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); + indexWriter = new IndexWriter(index, indexationConfig); + } + return indexWriter; + } + +} diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/AbstractCoselmarServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/AbstractCoselmarServiceTest.java new file mode 100644 index 0000000..e187c0a --- /dev/null +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/AbstractCoselmarServiceTest.java @@ -0,0 +1,42 @@ +package fr.ifremer.coselmar.services; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import org.apache.commons.logging.Log; +import org.junit.Rule; + +import static org.apache.commons.logging.LogFactory.getLog; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class AbstractCoselmarServiceTest { + + private static final Log log = getLog(AbstractCoselmarServiceTest.class); + + @Rule + public final FakeCoselmarApplicationContext application = new FakeCoselmarApplicationContext("coselmar-test.properties"); + +} diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/DocumentsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/DocumentsIndexationServiceTest.java new file mode 100644 index 0000000..c201e4c --- /dev/null +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/DocumentsIndexationServiceTest.java @@ -0,0 +1,103 @@ +package fr.ifremer.coselmar.services; + +import java.util.Date; +import java.util.List; +import java.util.Locale; + +import com.google.common.collect.Lists; +import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.persistence.entity.Privacy; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; +import org.junit.Assert; +import org.junit.Test; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest { + + protected FakeCoselmarServicesContext serviceContext; + + protected FakeCoselmarServicesContext getServiceContext() { + + if (serviceContext == null) { + serviceContext = application.newServiceContext(application.newPersistenceContext(), Locale.FRANCE); + } + + return serviceContext; + } + + @Test + public void testAddDocument() throws Exception { + CoselmarServicesContext serviceContext = getServiceContext(); + DocumentsIndexationService documentsIndexationService = + serviceContext.newService(DocumentsIndexationService.class); + + DocumentBean documentOne = new DocumentBean("document1", + "Ceci n'est pas un document", "John Doe", Privacy.PUBLIC.name(), + new Date(), Lists.newArrayList("document", "test"), "testDocument", + "This is not a fake document used for test", "fr", null, "Jack, Jane", + null, null, false, null, "http://somewhere", "no comment"); + + documentsIndexationService.indexDocument(documentOne); + + } + + @Test + public void testSearchDocument() throws Exception { + populate(); + + CoselmarServicesContext serviceContext = getServiceContext(); + DocumentsIndexationService documentsIndexationService = + serviceContext.newService(DocumentsIndexationService.class); + + List<String> documentMatchingDoctorIds = documentsIndexationService.searchDocuments("doctor"); + Assert.assertEquals(1, documentMatchingDoctorIds.size()); + Assert.assertEquals("document3", documentMatchingDoctorIds.get(0)); + + List<String> documentMatchingAmyIds = documentsIndexationService.searchDocuments("amy"); + Assert.assertEquals(2, documentMatchingAmyIds.size()); + Assert.assertTrue(documentMatchingAmyIds.contains("document2")); + Assert.assertTrue(documentMatchingAmyIds.contains("document3")); + + List<String> documentMatchingTheMasterIds = documentsIndexationService.searchDocuments("the Master"); + Assert.assertTrue(documentMatchingTheMasterIds.isEmpty()); + + List<String> documentMatchingPartOfSummaryIds = documentsIndexationService.searchDocuments("This is part of"); + Assert.assertEquals(1, documentMatchingPartOfSummaryIds.size()); + Assert.assertEquals("document3", documentMatchingPartOfSummaryIds.get(0)); + + } + + public void populate() throws Exception { + CoselmarServicesContext serviceContext = getServiceContext(); + DocumentsIndexationService documentsIndexationService = + serviceContext.newService(DocumentsIndexationService.class); + + DocumentBean documentOne = new DocumentBean("document1", + "Ceci n'est pas un document", "John Doe", Privacy.PUBLIC.name(), + new Date(), Lists.newArrayList("document", "test"), "testDocument", + "This is not a fake document used for test", "fr", null, "Jack, Jane", + null, null, false, null, "http://somewhere", "no comment"); + + documentsIndexationService.indexDocument(documentOne); + + DocumentBean documentTwo = new DocumentBean("document2", + "Another document", "Amy Pond", Privacy.PUBLIC.name(), + new Date(), Lists.newArrayList("document", "test", "fish"), "testDocument", + "This is just an other document used for test", "fr", null, "Amy, Rory", + null, null, false, null, "http://somewhere", "no comment"); + + documentsIndexationService.indexDocument(documentTwo); + + DocumentBean documentThree = new DocumentBean("document3", + "Tardis documentation", "The Doctor", Privacy.PUBLIC.name(), + new Date(), Lists.newArrayList("tardis", "documentation", "old", "new", "borrowed", "blue"), "testDocument", + "This is part of documentation about the TARDIS", "fr", null, "The Doctor, Rose, Amy, River, Clara", + null, null, false, null, "http://tardis.wikia.com/wiki/TARDIS", "no comment"); + + documentsIndexationService.indexDocument(documentThree); + + } + +} diff --git a/pom.xml b/pom.xml index 4585df2..5f6a966 100644 --- a/pom.xml +++ b/pom.xml @@ -140,6 +140,8 @@ <postgresqlVersion>9.1-901-1.jdbc4</postgresqlVersion> <h2Version>1.4.178</h2Version> + <luceneVersion>4.10.2</luceneVersion> + <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> <angularUiSelectVersion>0.9.0</angularUiSelectVersion> @@ -247,6 +249,31 @@ <version>${h2Version}</version> </dependency> + <!-- Indexation - Lucence --> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>${luceneVersion}</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queryparser</artifactId> + <version>${luceneVersion}</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queries</artifactId> + <version>${luceneVersion}</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>${luceneVersion}</version> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.