This is an automated email from the git hooks/post-receive script. New commit to branch feature/comparatif-lunce-pg in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit a763b05e739a1366c9060d9acdd14b90e3cbb057 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 12:03:05 2017 +0200 Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms --- .../persistence/entity/QuestionTopiaDao.java | 15 +- ...V2_1_0_1__9197_add_fileContent_in_documents.sql | 23 +++ .../src/main/xmi/coselmar-model.properties | 1 + .../src/main/xmi/coselmar-model.zargo | Bin 11007 -> 11150 bytes .../indexation/DocumentsIndexationService.java | 16 +- .../coselmar/services/v1/AdminWebService.java | 21 ++- .../coselmar/services/v1/DocumentsWebService.java | 33 +++- .../services/v1/ExperimentationService.java | 180 +++++++++++++++++++++ coselmar-rest/src/main/resources/mapping | 2 + 9 files changed, 267 insertions(+), 24 deletions(-) diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java index 8c66d18..8faba2f 100644 --- a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java @@ -369,12 +369,13 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { private final String sql; private final String getSql(String questionId) { - return "SELECT word, nentry FROM ts_stat( ' select to_tsvector(''public.simple_english_conf'', q.title)" + - " || to_tsvector(''public.simple_english_conf'', q.summary)" + - " || to_tsvector(''public.simple_english_conf'', qt.theme)" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', d.name),'''')" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', dk.keywords),'''')" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', d.summary),'''') FROM question q" + + return "SELECT word, nentry FROM ts_stat( ' select to_tsvector(''public.simple_english_conf'', q." + Question.PROPERTY_TITLE + ")" + + " || to_tsvector(''public.simple_english_conf'', q." + Question.PROPERTY_SUMMARY + ")" + + " || to_tsvector(''public.simple_english_conf'', qt." + Question.PROPERTY_THEME + ")" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_NAME + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', dk." + Document.PROPERTY_KEYWORDS + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_SUMMARY + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_FILE_CONTENT + "),'''') FROM question q" + " LEFT JOIN relateddocuments_relatedquestion ON" + " relateddocuments_relatedquestion.relatedquestion = q.topiaid" + " LEFT JOIN closingdocuments_relatedquestion ON" + @@ -390,7 +391,7 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { " ')" + " WHERE char_length(word) > 3 " + " ORDER BY nentry DESC " + - " LIMIT 30"; + " "; } QuestionTermStatSqlQuery(String questionId) { diff --git a/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql b/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql new file mode 100644 index 0000000..f8a64c7 --- /dev/null +++ b/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql @@ -0,0 +1,23 @@ +--- +-- #%L +-- Coselmar :: Persistence +-- %% +-- Copyright (C) 2014 - 2016 Ifremer, Code Lutin +-- %% +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. +-- +-- You should have received a copy of the GNU General Public +-- License along with this program. If not, see +-- <http://www.gnu.org/licenses/gpl-3.0.html>. +-- #L% +--- + +ALTER TABLE document ADD fileContent TEXT; diff --git a/coselmar-persistence/src/main/xmi/coselmar-model.properties b/coselmar-persistence/src/main/xmi/coselmar-model.properties index 1ee6952..5419513 100644 --- a/coselmar-persistence/src/main/xmi/coselmar-model.properties +++ b/coselmar-persistence/src/main/xmi/coselmar-model.properties @@ -30,6 +30,7 @@ model.tagvalue.useEnumerationName=true fr.ifremer.coselmar.persistence.entity.Document.attribute.summary.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Document.attribute.comment.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Document.attribute.citation.tagValue.hibernateAttributeType=text +fr.ifremer.coselmar.persistence.entity.Document.attribute.fileContent.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Question.attribute.summary.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Question.attribute.conclusion.tagValue.hibernateAttributeType=text \ No newline at end of file diff --git a/coselmar-persistence/src/main/xmi/coselmar-model.zargo b/coselmar-persistence/src/main/xmi/coselmar-model.zargo index 85a95b3..c2eac82 100644 Binary files a/coselmar-persistence/src/main/xmi/coselmar-model.zargo and b/coselmar-persistence/src/main/xmi/coselmar-model.zargo differ diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 8efe04d..700e1a0 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -86,7 +86,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_FILE_CONTENT_INDEX_PROPERTY = "documentFileContent"; protected static final String DOCUMENT_TYPE = "documentindextype"; - public void indexDocument(DocumentBean document, String filepath) throws IOException { + public void indexDocument(DocumentBean document, String fileContent) throws IOException { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -121,18 +121,8 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } } - if (StringUtils.isNotBlank(filepath)) { - try { - File documentFile = new File(filepath); - String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); - } catch (TikaException te) { - if (log.isErrorEnabled()) { - String message = String.format("Unable to index document '%s'", filepath); - log.error(message); - } - } - + if (StringUtils.isNotBlank(fileContent)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().addDocument(doc); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 33b3e57..98ad677 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -24,6 +24,7 @@ package fr.ifremer.coselmar.services.v1; * #L% */ +import java.io.File; import java.io.IOException; import java.util.List; @@ -42,6 +43,7 @@ import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; +import org.apache.tika.exception.TikaException; import static org.apache.commons.logging.LogFactory.getLog; @@ -76,10 +78,25 @@ public class AdminWebService extends CoselmarWebServiceSupport { // get All documents List<Document> documents = getDocumentDao().findAll(); for (Document document : documents) { - String lightId = getPersistenceContext().getTopiaIdFactory().getRandomPart(document.getTopiaId()); DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - documentsIndexationService.indexDocument(documentBean, document.getFilePath()); + String filePath = document.getFilePath(); + String fileContent = null; + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + document.setFileContent(fileContent); + getDocumentDao().update(document); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } + documentsIndexationService.indexDocument(documentBean, fileContent); } + commit(); // Get all questions List<Question> questions = getQuestionDao().findAll(); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index 9a1d010..d8bda0c 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -54,6 +54,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.logging.Log; import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.tika.exception.TikaException; import org.debux.webmotion.server.call.UploadFile; import org.debux.webmotion.server.render.Render; import org.nuiton.topia.persistence.TopiaNoResultException; @@ -65,6 +66,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Date; @@ -376,12 +378,24 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { String documentName = document.getName(); String contentType = null; String filePath = null; + String fileContent = null; // If document has a file, manager it ! if (uploadFile != null) { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); filePath = pathAndContentType.getLeft(); contentType = pathAndContentType.getRight(); + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } } // Document Metadata @@ -431,6 +445,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { documentEntity.setWithFile(true); documentEntity.setMimeType(contentType); documentEntity.setFilePath(filePath); + documentEntity.setFileContent(fileContent); } else { documentEntity.setWithFile(false); } @@ -445,7 +460,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result, filePath); + documentsIndexationService.indexDocument(result, fileContent); if (log.isDebugEnabled()) { String message = String.format("Document '%s' added to index", documentName); log.debug(message); @@ -489,6 +504,19 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); + String fileContent = null; + // Read file content + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } // If document has already a file, remove it if (StringUtils.isNotBlank(documentEntity.getFilePath())) { @@ -500,13 +528,14 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { documentEntity.setMimeType(contentType); documentEntity.setFilePath(filePath); documentEntity.setFileName(uploadFile.getName()); + documentEntity.setFileContent(fileContent); // Should update document index information to put the file DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), documentEntity); DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.updateDocument(documentBean, filePath); // no document file for the moment here + documentsIndexationService.updateDocument(documentBean, fileContent); if (log.isDebugEnabled()) { String message = String.format("Document '%s' was updated in index", documentEntity.getName()); log.debug(message); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java new file mode 100644 index 0000000..48cba7f --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java @@ -0,0 +1,180 @@ +package fr.ifremer.coselmar.services.v1; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import com.google.common.base.Function; +import com.google.common.base.Preconditions; +import com.google.common.collect.Collections2; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import fr.ifremer.coselmar.beans.CloudWord; +import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.beans.LinkBean; +import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.beans.QuestionExportModel; +import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.beans.QuestionSearchExample; +import fr.ifremer.coselmar.beans.QuestionTreeNode; +import fr.ifremer.coselmar.beans.QuestionUserRole; +import fr.ifremer.coselmar.beans.UserBean; +import fr.ifremer.coselmar.beans.UserWebToken; +import fr.ifremer.coselmar.converter.BeanEntityConverter; +import fr.ifremer.coselmar.exceptions.CoselmarTechnicalException; +import fr.ifremer.coselmar.persistence.entity.CoselmarUser; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserGroup; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserRole; +import fr.ifremer.coselmar.persistence.entity.Document; +import fr.ifremer.coselmar.persistence.entity.Link; +import fr.ifremer.coselmar.persistence.entity.LinkImpl; +import fr.ifremer.coselmar.persistence.entity.Privacy; +import fr.ifremer.coselmar.persistence.entity.Question; +import fr.ifremer.coselmar.persistence.entity.QuestionImpl; +import fr.ifremer.coselmar.persistence.entity.Status; +import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; +import fr.ifremer.coselmar.services.errors.InvalidCredentialException; +import fr.ifremer.coselmar.services.errors.NoResultException; +import fr.ifremer.coselmar.services.errors.UnauthorizedException; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; +import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.queryparser.classic.ParseException; +import org.debux.webmotion.server.render.Render; +import org.nuiton.csv.Export; +import org.nuiton.topia.persistence.TopiaIdFactory; +import org.nuiton.topia.persistence.TopiaNoResultException; +import org.nuiton.util.DateUtil; +import org.nuiton.util.pagination.PaginationParameter; +import org.nuiton.util.pagination.PaginationResult; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class ExperimentationService extends CoselmarWebServiceSupport { + + private static final Log log = LogFactory.getLog(ExperimentationService.class); + + public List<String> evaluateTopWordsGeneration() { + Question question = getQuestionDao().forStatusEquals(Status.IN_PROGRESS).findAny(); + long start = System.currentTimeMillis(); + String questionId = getShortIdFromFull(question.getTopiaId()); + System.out.println("Question : " + questionId); + List<CloudWord> luceneTopWords = getLuceneTopWords(questionId); + long stop = System.currentTimeMillis(); + String luceneTiming = String.format("Recherche par Lucene : %d termes en %d ms", luceneTopWords.size(), stop - start); + start = System.currentTimeMillis(); + List<CloudWord> postgresTopWords = getPostgresTopWords(questionId); + stop = System.currentTimeMillis(); + String pgTiming = String.format("Recherche par Postgresql : %d termes en %d ms", postgresTopWords.size(), stop - start); + + return Lists.newArrayList(luceneTiming, pgTiming); + } + + public List<CloudWord> getLuceneTopWords(String questionId) { + + // Retrieve Question + String fullQuestionId = getFullIdFromShort(Question.class, questionId); + Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); + + List<CloudWord> topWords = new ArrayList<>(); + + QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); + try { + Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); + List<String> shortDocumentIds = getShortDocumentIds(question); + Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); + for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { + String term = documentTermFreq.getKey(); + Long frequence = documentTermFreq.getValue(); + if (topQuestionsTerms.containsKey(term)) { + } else { + topQuestionsTerms.put(term, frequence); + } + } + + for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { + String term = termFreq.getKey(); + CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); + topWords.add(cloudWord); + } + + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to index new question", e); + } + } + + return topWords; + } + + public List<CloudWord> getPostgresTopWords(String questionId) { + + List<CloudWord> topWords; + if (getCoselmarServicesConfig().isPostgresqlDatabase()) { + try { + topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); + } catch (TopiaNoResultException e) { + if (log.isErrorEnabled()) { + log.error("Try to find top words for non existing questionId" + questionId, e); + } + topWords = Collections.EMPTY_LIST; + } + } else { + topWords = Collections.EMPTY_LIST; + } + + return topWords; + } + + //////////////////////////////////////////////////////////////////////////// + /////////////////////// Internal Parts ///////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + protected List<String> getShortDocumentIds(Question question) { + List<String> shortDocumentIds = new ArrayList<>(); + for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(relatedDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(closingDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + return shortDocumentIds; + } +} diff --git a/coselmar-rest/src/main/resources/mapping b/coselmar-rest/src/main/resources/mapping index b498a97..147d53d 100644 --- a/coselmar-rest/src/main/resources/mapping +++ b/coselmar-rest/src/main/resources/mapping @@ -76,6 +76,8 @@ GET /v1/questions/{questionId}/topwords QuestionsWebService.getTopWords GET /v1/general/topwords GeneralWebService.getTopWords +GET /v1/experimentation/topwords ExperimentationService.evaluateTopWordsGeneration + # Admin API POST /v1/admin/lucene/index AdminWebService.refreshLuceneIndex -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.