This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit f37fe4f513d6bd4c8f8e52ae2086873baba477c2 Author: Yannick Martel <martel@©odelutin.com> Date: Tue May 30 17:08:16 2017 +0200 refs #9197 review way to use Tika, filter file content indexation for pdf, opendocument text/presentation and ms word/powerpoint --- .../indexation/DocumentsIndexationService.java | 21 ++------- .../coselmar/services/indexation/LuceneUtils.java | 7 --- .../coselmar/services/indexation/TikaUtils.java | 54 ++++++++++++++++++++++ .../coselmar/services/v1/AdminWebService.java | 21 +++------ .../coselmar/services/v1/DocumentsWebService.java | 28 ++--------- 5 files changed, 69 insertions(+), 62 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 158c10f..f820531 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -48,6 +48,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MimeType; import java.io.File; import java.io.IOException; @@ -122,7 +123,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } if (StringUtils.isNotBlank(fileContent)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().addDocument(doc); @@ -236,7 +237,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { return documentIds; } - public void updateDocument(DocumentBean document, String filepath) throws IOException { + public void updateDocument(DocumentBean document, String fileContent) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -280,20 +281,8 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); - if (StringUtils.isNotBlank(filepath)) { - try { - File documentFile = new File(filepath); - String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - if (StringUtils.isNotBlank(parsedDocumentFile)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); - } - } catch (TikaException te) { - if (log.isErrorEnabled()) { - String message = String.format("Unable to index document '%s'", filepath); - log.error(message); - } - } - + if (StringUtils.isNotBlank(fileContent)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().updateDocument(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId()), doc); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index 26224cf..6e6b0ef 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -93,13 +93,6 @@ public class LuceneUtils { return indexWriter; } - public Tika getTika() { - if (tika == null) { - this.tika = new Tika(); - } - return tika; - } - public void closeWriter() { if (indexWriter != null) { try { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java new file mode 100644 index 0000000..eb05bfe --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java @@ -0,0 +1,54 @@ +package fr.ifremer.coselmar.services.indexation; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class TikaUtils { + + private static final Log log = LogFactory.getLog(TikaUtils.class); + + public static final List<String> READABLE_TEXT_MIMETYPES = Arrays.asList("text/plain", + "application/pdt", + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.presentation", + "application/msword", + "application/mspowerpoint", + "application/powerpoint", + "application/vnd.ms-powerpoint", + "text/html" + ); + + private static final Tika tika = new Tika(); + + public static String getFileContent(String filePath) { + String fileContent = ""; + File file = new File(filePath); + try { + String mimeType = tika.detect(file); + // Can we read it ? + if (StringUtils.isNotBlank(mimeType) && READABLE_TEXT_MIMETYPES.contains(mimeType.toLowerCase())) { + fileContent = tika.parseToString(file); + } + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } + return fileContent; + } +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 98ad677..f39ab29 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -41,6 +41,7 @@ import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import fr.ifremer.coselmar.services.indexation.TikaUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.tika.exception.TikaException; @@ -79,22 +80,12 @@ public class AdminWebService extends CoselmarWebServiceSupport { List<Document> documents = getDocumentDao().findAll(); for (Document document : documents) { DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - String filePath = document.getFilePath(); - String fileContent = null; - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - document.setFileContent(fileContent); - getDocumentDao().update(document); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + // Refresh file information + String fileContent = TikaUtils.getFileContent(document.getFilePath()); documentsIndexationService.indexDocument(documentBean, fileContent); + // Refresh database content + document.setFileContent(fileContent); + getDocumentDao().update(document); } commit(); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index d8bda0c..63e133d 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -49,6 +49,7 @@ import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.NoResultException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; +import fr.ifremer.coselmar.services.indexation.TikaUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; @@ -380,22 +381,12 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { String filePath = null; String fileContent = null; - // If document has a file, manager it ! + // If document has a file, manage it ! if (uploadFile != null) { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); filePath = pathAndContentType.getLeft(); contentType = pathAndContentType.getRight(); - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + fileContent = TikaUtils.getFileContent(filePath); } // Document Metadata @@ -504,19 +495,8 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); - String fileContent = null; // Read file content - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + String fileContent = TikaUtils.getFileContent(filePath); // If document has already a file, remove it if (StringUtils.isNotBlank(documentEntity.getFilePath())) { -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.